ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • XGBoost로 분류 모델 예시(산탄데르 은행 고객 만족 여부 분류 모델)
    Python 데이터 분석 2022. 11. 23. 12:20

     

     

    # 산탄데르 은행 고객 만족 여부 분류 모델
    # label name : TARGET - 0(만족), 1(불만족)
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    import matplotlib.pyplot as plt
    
    df = pd.read_csv('train.csv', encoding = 'latin-1')
    print(df.head(3), df.shape) # (76020, 371)
    print(df.info())
    print()
    print(df['TARGET'].value_counts()) # 0 : 73012, 1 : 3008
    unsatified_cnt = df[df['TARGET'] == 1].TARGET.count()
    total_cnt = df.TARGET.count()
    print('불만족 비율은 {0:.2f}'.format((unsatified_cnt / total_cnt))) # 불만족 비율은 0.04
    # pd.set_option('display.max_columns', 500) # columns 호출
    print(df.describe()) # var3 변수에 이상치 의심
    df['var3'].replace(-999999, 2, inplace=True)
    # print(df.describe())
    df.drop('ID', axis=1, inplace=True)
    
    x_features = df.iloc[:, :-1] # 마지막 column 만 제외
    y_labels = df.iloc[:, -1]
    print(x_features.shape, y_labels.shape) # (76020, 369) (76020,)
    
    # train / test split
    x_train, x_test, y_train, y_test = train_test_split(x_features, y_labels, test_size = 0.2, random_state = 12)
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (60816, 369) (15204, 369) (60816,) (15204,)
    
    train_cnt = y_train.count()
    test_cnt = y_test.count()
    print('train 데이터 레이블 분포 비율 :', y_train.value_counts() / train_cnt) # 0 : 0.960257, 1 : 0.039743
    print('test 데이터 레이블 분포 비율 :', y_test.value_counts() / test_cnt)    # 0 : 0.961129, 1 : 0.038871
    
    # model
    from xgboost import XGBClassifier
    from sklearn.metrics import roc_auc_score
    
    xgb_clf = XGBClassifier(n_estimators=5, random_state=12) # n_estimators : column 수가 너무 많아 5만 주었다. 
    xgb_clf.fit(x_train, y_train, eval_metric = 'auc', early_stopping_rounds=2, # early_stopping_rounds : 똑같은 수가 n번 나오면 중지해라
                eval_set=[(x_train, y_train), (x_test, y_test)])
    
    xgb_roc_curve = roc_auc_score(y_test, xgb_clf.predict_proba(x_test)[:, 1])
    print('ROC AUC : {0:.4f}'.format(xgb_roc_curve))
    pred = xgb_clf.predict(x_test)
    print('예측값 :', pred[:5])
    print('실제값 :', y_test[:5].values)
    
    from sklearn import metrics
    acc = metrics.accuracy_score(y_test, pred)
    print('acc :', acc) # 0.9611
    
    # GridSearchCV로 best parameter 구한 후 모델 작성
    # 중요변수를 알아내 feature를 줄이는 작업
    # 성격이 유사한 변수들에 대해 차원축소를 하여 feature를 줄이는 작업
    # ...
    
    
    <console>
       ID  var3  var15  ...  saldo_medio_var44_ult3     var38  TARGET
    0   1     2     23  ...                     0.0  39205.17       0
    1   3     2     34  ...                     0.0  49278.03       0
    2   4     2     23  ...                     0.0  67333.77       0
    
    [3 rows x 371 columns] (76020, 371)
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 76020 entries, 0 to 76019
    Columns: 371 entries, ID to TARGET
    dtypes: float64(111), int64(260)
    memory usage: 215.2 MB
    None
    
    0    73012
    1     3008
    Name: TARGET, dtype: int64
    불만족 비율은 0.04
                      ID           var3  ...         var38        TARGET
    count   76020.000000   76020.000000  ...  7.602000e+04  76020.000000
    mean    75964.050723   -1523.199277  ...  1.172358e+05      0.039569
    std     43781.947379   39033.462364  ...  1.826646e+05      0.194945
    min         1.000000 -999999.000000  ...  5.163750e+03      0.000000
    25%     38104.750000       2.000000  ...  6.787061e+04      0.000000
    50%     76043.000000       2.000000  ...  1.064092e+05      0.000000
    75%    113748.750000       2.000000  ...  1.187563e+05      0.000000
    max    151838.000000     238.000000  ...  2.203474e+07      1.000000
    
    [8 rows x 371 columns]
    (76020, 369) (76020,)
    (60816, 369) (15204, 369) (60816,) (15204,)
    train 데이터 레이블 분포 비율 : 0    0.960257
    1    0.039743
    Name: TARGET, dtype: float64
    test 데이터 레이블 분포 비율 : 0    0.961129
    1    0.038871
    Name: TARGET, dtype: float64
    
    [0]	validation_0-auc:0.82412	validation_1-auc:0.82760
    [1]	validation_0-auc:0.83365	validation_1-auc:0.83537
    [2]	validation_0-auc:0.83787	validation_1-auc:0.83725
    [3]	validation_0-auc:0.84191	validation_1-auc:0.83865
    [4]	validation_0-auc:0.84388	validation_1-auc:0.83987
    ROC AUC : 0.8399
    예측값 : [0 0 0 0 0]
    실제값 : [0 0 0 0 0]
    acc : 0.9611286503551697

     

     

     

    댓글

Designed by Tistory.