Python 데이터 분석

XGBoost로 분류 모델 예시(산탄데르 은행 고객 만족 여부 분류 모델)

코딩탕탕 2022. 11. 23. 12:20

 

 

# 산탄데르 은행 고객 만족 여부 분류 모델
# label name : TARGET - 0(만족), 1(불만족)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df = pd.read_csv('train.csv', encoding = 'latin-1')
print(df.head(3), df.shape) # (76020, 371)
print(df.info())
print()
print(df['TARGET'].value_counts()) # 0 : 73012, 1 : 3008
unsatified_cnt = df[df['TARGET'] == 1].TARGET.count()
total_cnt = df.TARGET.count()
print('불만족 비율은 {0:.2f}'.format((unsatified_cnt / total_cnt))) # 불만족 비율은 0.04
# pd.set_option('display.max_columns', 500) # columns 호출
print(df.describe()) # var3 변수에 이상치 의심
df['var3'].replace(-999999, 2, inplace=True)
# print(df.describe())
df.drop('ID', axis=1, inplace=True)

x_features = df.iloc[:, :-1] # 마지막 column 만 제외
y_labels = df.iloc[:, -1]
print(x_features.shape, y_labels.shape) # (76020, 369) (76020,)

# train / test split
x_train, x_test, y_train, y_test = train_test_split(x_features, y_labels, test_size = 0.2, random_state = 12)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (60816, 369) (15204, 369) (60816,) (15204,)

train_cnt = y_train.count()
test_cnt = y_test.count()
print('train 데이터 레이블 분포 비율 :', y_train.value_counts() / train_cnt) # 0 : 0.960257, 1 : 0.039743
print('test 데이터 레이블 분포 비율 :', y_test.value_counts() / test_cnt)    # 0 : 0.961129, 1 : 0.038871

# model
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

xgb_clf = XGBClassifier(n_estimators=5, random_state=12) # n_estimators : column 수가 너무 많아 5만 주었다. 
xgb_clf.fit(x_train, y_train, eval_metric = 'auc', early_stopping_rounds=2, # early_stopping_rounds : 똑같은 수가 n번 나오면 중지해라
            eval_set=[(x_train, y_train), (x_test, y_test)])

xgb_roc_curve = roc_auc_score(y_test, xgb_clf.predict_proba(x_test)[:, 1])
print('ROC AUC : {0:.4f}'.format(xgb_roc_curve))
pred = xgb_clf.predict(x_test)
print('예측값 :', pred[:5])
print('실제값 :', y_test[:5].values)

from sklearn import metrics
acc = metrics.accuracy_score(y_test, pred)
print('acc :', acc) # 0.9611

# GridSearchCV로 best parameter 구한 후 모델 작성
# 중요변수를 알아내 feature를 줄이는 작업
# 성격이 유사한 변수들에 대해 차원축소를 하여 feature를 줄이는 작업
# ...


<console>
   ID  var3  var15  ...  saldo_medio_var44_ult3     var38  TARGET
0   1     2     23  ...                     0.0  39205.17       0
1   3     2     34  ...                     0.0  49278.03       0
2   4     2     23  ...                     0.0  67333.77       0

[3 rows x 371 columns] (76020, 371)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB
None

0    73012
1     3008
Name: TARGET, dtype: int64
불만족 비율은 0.04
                  ID           var3  ...         var38        TARGET
count   76020.000000   76020.000000  ...  7.602000e+04  76020.000000
mean    75964.050723   -1523.199277  ...  1.172358e+05      0.039569
std     43781.947379   39033.462364  ...  1.826646e+05      0.194945
min         1.000000 -999999.000000  ...  5.163750e+03      0.000000
25%     38104.750000       2.000000  ...  6.787061e+04      0.000000
50%     76043.000000       2.000000  ...  1.064092e+05      0.000000
75%    113748.750000       2.000000  ...  1.187563e+05      0.000000
max    151838.000000     238.000000  ...  2.203474e+07      1.000000

[8 rows x 371 columns]
(76020, 369) (76020,)
(60816, 369) (15204, 369) (60816,) (15204,)
train 데이터 레이블 분포 비율 : 0    0.960257
1    0.039743
Name: TARGET, dtype: float64
test 데이터 레이블 분포 비율 : 0    0.961129
1    0.038871
Name: TARGET, dtype: float64

[0]	validation_0-auc:0.82412	validation_1-auc:0.82760
[1]	validation_0-auc:0.83365	validation_1-auc:0.83537
[2]	validation_0-auc:0.83787	validation_1-auc:0.83725
[3]	validation_0-auc:0.84191	validation_1-auc:0.83865
[4]	validation_0-auc:0.84388	validation_1-auc:0.83987
ROC AUC : 0.8399
예측값 : [0 0 0 0 0]
실제값 : [0 0 0 0 0]
acc : 0.9611286503551697