# 산탄데르 은행 고객 만족 여부 분류 모델
# label name : TARGET - 0(만족), 1(불만족)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
df = pd.read_csv('train.csv', encoding = 'latin-1')
print(df.head(3), df.shape) # (76020, 371)
print(df.info())
print()
print(df['TARGET'].value_counts()) # 0 : 73012, 1 : 3008
unsatified_cnt = df[df['TARGET'] == 1].TARGET.count()
total_cnt = df.TARGET.count()
print('불만족 비율은 {0:.2f}'.format((unsatified_cnt / total_cnt))) # 불만족 비율은 0.04
# pd.set_option('display.max_columns', 500) # columns 호출
print(df.describe()) # var3 변수에 이상치 의심
df['var3'].replace(-999999, 2, inplace=True)
# print(df.describe())
df.drop('ID', axis=1, inplace=True)
x_features = df.iloc[:, :-1] # 마지막 column 만 제외
y_labels = df.iloc[:, -1]
print(x_features.shape, y_labels.shape) # (76020, 369) (76020,)
# train / test split
x_train, x_test, y_train, y_test = train_test_split(x_features, y_labels, test_size = 0.2, random_state = 12)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (60816, 369) (15204, 369) (60816,) (15204,)
train_cnt = y_train.count()
test_cnt = y_test.count()
print('train 데이터 레이블 분포 비율 :', y_train.value_counts() / train_cnt) # 0 : 0.960257, 1 : 0.039743
print('test 데이터 레이블 분포 비율 :', y_test.value_counts() / test_cnt) # 0 : 0.961129, 1 : 0.038871
# model
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
xgb_clf = XGBClassifier(n_estimators=5, random_state=12) # n_estimators : column 수가 너무 많아 5만 주었다.
xgb_clf.fit(x_train, y_train, eval_metric = 'auc', early_stopping_rounds=2, # early_stopping_rounds : 똑같은 수가 n번 나오면 중지해라
eval_set=[(x_train, y_train), (x_test, y_test)])
xgb_roc_curve = roc_auc_score(y_test, xgb_clf.predict_proba(x_test)[:, 1])
print('ROC AUC : {0:.4f}'.format(xgb_roc_curve))
pred = xgb_clf.predict(x_test)
print('예측값 :', pred[:5])
print('실제값 :', y_test[:5].values)
from sklearn import metrics
acc = metrics.accuracy_score(y_test, pred)
print('acc :', acc) # 0.9611
# GridSearchCV로 best parameter 구한 후 모델 작성
# 중요변수를 알아내 feature를 줄이는 작업
# 성격이 유사한 변수들에 대해 차원축소를 하여 feature를 줄이는 작업
# ...
<console>
ID var3 var15 ... saldo_medio_var44_ult3 var38 TARGET
0 1 2 23 ... 0.0 39205.17 0
1 3 2 34 ... 0.0 49278.03 0
2 4 2 23 ... 0.0 67333.77 0
[3 rows x 371 columns] (76020, 371)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB
None
0 73012
1 3008
Name: TARGET, dtype: int64
불만족 비율은 0.04
ID var3 ... var38 TARGET
count 76020.000000 76020.000000 ... 7.602000e+04 76020.000000
mean 75964.050723 -1523.199277 ... 1.172358e+05 0.039569
std 43781.947379 39033.462364 ... 1.826646e+05 0.194945
min 1.000000 -999999.000000 ... 5.163750e+03 0.000000
25% 38104.750000 2.000000 ... 6.787061e+04 0.000000
50% 76043.000000 2.000000 ... 1.064092e+05 0.000000
75% 113748.750000 2.000000 ... 1.187563e+05 0.000000
max 151838.000000 238.000000 ... 2.203474e+07 1.000000
[8 rows x 371 columns]
(76020, 369) (76020,)
(60816, 369) (15204, 369) (60816,) (15204,)
train 데이터 레이블 분포 비율 : 0 0.960257
1 0.039743
Name: TARGET, dtype: float64
test 데이터 레이블 분포 비율 : 0 0.961129
1 0.038871
Name: TARGET, dtype: float64
[0] validation_0-auc:0.82412 validation_1-auc:0.82760
[1] validation_0-auc:0.83365 validation_1-auc:0.83537
[2] validation_0-auc:0.83787 validation_1-auc:0.83725
[3] validation_0-auc:0.84191 validation_1-auc:0.83865
[4] validation_0-auc:0.84388 validation_1-auc:0.83987
ROC AUC : 0.8399
예측값 : [0 0 0 0 0]
실제값 : [0 0 0 0 0]
acc : 0.9611286503551697