Python 데이터 분석
XGBoost로 분류 모델 예시(kaggle.com이 제공하는 'glass datasets')
코딩탕탕
2022. 11. 23. 14:40
<작성자 코드>
# [XGBoost 문제]
# kaggle.com이 제공하는 'glass datasets'
# 유리 식별 데이터베이스로 여러 가지 특징들에 의해 7 가지의 label(Type)로 분리된다.
#
# RI Na Mg Al Si K Ca Ba Fe Type
# ...
# glass.csv 파일을 읽어 분류 작업을 수행하시오.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance
df = pd.read_csv('../testdata/glass.csv')
print(df.head(3), df.shape) # (214, 10)
print(df.info())
print(df.describe())
print(df.corr())
df_x = df.drop(columns = ['Type'])
df_y = df['Type']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_y = le.fit_transform(df_y)
# train / test split
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 12)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (171, 9) (43, 9) (171,) (43,)
# model
model = xgb.XGBClassifier(booster = 'gbtree', max_depth = 6, n_estimators = 500).fit(x_train, y_train) # 의사결정 기반(booster)
pred = model.predict(x_test)
print('예측값 :', pred[:10])
print('실제값 :', y_test[:10])
print('정확도 확인 방법 1')
from sklearn import metrics
acc = metrics.accuracy_score(y_test, pred)
print('acc :', acc)
# 중요 변수 시각화
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(model, ax = ax)
plt.show()
<console>
RI Na Mg Al Si K Ca Ba Fe Type
0 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0.0 0.0 1
1 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0.0 0.0 1
2 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0.0 0.0 1 (214, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 RI 214 non-null float64
1 Na 214 non-null float64
2 Mg 214 non-null float64
3 Al 214 non-null float64
4 Si 214 non-null float64
5 K 214 non-null float64
6 Ca 214 non-null float64
7 Ba 214 non-null float64
8 Fe 214 non-null float64
9 Type 214 non-null int64
dtypes: float64(9), int64(1)
memory usage: 16.8 KB
None
RI Na Mg ... Ba Fe Type
count 214.000000 214.000000 214.000000 ... 214.000000 214.000000 214.000000
mean 1.518365 13.407850 2.684533 ... 0.175047 0.057009 2.780374
std 0.003037 0.816604 1.442408 ... 0.497219 0.097439 2.103739
min 1.511150 10.730000 0.000000 ... 0.000000 0.000000 1.000000
25% 1.516522 12.907500 2.115000 ... 0.000000 0.000000 1.000000
50% 1.517680 13.300000 3.480000 ... 0.000000 0.000000 2.000000
75% 1.519157 13.825000 3.600000 ... 0.000000 0.100000 3.000000
max 1.533930 17.380000 4.490000 ... 3.150000 0.510000 7.000000
[8 rows x 10 columns]
RI Na Mg ... Ba Fe Type
RI 1.000000 -0.191885 -0.122274 ... -0.000386 0.143010 -0.164237
Na -0.191885 1.000000 -0.273732 ... 0.326603 -0.241346 0.502898
Mg -0.122274 -0.273732 1.000000 ... -0.492262 0.083060 -0.744993
Al -0.407326 0.156794 -0.481799 ... 0.479404 -0.074402 0.598829
Si -0.542052 -0.069809 -0.165927 ... -0.102151 -0.094201 0.151565
K -0.289833 -0.266087 0.005396 ... -0.042618 -0.007719 -0.010054
Ca 0.810403 -0.275442 -0.443750 ... -0.112841 0.124968 0.000952
Ba -0.000386 0.326603 -0.492262 ... 1.000000 -0.058692 0.575161
Fe 0.143010 -0.241346 0.083060 ... -0.058692 1.000000 -0.188278
Type -0.164237 0.502898 -0.744993 ... 0.575161 -0.188278 1.000000
[10 rows x 10 columns]
(171, 9) (43, 9) (171,) (43,)
예측값 : [1 1 1 0 5 1 2 5 0 0]
실제값 : [1 1 4 0 5 4 2 5 0 0]
정확도 확인 방법 1
acc : 0.8837209302325582
<선생님 코드>
# [XGBoost 문제] 이걸로 풀어야 함
# kaggle.com이 제공하는 'glass datasets'
# 유리 식별 데이터베이스로 여러 가지 특징들에 의해 7가지의 label(Type)로 분리된다.
# RI Na Mg Al Si K Ca Ba Fe Type
# ...
# glass.csv 파일을 읽어 분류 작업을 수행하시오.
import pandas as pd
import numpy as np
from sklearn.model_selection._split import train_test_split
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
data = pd.read_csv("../testdata/glass.csv")
print(data.columns)
x = data.drop('Type', axis=1) # Type 열은 독립 변수에서 제외
y = data['Type']
print(set(y)) # {1, 2, 3, 5, 6, 7}
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y[:3], set(y)) # {0, 1, 2, 3, 4, 5}
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=12)
model = xgb.XGBClassifier(booster='gbtree', n_estimators=500, random_state=12)
model.fit(x_train,y_train)
print()
y_pred = model.predict(x_test)
print('실제값 :', y_pred[:5])
print('예측값:', np.array(y_test[:5]))
print('정확도 :', metrics.accuracy_score(y_test, y_pred))
from sklearn.metrics import roc_auc_score
xgb_roc_curve = roc_auc_score(y_test, model.predict_proba(x_test), multi_class="ovr")
# ValueError: multi_class must be in ('ovo', 'ovr') 예외 발생 에러가 나면 multi_class="ovr"를 주자.
print('ROC AUC : {0:.4f}'.format(xgb_roc_curve))
# 중요 변수 시각화
from xgboost import plot_importance
plot_importance(model)
plt.show()
<console>
Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'], dtype='object')
{1, 2, 3, 5, 6, 7}
[0 0 0] {0, 1, 2, 3, 4, 5}
실제값 : [1 1 4 0 5]
예측값: [1 1 4 0 5]
정확도 : 0.8
ROC AUC : 0.9565