import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
df = pd.read_csv('../testdata/winequality-red.csv')
print(df.head(3))
print(df.info())
df_x = df.drop(columns = ['quality'])
df_y = df['quality']
train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size = 0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # (1117, 11) (479, 11) (1117,) (479,)
model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model.fit(train_x, train_y)
pred = model.predict(test_x)
print('예측값 :', pred[:5])
print('실제값 :', np.array(test_y[:5]))
# 정확도
print('acc :', accuracy_score(test_y, pred))
# 교차검증
cross_vali = cross_val_score(model, df_x, df_y, cv = 5)
print(cross_vali)
print(np.mean(cross_vali))
# 중요변수
print('특성(변수) 중요도 :',model.feature_importances_)
<console>
STA AGE SEX RACE SER CAN CRN INF CPR HRA
ID
8 0 27 1 1 0 0 0 1 0 88
12 0 39 0 1 0 0 0 0 0 80
14 0 27 0 1 1 0 0 0 0 70
(160, 9) (40, 9) (160, 1) (40, 1)
acc : 0.85
RandomForestClassifier(criterion='entropy', n_estimators=500)