Python 데이터 분석

RandomForest 예제 1 - Red Wine quality 데이터

코딩탕탕 2022. 11. 22. 18:02

 

 

 

 

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

df = pd.read_csv('../testdata/winequality-red.csv')
print(df.head(3))
print(df.info())

df_x = df.drop(columns = ['quality'])
df_y = df['quality']

train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size = 0.3, random_state=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # (1117, 11) (479, 11) (1117,) (479,)

model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model.fit(train_x, train_y)

pred = model.predict(test_x)
print('예측값 :', pred[:5])
print('실제값 :', np.array(test_y[:5]))

# 정확도
print('acc :', accuracy_score(test_y, pred))

# 교차검증
cross_vali = cross_val_score(model, df_x, df_y, cv = 5)
print(cross_vali)
print(np.mean(cross_vali))

# 중요변수
print('특성(변수) 중요도 :',model.feature_importances_)


<console>
    STA  AGE  SEX  RACE  SER  CAN  CRN  INF  CPR  HRA
ID                                                   
8     0   27    1     1    0    0    0    1    0   88
12    0   39    0     1    0    0    0    0    0   80
14    0   27    0     1    1    0    0    0    0   70
(160, 9) (40, 9) (160, 1) (40, 1)
acc : 0.85
RandomForestClassifier(criterion='entropy', n_estimators=500)