Python 데이터 분석
Python 데이터분석 기초 62 - Random forest 예제(titanic)
코딩탕탕
2022. 11. 22. 14:59
# titanic dataset으로 LogisticRegression, DecisionTree, RandomForest 분류 모델 비교
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from conda.common._logic import TRUE
df = pd.read_csv('../testdata/titanic_data.csv')
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace = True)
print(df.describe())
print(df.info())
print(df.isnull().sum()) # Age 177 Cabin 687 Embarked 2
# Null 처리 : 평균, 0, 'N' 등으로 변경
df['Age'].fillna(df['Age'].mean(), inplace = True)
df['Cabin'].fillna('N', inplace = True)
df['Embarked'].fillna('N', inplace = True)
print(df.head(5))
# print(df.isnull().sum())
# Dtype : object - Sex, Cabin, Embarked 값들의 상태를 분류해서 보기
print('Sex', df['Sex'].value_counts()) # male 577, female 314
print('Cabin', df['Cabin'].value_counts()) # Cabin 값들이 너무 복잡하므로 간략하게 정리 - 앞글자만 사용하기로 함
print('Embarked', df['Embarked'].value_counts()) #
df['Cabin'] = df['Cabin'].str[:1]
print(df.head(5))
print()
# 성별이 생존확률에 어떤 영향을 주었나?
print(df.groupby(['Sex', 'Survived'])['Survived'].count())
print(233 / (81 + 233)) # 여성 : 74.2%
print(109 / (468 + 109)) # 남성 : 18.9%
# 성별 생존 확률에 대한 시각화
sns.barplot(x = 'Sex', y = 'Survived', data = df, ci = 95)
plt.show()
# 나이별, Pclass가 생존확률에 어떤 영향을 주었나?
print(df.head(3))
print()
# 문자열(object) 데이터를 숫자형으로 변환(범주형)하기
from sklearn import preprocessing
# print(set(df['Cabin']))
def labelFunc(datas):
cols = ['Cabin', 'Sex', 'Embarked']
for c in cols:
lab = preprocessing.LabelEncoder()
lab = lab.fit(datas[c])
datas[c] = lab.transform(datas[c])
return datas
df = labelFunc(df)
print(df.head(3))
print(df['Cabin'].unique()) # [7 2 4 6 3 0 1 5 8]
print(df['Sex'].unique()) # [1 0]
print(df['Embarked'].unique()) # [3 0 2 1]
print()
feature_df = df.drop(['Survived'], axis = 'columns')
print(feature_df.head(2))
label_df = df['Survived']
print(label_df.head(2))
x_train, x_test, y_train, y_test = train_test_split(feature_df, label_df, test_size = 0.2, random_state = 1)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (712, 8) (179, 8) (712,) (179,)
print('---------')
# LogisticRegression, DecisionTree, RandomForest 분류 모델 비교
logmodel = LogisticRegression(solver = 'lbfgs', max_iter = 500).fit(x_train, y_train)
decmodel = DecisionTreeClassifier().fit(x_train, y_train)
rfmodel = RandomForestClassifier().fit(x_train, y_train)
logpredict = logmodel.predict(x_test)
print('LogisticRegression acc : {0:.5f}'.format(accuracy_score(y_test, logpredict)))
decpredict = decmodel.predict(x_test)
print('DecisionTree acc : {0:.5f}'.format(accuracy_score(y_test, decpredict)))
rfredict = rfmodel.predict(x_test)
print('RandomForest acc : {0:.5f}'.format(accuracy_score(y_test, rfredict)))
<console>
Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Pclass 891 non-null int64
2 Sex 891 non-null object
3 Age 714 non-null float64
4 SibSp 891 non-null int64
5 Parch 891 non-null int64
6 Fare 891 non-null float64
7 Cabin 204 non-null object
8 Embarked 889 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB
None
Survived 0
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 0 3 male 22.0 1 0 7.2500 N S
1 1 1 female 38.0 1 0 71.2833 C85 C
2 1 3 female 26.0 0 0 7.9250 N S
3 1 1 female 35.0 1 0 53.1000 C123 S
4 0 3 male 35.0 0 0 8.0500 N S
Sex male 577
female 314
Name: Sex, dtype: int64
Cabin N 687
C23 C25 C27 4
G6 4
B96 B98 4
C22 C26 3
...
E34 1
C7 1
C54 1
E36 1
C148 1
Name: Cabin, Length: 148, dtype: int64
Embarked S 644
C 168
Q 77
N 2
Name: Embarked, dtype: int64
Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 0 3 male 22.0 1 0 7.2500 N S
1 1 1 female 38.0 1 0 71.2833 C C
2 1 3 female 26.0 0 0 7.9250 N S
3 1 1 female 35.0 1 0 53.1000 C S
4 0 3 male 35.0 0 0 8.0500 N S
Sex Survived
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
0.7420382165605095
0.18890814558058924
Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 0 3 male 22.0 1 0 7.2500 N S
1 1 1 female 38.0 1 0 71.2833 C C
2 1 3 female 26.0 0 0 7.9250 N S
Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 0 3 1 22.0 1 0 7.2500 7 3
1 1 1 0 38.0 1 0 71.2833 2 0
2 1 3 0 26.0 0 0 7.9250 7 3
[7 2 4 6 3 0 1 5 8]
[1 0]
[3 0 2 1]
Pclass Sex Age SibSp Parch Fare Cabin Embarked
0 3 1 22.0 1 0 7.2500 7 3
1 1 0 38.0 1 0 71.2833 2 0
0 0
1 1
Name: Survived, dtype: int64
(712, 8) (179, 8) (712,) (179,)
---------
LogisticRegression acc : 0.79888
DecisionTree acc : 0.72626
RandomForest acc : 0.75978