Python 데이터 분석

Python 데이터분석 기초 62 - Random forest 예제(titanic)

코딩탕탕 2022. 11. 22. 14:59

 

 

# titanic dataset으로 LogisticRegression, DecisionTree, RandomForest 분류 모델 비교

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from conda.common._logic import TRUE

df = pd.read_csv('../testdata/titanic_data.csv')
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace = True)
print(df.describe())
print(df.info())
print(df.isnull().sum()) # Age 177  Cabin 687  Embarked 2

# Null 처리 : 평균, 0, 'N' 등으로 변경
df['Age'].fillna(df['Age'].mean(), inplace = True)
df['Cabin'].fillna('N', inplace = True)
df['Embarked'].fillna('N', inplace = True)
print(df.head(5))
# print(df.isnull().sum())

# Dtype : object - Sex, Cabin, Embarked 값들의 상태를 분류해서 보기
print('Sex', df['Sex'].value_counts())  # male 577, female 314
print('Cabin', df['Cabin'].value_counts()) # Cabin 값들이 너무 복잡하므로 간략하게 정리 - 앞글자만 사용하기로 함
print('Embarked', df['Embarked'].value_counts()) #

df['Cabin'] = df['Cabin'].str[:1]
print(df.head(5))

print()
# 성별이 생존확률에 어떤 영향을 주었나?
print(df.groupby(['Sex', 'Survived'])['Survived'].count())
print(233 / (81 + 233))  # 여성 : 74.2%
print(109 / (468 + 109)) # 남성 : 18.9%
# 성별 생존 확률에 대한 시각화
sns.barplot(x = 'Sex', y = 'Survived', data = df, ci = 95)
plt.show()

# 나이별, Pclass가 생존확률에 어떤 영향을 주었나?

print(df.head(3))
print()
# 문자열(object) 데이터를 숫자형으로 변환(범주형)하기
from sklearn import preprocessing
# print(set(df['Cabin']))

def labelFunc(datas):
    cols = ['Cabin', 'Sex', 'Embarked']
    for c in cols:
        lab = preprocessing.LabelEncoder()
        lab = lab.fit(datas[c])
        datas[c] = lab.transform(datas[c])
    return datas

df = labelFunc(df)
print(df.head(3))
print(df['Cabin'].unique())    # [7 2 4 6 3 0 1 5 8]
print(df['Sex'].unique())      # [1 0]
print(df['Embarked'].unique()) # [3 0 2 1]

print()
feature_df = df.drop(['Survived'], axis = 'columns')
print(feature_df.head(2))
label_df = df['Survived']
print(label_df.head(2))

x_train, x_test, y_train, y_test = train_test_split(feature_df, label_df, test_size = 0.2, random_state = 1)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (712, 8) (179, 8) (712,) (179,)

print('---------')
# LogisticRegression, DecisionTree, RandomForest 분류 모델 비교
logmodel = LogisticRegression(solver = 'lbfgs', max_iter = 500).fit(x_train, y_train)
decmodel = DecisionTreeClassifier().fit(x_train, y_train)
rfmodel = RandomForestClassifier().fit(x_train, y_train)

logpredict = logmodel.predict(x_test)
print('LogisticRegression acc : {0:.5f}'.format(accuracy_score(y_test, logpredict)))
decpredict = decmodel.predict(x_test)
print('DecisionTree acc : {0:.5f}'.format(accuracy_score(y_test, decpredict)))
rfredict = rfmodel.predict(x_test)
print('RandomForest acc : {0:.5f}'.format(accuracy_score(y_test, rfredict)))



<console>
         Survived      Pclass         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB
None
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64
   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500     N        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250     N        S
3         1       1  female  35.0      1      0  53.1000  C123        S
4         0       3    male  35.0      0      0   8.0500     N        S
Sex male      577
female    314
Name: Sex, dtype: int64
Cabin N              687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: Cabin, Length: 148, dtype: int64
Embarked S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64
   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500     N        S
1         1       1  female  38.0      1      0  71.2833     C        C
2         1       3  female  26.0      0      0   7.9250     N        S
3         1       1  female  35.0      1      0  53.1000     C        S
4         0       3    male  35.0      0      0   8.0500     N        S

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64
0.7420382165605095
0.18890814558058924
   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500     N        S
1         1       1  female  38.0      1      0  71.2833     C        C
2         1       3  female  26.0      0      0   7.9250     N        S

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
0         0       3    1  22.0      1      0   7.2500      7         3
1         1       1    0  38.0      1      0  71.2833      2         0
2         1       3    0  26.0      0      0   7.9250      7         3
[7 2 4 6 3 0 1 5 8]
[1 0]
[3 0 2 1]

   Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
0       3    1  22.0      1      0   7.2500      7         3
1       1    0  38.0      1      0  71.2833      2         0
0    0
1    1
Name: Survived, dtype: int64
(712, 8) (179, 8) (712,) (179,)
---------
LogisticRegression acc : 0.79888
DecisionTree acc : 0.72626
RandomForest acc : 0.75978

 

성별 생존 시각화