ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • Python 데이터분석 기초 62 - Random forest 예제(titanic)
    Python 데이터 분석 2022. 11. 22. 14:59

     

     

    # titanic dataset으로 LogisticRegression, DecisionTree, RandomForest 분류 모델 비교
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from conda.common._logic import TRUE
    
    df = pd.read_csv('../testdata/titanic_data.csv')
    df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace = True)
    print(df.describe())
    print(df.info())
    print(df.isnull().sum()) # Age 177  Cabin 687  Embarked 2
    
    # Null 처리 : 평균, 0, 'N' 등으로 변경
    df['Age'].fillna(df['Age'].mean(), inplace = True)
    df['Cabin'].fillna('N', inplace = True)
    df['Embarked'].fillna('N', inplace = True)
    print(df.head(5))
    # print(df.isnull().sum())
    
    # Dtype : object - Sex, Cabin, Embarked 값들의 상태를 분류해서 보기
    print('Sex', df['Sex'].value_counts())  # male 577, female 314
    print('Cabin', df['Cabin'].value_counts()) # Cabin 값들이 너무 복잡하므로 간략하게 정리 - 앞글자만 사용하기로 함
    print('Embarked', df['Embarked'].value_counts()) #
    
    df['Cabin'] = df['Cabin'].str[:1]
    print(df.head(5))
    
    print()
    # 성별이 생존확률에 어떤 영향을 주었나?
    print(df.groupby(['Sex', 'Survived'])['Survived'].count())
    print(233 / (81 + 233))  # 여성 : 74.2%
    print(109 / (468 + 109)) # 남성 : 18.9%
    # 성별 생존 확률에 대한 시각화
    sns.barplot(x = 'Sex', y = 'Survived', data = df, ci = 95)
    plt.show()
    
    # 나이별, Pclass가 생존확률에 어떤 영향을 주었나?
    
    print(df.head(3))
    print()
    # 문자열(object) 데이터를 숫자형으로 변환(범주형)하기
    from sklearn import preprocessing
    # print(set(df['Cabin']))
    
    def labelFunc(datas):
        cols = ['Cabin', 'Sex', 'Embarked']
        for c in cols:
            lab = preprocessing.LabelEncoder()
            lab = lab.fit(datas[c])
            datas[c] = lab.transform(datas[c])
        return datas
    
    df = labelFunc(df)
    print(df.head(3))
    print(df['Cabin'].unique())    # [7 2 4 6 3 0 1 5 8]
    print(df['Sex'].unique())      # [1 0]
    print(df['Embarked'].unique()) # [3 0 2 1]
    
    print()
    feature_df = df.drop(['Survived'], axis = 'columns')
    print(feature_df.head(2))
    label_df = df['Survived']
    print(label_df.head(2))
    
    x_train, x_test, y_train, y_test = train_test_split(feature_df, label_df, test_size = 0.2, random_state = 1)
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (712, 8) (179, 8) (712,) (179,)
    
    print('---------')
    # LogisticRegression, DecisionTree, RandomForest 분류 모델 비교
    logmodel = LogisticRegression(solver = 'lbfgs', max_iter = 500).fit(x_train, y_train)
    decmodel = DecisionTreeClassifier().fit(x_train, y_train)
    rfmodel = RandomForestClassifier().fit(x_train, y_train)
    
    logpredict = logmodel.predict(x_test)
    print('LogisticRegression acc : {0:.5f}'.format(accuracy_score(y_test, logpredict)))
    decpredict = decmodel.predict(x_test)
    print('DecisionTree acc : {0:.5f}'.format(accuracy_score(y_test, decpredict)))
    rfredict = rfmodel.predict(x_test)
    print('RandomForest acc : {0:.5f}'.format(accuracy_score(y_test, rfredict)))
    
    
    
    <console>
             Survived      Pclass         Age       SibSp       Parch        Fare
    count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
    mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
    std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
    min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
    25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
    50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
    75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
    max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 891 entries, 0 to 890
    Data columns (total 9 columns):
     #   Column    Non-Null Count  Dtype  
    ---  ------    --------------  -----  
     0   Survived  891 non-null    int64  
     1   Pclass    891 non-null    int64  
     2   Sex       891 non-null    object 
     3   Age       714 non-null    float64
     4   SibSp     891 non-null    int64  
     5   Parch     891 non-null    int64  
     6   Fare      891 non-null    float64
     7   Cabin     204 non-null    object 
     8   Embarked  889 non-null    object 
    dtypes: float64(2), int64(4), object(3)
    memory usage: 62.8+ KB
    None
    Survived      0
    Pclass        0
    Sex           0
    Age         177
    SibSp         0
    Parch         0
    Fare          0
    Cabin       687
    Embarked      2
    dtype: int64
       Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
    0         0       3    male  22.0      1      0   7.2500     N        S
    1         1       1  female  38.0      1      0  71.2833   C85        C
    2         1       3  female  26.0      0      0   7.9250     N        S
    3         1       1  female  35.0      1      0  53.1000  C123        S
    4         0       3    male  35.0      0      0   8.0500     N        S
    Sex male      577
    female    314
    Name: Sex, dtype: int64
    Cabin N              687
    C23 C25 C27      4
    G6               4
    B96 B98          4
    C22 C26          3
                  ... 
    E34              1
    C7               1
    C54              1
    E36              1
    C148             1
    Name: Cabin, Length: 148, dtype: int64
    Embarked S    644
    C    168
    Q     77
    N      2
    Name: Embarked, dtype: int64
       Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
    0         0       3    male  22.0      1      0   7.2500     N        S
    1         1       1  female  38.0      1      0  71.2833     C        C
    2         1       3  female  26.0      0      0   7.9250     N        S
    3         1       1  female  35.0      1      0  53.1000     C        S
    4         0       3    male  35.0      0      0   8.0500     N        S
    
    Sex     Survived
    female  0            81
            1           233
    male    0           468
            1           109
    Name: Survived, dtype: int64
    0.7420382165605095
    0.18890814558058924
       Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
    0         0       3    male  22.0      1      0   7.2500     N        S
    1         1       1  female  38.0      1      0  71.2833     C        C
    2         1       3  female  26.0      0      0   7.9250     N        S
    
       Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
    0         0       3    1  22.0      1      0   7.2500      7         3
    1         1       1    0  38.0      1      0  71.2833      2         0
    2         1       3    0  26.0      0      0   7.9250      7         3
    [7 2 4 6 3 0 1 5 8]
    [1 0]
    [3 0 2 1]
    
       Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
    0       3    1  22.0      1      0   7.2500      7         3
    1       1    0  38.0      1      0  71.2833      2         0
    0    0
    1    1
    Name: Survived, dtype: int64
    (712, 8) (179, 8) (712,) (179,)
    ---------
    LogisticRegression acc : 0.79888
    DecisionTree acc : 0.72626
    RandomForest acc : 0.75978

     

    성별 생존 시각화

    댓글

Designed by Tistory.