TensorFlow

TensorFlow 기초 23 - iris dataset 분류 모델 여러개 생성 후 성능 비교. 최종 모델 ROC curve 표현(모델 = 함수 사용)

코딩탕탕 2022. 12. 6. 12:28

 

 

# iris dataset 분류 모델 여러개 생성 후 성능 비교. 최종 모델 ROC curve 표현

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

iris = load_iris()
print(iris.keys())

x = iris.data
print(x[:2])
y = iris.target
print(y[:2])

names = iris.target_names # 꽃의 종류명
print(names) # ['setosa' 'versicolor' 'virginica']
feature_names = iris.feature_names
print(feature_names) # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

# label에 대해 원핫 인코딩
print(y[:1], y.shape) # [0] (150,)
onehot = OneHotEncoder(categories='auto') # keras : to_categorical, numpy : eye(), pandas : get_dummies()
y = onehot.fit_transform(y[:, np.newaxis]).toarray()
print(y[:1], y.shape) # [[1. 0. 0.]] (150, 3)

# feature에 대해 표준화
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)
print(x_scaler[:2])

# train / test split
x_train, x_test, y_train, y_test = train_test_split(x_scaler, y, test_size=0.3, random_state=1)

n_features = x_train.shape[1]
n_classes = y_train.shape[1]
print('feature 수 : {}, label 수 : {}'.format(n_features, n_classes))

print('model')
from keras.models import Sequential
from keras.layers import Dense

def create_model_func(input_dim, output_dim, out_nodes, n, model_name='model'): # parameter 값에 초기값을 줄 수 있다.
    # print(input_dim, output_dim, out_nodes, n, model_name)
    def create_model():
        model = Sequential(name=model_name)
        for _ in range(n):
            model.add(Dense(units=out_nodes, input_dim=input_dim, activation='relu'))
            
        model.add(Dense(units=output_dim, activation='softmax')) # 출력층
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
        
        return model
    return create_model
    
models = [create_model_func(n_features, n_classes, 10, n, 'model_{}'.format(n)) for n in range(1, 4)]
print(len(models))

for cre_model in models:
    print()
    cre_model().summary()
    
history_dict = {}
for cre_model in models:
    model = cre_model()
    print('모델명 :', model.name)
    historis = model.fit(x_train, y_train, batch_size=5, epochs=50, validation_split=0.3, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    print('test loss :', score[0])
    print('test acc :', score[1])
    history_dict[model.name] = [historis, model]

print(history_dict)

# 모델 성능 확인을 위한 시각화
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 6))

for model_name in history_dict:
    print('h_d :', history_dict[model_name][0].history['acc'])
    
    val_acc = history_dict[model_name][0].history['val_acc']
    val_loss = history_dict[model_name][0].history['val_loss']
    ax1.plot(val_acc, label=model_name)
    ax2.plot(val_loss, label=model_name)
    ax1.set_ylabel('val acc')
    ax2.set_ylabel('val loss')
    ax2.set_ylabel('val loss')
    ax2.set_xlabel('epochs')
    ax1.legend()
    ax2.legend()
    
plt.show()

# ROC curve로 모델 성능 확인
from sklearn.metrics import roc_curve, auc

plt.figure()
plt.plot([0, 1], [0, 1], 'k--')

for model_name in history_dict:
    model = history_dict[model_name][1]
    y_pred = model.predict(x_test)
    # fpr, tpr 구하기
    fpr, tpr, _ = roc_curve(y_test.ravel(), y_pred.ravel())
    plt.plot(fpr, tpr, label='{}, AUC value : {:.3f}'.format(model_name, auc(fpr, tpr)))
             
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve')
plt.legend()
plt.show()

print()
# k-fold 교차검증 수행하여 모델 성능 비교
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

create_model = create_model_func(n_features, n_classes, 10, 1)
estimator = KerasClassifier(build_fn=create_model, epochs=50, batch_size=10, verbose=0)
scores = cross_val_score(estimator, x_scaler, y, cv=10)
print('acc : {:0.2f} (+/-{:0.2f})'.format(scores.mean(), scores.std()))

create_model = create_model_func(n_features, n_classes, 10, 2)
estimator = KerasClassifier(build_fn=create_model, epochs=50, batch_size=10, verbose=0)
scores = cross_val_score(estimator, x_scaler, y, cv=10)
print('acc2 : {:0.2f} (+/-{:0.2f})'.format(scores.mean(), scores.std()))

create_model = create_model_func(n_features, n_classes, 10, 3)
estimator = KerasClassifier(build_fn=create_model, epochs=50, batch_size=10, verbose=0)
scores = cross_val_score(estimator, x_scaler, y, cv=10)
print('acc3 : {:0.2f} (+/-{:0.2f})'.format(scores.mean(), scores.std()))

print('-------')
# 위 작업 후 가장 좋은 모델을 확인 후 최종 모델 작성 ...



<console>
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]]
[0 0]
['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[0] (150,)
[[1. 0. 0.]] (150, 3)
[[-0.90068117  1.01900435 -1.34022653 -1.3154443 ]
 [-1.14301691 -0.13197948 -1.34022653 -1.3154443 ]]
feature 수 : 4, label 수 : 3
model
3

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 10)                50        
                                                                 
 dense_1 (Dense)             (None, 3)                 33        
                                                                 
=================================================================
Total params: 83
Trainable params: 83
Non-trainable params: 0
_________________________________________________________________

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_2 (Dense)             (None, 10)                50        
                                                                 
 dense_3 (Dense)             (None, 10)                110       
                                                                 
 dense_4 (Dense)             (None, 3)                 33        
                                                                 
=================================================================
Total params: 193
Trainable params: 193
Non-trainable params: 0
_________________________________________________________________

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_5 (Dense)             (None, 10)                50        
                                                                 
 dense_6 (Dense)             (None, 10)                110       
                                                                 
 dense_7 (Dense)             (None, 10)                110       
                                                                 
 dense_8 (Dense)             (None, 3)                 33        
                                                                 
=================================================================
Total params: 303
Trainable params: 303
Non-trainable params: 0
_________________________________________________________________
모델명 : model_1
test loss : 0.4027646481990814
test acc : 0.800000011920929
모델명 : model_2
test loss : 0.27040863037109375
test acc : 0.9111111164093018
모델명 : model_3
test loss : 0.23972351849079132
test acc : 0.9333333373069763
{'model_1': [<keras.callbacks.History object at 0x000001EBD3F64130>, <keras.engine.sequential.Sequential object at 0x000001EBD3FDA250>], 'model_2': [<keras.callbacks.History object at 0x000001EBD54F5EB0>, <keras.engine.sequential.Sequential object at 0x000001EBD4012C70>], 'model_3': [<keras.callbacks.History object at 0x000001EBD690BE20>, <keras.engine.sequential.Sequential object at 0x000001EBD54F59D0>]}
h_d : [0.21917808055877686, 0.4109589159488678, 0.45205479860305786, 0.4794520437717438, 0.5479452013969421, 0.6301369667053223, 0.7123287916183472, 0.7397260069847107, 0.767123281955719, 0.7945205569267273, 0.7945205569267273, 0.7945205569267273, 0.8082191944122314, 0.835616409778595, 0.8493150472640991, 0.8493150472640991, 0.8630136847496033, 0.8630136847496033, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.9041095972061157]
h_d : [0.5479452013969421, 0.6438356041908264, 0.7123287916183472, 0.7397260069847107, 0.7397260069847107, 0.7397260069847107, 0.7397260069847107, 0.7123287916183472, 0.7123287916183472, 0.7123287916183472, 0.7123287916183472, 0.7123287916183472, 0.7260273694992065, 0.7260273694992065, 0.7397260069847107, 0.7397260069847107, 0.7534246444702148, 0.767123281955719, 0.7945205569267273, 0.8082191944122314, 0.8082191944122314, 0.835616409778595, 0.835616409778595, 0.8630136847496033, 0.8630136847496033, 0.8630136847496033, 0.8630136847496033, 0.8767123222351074, 0.8767123222351074, 0.8904109597206116, 0.8904109597206116, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9178082346916199, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9589040875434875, 0.9589040875434875, 0.9589040875434875]
h_d : [0.5890411138534546, 0.6438356041908264, 0.6438356041908264, 0.6849315166473389, 0.6849315166473389, 0.7397260069847107, 0.7808219194412231, 0.7945205569267273, 0.7945205569267273, 0.7945205569267273, 0.8082191944122314, 0.8219178318977356, 0.835616409778595, 0.8493150472640991, 0.8630136847496033, 0.8904109597206116, 0.8767123222351074, 0.8767123222351074, 0.8767123222351074, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.8767123222351074, 0.8904109597206116, 0.8904109597206116, 0.8904109597206116, 0.9178082346916199, 0.9041095972061157, 0.9178082346916199, 0.9178082346916199, 0.931506872177124, 0.931506872177124, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9452054500579834, 0.9589040875434875, 0.9589040875434875, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917, 0.9726027250289917]

1/2 [==============>...............] - ETA: 0s
2/2 [==============================] - 0s 997us/step

1/2 [==============>...............] - ETA: 0s
2/2 [==============================] - 0s 0s/step

1/2 [==============>...............] - ETA: 0s
2/2 [==============================] - 0s 997us/step

acc : 0.85 (+/-0.16)
acc : 0.92 (+/-0.08)
acc : 0.93 (+/-0.07)

여러 모델을 만들어서 for문을 돌린 뒤 가장 뛰어난 성능을 가진 모델을 찾아보았다.

 

one-hot 인코딩 방법

sklearn : OneHotEncoder()

keras : to_categorical,

numpy : eye(),

pandas : get_dummies()

 

acc, val loss 모델3개 시각

 

ROC curve 시각화