TensorFlow 기초 42 - IMDB 리뷰 감성 분류하기(IMDB Movie Review Sentiment Analysis)

TensorFlow 2022. 12. 19. 12:57
# IMDB 리뷰 감성 분류하기(IMDB Movie Review Sentiment Analysis)
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb

(X_train, y_train), (X_test, y_test) = imdb.load_data()

print('훈련용 리뷰 개수 : {}'.format(len(X_train)))
print('테스트용 리뷰 개수 : {}'.format(len(X_test)))
num_classes = len(set(y_train))
print('카테고리 : {}'.format(num_classes))
print(set(y_train)) # {0, 1}
print(X_train[:1])
print(y_train[:1])

reviews_length = [len(review) for review in X_train]

print('리뷰의 최대 길이 : {}'.format(np.max(reviews_length)))
print('리뷰의 평균 길이 : {}'.format(np.mean(reviews_length)))

len_result = [len(i) for i in X_train]

plt.subplot(1,2,1)
plt.boxplot(reviews_length)
plt.subplot(1,2,2)
plt.hist(len_result, bins=50)
plt.show()

unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("각 레이블에 대한 빈도수:")
print(np.asarray((unique_elements, counts_elements)))

word_to_index = imdb.get_word_index()
index_to_word = {}
for key, value in word_to_index.items():
    index_to_word[value+3] = key # index_to_word에 인덱스를 집어넣으면 전처리 전에 어떤 단어였는지 확인할 수 있다.

print('빈도수 상위 1등 단어 : {}'.format(index_to_word[4])) # IMDB 리뷰 데이터셋에서는 0, 1, 2, 3은 특별 토큰으로 취급하고 있다
print('빈도수 상위 3938등 단어 : {}'.format(index_to_word[3941]))

# 첫번째 훈련용 리뷰의 X_train[0]의 각 단어가 정수로 바뀌기 전에 어떤 단어들이었는지 확인해보겠습니다.
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index] = token

print(' '.join([index_to_word[index] for index in X_train[0]]))


# LSTM, GRU로 IMDB 리뷰 감성 분류하기
import re
from keras.datasets import imdb
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

vocab_size = 10000
max_len = 500

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

embedding_dim = 100
hidden_units = 128

# # model 생성 1 : RNN 
# model = Sequential()
# model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
# model.add(GRU(hidden_units, activation='tanh'))
# model.add(Dense(1, activation='sigmoid'))

# model 생성 2 : CNN - Conv1D
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Dropout
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Conv1D(filters=256, kernel_size=3, padding='valid', strides=1, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=3, baseline=0.01)
mc = ModelCheckpoint('rnn16_model.h5', monitor='val_acc', mode='max', verbose=0, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=50, callbacks=[es, mc], batch_size=64, validation_split=0.2)
print('evaluate : ', model.evaluate(X_test, y_test))


es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=3, baseline=0.01)
mc = ModelCheckpoint('rnn16_model.h5', monitor='val_acc', mode='max', verbose=0, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=50, callbacks=[es, mc], batch_size=64, validation_split=0.2)
print('evaluate : ', model.evaluate(X_test, y_test))

pred = model.predict(X_test[:20])
print('예측값 :', np.where(pred > 0.5, 1, 0).flatten())
print('실제값 :', y_test[:20])

def sentiment_predict(new_sentence):
  # 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
  new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()
  encoded = []

  # 띄어쓰기 단위 토큰화 후 정수 인코딩
  for word in new_sentence.split():
    try :
      # 단어 집합의 크기를 10,000으로 제한.
      if word_to_index[word] <= 10000:
        encoded.append(word_to_index[word]+3)
      else:
      # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
        encoded.append(2)
    # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
    except KeyError:
      encoded.append(2)

  pad_sequence = pad_sequences([encoded], maxlen=max_len)
  score = float(model.predict(pad_sequence)) # 예측

  if(score > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100))
    
test_input = "This movie was just way too overrated. The fighting was not professional and in slow motion. I was expecting more from a 200 million budget movie. The little sister of T.Challa was just trying too hard to be funny. The story was really dumb as well. Don't watch this movie if you are going because others say its great unless you are a Black Panther fan or Marvels fan."
sentiment_predict(test_input)

test_input = " I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios. \
Now, the film... how can I even begin to explain how I feel about this film? It is, as the title of this review says a 'comic book triumph'. I went into the film with very, very high expectations and I was not disappointed. \
Seeing Joss Whedon's direction and envisioning of the film come to life on the big screen is perfect. The script is amazingly detailed and laced with sharp wit a humor. The special effects are literally mind-blowing and the action scenes are both hard-hitting and beautifully choreographed."
sentiment_predict(test_input)


<console>
훈련용 리뷰 개수 : 25000
테스트용 리뷰 개수 : 25000
카테고리 : 2
{0, 1}
[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32])]
[1]
리뷰의 최대 길이 : 2494
리뷰의 평균 길이 : 238.71364
각 레이블에 대한 빈도수:
[[    0     1]
 [12500 12500]]
빈도수 상위 1등 단어 : the
빈도수 상위 3938등 단어 : suited

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 conv1d (Conv1D)             (None, 498, 256)          77056     
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
=================================================================
Total params: 1,093,569
Trainable params: 1,093,569
Non-trainable params: 0
_________________________________________________________________
None
evaluate :  [0.32060739398002625, 0.8897200226783752]

1/1 [==============================] - ETA: 0s
1/1 [==============================] - 0s 61ms/step
예측값 : [0 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0]
실제값 : [0 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 0]

1/1 [==============================] - ETA: 0s
1/1 [==============================] - 0s 9ms/step
99.97% 확률로 부정 리뷰입니다.

1/1 [==============================] - ETA: 0s
1/1 [==============================] - 0s 9ms/step
99.64% 확률로 긍정 리뷰입니다.
'TensorFlow' 카테고리의 다른 글

TensorFlow 기초 44 - LSTM을 사용한 삼성전자 주가 예측(종가) (0)	2022.12.20
TensorFlow 기초 43 - 케라스에서 제공하는 로이터 뉴스 데이터를 LSTM을 이용하여 텍스트 분류를 진행 (0)	2022.12.19
TensorFlow 기초 41 - RNN으로 스펨 메일 분류 (이항 분류) (0)	2022.12.19
TensorFlow 기초 40 - 자소 단위로 분리한 후 텍스트 생성 모델 (0)	2022.12.16
뉴욕타임즈 뉴스 기사 중 헤드라인을 읽어 텍스트 생성 연습(LSTM) (0)	2022.12.14
ABOUT ME

코딩탕탕 코딩탕탕

'TensorFlow' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'TensorFlow' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바