# LSTM을 사용한 삼성전자 주가 예측(종가)
# KRX: 005930
# !pip install finance-datareader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import FinanceDataReader as fdr
STOCK_CODE = '005930'
stock_data = fdr.DataReader(STOCK_CODE)
print(stock_data.head())
print(stock_data.tail())
print('상관관계 : \n,', stock_data.corr(method='pearson'))
stock_data.reset_index(inplace=True)
stock_data.drop(['Change'], axis='columns', inplace=True)
print(stock_data.head(3))
print(stock_data.info())
# Data열을 연, 월, 일로 분리
stock_data['year'] = stock_data['Date'].dt.year
stock_data['month'] = stock_data['Date'].dt.month
stock_data['day'] = stock_data['Date'].dt.day
print(stock_data.head(3))
print(stock_data.shape) # (6000, 9)
# 1998년 이후 주가 흐름 시각화
df = stock_data.loc[stock_data['year'] >= 1998]
plt.figure(figsize=(6,4))
sns.lineplot(y=df['Close'], x=df.year)
plt.xlabel('year')
plt.ylabel('Close')
plt.legend()
plt.show()
# 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scale_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
df_scaled = scaler.fit_transform(stock_data[scale_cols])
df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = scale_cols
print(df_scaled.head(3))
only_close = ['Close']
close_scaled = scaler.fit_transform(stock_data[only_close]) # predict을 위함
print('스케일 값 :', close_scaled[:5].ravel())
print('복원 값 :', scaler.inverse_transform(close_scaled[:5]).ravel())
print('최초 값 :', stock_data['Close'].values[:5])
# 이전 20일을 기준으로 다음날 종가 예측
TEST_SIZE = 200 # 학습은 200일
train = df_scaled[:-TEST_SIZE] # 관찰값 처음부터 200일 이전까지
test = df_scaled[-TEST_SIZE:] # 최근 200일
print(train.shape) # (5800, 5)
print(test.shape) # (200, 5)
def make_dataset(data, label, window_size = 20):
feature_list = []
label_list = []
for i in range(len(data) - window_size):
feature_list.append(np.array(data.iloc[i:i + window_size]))
label_list.append(np.array(label.iloc[i + window_size]))
return np.array(feature_list), np.array(label_list)
# feature, label
feature_cols = ['Open', 'High', 'Low', 'Volume']
label_cols = ['Close']
train_feature = train[feature_cols]
train_label = train[label_cols]
test_feature = test[feature_cols]
test_label = test[label_cols]
train_feature, train_label = make_dataset(train_feature, train_label, 20)
print(train_feature[:2])
print(train_label[:2])
print(train_feature.shape, train_label.shape) # (5780, 20, 4) (5780, 1)
# train / test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_feature, train_label, test_size=0.2, shuffle=False, random_state=12)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (4624, 20, 4) (1156, 20, 4) (4624, 1) (1156, 1)
test_feature, test_label = make_dataset(test_feature, test_label, 20)
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping, ModelCheckpoint
model = Sequential()
model.add(LSTM(units=16, activation='tanh', input_shape=(train_feature.shape[1], train_feature.shape[2]), return_sequences=False))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))
# 'mse'는 이상치에 민감하다. Huber loss는 모든 지점에서 미분이 가능하면서 이상치에 강건한(robust) 성격을 보인다.
from keras.losses import Huber
loss = Huber()
# model.compile(optimizer='adam', loss='mse', metrics=['mse'])
model.compile(optimizer='adam', loss=loss, metrics=['mse'])
es = EarlyStopping(monitor='val_loss', mode='auto', patience=3)
mchkpoint = ModelCheckpoint('nlp18.h5', monitor='val_loss', save_best_only=True, verbose=0)
history = model.fit(x_train, y_train, epochs=50, batch_size=8, validation_data=(x_test,y_test), verbose=2,
callbacks=[es, mchkpoint])
# 시각화
plt.figure(figsize=(6, 4))
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()
# predict
from sklearn.metrics import r2_score
pred = model.predict(test_feature, verbose=0)
print('결정계수(설명력) :', r2_score(test_label, pred))
print('pred :', np.round(pred[:10].flatten(), 2))
print('pred(스케일 원복) :', scaler.inverse_transform(pred[:10]).flatten())
print('real(스케일 원복) :', scaler.inverse_transform(test_label[:10]).flatten())
# 시각화
plt.figure(figsize=(6, 4))
plt.plot(test_label[:20], label='real')
plt.plot(pred[:20].flatten(), label='pred')
plt.legend()
plt.show()
<console>
Open High Low Close Volume Change
Date
1998-09-18 727 747 680 690 1432270 NaN
1998-09-19 660 706 653 697 794390 0.010145
1998-09-21 689 689 661 669 828650 -0.040172
1998-09-22 657 669 637 638 1026950 -0.046338
1998-09-23 643 662 624 649 1719730 0.017241
Open High Low Close Volume Change
Date
2022-12-14 59800 60600 59800 60500 8207485 0.013400
2022-12-15 59800 60200 59300 59300 8716039 -0.019835
2022-12-16 58300 59500 58300 59500 13033596 0.003373
2022-12-19 59500 59900 59100 59500 7696187 0.000000
2022-12-20 59000 59100 58600 58800 5367011 -0.011765
상관관계 :
, Open High Low Close Volume Change
Open 1.000000 0.999884 0.999915 0.998152 0.722377 -0.025527
High 0.999884 1.000000 0.999882 0.998273 0.725217 -0.020439
Low 0.999915 0.999882 1.000000 0.998274 0.720598 -0.021182
Close 0.998152 0.998273 0.998274 1.000000 0.721871 -0.015309
Volume 0.722377 0.725217 0.720598 0.721871 1.000000 -0.003857
Change -0.025527 -0.020439 -0.021182 -0.015309 -0.003857 1.000000
Date Open High Low Close Volume
0 1998-09-18 727 747 680 690 1432270
1 1998-09-19 660 706 653 697 794390
2 1998-09-21 689 689 661 669 828650
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 6000 non-null datetime64[ns]
1 Open 6000 non-null int64
2 High 6000 non-null int64
3 Low 6000 non-null int64
4 Close 6000 non-null int64
5 Volume 6000 non-null int64
dtypes: datetime64[ns](1), int64(5)
memory usage: 281.4 KB
None
Date Open High Low Close Volume year month day
0 1998-09-18 727 747 680 690 1432270 1998 9 18
1 1998-09-19 660 706 653 697 794390 1998 9 19
2 1998-09-21 689 689 661 669 828650 1998 9 21
(6000, 9)
Open High Low Close Volume
0 0.008051 0.007717 0.007598 0.000575 0.015860
1 0.007309 0.007293 0.007296 0.000653 0.008797
2 0.007630 0.007118 0.007385 0.000343 0.009176
스케일 값 : [0.00057546 0.00065293 0.00034306 0. 0.00012173]
복원 값 : [690. 697. 669. 638. 649.]
최초 값 : [690 697 669 638 649]
(5800, 5)
(200, 5)
[[[0.00805094 0.00771694 0.00759777 0.01586016]
[0.00730897 0.00729339 0.00729609 0.00879663]
[0.00763012 0.00711777 0.00738547 0.00917601]
[0.00727575 0.00691116 0.00711732 0.01137187]
[0.00712071 0.00683884 0.00697207 0.01904333]
[0.00803987 0.0075 0.00791061 0.01235021]
[0.00827243 0.00786157 0.00801117 0.01733912]
[0.00795127 0.00757231 0.00795531 0.0033217 ]
[0.00820598 0.00780992 0.00795531 0.00601365]
[0.00791805 0.00771694 0.00792179 0.00650742]
[0.00816168 0.00769628 0.00804469 0.00534227]
[0.00797342 0.00747934 0.00782123 0.00431831]
[0.00771872 0.00733471 0.00767598 0.00396551]
[0.00797342 0.00760331 0.00791061 0.00868069]
[0.00805094 0.00769628 0.00811173 0.00855689]
[0.00848283 0.00840909 0.00830168 0.01628106]
[0.00894795 0.00852273 0.00887151 0.00901599]
[0.00885936 0.00876033 0.00858101 0.01349 ]
[0.00922481 0.00897727 0.00900559 0.01499311]
[0.00922481 0.0088843 0.00909497 0.01206717]]
[[0.00730897 0.00729339 0.00729609 0.00879663]
[0.00763012 0.00711777 0.00738547 0.00917601]
[0.00727575 0.00691116 0.00711732 0.01137187]
[0.00712071 0.00683884 0.00697207 0.01904333]
[0.00803987 0.0075 0.00791061 0.01235021]
[0.00827243 0.00786157 0.00801117 0.01733912]
[0.00795127 0.00757231 0.00795531 0.0033217 ]
[0.00820598 0.00780992 0.00795531 0.00601365]
[0.00791805 0.00771694 0.00792179 0.00650742]
[0.00816168 0.00769628 0.00804469 0.00534227]
[0.00797342 0.00747934 0.00782123 0.00431831]
[0.00771872 0.00733471 0.00767598 0.00396551]
[0.00797342 0.00760331 0.00791061 0.00868069]
[0.00805094 0.00769628 0.00811173 0.00855689]
[0.00848283 0.00840909 0.00830168 0.01628106]
[0.00894795 0.00852273 0.00887151 0.00901599]
[0.00885936 0.00876033 0.00858101 0.01349 ]
[0.00922481 0.00897727 0.00900559 0.01499311]
[0.00922481 0.0088843 0.00909497 0.01206717]
[0.00971207 0.00987603 0.00975419 0.01851357]]]
[[0.00353025]
[0.00417211]]
(5780, 20, 4) (5780, 1)
(4624, 20, 4) (1156, 20, 4) (4624, 1) (1156, 1)
2022-12-20 12:32:53.248658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Epoch 1/50
578/578 - 5s - loss: 3.2635e-04 - mse: 6.5270e-04 - val_loss: 0.0061 - val_mse: 0.0121 - 5s/epoch - 9ms/step
Epoch 2/50
578/578 - 2s - loss: 2.4637e-05 - mse: 4.9273e-05 - val_loss: 0.0035 - val_mse: 0.0070 - 2s/epoch - 3ms/step
Epoch 3/50
578/578 - 2s - loss: 2.3148e-05 - mse: 4.6295e-05 - val_loss: 0.0025 - val_mse: 0.0050 - 2s/epoch - 4ms/step
Epoch 4/50
578/578 - 2s - loss: 2.1270e-05 - mse: 4.2541e-05 - val_loss: 0.0015 - val_mse: 0.0030 - 2s/epoch - 4ms/step
Epoch 5/50
578/578 - 2s - loss: 1.9913e-05 - mse: 3.9827e-05 - val_loss: 0.0011 - val_mse: 0.0023 - 2s/epoch - 4ms/step
Epoch 6/50
578/578 - 2s - loss: 1.8818e-05 - mse: 3.7635e-05 - val_loss: 9.4901e-04 - val_mse: 0.0019 - 2s/epoch - 3ms/step
Epoch 7/50
578/578 - 2s - loss: 1.7018e-05 - mse: 3.4036e-05 - val_loss: 6.5133e-04 - val_mse: 0.0013 - 2s/epoch - 4ms/step
Epoch 8/50
578/578 - 2s - loss: 1.7787e-05 - mse: 3.5574e-05 - val_loss: 4.7323e-04 - val_mse: 9.4647e-04 - 2s/epoch - 4ms/step
Epoch 9/50
578/578 - 2s - loss: 1.5573e-05 - mse: 3.1145e-05 - val_loss: 5.5514e-04 - val_mse: 0.0011 - 2s/epoch - 3ms/step
Epoch 10/50
578/578 - 2s - loss: 1.5467e-05 - mse: 3.0933e-05 - val_loss: 9.2030e-04 - val_mse: 0.0018 - 2s/epoch - 3ms/step
Epoch 11/50
578/578 - 2s - loss: 1.5910e-05 - mse: 3.1819e-05 - val_loss: 9.5229e-04 - val_mse: 0.0019 - 2s/epoch - 3ms/step
결정계수(설명력) : 0.5338959831493618
pred : [0.73 0.73 0.72 0.73 0.72 0.71 0.71 0.71 0.71 0.71]
pred(스케일 원복) : [66518.586 66166.58 66002.34 66203.74 65722.97 65136.285 64868.406
64739.223 64469.695 64387.508]
real(스케일 원복) : [69100. 69300. 69200. 68500. 68000. 67800. 67900. 67000. 68700. 67500.]