# 방법4 : linregress를 사용. model O
from scipy import stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# iq 에 따른 시험점수 예측
score_iq = pd.read_csv('../testdata/score_iq.csv')
print(score_iq.head(3))
print(score_iq.info()) # 구조 출력
print(score_iq.corr()) # 상관관계 출력
x = score_iq.iq
y = score_iq.score
print(np.corrcoef(x, y)[0, 1]) # numpy로는 두 개밖에 볼 수 없다. 0.8822203446134701(피어슨상관계수)
# plt.scatter(x, y)
# plt.show()
# 모델 생성
model = stats.linregress(x, y)
print(model)
print('slope :',model.slope)
print('intercept :', model.intercept)
print('rvalue :', model.rvalue)
print('pvalue :', model.pvalue) # 2.8476895206683644e-50 < 0.05 이므로 회귀모델은 유의하다. 두 변수 간에 인과관계가 있다.
print('stderr :', model.stderr)
# y_hat = 0.6514309527270075 * x + -2.8564471221974657
plt.scatter(x, y)
plt.plot(x, model.slope * x + model.intercept, c = 'red')
plt.show()
# 점수 예측
print('점수 예측 :', model.slope * 140 + model.intercept)
print('점수 예측 :', model.slope * 125 + model.intercept)
print() # linregress는 predict를 지원하지 않음.
new_df = pd.DataFrame({'iq':[140, 125, 123, 100, 95]})
print('점수 예측 :', np.polyval([model.slope, model.intercept], new_df))
<console>
sid score iq academy game tv
0 10001 90 140 2 1 0
1 10002 75 125 1 3 3
2 10003 77 120 1 0 4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sid 150 non-null int64
1 score 150 non-null int64
2 iq 150 non-null int64
3 academy 150 non-null int64
4 game 150 non-null int64
5 tv 150 non-null int64
dtypes: int64(6)
memory usage: 7.2 KB
None
sid score iq academy game tv
sid 1.000000 -0.014399 -0.007048 -0.004398 0.018806 0.024565
score -0.014399 1.000000 0.882220 0.896265 -0.298193 -0.819752
iq -0.007048 0.882220 1.000000 0.671783 -0.031516 -0.585033
academy -0.004398 0.896265 0.671783 1.000000 -0.351315 -0.948551
game 0.018806 -0.298193 -0.031516 -0.351315 1.000000 0.239217
tv 0.024565 -0.819752 -0.585033 -0.948551 0.239217 1.000000
0.8822203446134701
LinregressResult(slope=0.6514309527270075, intercept=-2.8564471221974657, rvalue=0.8822203446134699, pvalue=2.8476895206683644e-50, stderr=0.028577934409305443, intercept_stderr=3.546211918048538)
slope : 0.6514309527270075
intercept : -2.8564471221974657
rvalue : 0.8822203446134699
pvalue : 2.8476895206683644e-50
stderr : 0.028577934409305443
점수 예측 : 88.34388625958358
점수 예측 : 78.57242196867847
점수 예측 : [[88.34388626]
[78.57242197]
[77.26956006]
[62.28664815]
[59.02949339]]
iq에 따른 시험점수 상관계수 시각화
추세선 시각화