# 선형회귀분석 : iris dataset으로 모델 생성
# 약한 상관관계 변수, 강한 상관관계 변수로 모델 작성
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
iris = sns.load_dataset('iris')
print(iris.head(3))
print(type(iris))
print(iris.corr()) # 상관관계 호출
print('연습1 : 약한 상관관계 변수 - sepal_length, sepal_width')
result1 = smf.ols(formula = 'sepal_length ~ sepal_width', data = iris).fit()
print('요약결과1 :', result1.summary())
print('R-squared :', result1.rsquared) # 0.013822654141080748 이므로 설명력이 매우 낮다.
print('p-value :', result1.pvalues) # 1.518983e-01 > 0.05 이므로 독립변수로 유의하지 않다.
# 의미없는 모델로 예측 결과 확인
print('실제값 :', iris.sepal_length[:5].values)
print('예측값 :', result1.predict()[:5])
# model1 시각화
plt.scatter(iris.sepal_width, iris.sepal_length)
plt.plot(iris.sepal_width, result1.predict(), c = 'red') # 추세선 긋기
plt.show()
print('\n연습2 : 강한 상관관계 변수 - sepal_length, petal_width')
result2 = smf.ols(formula = 'sepal_length ~ petal_width', data = iris).fit()
print('요약결과1 :', result2.summary())
print('R-squared :', result2.rsquared) # 0.6690276860464136 이므로 설명력이 높다.
print('p-value :', result2.pvalues) # 2.325498e-37 < 0.05 이므로 독립변수로 유의하다.
# 의미없는 모델로 예측 결과 확인
print('실제값 :', iris.sepal_length[:5].values)
print('예측값 :', result2.predict()[:5])
# model2 시각화
plt.scatter(iris.petal_width, iris.sepal_length)
plt.plot(iris.petal_width, result2.predict(), c = 'b') # 추세선 긋기
plt.show()
print('새로운 값(petal_width)으로 결과 예측(sepal_length)')
new_data = pd.DataFrame({'petal_width':[1.1, 3.3, 5.5, 7.7]})
y_pred = result2.predict(new_data)
print('예측 결과 :', y_pred.values)
<console>
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
<class 'pandas.core.frame.DataFrame'>
sepal_length sepal_width petal_length petal_width
sepal_length 1.000000 -0.117570 0.871754 0.817941
sepal_width -0.117570 1.000000 -0.428440 -0.366126
petal_length 0.871754 -0.428440 1.000000 0.962865
petal_width 0.817941 -0.366126 0.962865 1.000000
연습1 : 약한 상관관계 변수 - sepal_length, sepal_width
요약결과1 : OLS Regression Results
==============================================================================
Dep. Variable: sepal_length R-squared: 0.014
Model: OLS Adj. R-squared: 0.007
Method: Least Squares F-statistic: 2.074
Date: Tue, 15 Nov 2022 Prob (F-statistic): 0.152
Time: 14:46:20 Log-Likelihood: -183.00
No. Observations: 150 AIC: 370.0
Df Residuals: 148 BIC: 376.0
Df Model: 1
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
Intercept 6.5262 0.479 13.628 0.000 5.580 7.473
sepal_width -0.2234 0.155 -1.440 0.152 -0.530 0.083
==============================================================================
Omnibus: 4.389 Durbin-Watson: 0.952
Prob(Omnibus): 0.111 Jarque-Bera (JB): 4.237
Skew: 0.360 Prob(JB): 0.120
Kurtosis: 2.600 Cond. No. 24.2
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
R-squared : 0.013822654141080748
p-value : Intercept 6.469702e-28
sepal_width 1.518983e-01
dtype: float64
실제값 : [5.1 4.9 4.7 4.6 5. ]
예측값 : [5.74445884 5.85613937 5.81146716 5.83380326 5.72212273]
연습2 : 강한 상관관계 변수 - sepal_length, petal_width
요약결과1 : OLS Regression Results
==============================================================================
Dep. Variable: sepal_length R-squared: 0.669
Model: OLS Adj. R-squared: 0.667
Method: Least Squares F-statistic: 299.2
Date: Tue, 15 Nov 2022 Prob (F-statistic): 2.33e-37
Time: 14:46:21 Log-Likelihood: -101.11
No. Observations: 150 AIC: 206.2
Df Residuals: 148 BIC: 212.2
Df Model: 1
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
Intercept 4.7776 0.073 65.506 0.000 4.634 4.922
petal_width 0.8886 0.051 17.296 0.000 0.787 0.990
==============================================================================
Omnibus: 2.390 Durbin-Watson: 1.917
Prob(Omnibus): 0.303 Jarque-Bera (JB): 1.939
Skew: 0.254 Prob(JB): 0.379
Kurtosis: 3.229 Cond. No. 3.70
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
R-squared : 0.6690276860464136
p-value : Intercept 3.340431e-111
petal_width 2.325498e-37
dtype: float64
실제값 : [5.1 4.9 4.7 4.6 5. ]
예측값 : [4.95534547 4.95534547 4.95534547 4.95534547 4.95534547]
새로운 값(petal_width)으로 결과 예측(sepal_length)
예측 결과 : [ 5.75506769 7.70994425 9.66482081 11.61969737]
iris.sepal_width, iris.sepal_length의 약한 상관관계의 시각화(안 좋은 모델)
iris.petal_width, iris.sepal_length 강한 상관관계의 시각화(좋은 모델)