Python 데이터 분석

Python 데이터분석 기초 66 - random함수로 무작위로 데이터를 생성 분석(체질량지수(BMI))

코딩탕탕 2022. 11. 23. 16:26

 

 

"""
#  BMI : 체질량지수는 자신의 몸무게(kg)를 키의 제곱(m)으로 나눈 값입니다.

# 예)
print(71 / ((178/100)*(178/100))) # 체질량지수: 22.4


import random
random.seed(12)
def calc_bmi(h, w):
    bmi = w/(h/100)**2
    if bmi < 18.5: return 'thin'
    if bmi < 25.0: return 'normal'
    return 'fat'

# print(calc_bmi(178, 71))

fp = open('bmi.csv', 'w')
fp.write('height,weight,label\n')


# 무작위 데이터 생성
cnt={'thin':0, 'normal':0, 'fat':0 }

for i in range(50000):
    h= random.randint(150, 200)
    w= random.randint(35, 100)
    label = calc_bmi(h, w)
    cnt[label] += 1
    fp.write('{0},{1},{2}\n'.format(h, w, label))

fp.close()
"""


import pandas as pd
import numpy as np
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


tbl = pd.read_csv('bmi.csv')
print(tbl.head(3), tbl.shape) # (50000, 3)
print(tbl.describe())

label = tbl['label']
print(label[:3])

w = tbl['weight'] / 100 # 정규화
h = tbl['height'] / 200 # 정규화
print(w[:3])
print(h[:3])

wh = pd.concat([w, h], axis = 1)
print(wh[:3], wh.shape) # (50000, 2)

# label을 dummy
label = label.map({'thin':0, 'normal':1, 'fat':2})
print(label[:3])

# train / test split
x_train, x_test, y_train, y_test = train_test_split(wh, label, test_size = 0.3, random_state = 1)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) # (35000, 2) (15000, 2) (35000,) (15000,)

print()
# model
model = svm.SVC(C=0.1).fit(x_train, y_train)

pred = model.predict(x_test)
print('예측값 :', pred[:10])
print('실제값 :', y_test[:10].values)

acc = metrics.accuracy_score(y_test, pred)
print('acc :', acc) # 0.9909

print()
# 교차 검증
from sklearn import model_selection
cross_vali = model_selection.cross_val_score(model, wh, label, cv = 3)
print('각각의 검증 정확도 :', cross_vali)
print('평균 검증 정확도 :', cross_vali.mean())

# 시각화
tbl2 = pd.read_csv('bmi.csv', index_col = 2)

def scatter_func(lbl, color):
    b = tbl2.loc[lbl]
    plt.scatter(b['weight'], b['height'], c = color, label = lbl)

scatter_func('fat', 'red')
scatter_func('normal', 'yellow')
scatter_func('thin', 'blue')
plt.legend()
plt.show()

# 새 값으로 예측
new_data = pd.DataFrame({'weight':[66, 55], 'height':[170, 180]})
new_data['weight'] = new_data['weight'] / 100
new_data['height'] = new_data['height'] / 200
new_pred = model.predict(new_data)
print('새로운 예측값 :', new_pred)





<console>
   height  weight   label
0     180      69  normal
1     192      79  normal
2     159      83     fat (50000, 3)
             height        weight
count  50000.000000  50000.000000
mean     174.924900     67.577460
std       14.733304     19.049192
min      150.000000     35.000000
25%      162.000000     51.000000
50%      175.000000     68.000000
75%      188.000000     84.000000
max      200.000000    100.000000
0    normal
1    normal
2       fat
Name: label, dtype: object
0    0.69
1    0.79
2    0.83
Name: weight, dtype: float64
0    0.900
1    0.960
2    0.795
Name: height, dtype: float64
   weight  height
0    0.69   0.900
1    0.79   0.960
2    0.83   0.795 (50000, 2)
0    1
1    1
2    2
Name: label, dtype: int64
(35000, 2) (15000, 2) (35000,) (15000,)

예측값 : [2 0 1 1 0 0 2 1 0 0]
실제값 : [2 0 1 1 0 0 2 1 0 0]
acc : 0.9909333333333333

각각의 검증 정확도 : [0.99232015 0.99346013 0.99057962]
평균 검증 정확도 : 0.9921199691930799
새로운 예측값 : [1 0]

 

시각화

 

 

random 함수로 임의의 키, 몸무게 데이터를 만들어서 체질량지수(BMI)를 분석했다. SVM을 사용하였다.