Python 데이터 분석
Python 데이터분석 기초 76 - 밀도 기반 클러스터링(DBSCAN)
코딩탕탕
2022. 11. 28. 10:39
DBSCAN(Density-based spatial clustering of applications with noise)
밀도 기반의 클러스터링은 점이 세밀하게 몰려 있어서 밀도가 높은 부분을 클러스터링 하는 방식이다. 쉽게 설명하면, 어느점을 기준으로 반경 x내에 점이 n개 이상 있으면 하나의 군집으로 인식하는 방식이다.
# 밀도 기반 클러스터링 : 데이터가 비선형인 경우 일반적인 계층적/비계층적 클러스터링이 불가. 이를 해결하기 위한 방안.
import matplotlib.pylab as plt
from matplotlib import style
import numpy as np
from sklearn.datasets import make_moons
from sklearn.cluster import KMeans, DBSCAN
# 샘플 데이터
x, y = make_moons(n_samples=200, noise=0.05, random_state=0)
print(x)
print('실제 군집 id :', set(y))
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()
# KMeans로 군집 분류
km = KMeans(n_clusters=2, random_state=0)
pred1 = km.fit_predict(x)
print('예측 군집 id :', pred1[:10])
print('예측 군집 id :', set(pred1))
# Kmeans 군집결과 시각화
def plotFunc(x, pr):
plt.scatter(x[pr==0, 0], x[pr==0, 1], s=40, c='blue', marker='o', label='cluster1')
plt.scatter(x[pr==1, 0], x[pr==1, 1], s=40, c='red', marker='s', label='cluster2')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=60, c='black', marker='+', label='centroid')
plt.legend()
plt.show()
plotFunc(x, pred1) # 완전한 분리가 완 된다.
# DBSCAN으로 군집 분류
ds = DBSCAN(eps=0.2, min_samples=5, metric='euclidean') # eps : 두 샘플간의 최대거리
pred2 = ds.fit_predict(x)
plotFunc(x, pred2)
<console>
[[ 0.81680544 0.5216447 ]
[ 1.61859642 -0.37982927]
[-0.02126953 0.27372826]
[-1.02181041 -0.07543984]
[ 1.76654633 -0.17069874]
[ 1.8820287 -0.04238449]
[ 0.97481551 0.20999374]
[ 0.88798782 -0.48936735]
[ 0.89865156 0.36637762]
[ 1.11638974 -0.53460385]
[-0.36380036 0.82790185]
[ 0.24702417 -0.23856676]
[ 1.81658658 -0.13088387]
[ 1.2163905 -0.40685761]
[-0.8236696 0.64235178]
[ 0.98065583 0.20850838]
[ 0.54097175 0.88616823]
[ 0.348031 -0.30101351]
[ 0.35181497 0.88827765]
[-0.77708642 0.82253872]
[ 1.92590813 0.01214146]
[ 0.86096723 -0.47653216]
[ 0.19990695 0.99672359]
[ 1.2895301 -0.37071087]
[-0.27847636 1.02538452]
[ 0.24187916 -0.07627812]
[ 1.84988768 -0.09773674]
[ 1.88406869 0.0449402 ]
[ 0.165822 -0.08613126]
[ 0.13861369 0.89639036]
[ 0.89087024 0.52265882]
[-0.22806587 0.84091882]
[ 0.98279208 -0.46457771]
[ 0.04237749 0.19457898]
[ 0.76422612 0.67223332]
[ 1.91108938 0.21178339]
[ 0.43608432 -0.23007221]
[ 0.96186938 0.09923426]
[-0.84336684 0.52414334]
[-0.04122466 0.35721873]
[ 0.55507653 -0.42493298]
[-0.4388286 0.85940389]
[ 0.6532646 0.71235382]
[ 0.10274835 0.06721414]
[ 1.5486824 -0.34012196]
[-0.37318371 0.95506411]
[ 1.01706978 0.19210044]
[-0.71923685 0.65476676]
[ 0.16135772 -0.10771978]
[ 0.86434045 -0.4594568 ]
[-0.69717533 0.80133734]
[ 0.32791175 -0.19619019]
[ 1.98046734 0.03848682]
[-0.90479784 0.05723938]
[ 1.04515397 -0.50020349]
[ 0.7534213 0.65688005]
[ 0.54968577 0.73635744]
[ 1.24038086 -0.47577903]
[ 0.24918868 0.94246199]
[-0.20756105 0.99290594]
[ 0.35136403 -0.29065432]
[-1.01628753 0.16290244]
[ 1.78137056 -0.1244931 ]
[ 0.87423825 0.53065346]
[ 1.09997644 -0.46733763]
[-1.07022744 0.2365448 ]
[-0.15869858 1.01497482]
[ 1.46569247 -0.3808977 ]
[ 0.03025209 0.97792142]
[-0.9365943 0.45674926]
[ 0.66038307 -0.46576222]
[-0.99144728 0.40662094]
[ 0.46339847 -0.46605416]
[-0.132006 0.52447234]
[ 0.81566997 -0.42821617]
[-0.94820947 0.37717096]
[ 0.05300205 0.18597406]
[ 0.92648634 0.40988975]
[ 0.60689997 0.78279323]
[ 0.72961391 -0.37215252]
[ 1.9796026 0.12425417]
[-0.02053902 0.97601558]
[ 0.63818364 -0.49916763]
[ 2.00639179 0.44597642]
[ 0.02315539 0.24035667]
[-0.35883877 1.02716833]
[ 0.95414653 0.04177433]
[-0.33921532 0.96308888]
[ 0.59950492 -0.39774852]
[ 1.99019644 0.39360049]
[ 0.33125729 0.9365782 ]
[ 0.99460422 0.35063363]
[ 1.98845457 0.2628361 ]
[-0.67473718 0.76419738]
[ 2.00751107 0.3651166 ]
[ 1.78298331 -0.11490401]
[ 1.73616653 -0.22781554]
[ 0.40646216 -0.25422904]
[-1.02505346 0.24337404]
[ 0.06414296 0.07759793]
[ 1.30092145 -0.58089757]
[ 1.97425572 0.30889897]
[ 0.03228388 1.07937745]
[ 1.03086156 -0.02389082]
[-0.90062492 0.30653639]
[ 0.08068561 0.29131373]
[-0.98807765 0.1039765 ]
[-0.47394435 0.96143212]
[ 1.54651932 -0.35008497]
[ 0.23332453 0.89648984]
[-0.58481687 0.80318956]
[ 0.0374878 1.02322111]
[-0.01943215 1.07001032]
[-0.85323667 0.39896937]
[ 0.92635535 0.37695326]
[ 1.43250553 -0.50148981]
[ 0.60622756 0.66229531]
[ 1.94401554 0.13685573]
[ 0.57984414 -0.39868907]
[ 0.74317519 0.50998316]
[ 0.87116686 0.54105191]
[-0.71045745 0.57281877]
[-0.03081568 0.33644614]
[-0.0298505 0.99553114]
[-0.06313347 0.42194174]
[-0.79223214 0.68354165]
[ 0.92098434 0.04171051]
[ 0.17794377 0.04536893]
[ 1.34934828 -0.3941652 ]
[ 1.98387143 0.50898445]
[ 1.00104892 0.27158454]
[-0.5425424 0.76257612]
[-0.9969011 0.47226403]
[ 0.23408511 -0.15381658]
[ 1.21437019 -0.40862022]
[ 1.60101745 -0.17940652]
[ 1.15844202 -0.40408591]
[-1.00922523 0.2161359 ]
[ 2.01865957 0.50313426]
[ 0.88839866 0.39017093]
[ 0.10170896 -0.01206481]
[-0.01241966 0.47064905]
[ 0.44566504 0.94595998]
[-0.3569344 0.98319206]
[-0.43845037 0.88374167]
[ 1.01534178 0.06687469]
[ 0.2310607 0.01153495]
[ 1.35098772 -0.44520507]
[ 0.25423421 1.0205525 ]
[-0.00586456 0.24919627]
[ 0.4752852 -0.37028432]
[ 1.68071768 -0.34775296]
[ 0.84564282 0.45629647]
[ 0.34218757 0.90613948]
[ 0.58741368 -0.35078742]
[-0.17818292 0.96641541]
[ 1.25865528 -0.4740009 ]
[ 0.33542814 -0.18023343]
[ 0.52630774 0.94876068]
[ 0.6424051 0.77717105]
[ 0.15770292 0.04709417]
[ 1.11178863 -0.5065278 ]
[ 0.60370903 0.83759912]
[ 1.48247118 -0.32721961]
[ 0.39793421 -0.36876588]
[ 1.67240934 -0.09328043]
[ 0.47551295 0.85547255]
[ 0.70605116 -0.42241887]
[ 1.56418943 -0.34860626]
[ 0.94012854 -0.57508877]
[ 0.61400301 0.83833823]
[-1.07139757 0.02669316]
[-0.91308996 0.52626435]
[-0.74824469 0.51823742]
[ 0.14688241 0.0297201 ]
[ 0.94362014 -0.44829425]
[ 1.84489829 0.40601924]
[-0.66827347 0.69085682]
[-0.7362418 0.59951884]
[ 0.60146482 0.72551706]
[ 1.47437703 -0.37541022]
[-0.88760005 0.50864517]
[ 1.92892164 0.18201791]
[ 1.78673422 -0.27470711]
[ 1.95130228 0.26574549]
[ 0.33471666 0.98057089]
[-0.16884749 0.89206411]
[ 0.77063994 -0.51750338]
[-0.88700503 0.36696366]
[-0.62886492 0.79087211]
[-0.93006783 0.38754885]
[ 0.42447858 0.93268774]
[ 0.80861392 0.53599924]
[ 0.94000928 0.27111431]
[-0.01609181 0.37369612]
[-0.53633385 0.86026837]
[ 1.88281749 0.24435589]
[ 0.17575161 -0.007231 ]
[ 0.12423604 1.00790161]
[ 1.62152568 -0.22328525]]
실제 군집 id : {0, 1}
예측 군집 id : [1 1 0 0 1 1 1 1 1 1]
예측 군집 id : {0, 1}


KMeans를 사용하면 완전한 분리가 되지 않는다.

DBSCAN을 사용하면 완벽한 군집분류가 완성된다.