import numpy as np
from sklearn.cluster
import KMeans
from scipy.spatial.distance
import cdist
import matplotlib.pyplot as plt
c1x = np.random.uniform(0.5, 1.5, (1, 10
))
c1y = np.random.uniform(0.5, 1.5, (1, 10
))
c2x = np.random.uniform(3.5, 4.5, (1, 10
))
c2y = np.random.uniform(3.5, 4.5, (1, 10
))
x =
np.hstack((c1x, c2x))
y =
np.hstack((c1y, c2y))
X =
np.vstack((x, y)).T
K = range(1, 10
)
meanDispersions =
[]
for k
in K:
kmeans = KMeans(n_clusters=
k)
kmeans.fit(X)
#理解为计算某个与其所属类聚中心的欧式距离
#最终是计算所有点与对应中心的距离的平方和的均值
meanDispersions.append(sum(np.min(cdist(X, kmeans.cluster_centers_,
'euclidean'), axis=1)) /
X.shape[0])
plt.plot(K, meanDispersions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average Dispersion')
plt.title('Selecting k with the Elbow Method')
plt.show()
X为:
[[0.84223858 1.18059879
]
[0.84834276 0.84499409
]
[1.13263229 1.34316399
]
[0.95487981 0.59743761
]
[0.81646041 1.32361288
]
[0.90405171 0.54047701
]
[1.2723004 1.3461647
]
[0.52939142 1.03325549
]
[0.84592514 0.74344317
]
[1.07882783 1.4286598
]
[3.71702311 3.97510452
]
[3.95476036 3.83842502
]
[4.4297804 3.91854623
]
[4.08686159 4.15798624
]
[3.90406684 3.84413461
]
[4.32395689 4.06825926
]
[4.23112269 3.78578326
]
[3.70602931 4.08608482
]
[3.58690191 4.37072349
]
[4.38564657 4.02168693]]
随着K的增加,纵轴呈下降趋势且最终趋于稳定,那么拐点肘部处的位置所对应的k 值,不妨认为是相对最佳的类聚数量值。