# K-means实验原理

k-means算法是基于原型的聚类算法。给定聚类簇数k后，k-means算法把数据样本划分到k个簇中，使得组内平方和（within-cluster sum of squares）最小。直观来看，该式衡量了簇内样本围绕簇均值向量的紧密程度，越小则簇内样本相似程度越高。

k-means算法需要手动选择一个超参数k，选取不同的初始均值向量也可能会得到不同的聚类结果。

# 代码如下

``````#导入相关库
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import csv

#读取数据集
target2num = {'Iris-setosa':0,
'Iris-versicolor':1,
'Iris-virginica':2}

with open('iris_2features.txt') as csvfile:
dataset = list(lines)
x = np.array([sample[0:2] for sample in dataset], dtype=np.float)
y = np.array([target2num[sample[2]] for sample in dataset])

#Rand指数
def rand(y1, y2):
m = y1.shape[0]
a, d = 0, 0
for i in range(m):
for j in range(i+1, m):
if y1[i] == y1[j] and y2[i] == y2[j]:
a += 1
elif y1[i] != y1[j] and y2[i] != y2[j]:
d += 1
return 2*(a+d)/(m*(m-1))

#K-means聚类
print('真实情况')
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()

ks = [2, 3, 4, 5]

for k in ks:
print('k = {}'.format(k))
y_pred = KMeans(n_clusters=k).fit_predict(x)
ri = rand(y, y_pred)
print('Rand指数 = {}'.format(ri))
db = metrics.davies_bouldin_score(x, y_pred)
print('DB指数 = {}'.format(db))
plt.scatter(x[:, 0], x[:, 1], c=y_pred)
plt.show()

``````

THE END