1 k-Means算法
k-Means算法是一种经典的聚类算法,也称为K均值聚类算法。
k-Means的工具原理:
假设建立一个坐标系,这个坐标系的横坐标是价格,纵坐标是评论。然后根据每个物品的这两项特征将物品放置在该坐标系中,那么如何将这些物品划分为k个类。此时K为自定义。例如,可以定义k为2,既将所有的物品划分为两类。
首先,随机选择两类的中心点AB,这两类的称为聚类中心。初始的聚类中心是随机选择的,很大的概率上并不是真正的类中心,因此这两点会在后续的聚类过程中不断调整,至至趋于真正的聚类中心。
其次,分别计算各个物品距两个聚类中心AB的距离,将其划分为距离较近的聚类中心点一类。例如,点1距A的距离小于点B到1的距离,所以点1划分成A类。
再次,需要对每一类计算重心位置,需要将这个类的聚类中心调整到中心位置A',B'.然后再次计算各个点到两个聚类中A',B'的距离。选择距离最短的并将其设为新的聚类中A'',和B''.
最后,不断迭代直至收敛,得到的距离中心不再变化。
步骤简述:
1 初始化聚类中心(随机选择)
2 计算样本点到各个聚类中心的距离
3 将样本点归为距离较近的聚类中心一类
4 移动聚类中心到类别的重心位置,调整聚类中心
5 重复234直至聚类中心不再变化。
代码1:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
#随机生成二列100行个点的数据
#使用kmeans对其进行分类 分三类
#sklearn.cluster.KMeans(n_clusters = 8,
# 初始化= 'k均值++' ,n_init = 10,max_iter = 300,TOL = 0.0001,
# precompute_distances = '自动',
# 冗长= 0,random_state =无,copy_x =真,n_jobs = 1,算法= 'auto' )
estimator = KMeans(n_clusters=3)
#计算每个样本的聚类中心并预测聚类索引。
res = estimator.fit_predict(data)
#每个点的标签
lable_pred = estimator.labels_
#每个点的聚类中心
centroids = estimator.cluster_centers_
#样本距其最近的聚类中心的平方距离之和。
inertia = estimator.inertia_
# print (res)
print (lable_pred)
print (centroids)
print (inertia)
for i in range(len(data)):
if int(lable_pred[i]) == 0:
plt.scatter(data[i][0], data[i][1], color='red')
if int(lable_pred[i]) == 1:
plt.scatter(data[i][0], data[i][1], color='black')
if int(lable_pred[i]) == 2:
plt.scatter(data[i][0], data[i][1], color='blue')
plt.show()
代码2:
import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
estimators = [('k_means_iris_8', KMeans(n_clusters=8)),
('k_means_iris_3', KMeans(n_clusters=3)),
('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
init='random'))]
fignum = 1
titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization']
for name, est in estimators:
fig = plt.figure(fignum, figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
est.fit(X)
labels = est.labels_
ax.scatter(X[:, 3], X[:, 0], X[:, 2],
c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
ax.set_title(titles[fignum - 1])
ax.dist = 12
fignum = fignum + 1
# Plot the ground truth
fig = plt.figure(fignum, figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
for name, label in [('Setosa', 0),
('Versicolour', 1),
('Virginica', 2)]:
ax.text3D(X[y == label, 3].mean(),
X[y == label, 0].mean(),
X[y == label, 2].mean() + 2, name,
horizontalalignment='center',
bbox=dict(alpha=.2, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y, edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')
ax.set_title('Ground Truth')
ax.dist = 12
fig.show()
代码3:
周志华西瓜书 【西瓜数据集4.0】
number,density,sugercontent
1,0.697,0.460
2,0.774,0.376
3, 0.634,0.264
4,0.608,0.318
5,0.556,0.215
6,0.403,0.237
7,0.481,0.149
7,0.666,0.091
8,0.437,0.211
9,0.666,0.091
10,0.243,0.267
11,0.245,0.057
12,0.343,0.099
13,0.639,0.161
14,0.657,0.198
15,0.360,0.370
16,0.593,0.042
17,0.719,0.103
18,0.359,0.188
19,0.339,0.241
20,0.282,0.257
21,0.748,0.232
22,0.714,0.346
23,0.483,0.312
24,0.478,0.437
25,0.525,0.369
26,0.751,0.489
27,0.532,0.472
28,0.473,0.376
29,0.725,0.445
30,0.446,0.459
import numpy as np
import matplotlib.pyplot as plt
# Though the following import is not directly being used, it is required
# for 3D projection to work
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
import pandas as pd
xigua = pd.read_csv('xigua.csv')
estimator = KMeans(n_clusters=3,max_iter=500,)
#计算每个样本的聚类中心并预测聚类索引。
a1=xigua.values
print(a1[:,1:3])
res = estimator.fit_predict(a1[:,1:3])
#每个点的标签
lable_pred = estimator.labels_
#每个点的聚类中心
centroids = estimator.cluster_centers_
#样本距其最近的聚类中心的平方距离之和。
inertia = estimator.inertia_
print (lable_pred)
print (centroids)
print (inertia)
for i in range(len(a1)):
if int(lable_pred[i]) == 0:
plt.scatter(a1[i][0], a1[i][1], color='red')
if int(lable_pred[i]) == 1:
plt.scatter(a1[i][0], a1[i][1], color='black')
if int(lable_pred[i]) == 2:
plt.scatter(a1[i][0], a1[i][1], color='yellow')
plt.show()
领取专属 10元无门槛券
私享最新 技术干货