
数据挖掘(第八章聚类分析)
print("NMI指数:%0.3f" % metrics.normalized_mutual_info_score(y,y_pred))print('调整兰德指数AMI: %0.3f' % metrics.adjusted_rand_score(y, y_pred))print("调整兰德指数AMI:%0.3f" % metrics.adjusted_rand_score(y, y_pred))
·
# 1.聚类分析 # (聚类分析无须知晓数据的类别标签和类别数据,是一类无监督的数据挖掘方法) # 一般流程:数据预处理、相似性度量、参数优化、执行聚类算法、性能评价、结果解释和应用 # 2.K-means一种基于划分的聚类算法 #2.1 K-means算法在Blobs数据集上的聚类过程 import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.datasets import make_blobs #获取数据集 n_samples = 200 X,y = make_blobs(n_samples=n_samples,random_state=9,centers=4,cluster_std=1) # 2.kMeans模型创建和训练预测 model = KMeans(n_clusters=4,random_state=12345) y_pred = model.fit_predict(X) # 3.聚类结果及评价 print('sse的值:',model.inertia_) #SSE值,质心的距离之和 print('质心:',model.cluster_centers_) #质心的位置 # 4.绘图显示聚类结果 plt.figure(figsize=(5,5)) plt.rcParams['font.sans-serif'] = ['SimHei'] #显示中文标签 plt.rcParams['axes.unicode_minus'] = False plt.scatter(X[y_pred==0][:,0],X[y_pred==0][:,1],marker='D',color='g') plt.scatter(X[y_pred==1][:,0],X[y_pred==1][:,1],marker='o',color='b') plt.scatter(X[y_pred==2][:,0],X[y_pred==2][:,1],marker='s',color='m') plt.scatter(X[y_pred==3][:,0],X[y_pred==3][:,1],marker='v',color='r') plt.title('K-means,k=4') plt.show() #针对球形簇或者凸簇的聚类效果好,SSE越小意味着簇内相似性越好,(搞得簇内相似性,低的簇间相似性),聚类效果越好。![]()
# 2.聚类算法的性能评价指标 # (内部质量指标)、非监督度量指标 # 2.1 轮廓系数,范围【-1,1】,取值越接近1,聚类质量越好 # 2.2 CH指数,数值越大,表示聚类算法识别的簇月紧密,且簇之间越分散 # (外部度量指标) # 2.3 兰德指数RI:指数的值在【0,1】,当聚类结果‘完美’匹配时,兰德指数为1 # 2.4 互信息MI: # 2.5 聚类结果评价 from sklearn import metrics print('1.内部度量指标') print("轮廓系数: %0.3f " % metrics.silhouette_score(X,y_pred)) print("CH系数: %0.3f " % metrics.calinski_harabasz_score(X,y_pred)) print("2.外部度量指标") print('AR: %0.3f' % metrics.adjusted_rand_score(y,y_pred)) print("NMI指数:%0.3f" % metrics.normalized_mutual_info_score(y,y_pred))
#3.DBSCAN算法(基于密度的聚类算法) # 适用于任意形状的簇,具有更好的适用性,自动确定聚类簇的数量,无须事先指定。(避免了噪声对聚类的结果影响) # 3.1 Eps邻域半径:影响护具对象的密度(K-距离曲线图,值在拐点选取) # 3.2 MinPts:影响数据对象的类型(核心点、边界点、噪声点) #3.3 DBSCAN算法在带噪声的Moons数据集上的聚类过程 import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import DBSCAN from sklearn.datasets import make_moons from sklearn import metrics plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体为黑体 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 # 1. 获得数据集 n_samples = 200 #样本数量 X, y = make_moons(n_samples = n_samples, random_state = 9,noise = 0.1) #添加噪声(若无需噪声,此步骤可删除) X = np.insert(X, 0, values = np.array([[1.5, 0.5], [-0.5, 0]]), axis = 0) y = np.insert(y, 0, [0, 0], axis = 0) #2. DBSCAN模型创建和训练 model = DBSCAN( eps = 0.2, min_samples = 4) y_pred = model.fit_predict(X) # -1代表噪声,其余值代表预测的簇标号,0,1 # 统计聚类后的簇数量 n_clusters_ = len(set(y_pred)) - (1 if -1 in y_pred else 0) #3. 聚类模型评价 print('聚类的簇数: %d' % n_clusters_) print('轮廓系数: %0.3f' % metrics.silhouette_score(X, y_pred)) print('调整兰德指数AMI: %0.3f' % metrics.adjusted_rand_score(y, y_pred)) # 4. 绘图显示聚类结果 core_samples_mask = np.zeros_like(model.labels_, dtype = bool) #获得核心对象的掩码 core_samples_mask[model.core_sample_indices_] = True #绘制原始数据集 set_marker = ['o', 'v', 'x', 'D', '>', 'p', '<'] set_color = ['b', 'r', 'm', 'g', 'c', 'k', 'tan'] plt.figure(figsize = (5, 5)) for i in range(n_clusters_): plt.scatter(X[y == i][:, 0], X[y == i][:, 1], marker = set_marker[i], color = 'none', edgecolors = set_color[i]) plt.title(' Moons数据集(带2个噪声点)', fontsize = 14) #绘制DBSCAN的聚类结果 plt.figure(figsize = (5, 5)) unique_labels = set(y_pred) i = -1 #flag变量 for k, col in zip(unique_labels, set_color[0: len(unique_labels)]): if k == -1: col = 'k' # 黑色表示标记噪声点. class_member_mask = (y_pred == k) i += 1 if (i>=len(unique_labels)): i = 0 #绘制核心对象 xcore = X[class_member_mask & core_samples_mask] plt.plot(xcore[:, 0], xcore[:, 1], set_marker[i], markerfacecolor = col, markeredgecolor = 'k', markersize = 8) #绘制边界对象和噪声 xncore = X[class_member_mask & ~core_samples_mask] plt.plot(xncore[:, 0], xncore[:, 1], set_marker[i], markerfacecolor = col, markeredgecolor = 'k', markersize = 4) plt.title('DBSCAN算法的聚类结果: 识别的簇= %d' % n_clusters_, fontsize = 14) plt.show()
# 4. 高斯混合模型(GMM) # 一种基于概率的生成式聚类算法 # 4.1 和K-means算法的比较:指定簇的数量,簇的形状可以是椭圆 # 4.2 簇数量的选择:BIC贝叶斯信息准则 # 4.3 GMM聚类算法在blobs数据集上的聚类过程 from sklearn.mixture import GaussianMixture from sklearn.datasets import make_blobs import numpy as np from sklearn import metrics import matplotlib.pyplot as plt # 假设draw_ellipse函数定义在utils模块中 from utils import draw_ellipse # 1.获取数据集 n_samples = 200 X, y = make_blobs(n_samples=n_samples, random_state=9, centers=4,cluster_std=1) # 2.GMM的创建和训练 K = 4 model = GaussianMixture(n_components=K, covariance_type='full', random_state=15) y_pred = model.fit_predict(X) # 3.聚类模型评价 print("轮廓系数:%0.3f" % metrics.silhouette_score(X, y_pred)) print("调整兰德指数AMI:%0.3f" % metrics.adjusted_rand_score(y, y_pred)) # 4.绘图显示GMM的聚类结果 plt.figure(figsize=(5, 5)) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 定义颜色和标记列表 colors = ['red', 'blue', 'green', 'purple'] markers = ['o', '^', 's', 'p'] for i in range(K): plt.scatter(X[y_pred == i][:, 0], X[y_pred == i][:, 1], marker=markers[i], color=colors[i]) # 绘制椭圆 for p,c,w in zip(model.means_, model.covariances_,model.weights_): draw_ellipse(p, c, alpha=0.05) plt.title("GMM聚类结果,k=%d" % K, fontsize=14) plt.show()
更多推荐
所有评论(0)