文本聚类

mac2025-12-27  6

# -*- coding: utf-8 -*- # @Time : 2019/11/1 13:23 # @Author : Chicker # @FileName: clusterr.py # @Software: PyCharm # @Blog :http://blog.csdn.net/u010105243/article/ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from scipy.spatial.distance import cdist from sklearn.manifold import TSNE from mpl_toolkits.mplot3d import Axes3D data_path = r'E:\研一上\数据挖掘\dataset\Case2-clustering\ALS.csv' eps = 1e-8 def preprocess(data_path): df = pd.read_csv(data_path) data = df.iloc[:,1:] data_norm = data.apply(lambda x:(x-np.mean(x))/np.std(x)+eps) return data_norm, data def myplot(data_path): data_norm, data = preprocess(data_path) for i in range(8): plt.subplot(2,4,i+1) plt.hist(data_norm.iloc[:,i].values,color='m') plt.title(data_norm.columns[i]) plt.show() def definek(data_path): K = range(20) data_norm, data = preprocess(data_path) data_norm_values = data.values.astype('float64') distance = [] for i in K: model = KMeans(n_clusters=i+1,random_state=10) model.fit(data_norm_values) centers = model.cluster_centers_ distance.append(np.mean(np.min(cdist(data_norm_values,centers),axis=1))) plt.plot(K, distance) plt.title('Distance with different clusters') plt.xlabel('clusters') plt.ylabel('Distance') plt.show() def vis(data_path): data_norm, data = preprocess(data_path) data_norm_values = data.values.astype('float64') model = KMeans(n_clusters=5, random_state=10) model.fit(data_norm_values) label = model.predict(data_norm_values) column = list(data.columns) column.append('label') label = np.reshape(label,(len(label),1)) result = np.concatenate((data_norm_values,label),axis=1) result_df = pd.DataFrame(result,columns=column) result_df.to_csv('./result.csv') centers = model.cluster_centers_ # print(centers) tsne = TSNE(n_components=2,perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=None, method='barnes_hut', angle=0.5) low_dims_data = tsne.fit_transform(data_norm_values) low_dims_center = tsne.fit_transform(centers) # print(low_dims_center) data_vis = np.concatenate((low_dims_data, label), axis=1) data_df = pd.DataFrame(data_vis,columns=['x', 'y', 'label']) class_one = data_df[data_df['label'] == 0].values class_two = data_df[data_df['label'] == 1].values class_three = data_df[data_df['label'] == 2].values class_four = data_df[data_df['label'] == 3].values class_five = data_df[data_df['label'] == 4].values plt.scatter(class_one[:,0],class_one[:,1],c='g') plt.scatter(class_two[:,0],class_two[:,1],c='r') plt.scatter(class_three[:,0],class_three[:,1],c='b') plt.scatter(class_four[:,0],class_four[:,1],c='y') plt.scatter(class_five[:,0],class_five[:,1],c='m') # plt.scatter(low_dims_center[0][0],low_dims_center[0][1],marker='o') # plt.scatter(low_dims_center[1][0],low_dims_center[1][1],marker='s') # plt.scatter(low_dims_center[2][0],low_dims_center[2][1],marker='p') # plt.scatter(low_dims_center[3][0],low_dims_center[3][1],marker='*') # plt.scatter(low_dims_center[4][0],low_dims_center[4][1],marker='x') plt.show() def vis_triples(data_path): data_norm, data = preprocess(data_path) data_norm_values = data.values.astype('float64') model = KMeans(n_clusters=5, random_state=10) model.fit(data_norm_values) label = model.predict(data_norm_values) # column = list(data.columns) # column.append('label') label = np.reshape(label,(len(label),1)) # result = np.concatenate((data_norm_values,label),axis=1) # result_df = pd.DataFrame(result,columns=column) # result_df.to_csv('./result.csv') centers = model.cluster_centers_ # print(centers) tsne = TSNE(n_components=3,perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=None, method='barnes_hut', angle=0.5) low_dims_data = tsne.fit_transform(data_norm_values) # low_dims_center = tsne.fit_transform(centers) # print(low_dims_center) data_vis = np.concatenate((low_dims_data, label), axis=1) data_df = pd.DataFrame(data_vis,columns=['x', 'y', 'z','label']) class_one = data_df[data_df['label'] == 0].values class_two = data_df[data_df['label'] == 1].values class_three = data_df[data_df['label'] == 2].values class_four = data_df[data_df['label'] == 3].values class_five = data_df[data_df['label'] == 4].values fig = plt.figure() ax = Axes3D(fig) ax.scatter(class_one[:,0],class_one[:,1],class_one[:,2],c='g') ax.scatter(class_two[:,0],class_two[:,1],class_two[:,2],c='r') ax.scatter(class_three[:,0],class_three[:,1],class_three[:,2],c='b') ax.scatter(class_four[:,0],class_four[:,1],class_four[:,2],c='y') ax.scatter(class_five[:,0],class_five[:,1],class_five[:,2],c='m') ax.set_xlabel('feature1') ax.set_ylabel('feature2') ax.set_zlabel('feature3') plt.show() def main(): vis_triples(data_path) if __name__ == '__main__': main()
最新回复(0)