在学习KPCA原理
原理部分可参考文章:https://zhuanlan.zhihu.com/p/59775730 以下是我用python实现的KPCA
#!/usr/bin/env python # coding: utf-8 # In[1]: ''' myKPCA 2019.10.31 Reference: Zhihua Zhou. Machine learning[M]. Tsinghua University Press, 2016 训练实现步骤:1.计算核矩阵K; 2.中心化核矩阵得到hat_K; 3.计算得到hat_K的最大k个特征值与特征向量 数据集: HNR 使用效果:linear和poly运行速度正常,运行出来的精度也可以。 rbf运行速度较慢,因为在计算核矩阵的时候用的是for, 没有进行优化 同pca一样可通过重构误差来确定降维后的维度 ''' import numpy as np # from scipy.spatial.distance import pdist, squareform ''' define class myKPCA 该类的核函数有3种可选,其中多项式核和高斯核需要另外输入参数sigma,默认核函数为linear 降维后的维数d' 和PCA一样,可以自己输入,也可以通过重构误差来确定 ''' class myKPCA: ''' Initialize function of class myKPCA. Input: kernel: 可选的核函数,默认为'linear' 'linear'==>线性核函数;'poly'==>多项式核函数;'rbf'==>高斯核 n_components:The dimension after dimensionality reduction if n_components=0, n_components will be set by the refactoring threshold t: threshold, t=0.95 sigma: the degree of rbf kernel function, polynomial kernel function ''' def __init__(self, kernel='linear', n_components=0, t=0.95, sigma=1): self.kernel = kernel self.n_components = n_components self.t = t self.sigma = sigma self.w = [] # [x_train num_sample, n_components] self.K = [] # [x_train num_sample, x_train num_sample] self.X = [] # [x_train num_sample, x_train num_feature] ''' define rbf, linear and poly kernel function Input: X1:numpy.ndarry, size: [X1_num_sample, num_feature] X1:numpy.ndarry, size: [X2_num_sample, num_feature] Returns: k: numpy.ndarry, size: [X2_num_sample, X1_num_sample] ''' def rbf(self, X1, X2): m = X1.shape[0] p = X2.shape[0] ''' 使用pdist方法来求解欧式距离,运行速度较之于下面的for方法快,但... k = pdist(X1, 'euclidean') k = squareform(k) k = np.exp( -k/(2*self.sigma**2) ) ''' k = np.ones([p, m]) for i in range(p): dist = np.sqrt(np.sum( np.square(X1-X2[i,:]), axis=1 )) # k[i,:] = np.exp( -dist/(2*self.sigma**2) ) # 这一行的写法与课本的表达式一致,等价于下面那种写法 k[i,:] = np.exp( -self.sigma*k ) # 这一行的写法是高斯核的另一种等价表达式 return k def linear(self, X1, X2): k = np.dot(X2, X1.T) return k def poly(self, X1, X2): k = np.dot(X2, X1.T) k = k**self.sigma return k def kernel_fun(self, X1, X2): if self.kernel == 'rbf': return self.rbf(X1, X2) elif self.kernel == 'poly': return self.poly(X1, X2) elif self.kernel == 'linear': return self.linear(X1, X2) ''' define refact function to set the n_components if n_components=0 Input: A:numpy.ndarry, size: [1, num_feature] ''' def refact(self, A): if self.n_components == 0: d = A.shape[0] Lambd = np.sum(A) for i in range(d): lambd = np.sum(A[0:i]) tt = lambd/Lambd if tt >= self.t: self.n_components = i return self.n_components = int(d/3) ''' define fit_transform function to get the self.W and sample mean Input: X:numpy.ndarry, size: [num_sample, num_feature] Retrun: new_X: Data after dimensionality reduction numpy.ndarry, size: [num_sample, n_components] ''' def fit_transform(self, X): m = X.shape[0] self.X = X K = self.kernel_fun(X, X) # 计算核矩阵 K self.K = K # 这里记录 K 是因为在计算测试集的中心化矩阵时,需要用到,且核矩阵计算复杂,因此记录下来以便于后续使用 one_n = np.ones((m,m))/m K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n) # 中心化核矩阵 A, U = np.linalg.eig(K) # np.linalg.eig获得的A是特征值,T是特征向量矩阵,且T的列向量是特征向量 top_A_idx = A.argsort()[::-1] # 返回A从大到小的索引值 self.refact(A) top_A_idx = top_A_idx[:self.n_components] # 获得最大的d个特征值的索引 top_A = A[top_A_idx] # 获得最大的k个特征值 self.w = U[:,top_A_idx] # 对应的特征向量 self.w = self.w/np.sqrt(top_A) self.w = np.real(self.w) new_X = np.dot(K, self.w) return new_X ''' define transform function Input: X:numpy.ndarry, size: [x_test_num_sample, num_feature] Retrun: new_X: Data after dimensionality reduction numpy.ndarry, size: [x_test_num_sample, n_components] ''' def transform(self, X_test): K = self.kernel_fun(self.X, X_test) N = self.X.shape[0] L = X_test.shape[0] one_NL = np.ones((N,L))/N one_N = np.ones((N,N))/N hat_K = K - K.dot(one_N) - np.dot(one_NL.T, self.K) + one_NL.T.dot(self.K).dot(one_N) # 中心化测试集的核矩阵 x = np.dot(hat_K, self.w) return x ''' define get_n_components function Input: None Retrun: n_components ''' def get_n_components(self): return self.n_components然后是测试
# In[2]: # 首先从简单的来进行测验,为了检验myKPCA,采用以下数据进行测试 import matplotlib.pyplot as plt from sklearn.datasets import make_moons X, y = make_moons(n_samples=100, random_state=123) # scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15) scikit_kpca = myKPCA(n_components=2, kernel='rbf', sigma=0.15) X_skernpca = scikit_kpca.fit_transform(X) plt.scatter(X_skernpca[y == 0, 0], X_skernpca[y == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X_skernpca[y == 1, 0], X_skernpca[y == 1, 1], color='blue', marker='o', alpha=0.5) plt.xlabel('PC1') plt.ylabel('PC2') plt.tight_layout() plt.show() # In[3]: # HNR手写数据集,使用sklearn的KernelPCA会很快;而用myKPCA的linear也挺快的,带若用myKPCA的rbf则要好久好久。。。因为 # 精度都差不多 from sklearn.linear_model import LogisticRegression from sklearn.decomposition import KernelPCA #==============================sklearn 中的softmax============================= data = np.load('HNR/HNRdata.npz') trainx = data['train_x'] trainy = data['train_y'] testx = data['test_x'] testy = data['test_y'] #select the class of "1","2","3" for training and testing n_class = 3 tr_index = np.where(trainy[:,0:n_class] == 1)[0] te_index = np.where(testy[:,0:n_class] == 1)[0] train_x = trainx[tr_index] train_y = trainy[tr_index] test_x = testx[te_index] test_y = testy[te_index] train_y = train_y[:,0:n_class] np.random.seed(555) kpca = myKPCA(n_components=50) train_x = kpca.fit_transform(train_x) test_x = kpca.transform(test_x) # softmax 回归多分类 def softmax_classify(X, Y, x, y): ''' X: train_x, size;[num_samples, num_features] Y: train_y, size; [num_samples, 1] x: test_x, size: [num_samples, num_features] y_ test_y, size: [num_samples, 1] ''' softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=0.1) # 创建 softmax 回归实例 softmax_reg.fit(X, Y) predict = softmax_reg.predict(x) # predict temp_y = np.argmax(y, axis = 1) # convert one-hot to category for evluation accuracy = np.mean(predict == temp_y) print('softmax accuracy:', accuracy) train_y = np.argmax(train_y, axis = 1) # convert one-hot to category for evluation softmax_classify(train_x, train_y, test_x, test_y)