最近需要做AUC的显著性检验,delong test是比较常见的AUC显著性检验的方法。我大体研究了一下。 原理: 1.以两个不同的模型对肿瘤良恶性进行分类,其中vgg的AUC为 A 1 A_1 A1,svm结果得到的AUC为 A 2 A_2 A2,delong test就是首先计算两者AUC差值 θ = A 1 − A 2 \theta = A_1-A_2 θ=A1−A2 2.然后根据计算出 A 1 A_1 A1和 A 2 A_2 A2的方差 v a r ( A 1 ) var(A_1) var(A1), v a r ( A 2 ) var(A_2) var(A2),以及两者的协方差 c o v ( A 1 , A 2 ) ) cov(A_1,A_2)) cov(A1,A2)),关于AUC方差和协方差的计算方法我理解不深就不解释了。 3.然后算出 z z z值 z = ( A 1 − A 2 ) v a r ( A 1 ) + v a r ( A 2 ) − 2 c o v ( A 1 , A 2 ) z = \frac{(A_1-A_2)} {var(A_1)+var(A_2)-2cov(A_1,A_2)} z=var(A1)+var(A2)−2cov(A1,A2)(A1−A2) 4。然后将Z值分布作为正太分布,做显著性检验,得到P值,如果p值<0.05说明两个AUC之间存在显著性差异。
在github上找到的代码链接 delong test. delong test相关参考文章 《Comparing the Areas Under Two or More Correlated Receiver Operating Characteristic Curves: A Nonparametric Approach》 《Fast Implementation of DeLong’s Algorithm for Comparing the Areas Under Correlated Receiver Operating Characteristic Curves》 github上的代码有一些问题,我简单做了一些修改,并用自己的数据分别用medclac和文中的代码做了实验,两者计算的AUC,以及CI,pvalue都是一样的。 python代码结果图
medcalc结果图
以下是代码块。
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pandas as pd import numpy as np import scipy.stats from scipy import stats # AUC comparison adapted from # https://github.com/Netflix/vmaf/ def compute_midrank(x): """Computes midranks. Args: x - a 1D numpy array Returns: array of midranks """ J = np.argsort(x) Z = x[J] N = len(x) T = np.zeros(N, dtype=np.float) i = 0 while i < N: j = i while j < N and Z[j] == Z[i]: j += 1 T[i:j] = 0.5*(i + j - 1) i = j T2 = np.empty(N, dtype=np.float) # Note(kazeevn) +1 is due to Python using 0-based indexing # instead of 1-based in the AUC formula in the paper T2[J] = T + 1 return T2 def compute_midrank_weight(x, sample_weight): """Computes midranks. Args: x - a 1D numpy array Returns: array of midranks """ J = np.argsort(x) Z = x[J] cumulative_weight = np.cumsum(sample_weight[J]) N = len(x) T = np.zeros(N, dtype=np.float) i = 0 while i < N: j = i while j < N and Z[j] == Z[i]: j += 1 T[i:j] = cumulative_weight[i:j].mean() i = j T2 = np.empty(N, dtype=np.float) T2[J] = T return T2 def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight=None): if sample_weight is None: return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count) else: return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight) def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight): """ The fast version of DeLong's method for computing the covariance of unadjusted AUC. Args: predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] sorted such as the examples with label "1" are first Returns: (AUC value, DeLong covariance) Reference: @article{sun2014fast, title={Fast Implementation of DeLong's Algorithm for Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves}, author={Xu Sun and Weichao Xu}, journal={IEEE Signal Processing Letters}, volume={21}, number={11}, pages={1389--1393}, year={2014}, publisher={IEEE} } """ # Short variables are named as they are in the paper m = label_1_count n = predictions_sorted_transposed.shape[1] - m positive_examples = predictions_sorted_transposed[:, :m] negative_examples = predictions_sorted_transposed[:, m:] k = predictions_sorted_transposed.shape[0] tx = np.empty([k, m], dtype=np.float) ty = np.empty([k, n], dtype=np.float) tz = np.empty([k, m + n], dtype=np.float) for r in range(k): tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m]) ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:]) tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight) total_positive_weights = sample_weight[:m].sum() total_negative_weights = sample_weight[m:].sum() pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:]) total_pair_weights = pair_weights.sum() aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights sx = np.cov(v01) sy = np.cov(v10) delongcov = sx / m + sy / n return aucs, delongcov def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count): """ The fast version of DeLong's method for computing the covariance of unadjusted AUC. Args: predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] sorted such as the examples with label "1" are first Returns: (AUC value, DeLong covariance) Reference: @article{sun2014fast, title={Fast Implementation of DeLong's Algorithm for Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves}, author={Xu Sun and Weichao Xu}, journal={IEEE Signal Processing Letters}, volume={21}, number={11}, pages={1389--1393}, year={2014}, publisher={IEEE} } """ # Short variables are named as they are in the paper m = label_1_count n = predictions_sorted_transposed.shape[1] - m positive_examples = predictions_sorted_transposed[:, :m] negative_examples = predictions_sorted_transposed[:, m:] k = predictions_sorted_transposed.shape[0] tx = np.empty([k, m], dtype=np.float) ty = np.empty([k, n], dtype=np.float) tz = np.empty([k, m + n], dtype=np.float) for r in range(k): tx[r, :] = compute_midrank(positive_examples[r, :]) ty[r, :] = compute_midrank(negative_examples[r, :]) tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :]) aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n v01 = (tz[:, :m] - tx[:, :]) / n v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m sx = np.cov(v01) sy = np.cov(v10) delongcov = sx / m + sy / n return aucs, delongcov def calc_pvalue(aucs, sigma): """Computes log(10) of p-values. Args: aucs: 1D array of AUCs sigma: AUC DeLong covariances Returns: log10(pvalue) """ l = np.array([[1, -1]]) z = np.abs(np.diff(aucs)) / (np.sqrt(np.dot(np.dot(l, sigma), l.T)) + 1e-8) pvalue = 2 * (1 - scipy.stats.norm.cdf(np.abs(z))) # print(10**(np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10))) return pvalue def compute_ground_truth_statistics(ground_truth, sample_weight=None): assert np.array_equal(np.unique(ground_truth), [0, 1]) order = (-ground_truth).argsort() label_1_count = int(ground_truth.sum()) if sample_weight is None: ordered_sample_weight = None else: ordered_sample_weight = sample_weight[order] return order, label_1_count, ordered_sample_weight def delong_roc_variance(ground_truth, predictions): """ Computes ROC AUC variance for a single set of predictions Args: ground_truth: np.array of 0 and 1 predictions: np.array of floats of the probability of being class 1 """ sample_weight = None order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics( ground_truth, sample_weight) predictions_sorted_transposed = predictions[np.newaxis, order] aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers" return aucs[0], delongcov def delong_roc_test(ground_truth, predictions_one, predictions_two): """ Computes log(p-value) for hypothesis that two ROC AUCs are different Args: ground_truth: np.array of 0 and 1 predictions_one: predictions of the first model, np.array of floats of the probability of being class 1 predictions_two: predictions of the second model, np.array of floats of the probability of being class 1 """ sample_weight = None order, label_1_count,ordered_sample_weight = compute_ground_truth_statistics(ground_truth) predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order] aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count,sample_weight) return calc_pvalue(aucs, delongcov) def delong_roc_ci(y_true,y_pred): aucs, auc_cov = delong_roc_variance(y_true, y_pred) auc_std = np.sqrt(auc_cov) lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2) ci = stats.norm.ppf( lower_upper_q, loc=aucs, scale=auc_std) ci[ci > 1] = 1 return aucs,ci #examples 具体用法 y_true= np.load('自己的数据路径') y_pred_1 = np.load('自己的数据路径') y_pred _2 = np.load('自己的数据路径') alpha = .95 def delong_roc_ci(y_true,y_pred): aucs, auc_cov = delong_roc_variance(y_true, y_pred) auc_std = np.sqrt(auc_cov) lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2) ci = stats.norm.ppf( lower_upper_q, loc=aucs, scale=auc_std) ci[ci > 1] = 1 return aucs,ci #pvalue pvalue = delong_roc_test(y_true,y_pred_1,y_pred_2) # aucs, auc_cov = delong_roc_variance(y_true, y_pred) auc_1, auc_cov_1 = delong_roc_variance(y_true, y_pred_1) auc_2, auc_cov_2 = delong_roc_variance(y_true, y_pred_2) auc_std = np.sqrt(auc_cov_1) lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2) # ci = stats.norm.ppf( lower_upper_q, loc=auc_1, scale=auc_std) ci[ci > 1] = 1 print('95% AUC CI:', ci) print('AUC:', auc_1) print('p_value:', pvalue)