代码:
# 训练模型:计算各特征在不同的类别下的条件概率 def trainNB0(trainMatrix, trainCategory): numTrainDocs = len(trainMatrix) # 计算训练的文档数目 numWords = len(trainMatrix[0]) # 计算每篇文档的词条数 pAbusive = sum(trainCategory) / float(numTrainDocs) # 文档属于侮辱类的概率 p0Num = np.zeros(numWords) p1Num = np.zeros(numWords) # 创建numpy.zeros数组,词条出现数初始化为0 p0Denom = 0.0 p1Denom = 0.0 # 分母初始化为0 for i in range(numTrainDocs): if trainCategory[i] == 1: # 统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)··· p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: # 统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)··· p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = p1Num / p1Denom # 如[1, 3, 4, 2, 0] / 2 = [0.5, 1.5, 2.0, 1.0, 0.0] p0Vect = p0Num / p0Denom return p0Vect, p1Vect, pAbusive # 返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率