本书涉及的源程序和数据都可以在以下网站中找到:
http://guidetodatamining.com/
https://www.zybuluo.com/hainingwyx/note/559139
训练阶段:
将标识为同一假设的文档合并成一个文本文件
计算词在该文件中的出现次数n,形成一个词汇表
对于词汇表中的每个词w_k计算器在文本中的出现次数,记为n_k
对词汇表中的每个词(去除停用词)w_k,计算
分类阶段:
如果概率非常小,Python无法计算,可以采用取对数的形式。停用词:当停用词是噪声时,去掉这些词能减少处理量,提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。
def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] sqrt2pi = math.sqrt(2 * math.pi) for (category, prior) in self.prior.items(): prob = prior col = 1 for attrValue in itemVector: if not attrValue in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 else: prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart) col += 1 results.append((prob, category)) # return the category with the highest probability #print(results) return(max(results)[1]) # test code bT.classify(testDir+ 'rec.motorcycles/104673')10-fold cross
from __future__ import print_function import os, codecs, math class BayesText: # input:训练文件目录,停用词,忽略的文件子集 def __init__(self, trainingdir, stopwordlist, ignoreBucket): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {} f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories,in this program, neg and pos self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: #print(' ' + category) (self.prob[category], self.totals[category]) = self.train(trainingdir, category, ignoreBucket) # I am going to eliminate any word in the vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) #print("Computing probabilities:") for category in self.categories: #print(' ' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 self.prob[category][word] = (float(count + 1) / denominator) #print ("DONE TRAINING\n\n") def train(self, trainingdir, category, bucketNumberToIgnore): """counts word occurrences for a particular category""" ignore = "%i" % bucketNumberToIgnore currentdir = trainingdir + category directories = os.listdir(currentdir) counts = {} total = 0 for directory in directories: if directory != ignore: currentBucket = trainingdir + category + "/" + directory files = os.listdir(currentBucket) #print(" " + currentBucket) for file in files: f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) counts[token] += 1 total += 1 f.close() return(counts, total) def classify(self, filename): results = {} for category in self.categories: results[category] = 0 f = codecs.open(filename, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: #print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: if self.prob[category][token] == 0: print("%s %s" % (category, token)) results[category] += math.log( self.prob[category][token]) f.close() results = list(results.items()) results.sort(key=lambda tuple: tuple[1], reverse = True) # for debugging I can change this to give me the entire list return results[0][0] # input: 测试文件的分类目录,当前类别, 忽略子集 # return: 当前类别下的分类结果{0:12,1:23} def testCategory(self, direc, category, bucketNumber): results = {} directory = direc + ("%i/" % bucketNumber) #print("Testing " + directory) files = os.listdir(directory) total = 0 #correct = 0 for file in files: total += 1 result = self.classify(directory + file) results.setdefault(result, 0) results[result] += 1 #if result == category: # correct += 1 return results # input: 测试文件目录, 忽略的子集文件 # return: 所有类别的分类结果{1:{0:12,1:23},} def test(self, testdir, bucketNumber): """Test all files in the test directory--that directory is organized into subdirectories--each subdir is a classification category""" results = {} categories = os.listdir(testdir) #filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] for category in categories: #print(".", end="") results[category] = self.testCategory( testdir + category + '/', category, bucketNumber) return results def tenfold(dataPrefix, stoplist): results = {} for i in range(0,10): bT = BayesText(dataPrefix, stoplist, i) r = bT.test(theDir, i) for (key, value) in r.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 2s " % category subheader += "-----+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %3i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) # change these to match your directory structure prefixPath = "data/review_polarity/" theDir = prefixPath + "/txt_sentoken/" stoplistfile = prefixPath + "stopwords25.txt" tenfold(theDir, stoplistfile)