数据挖掘实践指南读书笔记6

mac2024-02-18 55

1. 写在之前

本书涉及的源程序和数据都可以在以下网站中找到：

http://guidetodatamining.com/

https://www.zybuluo.com/hainingwyx/note/559139

2. 朴素贝叶斯和文本

训练阶段：

将标识为同一假设的文档合并成一个文本文件

计算词在该文件中的出现次数n,形成一个词汇表

对于词汇表中的每个词w_k计算器在文本中的出现次数，记为n_k

对词汇表中的每个词(去除停用词)w_k，计算

3. 学习

class BayesText: def __init__(self, trainingdir, stopwordlist): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {} #停用词字典 f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: print(' ' + category) # 计算当前类别的单词和单词数量，单词的总量 (self.prob[category], self.totals[category]) = self.train(trainingdir, category) # I am going to eliminate any word in the 所有种类的单词库vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) print("Computing probabilities:") for category in self.categories: print(' ' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 # 条件概率计算 self.prob[category][word] = (float(count + 1) / denominator) print ("DONE TRAINING\n\n") # input：trainingdir训练文件的目录, category训练文件的种类 # return: (counts, total) (当前文件的单词和单词数量,所有单词的数量) def train(self, trainingdir, category): """counts word occurrences for a particular category""" currentdir = trainingdir + category files = os.listdir(currentdir) counts = {} total = 0 for file in files: #print(currentdir + '/' + file) f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1#所有文档的单词和单词数量 counts.setdefault(token, 0) counts[token] += 1#当前文件的单词和单词数量 total += 1#所有单词的数量 f.close() return(counts, total) # test code bT = BayesText(trainingDir, stoplistfile) bT.prob['rec.motorcycles']["god"]

分类阶段：

如果概率非常小，Python无法计算，可以采用取对数的形式。停用词：当停用词是噪声时，去掉这些词能减少处理量，提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。

def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] sqrt2pi = math.sqrt(2 * math.pi) for (category, prior) in self.prior.items(): prob = prior col = 1 for attrValue in itemVector: if not attrValue in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 else: prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 for x in numVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart) col += 1 results.append((prob, category)) # return the category with the highest probability #print(results) return(max(results)[1]) # test code bT.classify(testDir+ 'rec.motorcycles/104673')

10-fold cross

from __future__ import print_function import os, codecs, math class BayesText: # input:训练文件目录，停用词，忽略的文件子集 def __init__(self, trainingdir, stopwordlist, ignoreBucket): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {} f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories，in this program, neg and pos self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: #print(' ' + category) (self.prob[category], self.totals[category]) = self.train(trainingdir, category, ignoreBucket) # I am going to eliminate any word in the vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) #print("Computing probabilities:") for category in self.categories: #print(' ' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 self.prob[category][word] = (float(count + 1) / denominator) #print ("DONE TRAINING\n\n") def train(self, trainingdir, category, bucketNumberToIgnore): """counts word occurrences for a particular category""" ignore = "%i" % bucketNumberToIgnore currentdir = trainingdir + category directories = os.listdir(currentdir) counts = {} total = 0 for directory in directories: if directory != ignore: currentBucket = trainingdir + category + "/" + directory files = os.listdir(currentBucket) #print(" " + currentBucket) for file in files: f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) counts[token] += 1 total += 1 f.close() return(counts, total) def classify(self, filename): results = {} for category in self.categories: results[category] = 0 f = codecs.open(filename, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: #print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: if self.prob[category][token] == 0: print("%s %s" % (category, token)) results[category] += math.log( self.prob[category][token]) f.close() results = list(results.items()) results.sort(key=lambda tuple: tuple[1], reverse = True) # for debugging I can change this to give me the entire list return results[0][0] # input: 测试文件的分类目录，当前类别，忽略子集 # return: 当前类别下的分类结果{0:12,1：23} def testCategory(self, direc, category, bucketNumber): results = {} directory = direc + ("%i/" % bucketNumber) #print("Testing " + directory) files = os.listdir(directory) total = 0 #correct = 0 for file in files: total += 1 result = self.classify(directory + file) results.setdefault(result, 0) results[result] += 1 #if result == category: # correct += 1 return results # input: 测试文件目录，忽略的子集文件 # return: 所有类别的分类结果{1:{0:12,1：23},} def test(self, testdir, bucketNumber): """Test all files in the test directory--that directory is organized into subdirectories--each subdir is a classification category""" results = {} categories = os.listdir(testdir) #filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] for category in categories: #print(".", end="") results[category] = self.testCategory( testdir + category + '/', category, bucketNumber) return results def tenfold(dataPrefix, stoplist): results = {} for i in range(0,10): bT = BayesText(dataPrefix, stoplist, i) r = bT.test(theDir, i) for (key, value) in r.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue categories = list(results.keys()) categories.sort() print( "\n Classified as: ") header = " " subheader = " +" for category in categories: header += "% 2s " % category subheader += "-----+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %s |" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %3i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total) # change these to match your directory structure prefixPath = "data/review_polarity/" theDir = prefixPath + "/txt_sentoken/" stoplistfile = prefixPath + "stopwords25.txt" tenfold(theDir, stoplistfile)

4.我的公众号

最新回复(0)