python 小说词频统计，jieba库实例

mac2026-02-16 16

以《我的微信连三界》这本小说为例，进行字符统计

对中文、英文、标点符号、数字等分别统计, 统计结果暂时储存在字典 countchr 中用jieba库的分词功能将文本中所有可能的词（和字符）分离出来，统计每个词（和字符）出现频率，降序排列并保存 “词频”保存在 “jieba lcut.txt” 中, "字符频率"保存在 “jieba lcut1.txt” 中将高频字词和countchr一起保存在 “countchar.txt” 中

运行结果如下

这样看着不是很舒服, 用excel处理了下因为小说里有很多微信聊天的情景, 平均每个段落才21字, 比较短主角名字占了20万字的篇幅, 有点小惊讶, "的"字出现频率最高, 这个在预料之中

在网上随机找了十几本小说试了下, 有如下规律

标点符号约占五分之一主角名出现次数一般是最多的“的”、“了”、“是”、“他”这四个字使用频率都很高平均每章2000~3000字, 按掌阅一章1毛多一点计算, 平摊下来5分/千字. 假设作者每分钟可以码60字, 每天稳定更两章, 每天需要码字2小时.

完整代码如下

import os import time import jieba import asyncio # hs中不储存标点符号,标点符号只统计总数,不分别统计? pcd = { 3: '·、【】！￥—～……（）；‘’：“”《》，。？、', 4: ''' `~!@#$%^&*()_+-={}|:%<>?[]\;',./×''', } # uncuaion hs0 = { 1: 0, # 中文 2: 0, # english letter 3: 0, # 中文标点符号 4: 0, # english punctuation marks 5: 0, # 数字 6: 0, # 行数 7: 0, # 中文字数占总字符数的比例? } # 效仿word的字符统计，因中文文章中空格较少，故不统计空格? path = input('please input the path of your file: ') print(os.path.isfile(path)) if not os.path.isfile(path): path = r'C:\Users\QQ\Desktop\ls\py\我的微信连三界狼烟新书\我的微信连三界狼烟新书.txt' # 设置默认词? rootpath = r'C:\Users\QQ\Desktop\ls\py\我的微信连三界狼烟新书' print(rootpath) else: rootpath = os.path.dirname(path) print(rootpath) def wdwxlsj(): #path = rootpath + r'\我的微信连三界狼烟新书.txt' sl = ['林海', '凡间', '地仙', '地府', '天仙', '散仙', '金仙', '天劫', '馨月', '林儿', '脸皮', '不好意思', '齐天大圣', '微信', '手机', '太上老君', ] # 指定某些�?可以考虑增加脏话检索模�? hss1 = {} # s1中的词在文件path里出现次数? hsw = {} # 每个字符在文件里出现的次数 path2 = os.path.split(path)[0] + '\\{}.txt'.format('jieba lcut') path3 = rootpath + r'\{}.txt'.format('jieba lcut1') print(path2) with open(path, 'r') as f, open(path2, 'w') as fs, open(path3, 'w') as fw: string = f.read() for i in sl: hss1[i] = string.count(i) f.seek(0) # for i in f.readlines(): lines = f.readlines() print(len(lines)) hsc = {} # 每个词在文件里出现的次数 # async def wordsf(i,hsc): for i in lines: for j in i: if 19968 <= ord(j) <= 40869: hs0[1] += 1 elif 65 <= ord(j) < 90 or 97 <= ord(j) <= 122: hs0[2] += 1 elif 48 <= ord(j) <= 57: hs0[5] += 1 else: pass for j in jieba.lcut(i): if j in pcd[3]: hs0[3] += 1 elif j in pcd[4]: hs0[4] += 1 elif j == '\n': hs0[6] += 1 '''此处代码作废 elif ord('a')<=ord(j)<=ord('z') or ord('A')<=ord(j)<=ord('Z'): hs0[2]+=1#漏掉了连续字母? elif ord('0')<=ord(j)<=ord('9'): hs0[5]+=1#漏掉了大9的数字? ''' elif len(j) > 1: # print(j,type(j),hs0[3],hs0[4]) if j in hsc: hsc[j] += 1 else: hsc[j] = 1 else: if j in hsw: hsw[j] += 1 else: hsw[j] = 1 '''尝试用异步运行，运行时间并未缩短，不知道问题出在哪 async def do(i,hsc): await wordsf(i,hsc) tasks=[asyncio.ensure_future(do(i,hsc)) for i in lines] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) ''' hs0[7] = hs0[1] / len(string) hsc = sort(hsc) # 按values降序排列 hsw = sort(hsw) fs.write(str(hsc)) fw.write(str(hsw)) def countchar(): with open(rootpath + r'\countchar.txt', 'w') as f: countchr = { '中文': hs0[1], 'english letter': hs0[2], '中文标点符号': hs0[3], 'english punctuation marks': hs0[4], '数字': hs0[5], '行数': hs0[6], '中文字数占总字符数的比例': hs0[7], '总字符数': len(string), '平均每个段落字数': hs0[1] // hs0[6] } for i in countchr: print(type(countchr[i]), countchr[i]) f.write('{}: {}\n'.format(i, unit(countchr[i]))) for path in [r'\jieba lcut.txt', r'\jieba lcut1.txt']: filename, content = hfwords(rootpath + path, 100) print(filename, content) f.write('\n' + filename + '\n' + content) countchar() print('以下词在小说《{}》中出现次数:'.format(os.path.split(path)[1][:-4])) for i in hss1: print('{:<5}: {}次'.format(i, hss1[i])) def sort(hs): l = [*zip(hs.values(), hs.keys())] l = sorted(l, reverse=True) hs2 = {} for value, key in l: hs2[key] = value return hs2 def hfwords(file, num, spacing=5): # High frequency words with open(file, 'r') as f: string = f.read() wf = eval(string) # words frequency for i in wf: length = len(i) break if length > 1: filename = '《{}》中出现频率最高的{}个词'.format(os.path.basename(os.path.dirname(file)), num) else: filename = '《{}》中出现频率最高的{}个字'.format(os.path.basename(os.path.dirname(file)), num) # print(filename) with open(os.path.dirname(file) + '\\{}.txt'.format(filename), 'w') as f: c = 0 # 控制循环次数 content = '' # 将需要写入文件的内容暂时储存在content for i in wf: content += '{}:{}次'.format(i, unit(wf[i])) # print(type(content), content) c += 1 if c % spacing == 0: content += '\n' else: content += ' ' if c >= num: f.write(content) return filename, content # Jump out of the loop def unit(num): # 添加适宜的单位 if num < 1: return '{:.2%}'.format(num) elif num < 500: return num elif num < 5000: return '{:.2f}千'.format(num / 1000) else: return '{:.2f}万'.format(num / 10000) def main(): start = time.perf_counter() try: wdwxlsj() except UnboundLocalError as e: print(e) end = time.perf_counter() d = end - start print('runtime : {} minutes {} seconds'.format(d // 60, d % 60)) main()

可以改进的地方:

增加句型统计模块, 检索句子结果、段落结构相似度以及句型重复率计算平均每章节的字数, 提取章节名和章节编号等, 构建章节名格式管理模块, 对每个章节名进行格式的标准化统计小说主要人物名在各章节的分布情况, 找出人物出场频率较高的章节, 由此对章节进行分篇或分集处理将统计结果图表化

最新回复(0)