以《我的微信连三界》这本小说为例,进行字符统计
对中文、英文、标点符号、数字等分别统计, 统计结果暂时储存在字典 countchr 中 用jieba库的分词功能将文本中所有可能的词(和字符)分离出来,统计每个词(和字符)出现频率,降序排列并保存 “词频”保存在 “jieba lcut.txt” 中, "字符频率"保存在 “jieba lcut1.txt” 中 将高频字词和countchr一起保存在 “countchar.txt” 中
运行结果如下
这样看着不是很舒服, 用excel处理了下 因为小说里有很多微信聊天的情景, 平均每个段落才21字, 比较短 主角名字占了20万字的篇幅, 有点小惊讶, "的"字出现频率最高, 这个在预料之中
在网上随机找了十几本小说试了下, 有如下规律
标点符号约占五分之一主角名出现次数一般是最多的“的”、“了”、“是”、“他”这四个字使用频率都很高平均每章2000~3000字, 按掌阅一章1毛多一点计算, 平摊下来5分/千字. 假设作者每分钟可以码60字, 每天稳定更两章, 每天需要码字2小时.
完整代码如下
import os
import time
import jieba
import asyncio
pcd
= {
3: '·、【】!¥—~……();‘’:“”《》,。?、',
4: ''' `~!@#$%^&*()_+-={}|:%<>?[]\;',./×''',
}
hs0
= {
1: 0,
2: 0,
3: 0,
4: 0,
5: 0,
6: 0,
7: 0,
}
path
= input('please input the path of your file: ')
print(os
.path
.isfile
(path
))
if not os
.path
.isfile
(path
):
path
= r
'C:\Users\QQ\Desktop\ls\py\我的微信连三界 狼烟新书\我的微信连三界 狼烟新书.txt'
rootpath
= r
'C:\Users\QQ\Desktop\ls\py\我的微信连三界 狼烟新书'
print(rootpath
)
else:
rootpath
= os
.path
.dirname
(path
)
print(rootpath
)
def wdwxlsj():
sl
= ['林海', '凡间', '地仙', '地府', '天仙', '散仙', '金仙', '天劫', '馨月', '林儿',
'脸皮', '不好意思', '齐天大圣', '微信', '手机', '太上老君', ]
hss1
= {}
hsw
= {}
path2
= os
.path
.split
(path
)[0] + '\\{}.txt'.format('jieba lcut')
path3
= rootpath
+ r
'\{}.txt'.format('jieba lcut1')
print(path2
)
with open(path
, 'r') as f
, open(path2
, 'w') as fs
, open(path3
, 'w') as fw
:
string
= f
.read
()
for i
in sl
:
hss1
[i
] = string
.count
(i
)
f
.seek
(0)
lines
= f
.readlines
()
print(len(lines
))
hsc
= {}
for i
in lines
:
for j
in i
:
if 19968 <= ord(j
) <= 40869:
hs0
[1] += 1
elif 65 <= ord(j
) < 90 or 97 <= ord(j
) <= 122:
hs0
[2] += 1
elif 48 <= ord(j
) <= 57:
hs0
[5] += 1
else:
pass
for j
in jieba
.lcut
(i
):
if j
in pcd
[3]:
hs0
[3] += 1
elif j
in pcd
[4]:
hs0
[4] += 1
elif j
== '\n':
hs0
[6] += 1
'''此处代码作废
elif ord('a')<=ord(j)<=ord('z') or ord('A')<=ord(j)<=ord('Z'):
hs0[2]+=1#漏掉了连续字母?
elif ord('0')<=ord(j)<=ord('9'):
hs0[5]+=1#漏掉了大9的数字?
'''
elif len(j
) > 1:
if j
in hsc
:
hsc
[j
] += 1
else:
hsc
[j
] = 1
else:
if j
in hsw
:
hsw
[j
] += 1
else:
hsw
[j
] = 1
'''尝试用异步运行,运行时间并未缩短,不知道问题出在哪
async def do(i,hsc):
await wordsf(i,hsc)
tasks=[asyncio.ensure_future(do(i,hsc)) for i in lines]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
'''
hs0
[7] = hs0
[1] / len(string
)
hsc
= sort
(hsc
)
hsw
= sort
(hsw
)
fs
.write
(str(hsc
))
fw
.write
(str(hsw
))
def countchar():
with open(rootpath
+ r
'\countchar.txt', 'w') as f
:
countchr
= {
'中文': hs0
[1],
'english letter': hs0
[2],
'中文标点符号': hs0
[3],
'english punctuation marks': hs0
[4],
'数字': hs0
[5],
'行数': hs0
[6],
'中文字数占总字符数的比例': hs0
[7],
'总字符数': len(string
),
'平均每个段落字数': hs0
[1] // hs0
[6]
}
for i
in countchr
:
print(type(countchr
[i
]), countchr
[i
])
f
.write
('{}: {}\n'.format(i
, unit
(countchr
[i
])))
for path
in [r
'\jieba lcut.txt', r
'\jieba lcut1.txt']:
filename
, content
= hfwords
(rootpath
+ path
, 100)
print(filename
, content
)
f
.write
('\n' + filename
+ '\n' + content
)
countchar
()
print('以下词在小说《{}》中出现次数:'.format(os
.path
.split
(path
)[1][:-4]))
for i
in hss1
:
print('{:<5}: {}次'.format(i
, hss1
[i
]))
def sort(hs
):
l
= [*zip(hs
.values
(), hs
.keys
())]
l
= sorted(l
, reverse
=True)
hs2
= {}
for value
, key
in l
:
hs2
[key
] = value
return hs2
def hfwords(file, num
, spacing
=5):
with open(file, 'r') as f
:
string
= f
.read
()
wf
= eval(string
)
for i
in wf
:
length
= len(i
)
break
if length
> 1:
filename
= '《{}》中出现频率最高的{}个词'.format(os
.path
.basename
(os
.path
.dirname
(file)), num
)
else:
filename
= '《{}》中出现频率最高的{}个字'.format(os
.path
.basename
(os
.path
.dirname
(file)), num
)
with open(os
.path
.dirname
(file) + '\\{}.txt'.format(filename
), 'w') as f
:
c
= 0
content
= ''
for i
in wf
:
content
+= '{}:{}次'.format(i
, unit
(wf
[i
]))
c
+= 1
if c
% spacing
== 0:
content
+= '\n'
else:
content
+= ' '
if c
>= num
:
f
.write
(content
)
return filename
, content
def unit(num
):
if num
< 1:
return '{:.2%}'.format(num
)
elif num
< 500:
return num
elif num
< 5000:
return '{:.2f}千'.format(num
/ 1000)
else:
return '{:.2f}万'.format(num
/ 10000)
def main():
start
= time
.perf_counter
()
try:
wdwxlsj
()
except UnboundLocalError
as e
:
print(e
)
end
= time
.perf_counter
()
d
= end
- start
print('runtime : {} minutes {} seconds'.format(d
// 60, d
% 60))
main
()
可以改进的地方:
增加句型统计模块, 检索句子结果、段落结构相似度以及句型重复率 计算平均每章节的字数, 提取章节名和章节编号等, 构建章节名格式管理模块, 对每个章节名进行格式的标准化 统计小说主要人物名在各章节的分布情况, 找出人物出场频率较高的章节, 由此对章节进行分篇或分集处理 将统计结果图表化