当前位置 : 主页 > 编程语言 > python >

python 读取TXT 文档进行词频统计

来源:互联网 收集:自由互联 发布时间:2022-07-17
去除停用词 #排除词库 excludes = ['the','and','to','of','i','a','in','it','that','is', 'you','my','with','not','his','this','but','for', 'me','s','he','be','as','so','him','your'] def getText(): txt=open('hamlet.txt','r').read() txt


去除停用词
#排除词库
excludes = ['the','and','to','of','i','a','in','it','that','is',
'you','my','with','not','his','this','but','for',
'me','s','he','be','as','so','him','your']
def getText():
txt=open('hamlet.txt','r').read()
txt=txt.lower()
for ch in "~@#$%^&*()_-+=<>?/,.:;{}[]|\'""":
txt=txt.replace(ch,' ')
return txt

hamletTxt=getText()
words=hamletTxt.split()
counts={}
sumcount = 0
for word in words:
counts[word]=counts.get(word,0)+1
sumcount = sumcount + 1

counts_ex = counts.copy()
for key in counts.keys():
if key in excludes:
counts_ex.pop(key)
items=list(counts_ex.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count=items[i]
print('{0:<10}{1:>5}'.format(word,count))

#将统计结果写入文本文件中
outfile = open('词频统计结果.txt', "w")
lines = []
lines.append('单词种类:'+str(len(items))+'\n')
lines.append('单词总数:'+str(sumcount)+'\n')
lines.append('词频排序如下:\n')
lines.append('word\tcounts\n')

s= ''
for i in range(len(items)):
s = '\t'.join([str(items[i][0]), str(items[i][1])])
s += '\n'
lines.append(s)
print('\n统计完成!\n')
outfile.writelines(lines)
outfile.close()


上一篇:python+csv/Excel——数据持久化
下一篇:没有了
网友评论