import jieba from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np import jieba.analyse from pyquery import PyQuerycc = ‘‘‘ 文本 ‘‘‘ jieba.load
import jieba from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np import jieba.analyse from pyquery import PyQuery cc = ‘‘‘文本‘‘‘ jieba.load_userdict(‘./userdict.txt‘)#加载外部 用户词典 # 创建停用词list def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, ‘r‘, encoding=‘utf-8‘).readlines()] return stopwords # 对句子去除停用词 def movestopwords(sentence): list = [] stopwords = stopwordslist(‘./stop_words.txt‘) # 这里加载停用词的路径 for word in sentence: if word not in stopwords: if word != ‘\t‘ and len(word) >1: list.append(word) return list words = jieba.cut(PyQuery(cc).text()) #去除HTML标签 word_list = movestopwords(words) # 去除停用词 words_split = " ".join(word_list) #列表解析为字符串 #keywords = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True) # tf-tdf算法 keywords = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=(‘ns‘, ‘n‘, ‘vn‘, ‘v‘)) #textrank算法 for item in keywords: print(item[0],item[1]) #mycount = Counter(word_list) # 统计词频 #for key, val in mycount.most_common(100): # 有序(返回前10个) # print(key, val) #alice_mask = np.array(Image.open("./zhihu.png")) #遮罩 wc = WordCloud( # width=800, # height=600, background_color="#000000", # 设置背景颜色 max_words=50, # 词的最大数(默认为200) max_font_size=400, # 最大字体尺寸 min_font_size=10, # 最小字体尺寸(默认为4) #colormap=‘bone‘, # string or matplotlib colormap, default="viridis" random_state=42, # 设置有多少种随机生成状态,即有多少种配色方案 #mask=plt.imread("./zhihu.png"), # 读取遮罩图片!! #mask=alice_mask, #设置遮罩 font_path=‘./SimHei.ttf‘ ) my_wordcloud = wc.generate(words_split) #按词频生成词云 plt.imshow(my_wordcloud) #展示词云 plt.axis("off") #去除横纵轴 plt.show() wc.to_file(‘zzz.png‘) # 保存图片文件