当前位置 : 主页 > 网络推广 > seo >

jieba文本分词,去除停用词,添加用户词

来源:互联网 收集:自由互联 发布时间:2021-06-16
import jieba from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np import jieba.analyse from pyquery import PyQuerycc = ‘‘‘ 文本 ‘‘‘ jieba.load
import jieba
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import jieba.analyse
from pyquery import PyQuery

cc = ‘‘‘文本‘‘‘

jieba.load_userdict(./userdict.txt)#加载外部 用户词典

# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, r, encoding=utf-8).readlines()]
    return stopwords

# 对句子去除停用词
def movestopwords(sentence):
    list = []
    stopwords = stopwordslist(./stop_words.txt)  # 这里加载停用词的路径
    for word in sentence:
        if word not in stopwords:
            if word != \t and len(word) >1:
                list.append(word)
    return list


words = jieba.cut(PyQuery(cc).text()) #去除HTML标签
word_list = movestopwords(words) # 去除停用词
words_split = " ".join(word_list) #列表解析为字符串
#keywords = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True) # tf-tdf算法
keywords = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=(ns, n, vn, v)) #textrank算法

for item in keywords:
     print(item[0],item[1])

#mycount = Counter(word_list) # 统计词频
#for key, val in mycount.most_common(100):  # 有序(返回前10个)
#    print(key, val)


#alice_mask = np.array(Image.open("./zhihu.png")) #遮罩
wc = WordCloud(
    # width=800,
    # height=600,
    background_color="#000000",  # 设置背景颜色
    max_words=50,  # 词的最大数(默认为200)
    max_font_size=400,  # 最大字体尺寸
    min_font_size=10,  # 最小字体尺寸(默认为4)
    #colormap=‘bone‘,  # string or matplotlib colormap, default="viridis"
    random_state=42,  # 设置有多少种随机生成状态,即有多少种配色方案
    #mask=plt.imread("./zhihu.png"),  # 读取遮罩图片!!
    #mask=alice_mask, #设置遮罩
    font_path=./SimHei.ttf
)


my_wordcloud = wc.generate(words_split) #按词频生成词云
plt.imshow(my_wordcloud) #展示词云
plt.axis("off") #去除横纵轴
plt.show()
wc.to_file(zzz.png) # 保存图片文件
网友评论