当前位置 : 主页 > 网络推广 > seo >

文本分析——分词、统计词频、词云

来源:互联网 收集:自由互联 发布时间:2021-06-16
导入包 import osimport sysimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport refrom pandas import Series, DataFrame import stringimport reimport jiebaimport jieba.analyseimport datetimefrom wordcloud import WordCl

导入包

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from pandas import Series, DataFrame  

import string
import re
import jieba
import jieba.analyse
import datetime
from wordcloud import WordCloud, ImageColorGenerator
import codecs

导入文件和数据

gongdan = pd.read_excel('Gongdan.xlsx')

数据预处理

gongdan['content'] = [str(i) for i in gongdan['content']]
gongdan['content'] = [''.join(re.findall(u'[\u4e00-\u9fff]+', i)) for i in gongdan['content']]
indexs = list(gongdan['content'][pd.isnull(gongdan['content'])].index)
gongdan = gongdan.drop(indexs)
indexs = list(gongdan['content'][gongdan['content']==''].index)
gongdan = gongdan.drop(indexs)

content = gongdan['content']

cont = ''.join(content)
cont = ''.join(re.findall(u'[\u4e00-\u9fa5]+', cont))

分词并去除停用词

stopwords = set()
fr = codecs.open('stopwords.txt', 'r', 'utf-8')
for word in fr:
   stopwords.add(str(word).strip())
fr.close()

jieba.load_userdict("dict.txt")
text = list(jieba.cut(cont, cut_all=False, HMM=True))
text = list(filter(lambda x: x not in stopwords, text))
text = [str(i) for i in text if i != ' ']

Tfidf 算法

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  

test = ' '.join(text)
tlist = []
tlist.append(test)

vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf = transformer.fit_transform(vectorizer.fit_transform(tlist))  #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵  

word=vectorizer.get_feature_names()#获取词袋模型中的所有词语  
weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  
tfidf_list = {}
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重  
    for j in range(len(word)):  
        tfidf_list[word[j]] = weight[i][j]

词云

font_path = 'yahei.ttf'

from PIL import Image
back_coloring = np.array(Image.open('circle.jpg'))

wc = WordCloud(font_path=font_path,  # 设置字体
               background_color="white",  # 背景颜色
               max_words=60,  # 词云显示的最大词数
               mask=back_coloring,  # 设置背景图片
               stopwords=stopwords,
               max_font_size=100,  # 字体最大值
               random_state=42,
               width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
#               prefer_horizontal=1,
               )

wc.generate_from_frequencies(tfidf_list)

plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
wc.to_file("w.png")

# create coloring from image
image_colors = ImageColorGenerator(back_coloring)
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(back_coloring, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()
网友评论