参考链接 https://github.com/fxsjy/jieba #encoding=utf-8from __future__ import print_function, unicode_literalsimport syssys.path.append("../")import jiebajieba.load_userdict("dict.txt") #载入自定义字典import jieba.posseg as pseg#
参考链接 https://github.com/fxsjy/jieba
#encoding=utf-8 from __future__ import print_function, unicode_literals import sys sys.path.append("../") import jieba jieba.load_userdict("dict.txt") #载入自定义字典 import jieba.posseg as pseg # """ #dict.txt 文件 保存类型为 utf-8 # 云计算 5 # 李小福 2 nr # 创新办 3 i # easy_install 3 eng # 好用 300 # 韩玉赏鉴 3 nz # 八一双鹿 3 nz # 台中 # 凱特琳 nz # Edu Trust认证 2000 # """ jieba.add_word('石墨烯') #动态添加词典 jieba.add_word('凱特琳') jieba.del_word('自定义词')#动态删除词典中某词 test_sent = ( "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n" "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n" "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。" ) words = jieba.cut(test_sent) #分词 print('/'.join(words)) print("="*40) result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("\n" + "="*40) # terms = jieba.cut('easy_install is great') print('/'.join(terms)) terms = jieba.cut('python 的正则表达式是好用的') print('/'.join(terms)) # print("="*40) # test frequency tune testlist = [ ('今天天气不错', ('今天', '天气')), ('如果放到post中将出错。', ('中', '将')), ('我们中出了一个叛徒', ('中', '出')), ] # for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-"*40)
import jieba jieba.load_userdict("dict.txt") seg_list = jieba.cut("我来到北京清华大学", cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list))