当前位置 : 主页 > 网络推广 > seo >

jieba 细粒度分词 / add_word无效 / 强制分词

来源:互联网 收集:自由互联 发布时间:2021-06-16
def fenci (one_string) : for _ in range(len(one_string)): # 去掉所有空格 try : one_string=one_string.replace( " " , "" ) except : break def isAllZh (s) : # 判断是否全是中文 for c in s: if not ( '\u4e00' = c = '\u9fa5' ): retur
def fenci(one_string):
    for _ in range(len(one_string)): # 去掉所有空格
        try:
            one_string=one_string.replace(" ","")
        except:
            break
    def isAllZh(s): # 判断是否全是中文
        for c in s:
            if not ('\u4e00' <= c <= '\u9fa5'):
                return False
        return True
    final_result = []
    temp_list = jieba.lcut(one_string)
    for word in temp_list:
        if isAllZh(word)==False:
            continue
        # if jieba.get_FREQ(word)==1:
        # print(word)
        if (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) or len(word)>3:
            jieba.del_word(word) # 强制
            final_result.extend(jieba.lcut(word))
        else:
            final_result.append(word)
    return final_result

事实上和HMM=False的结果貌似差不多

print(jieba.lcut('丰田太省了', HMM=False))
print(jieba.lcut('我们中出了一个叛徒', HMM=False))
print(jieba.lcut('丰田太省了', HMM=True))
print(jieba.lcut('我们中出了一个叛徒', HMM=True))
上一篇:查看及检索文件
下一篇:搜索引擎
网友评论