当前位置 : 主页 > 网络推广 > seo >

中文分词系列(一)

来源:互联网 收集:自由互联 发布时间:2021-06-16
关于中文分词的一些资料网上资料很多,大家可以自己去了解了解,今天这里只关注代码怎么写。 中文分词主要可以归纳为“规则分词”、“统计分词”以及“规则+统计”三个主要派

关于中文分词的一些资料网上资料很多,大家可以自己去了解了解,今天这里只关注代码怎么写。

中文分词主要可以归纳为“规则分词”、“统计分词”以及“规则+统计”三个主要派别,今天主要了解“规则分词”中常见的正向、逆向和双向最大化匹配,这三个都是基于现在词典做的,所以得准备一个中文词典,一行一个词。

一.正向最大化匹配

  描述:

    1.找到词典中最长的词,记下长度L

    2.从 “左向右” 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最后一个字符去掉,将剩下的串作为新的匹配串    进行匹配。如此重复下去,直到切完。

二.逆向最大化匹配

  描述:

    1.找到词典中最长的词,记下长度L

    2.从 ”右向左“ 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最前面一个字符去掉,将剩下的串作为新的匹配    串进行匹配。如此重复下去,直到切完。

三.双向最大化匹配

  描述:

    1.将正向和逆向进行比较,先取词数切分最少的作为结果。

四.代码采用python

    1.load 词典

  

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 #这是词典路径
 5 dictPath = ../resource/dict.txt
 6 
 7 def loadDict():
 8     print(load dict...)
 9     dictionary = dict()
10     maximum = 0
11     # read resource
12     with open(dictPath, r, encoding=utf8) as f:
13         for line in f:
14             line = line.strip()
15             if not line:
16                 continue
17             str = line.split( )
18             dictionary[str[0]] = str[2]
19             wordLength = len(line)
20             if wordLength > maximum:
21                 maximum = wordLength #词典中最长的词的长度
22     return dictionary, maximum
View Code

    2.核心方法

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
 5 from word_segmentation.regulation.MaximumMatchMethod import MM
 6 from word_segmentation.regulation.BiDirectctionMatchMethod import BDMM
 7 from word_segmentation.util.LoadDict import loadDict
 8 
 9 class RegulationMatch(object):
10     def __init__(self):
11         self.dictionary, self.maximum = loadDict()
12 
13     def cut(self, text, method):
14         #逆向
15         if method == RMM:
16             return RMM.cut(text, self.dictionary, self.maximum)
17         #正向
18         if method == MM:
19             return MM.cut(text, self.dictionary, self.maximum)
20         #双向
21         if method == BDMM:
22             return BDMM.cut(text, self.dictionary, self.maximum)
View Code
 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 ‘‘‘
 5 词和词性
 6 ‘‘‘
 7 class Word(object) :
 8     def __init__(self, token, property):
 9         self.__token = token
10         self.__property = property
11     #单词
12     def getToken(self):
13         return self.__token
14     #词性
15     def getProperty(self):
16         return self.__property
View Code
 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.Word import Word
 5 
 6 ‘‘‘
 7 正向最大化匹配
 8 MaximumMatchMethod
 9 ‘‘‘
10 class MM(object):
11     def __init__(self):
12         pass
13 
14     @staticmethod
15     def cut(text, dictionary, maximum):
16         result = []
17         textLength = len(text)
18         start = 0
19         while textLength > 0:
20             word = None
21             for size in range(maximum, 0, -1):
22                 if textLength - size < 0:
23                     continue
24                 piece = text[start:(start + size)]
25                 if dictionary.__contains__(piece):
26                     word = piece
27                     result.append(Word(piece, dictionary.get(piece)))
28                     textLength -= size
29                     start += size
30                     break
31             if word is None:
32                 textLength -= 1
33         return result
View Code
 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.Word import Word
 5 
 6 ‘‘‘
 7 逆向最大化匹配
 8 ReverseMaximumMatchMethod
 9 ‘‘‘
10 class RMM(object):
11     def __init__(self):
12         pass
13 
14     @staticmethod
15     def cut(text, dictionary, maximum):
16         result = []
17         textLength = len(text)
18         while textLength > 0:
19             word = None
20             for size in range(maximum, 0, -1):
21                 if textLength - size < 0:
22                     continue
23                 piece = text[(textLength - size) : textLength]
24                 if dictionary.__contains__(piece):
25                     word = piece
26                     result.append(Word(piece, dictionary.get(piece)))
27                     textLength -= size
28                     break
29             if word is None:
30                 textLength -= 1
31         return result[::-1]
View Code
 1 # -*- coding:utf-8 -*-
 2 
 3 from word_segmentation.regulation.MaximumMatchMethod import MM
 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
 5 
 6 ‘‘‘
 7     比较正向最大匹配和逆向最大匹配结果:
 8     1.如果分词数量结果不同,那么取分词数量较少的那个
 9     2.如果分词数量结果相同
10         a.分词结果相同,可以返回任何一个
11         b.分词结果不同,返回单字数比较少的那个
12         c.分词结果不同,单字数相同,返回谁呢(可以返回逆向分词结果)
13 ‘‘‘
14 class BDMM(object):
15     def __init__(self):
16         pass
17 
18     @staticmethod
19     def cut(text, dictionary, maximum):
20         mmResult = MM.cut(text, dictionary, maximum)
21         rmmResult = RMM.cut(text, dictionary, maximum)
22         mmSegment = []
23         rmmSegment = []
24         for word in mmResult:
25             mmSegment.append(word.getToken())
26             # print(‘token = %s, property = %s‘ %(word.getToken(), word.getProperty()))
27         for word in rmmResult:
28             rmmSegment.append(word.getToken())
29 
30         if mmSegment.__len__() < rmmSegment.__len__():
31             return mmResult
32         elif mmSegment.__len__() == rmmSegment.__len__():
33             flag = True
34             for segment in mmSegment:
35                 if segment not in rmmSegment:
36                     flag = False
37                     break
38             if flag:
39                 return mmResult
40             else:
41                 mmSingleWords = 0
42                 rmmSingleWords = 0
43                 for word in mmSegment:
44                     if len(word) == 1:
45                         mmSingleWords += 1
46                 for word in rmmSegment:
47                     if len(word) == 1:
48                         rmmSingleWords += 1
49                 if mmSingleWords < rmmSingleWords:
50                     return mmResult
51                 else:
52                     return rmmResult
53         else:
54             return rmmResult
View Code
 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 from word_segmentation.regulation.ReverseMaximumMatchMethod import RMM
 5 import word_segmentation.regulation.MaximumMatchMethod
 6 import word_segmentation.regulation.BiDirectctionMatchMethod
 7 from word_segmentation.regulation.RegulationMatchMthod import RegulationMatch
 8 
 9 def test():
10     pass
11 if __name__ == __main__:
12     text = 各国有各国的困难…
13     print(分词:)
14     print(各国有各国的困难…)
15     regulation = RegulationMatch()
16     mmResult = regulation.cut(text, MM)
17     rmmResult = regulation.cut(text, RMM)
18     bdmmResult = regulation.cut(text, BDMM)
19     mmSegment = []
20     rmmSegment = []
21     bdmmSegment = []
22     for word in mmResult:
23         mmSegment.append(word.getToken())
24         #print(‘token = %s, property = %s‘ %(word.getToken(), word.getProperty()))
25     for word in rmmResult:
26         rmmSegment.append(word.getToken())
27     for word in bdmmResult:
28         bdmmSegment.append(word.getToken())
29 
30     print(正向匹配: %s  % mmSegment)
31     print(逆向匹配: %s  % rmmSegment)
32     print(双向匹配: %s % bdmmSegment)
View Code
网友评论