NLTK 分句、分词、词干提取、词型还原 print ( " ==========案例1:分句、分词=============== " ) import nltk.tokenize as tkdoc = " Are you curious about tokenization? "" Let‘s see how it works! " " We need to analyze a
NLTK 分句、分词、词干提取、词型还原
print("==========案例1:分句、分词===============") import nltk.tokenize as tk doc ="Are you curious about tokenization? ""Let‘s see how it works! ""We need to analyze a couple of sentences " "with punctuations to see it in action." print(doc) # 按句拆分:tk.sent_tokenize(doc) # 问:tk.sent_tokenize()为何能识别出到哪里是一句? # 答:1、看首字母是大写 ;2、结尾有标点符号 tokens = tk.sent_tokenize(doc) for i,token in enumerate(tokens): print("%2d" % (i+1),token) print("-----------------------------") # 按词拆分:tk.word_tokenize(doc) tokens = tk.word_tokenize(doc) for i,token in enumerate(tokens): print("%2d" % (i+1),token) # 按词和标点拆分:tk.WordPunctTokenizer().tokenize(doc) tokenizer=tk.WordPunctTokenizer() tokens = tokenizer.tokenize(doc) for i,token in enumerate(tokens): print("%2d" % (i+1),token) print("=============案例2:词干提取、词型还原===================") # 导入下面三种词干提取器进行对比 import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb # 导入nltk.stem用来词型还原 import nltk.stem as ns words = [‘table‘, ‘probably‘, ‘wolves‘, ‘playing‘, ‘is‘, ‘dog‘, ‘the‘, ‘beaches‘, ‘grounded‘, ‘dreamt‘, ‘envision‘] print(words) print("----------词干提取-------------") # 在名词和动词中,除了与数和时态有关的成分以外的核心成分。 # 词干并不一定是合法的单词 pt_stemmer = pt.PorterStemmer() # 波特词干提取器 lc_stemmer = lc.LancasterStemmer() # 兰卡斯词干提取器 sb_stemmer = sb.SnowballStemmer("english")# 思诺博词干提取器 for word in words: pt_stem = pt_stemmer.stem(word) lc_stem = lc_stemmer.stem(word) sb_stem = sb_stemmer.stem(word) print("%8s %8s %8s %8s" % (word,pt_stem,lc_stem,sb_stem)) print("----------词型还原器---------------") # 词型还原:复数名词->单数名词 ;分词->动词原型 # 单词原型一定是合法的单词 lemmatizer = ns.WordNetLemmatizer() for word in words: # 将名词还原为单数形式 n_lemma = lemmatizer.lemmatize(word, pos=‘n‘) # 将动词还原为原型形式 v_lemma = lemmatizer.lemmatize(word, pos=‘v‘) print(‘%8s %8s %8s‘ % (word, n_lemma, v_lemma))