NLP分词jar包很大,不建议使用maven下载,几百M,直接官网下载 package testimport edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}import org.apache.commons.l
NLP分词jar包很大,不建议使用maven下载,几百M,直接官网下载
package test import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP} import org.apache.commons.lang.StringUtils import util.EmojiFilter import scala.collection.mutable.ListBuffer import scala.util.control.Breaks.{break, breakable} import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, ListBuffer} /** * Created by liuwei on 2017/8/23. */ object NLPTest { def main(args: Array[String]): Unit = { val string = "test环境服务器启动方式更新为supervisor启动" val res = nlp(string,List.empty[String],List.empty[String]) println(res) } private def nlp(content: String, stopWordList: List[String], stopNatureList: List[String]): List[String] = { if(StringUtils.isEmpty(content)) return List.empty[String] val stopWordListBuffer = ListBuffer.empty[String] stopWordListBuffer.append(null, "了", "的")//获取系统停用词 stopWordList.foreach(f=> stopWordListBuffer.append(f)) val stopWordAll = stopWordListBuffer.toList val stopNaturesFromDictListBuffer = ListBuffer.empty[String] stopNaturesFromDictListBuffer.append(null, "PU") //获取系统停用词性 stopNatureList.flatMap(f=> f.split(",")).foreach( f=> stopNaturesFromDictListBuffer.append(f) ) val stopNatureAll = stopNaturesFromDictListBuffer.toList val props = "StanfordCoreNLP-chinese.properties" val pipeline = new StanfordCoreNLP(props) val context = EmojiFilter.filterEmoji(content) val document: Annotation = new Annotation(context) // run all Annotators n this text pipeline.annotate(document) val coreLabels: List[CoreLabel] = document.get(classOf[CoreAnnotations.TokensAnnotation]).asScala.toList val result = ListBuffer.empty[String] for (coreLabel:CoreLabel <- coreLabels) { breakable{ val word = coreLabel.word val tag = coreLabel.tag val stopWordExisted = null != stopWordAll && stopWordAll.nonEmpty && stopWordAll.contains(word) if(stopWordExisted) break val stopNatureExisted = null != stopNatureAll && stopNatureAll.nonEmpty && stopNatureAll.contains(tag) if(stopNatureExisted) break //add word result.append(word) } } result.toList } }