当前位置 : 主页 > 网络推广 > seo >

NLP分词

来源:互联网 收集:自由互联 发布时间:2021-06-16
NLP分词jar包很大,不建议使用maven下载,几百M,直接官网下载 package testimport edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}import org.apache.commons.l

NLP分词jar包很大,不建议使用maven下载,几百M,直接官网下载

package test

import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import edu.stanford.nlp.pipeline.{Annotation, StanfordCoreNLP}
import org.apache.commons.lang.StringUtils
import util.EmojiFilter

import scala.collection.mutable.ListBuffer
import scala.util.control.Breaks.{break, breakable}
import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, ListBuffer}

/**
  * Created by liuwei on 2017/8/23.
  */
object NLPTest {

  def main(args: Array[String]): Unit = {
    val string = "test环境服务器启动方式更新为supervisor启动"
    val res = nlp(string,List.empty[String],List.empty[String])
    println(res)

  }

  private def nlp(content: String, stopWordList: List[String], stopNatureList: List[String]): List[String] = {

    if(StringUtils.isEmpty(content))
      return List.empty[String]

    val stopWordListBuffer = ListBuffer.empty[String]
    stopWordListBuffer.append(null, "了", "的")//获取系统停用词
    stopWordList.foreach(f=> stopWordListBuffer.append(f))

    val stopWordAll = stopWordListBuffer.toList


    val stopNaturesFromDictListBuffer = ListBuffer.empty[String]
    stopNaturesFromDictListBuffer.append(null,  "PU") //获取系统停用词性
    stopNatureList.flatMap(f=> f.split(",")).foreach(
      f=> stopNaturesFromDictListBuffer.append(f)
    )
    val stopNatureAll = stopNaturesFromDictListBuffer.toList


    val props = "StanfordCoreNLP-chinese.properties"

    val pipeline = new StanfordCoreNLP(props)

    val context = EmojiFilter.filterEmoji(content)

    val document: Annotation = new Annotation(context)

    // run all Annotators n this text
    pipeline.annotate(document)

    val coreLabels: List[CoreLabel] = document.get(classOf[CoreAnnotations.TokensAnnotation]).asScala.toList

    val result = ListBuffer.empty[String]


    for (coreLabel:CoreLabel <- coreLabels) {

      breakable{

        val word = coreLabel.word
        val tag = coreLabel.tag

        val stopWordExisted = null != stopWordAll && stopWordAll.nonEmpty && stopWordAll.contains(word)

        if(stopWordExisted) break


        val stopNatureExisted = null != stopNatureAll && stopNatureAll.nonEmpty && stopNatureAll.contains(tag)

        if(stopNatureExisted) break


        //add word
        result.append(word)
      }
    }

    result.toList
  }

}
网友评论