使用Stanford NLP库进行词干和Lemmatisation

时间:2016-11-18 18:09:08

标签: scala apache-spark stanford-nlp

我正在使用Stanford NLP库来进行词干和Lemmatisation。我按照文档中的示例进行了操作

def plainTextToLemmas(text: String, stopWords: Set[String]): List[String] = {
      val props = new Properties()
      props.put("annotators", "tokenize, ssplit, pos, lemma")
      val pipeline = new StanfordCoreNLP(props)
      //empty annotation with given text
      val doc = new Annotation(text)
      //run annotators on text
      pipeline.annotate(doc)
      val lemmas = new ArrayBuffer[String]()
      val sentences = doc.get(classOf[SentencesAnnotation])
      for (sentence <- sentences; token <- sentence.get(classOf[TokensAnnotation])) {
        val lemma = token.get(classOf[LemmaAnnotation])
        if (lemma.length > 2 && !stopWords.contains(lemma)) {
          lemmas += lemma.toLowerCase
        }
      }
      lemmas.toList
    }

val x = sentence.map(plainTextToLemmas(_, stopWords))

然而,在完全停止后,它没有空间的句子。有没有办法解决这个问题?也可以选择过滤html标签吗?将它添加到停用词不起作用。

0 个答案:

没有答案