我试图在scala中编写基于n-gram的分离印刷算法。 如何为大文件生成n-gram: 例如,对于包含“蜜蜂是蜜蜂的蜜蜂”的文件。
你能给我一些提示怎么做吗? 抱歉给你带来不便。
答案 0 :(得分:13)
您的问题可能会更加具体,但这是我的尝试。
val words = "the bee is the bee of the bees"
words.split(' ').sliding(2).foreach( p => println(p.mkString))
答案 1 :(得分:4)
您可以尝试使用参数n
val words = "the bee is the bee of the bees"
val w = words.split(" ")
val n = 4
val ngrams = (for( i <- 1 to n) yield w.sliding(i).map(p => p.toList)).flatMap(x => x)
ngrams foreach println
List(the)
List(bee)
List(is)
List(the)
List(bee)
List(of)
List(the)
List(bees)
List(the, bee)
List(bee, is)
List(is, the)
List(the, bee)
List(bee, of)
List(of, the)
List(the, bees)
List(the, bee, is)
List(bee, is, the)
List(is, the, bee)
List(the, bee, of)
List(bee, of, the)
List(of, the, bees)
List(the, bee, is, the)
List(bee, is, the, bee)
List(is, the, bee, of)
List(the, bee, of, the)
List(bee, of, the, bees)
答案 2 :(得分:3)
这是一种基于流的方法。计算n-gram时,这不需要太多内存。
object ngramstream extends App {
def process(st: Stream[Array[String]])(f: Array[String] => Unit): Stream[Array[String]] = st match {
case x #:: xs => {
f(x)
process(xs)(f)
}
case _ => Stream[Array[String]]()
}
def ngrams(n: Int, words: Array[String]) = {
// exclude 1-grams
(2 to n).map { i => words.sliding(i).toStream }
.foldLeft(Stream[Array[String]]()) {
(a, b) => a #::: b
}
}
val words = "the bee is the bee of the bees"
val n = 4
val ngrams2 = ngrams(n, words.split(" "))
process(ngrams2) { x =>
println(x.toList)
}
}
输出:
List(the, bee)
List(bee, is)
List(is, the)
List(the, bee)
List(bee, of)
List(of, the)
List(the, bees)
List(the, bee, is)
List(bee, is, the)
List(is, the, bee)
List(the, bee, of)
List(bee, of, the)
List(of, the, bees)
List(the, bee, is, the)
List(bee, is, the, bee)
List(is, the, bee, of)
List(the, bee, of, the)
List(bee, of, the, bees)