使用Scalding我尝试在类似字符串对之间找到编辑距离。总而言之,我在CSV文件中有10 000 000个字符串。为了减少计算,我使用以下算法:
当我在HDFS上运行此算法时,它可以使用1 000 000个字符串。有10 000 000个字符串节点管理器抱怨我的“map”作业尝试分配更多的物理内存然后节点。我知道在.groupBy('key) { _.mapList ...}
代码中创建大量组合时会出现这种情况。当然这个算法并没有真正扩展。
请告知其他减少此任务计算的方法。
import cascading.tuple.Fields
import com.twitter.scalding._
class Proto1(args: Args) extends Job(args) {
val outputStr = "tmp/out.txt"
val output = TextLine(outputStr)
val wordsList = List(
("aaaa"),
("aaaa"),
("aaaad"),
("aaaab"),
("aaabb"),
("aabbcc"),
("aaabccdd"),
("aaabbccdde"),
("aaabbddd"),
("bbbb"),
("bbbb"),
("bbbaaa"),
("bbaaabb"),
("bbbcccc"),
("bbbddde"),
("bbbdddd"),
("bbbdddf"),
("bbbdddd"),
("ccccc"),
("cccaaa"),
("ccccaabbb"),
("ccbbbddd"),
("cdddeee"),
("ddd"),
("ddd")
)
val orderedPipe =
IterableSource[(String)](wordsList, ('word))
.map('word -> 'key) { word: String => word.take(3) }
.debug
//.groupBy('key) { _.toList[String]('word -> 'list) }
//.debug
.groupBy('key) { _.mapList[String, List[(String, String, Int)]]('word -> 'list)(editDistances) }
.filterNot('key, 'list) { fields: (String, List[(String, String, Int)]) =>
val (key, list) = fields
list.isEmpty
}
.flatMapTo('list -> ('str1, 'str2, 'dist)) {list:List[(String, String, Int)] => list }
.debug
//.write(output)
.write(Csv(outputStr))
def editDistances(list: List[String]): List[(String, String, Int)] = {
val resultList = list.combinations(2).toList.map(x => (x(0), x(1), editDistance(x(0), x(1))))
//println(resultList+"\n")
val result = resultList.filter(x => x._3 <= 1)
result
}
def editDistance(a: String, b: String): Int = {
import scala.math.min
def min3(x: Int, y: Int, z: Int) = min(min(x, y), z)
val (m, n) = (a.length, b.length)
val matrix = Array.fill(m + 1, n + 1)(0)
for (i <- 0 to m; j <- 0 to n) {
matrix(i)(j) = if (i == 0) j
else if (j == 0) i
else if (a(i - 1) == b(j - 1)) matrix(i - 1)(j - 1)
else min3(
matrix(i - 1)(j) + 1,
matrix(i)(j - 1) + 1,
matrix(i - 1)(j - 1) + 1)
}
matrix(m)(n)
}
}
有什么想法吗?