for (character <- content) {
if (character == '\n') {
val current_line = line.mkString
line.clear()
current_line match {
case docStartRegex(_*) => {
startDoc = true
endText = false
endDoc = false
}
case docnoRegex(group) => {
docID = group.trim
}
case docTextStartRegex(_*) => {
startText = true
}
case docTextEndRegex(_*) => {
endText = true
startText = false
}
case docEndRegex(_*) => {
endDoc = true
startDoc = false
es_json = Json.obj(
"_index" -> "ES_SPARK_AP",
"_type" -> "document",
"_id" -> docID,
"_source" -> Json.obj(
"text" -> textChunk.mkString(" ")
)
)
// yield es_json
textChunk.clear()
}
case _ => {
if (startDoc && !endDoc && startText) {
textChunk += current_line.trim
}
}
}
} else {
line += character
}
}
上面的for循环解析一个文本文件,并创建一个循环中解析的每个chunk的JSON对象。这是JSON将被发送到Elasticsearch的进一步处理。在python中,我们可以轻松地生成JSON并使用生成器:
def func():
for i in range(num):
... some computations ...
yield {
JSON ## JSON is yielded
}
for json in func(): ## we parse through the generator here.
process(json)
我无法理解如何使用scala以类似的方式使用yield
?
答案 0 :(得分:0)
如果您想要延迟返回,scala会使用Iterator类型执行此操作。特别是如果你想逐行处理,我首先用.lines
val content: String = ???
val results: Iterator[Json] =
for {
lines <- content.lines
line <- lines
} yield {
line match {
case docEndRegex(_*) => ...
}
}
您也可以直接使用功能
def toJson(line: String): Json =
line match {
case "hi" => Json.obj("line" -> "hi")
case "bye" => Json.obj("what" -> "a jerk")
}
val results: Iterator[Json] =
for {
lines <- content.lines
line <- lines
} yield toJson(line)
这相当于做
content.lines.map(line => toJson(line))
或者在python
中有点等效lines = (line.strip() for line in content.split("\n"))
jsons = (toJson(line) for line in lines)