我尝试使用书籍Learning Spark中的模式从包含JSON记录的Kafka源中读取流:
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
import java.util.Date
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.databind.DeserializationFeature
val spark = SparkSession.builder.appName("StreamingRetailTransactions").config("master", "local").getOrCreate()
val df = spark.readStream.
format("kafka").
option("kafka.bootstrap.servers", ...).
option("subscribe", "transactions_load").
option("kafka.security.protocol", "SASL_SSL").
...
load()
case class Invoice(
invoiceNo: Int,
stockCode: Int,
description: String,
...
storeId: Int,
transactionId: String
)
然后......
val df2 = df.selectExpr("CAST(value AS String)").as[String]
val df3 = df2.mapPartitions(records => {
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
records.flatMap(record => {
try {
Some(mapper.readValue(record, classOf[Invoice]))
} catch {
case e: Exception => None
}
})
}, true)
val query = df3.writeStream.format("console").start()
但是,我遇到了这个问题:
df2: org.apache.spark.sql.Dataset[String] = [value: string]
<console>:63: error: missing parameter type
val df3 = df2.mapPartitions(records => {
^
有什么想法吗?
答案 0 :(得分:1)
为什么要在参数中提供布尔值? mapPartitions方法只接受一个函数:
func: Iterator[T] => Iterator[U]
作为参数。
删除&#39; true&#39;后再尝试从上一个开始:
val df3 = df2.mapPartitions(records => {
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
records.flatMap(record => {
try {
Some(mapper.readValue(record, classOf[Invoice]))
} catch {
case _: Exception => None
}
})
})