我知道这个问题已经被问过多次了,但是对我来说,答案都没有帮助。
下面是我的火花代码
class ParseLogs extends java.io.Serializable {
def formLogLine(logLine: String): (String,String,String,Int,String,String,String,Int,Float,String,String,Flo at,Int,String,Int,Float,String)={
//some logic
//return value
(recordKey._2.toString().replace("\"", ""),recordKey._3,recordKey._4,recordKey._5,recordKey._6,recordKey._8,sbcId,recordKey._10,recordKey._11,recordKey._12,recordKey._13.trim(),LogTransferTime,contentAccessed,OTT,dataTypeId,recordKey._14,logCaptureTime1)
}
}
val inputDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", brokers)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.load()
val myDf = inputDf.selectExpr("CAST(value AS STRING)")
val df1 = myDf.map(line => new ParseLogs().formLogLine(line.get(0).toString()))
我遇到错误
User class threw exception: org.apache.spark.sql.streaming.StreamingQueryException: Text data source supports only a single column, and you have 17 columns.;
答案 0 :(得分:1)
使用UDF将logLine转换为所需的内容。例如:
spark.sqlContext.udf.register("YOURLOGIC", (logLine: String) => {
//some logic
(recordKey._2.toString().replace("\"",""),recordKey._3,recordKey._4,recordKey._5,recordKey._6,recordKey._8,sbcId,recordKey._10,recordKey._11,recordKey._12,recordKey._13.trim(),LogTransferTime,contentAccessed,OTT,dataTypeId,recordKey._14,logCaptureTime1)
})
val inputDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", brokers)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.load()
val myDf = inputDf.selectExpr("CAST(value AS STRING)")
val df1 = myDf.selectExpr("YOURLOGIC(value) as result")
val result = df1.select(
df1("result").getItem(0),
df1("result").getItem(1),
df1("result").getItem(2)),
df1("result").getItem(3)),
...if you have 17 item,then add to 17
df1("result").getItem(17))