我正在尝试在一个类中定义一个成员函数,该类在从json文件解析数据时将用作UDF。我使用trait来定义一组方法和一个类来覆盖这些方法。
trait geouastr {
def getGeoLocation(ipAddress: String): Map[String, String]
def uaParser(ua: String): Map[String, String]
}
class GeoUAData(appName: String, sc: SparkContext, conf: SparkConf, combinedCSV: String) extends geouastr with Serializable {
val spark = SparkSession.builder.config(conf).getOrCreate()
val GEOIP_FILE_COMBINED = combinedCSV;
val logger = LogFactory.getLog(this.getClass)
val allDF = spark.
read.
option("header","true").
option("inferSchema", "true").
csv(GEOIP_FILE_COMBINED).cache
val emptyMap = Map(
"country" -> "",
"state" -> "",
"city" -> "",
"zipCode" -> "",
"latitude" -> 0.0.toString(),
"longitude" -> 0.0.toString())
override def getGeoLocation(ipAddress: String): Map[String, String] = {
val ipLong = ipToLong(ipAddress)
try {
logger.error("Entering UDF " + ipAddress + " allDF " + allDF.count())
val resultDF = allDF.
filter(allDF("network").cast("long") <= ipLong.get).
filter(allDF("broadcast") >= ipLong.get).
select(allDF("country_name"), allDF("subdivision_1_name"),allDF("city_name"),
allDF("postal_code"),allDF("latitude"),allDF("longitude"))
val matchingDF = resultDF.take(1)
val matchRow = matchingDF(0)
logger.error("Lookup for " + ipAddress + " Map " + matchRow.toString())
val geoMap = Map(
"country" -> nullCheck(matchRow.getAs[String](0)),
"state" -> nullCheck(matchRow.getAs[String](1)),
"city" -> nullCheck(matchRow.getAs[String](2)),
"zipCode" -> nullCheck(matchRow.getAs[String](3)),
"latitude" -> matchRow.getAs[Double](4).toString(),
"longitude" -> matchRow.getAs[Double](5).toString())
} catch {
case (nse: NoSuchElementException) => {
logger.error("No such element", nse)
emptyMap
}
case (npe: NullPointerException) => {
logger.error("NPE for " + ipAddress + " allDF " + allDF.count(),npe)
emptyMap
}
case (ex: Exception) => {
logger.error("Generic exception " + ipAddress,ex)
emptyMap
}
}
}
def nullCheck(input: String): String = {
if(input != null) input
else ""
}
override def uaParser(ua: String): Map[String, String] = {
val client = Parser.get.parse(ua)
return Map(
"os"->client.os.family,
"device"->client.device.family,
"browser"->client.userAgent.family)
}
def ipToLong(ip: String): Option[Long] = {
Try(ip.split('.').ensuring(_.length == 4)
.map(_.toLong).ensuring(_.forall(x => x >= 0 && x < 256))
.zip(Array(256L * 256L * 256L, 256L * 256L, 256L, 1L))
.map { case (x, y) => x * y }
.sum).toOption
}
}
我注意到uaParser工作正常,而getGeoLocation返回emptyMap(运行到NPE)。添加片段,显示我在main方法中如何使用它。
val appName = "SampleApp"
val conf: SparkConf = new SparkConf().setAppName(appName)
val sc: SparkContext = new SparkContext(conf)
val spark = SparkSession.builder.config(conf).enableHiveSupport().getOrCreate()
val geouad = new GeoUAData(appName, sc, conf, args(1))
val uaParser = Sparkudf(geouad.uaParser(_: String))
val geolocation = Sparkudf(geouad.getGeoLocation(_: String))
val sampleRdd = sc.textFile(args(0))
val json = sampleRdd.filter(_.nonEmpty)
import spark.implicits._
val sampleDF = spark.read.json(json)
val columns = sampleDF.select($"user-agent", $"source_ip")
.withColumn("sourceIp", $"source_ip")
.withColumn("geolocation", geolocation($"source_ip"))
.withColumn("uaParsed", uaParser($"user-agent"))
.withColumn("device", ($"uaParsed") ("device"))
.withColumn("os", ($"uaParsed") ("os"))
.withColumn("browser", ($"uaParsed") ("browser"))
.withColumn("country" , ($"geolocation")("country"))
.withColumn("state" , ($"geolocation")("state"))
.withColumn("city" , ($"geolocation")("city"))
.withColumn("zipCode" , ($"geolocation")("zipCode"))
.withColumn("latitude" , ($"geolocation")("latitude"))
.withColumn("longitude" , ($"geolocation")("longitude"))
.drop("geolocation")
.drop("uaParsed")
问题: 1.我们应该从一个类切换到另一个对象来定义UDF吗? (我可以把它保持为单身) 2.类成员函数可以用作UDF吗? 3.当调用这样的UDF时,像allDF这样的类成员会保持初始化吗? 4. Val被声明为成员变量 - 它是否会在构建geouad时初始化?
我是Scala的新手,提前感谢您的指导/建议。
答案 0 :(得分:0)
不,定义class
不需要从object
切换到UDF
,只有在调用UDF
时才会有所不同。
是的,您可以将类成员函数用作UDF
,但首先需要将该函数注册为UDF
。
spark.sqlContext.udf.register(“registeredName”,Class Method _)
不,在调用UDF
是的,类变量val将在调用geouad
并执行某些操作时初始化。