I am trying to join two table to get the country code and ip address and have also attached table screenshot. How to join these two table? I am using Zeppelin
def ipToLong(dottedIP: String): Long = {
val addrArray: Array[String] = dottedIP.split("\\.")
var num: Long = 0
var i: Int = 0
while (i < addrArray.length) {
val power: Int = 3 - i
num = num + ((addrArray(i).toInt % 256) * Math.pow(256, power)).toLong
i += 1
}
num
}
val rdd1 = sc.textFile("/user/mamta/mamta_audit/mamta_audit.csv")
case class IPCode(Date_key:String,LogID:String,Activity:String,SourceIP:String)
val sal1 = rdd1.map(_.split(",")).map(i => IPCode(i(0),i(1),i(2),i(3))).toDF("Date_key","LogID","Activity","SourceIP")
val rdd2 = sc.textFile("/user/mamta/IP_LocationCode.csv")
case class IPLoc(ip_from:String,
ip_to:String,
Country_Code:String,
Region_Name:String,
City_Name:String
)
val sal2 = rdd2.map(_.split(",")).map(e => IPLoc(e(0),e(1),e(2),e(3),e(4))).toDF("ip_from","ip_to","Country_Code","Region_Name","City_Name")
I tried this but its giving me null value for ip_from, ip_to, countrycode
sal1.join(sal2,
sal1("SourceIP") >= sal2("ip_from") && sal1("SourceIP") <= sal2("ip_to"),
"left"
).show()
答案 0 :(得分:1)
您的ipToLong
方法需要转换为UDF
才能应用于加入条件中的IP
列,如下所示:
val sal1 = Seq(
("109.175.191.0"),
("invalid.ip"),
("187.42.62.209"),
("89.142.219.5")
).toDF("SourceIP")
val sal2 = Seq(
("75.0.0.0", "89.255.255.255", "Country A"),
("90.0.0.0", "129.255.255.255", "Country B"),
("130.0.0.0", "199.255.255.255", "Country C"),
("bad.ip", "bad.ip", "Country Z")
).toDF("ip_from", "ip_to", "country")
import org.apache.spark.sql.functions._
def ipToLongUDF = udf(
(ip: String) => {
val patternIPv4 = """\s*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s*""".r
ip match {
case patternIPv4() => ip.split("\\.").reverse.zipWithIndex.map(
a => a._1.toInt * math.pow(256, a._2).toLong
).sum
case _ => -1L
}
}
)
sal1.join(
sal2,
ipToLongUDF(sal1("SourceIP")) >= 0 &&
ipToLongUDF(sal1("SourceIP")) >= ipToLongUDF(sal2("ip_from")) &&
ipToLongUDF(sal1("SourceIP")) <= ipToLongUDF(sal2("ip_to")),
"left"
).
show
// +-------------+---------+---------------+---------+
// | SourceIP| ip_from| ip_to| country|
// +-------------+---------+---------------+---------+
// |109.175.191.0| 90.0.0.0|129.255.255.255|Country B|
// | invalid.ip| null| null| null|
// |187.42.62.209|130.0.0.0|199.255.255.255|Country C|
// | 89.142.219.5| 75.0.0.0| 89.255.255.255|Country A|
// +-------------+---------+---------------+---------+