输入样本数据集
+--------------+----------+-----------+
| col_a | col_b | col_label |
+------------------+-----+------------+
| aa | hy | Ford |
| bb | NA | Mahindra |
| | my | Ford |
| de | cz | Toyota |
| NA Except xy | mg | Hyundai |
| NA | gh | Toyota |
+--------------+---------+------------+
预期的输出数据集
+--------------+----------+------+------+------------+------+----------+----------+---------+---------+-------+--------+--------+--------+----------+
| col_a | col_b | col_c | col_d| col_brand | Ford | Mahindra | Toyota | Hyundai | BMW | AUDI | Nissan | flag_a | flag_b | flag_ab |
+--------------+---------+-------+------+------------+------+----------+----------+---------+---------+-------+--------+--------+--------+----------+
| aa | hy | aahy | | Ford | yes | no | no | no | no | no | no | 0 | 0 | 1 |
| bb | NA | bb | | Mahindra | no | yes | no | no | no | no | no | 1 | 0 | 0 |
| NA | my | my | | Ford | yes | no | no | no | no | no | no | 0 | 1 | 0 |
| de | cz | decz | | Toyota | no | no | yes | no | no | no | no | 0 | 0 | 1 |
| NA except xy | mg | mg | xy | Hyundai | no | no | no | yes | no | no | no | 0 | 1 | 0 |
| NA | gh | gh | | Toyota | no | no | yes | no | no | no | no | 0 | 1 | 0 |
+--------------+---------+-------+------|------------+------+----------+----------+---------+---------+--------+-------+--------+--------+----------+
说明:
第1列。col_a保持不变
第2列col_b保持不变
列3。col_c => concat(col_a,col_b)。如果必须将具有NA或NA除外或NULL的值视为空白(“”)
第4列。col_d =>从col_a捕获“除NA外”之后的任何数据
第5列。col_brand保持不变
第6列至第12列=> =>静态的品牌列表是福特,Mahindra,丰田,现代,宝马,奥迪,日产。 =>将col_brand与品牌的静态列表(6.1)比较,将是或否分配给相应的行
第13至15列(flag_a,flag_b和flag_ab) =>根据col_a和col_b中存在的值分配1或0 =>不适用/不适用,不适用/必须将Null视为空白(“”)
谢谢
答案 0 :(得分:2)
检查以下SQL(通过spark-shell),其中在设置临时flag_a,flag_b(在CTE中)并用作检索col_d的定界符时,“ NA”和“除了NA”不区分大小写:
df.createOrReplaceTempView("df_table")
spark.sql("""
WITH t1 AS (
SELECT IF(col_a = '' OR col_a is NULL, 'NA', col_a) as col_a
, IF(col_b = '' OR col_b is NULL, 'NA', col_b) as col_b
, IF(col_a rlike '^(?i)NA(?:\\s+except\\b|$)' OR col_a = '' OR col_a is NULL, 0, 1) as flag_a
, IF(col_b rlike '^(?i)NA(?:\\s+except\\b|$)' OR col_b = '' OR col_b is NULL, 0, 1) as flag_b
, col_label as col_brand
FROM df_table
)
SELECT col_a
, col_b
, concat(IF(flag_a > 0, col_a, ''), IF(flag_b > 0, col_b, '')) as col_c
, coalesce(split(col_a, '(?i)NA except ')[1], '') as col_d
, col_brand
, IF(col_brand = 'Ford', 'yes', 'no') as Ford
, IF(col_brand = 'Mahindra', 'yes', 'no') as Mahindra
, IF(col_brand = 'Toyota', 'yes', 'no') as Toyota
, IF(col_brand = 'Hyundai', 'yes', 'no') as Hyundai
, IF(col_brand = 'BMW', 'yes', 'no') as BMW
, IF(col_brand = 'AUDI', 'yes', 'no') as AUDI
, IF(col_brand = 'Nissan', 'yes', 'no') as Nissan
, IF(flag_a = 1 AND flag_b = 0, 1, 0) as flag_a
, IF(flag_a = 0 AND flag_b = 1, 1, 0) as flag_b
, IF(flag_a = 1 AND flag_b = 1, 1, 0) as flag_ab
FROM t1
""").show
+------------+-----+-----+-----+---------+----+--------+------+-------+---+----+------+------+------+-------+
| col_a|col_b|col_c|col_d|col_brand|Ford|Mahindra|Toyota|Hyundai|BMW|AUDI|Nissan|flag_a|flag_b|flag_ab|
+------------+-----+-----+-----+---------+----+--------+------+-------+---+----+------+------+------+-------+
| aa| hy| aahy| | Ford| yes| no| no| no| no| no| no| 0| 0| 1|
| bb| NA| bb| | Mahindra| no| yes| no| no| no| no| no| 1| 0| 0|
| NA| my| my| | Ford| yes| no| no| no| no| no| no| 0| 1| 0|
| de| cz| decz| | Toyota| no| no| yes| no| no| no| no| 0| 0| 1|
|NA Except xy| mg| mg| xy| Hyundai| no| no| no| yes| no| no| no| 0| 1| 0|
| NA| gh| gh| | Toyota| no| no| yes| no| no| no| no| 0| 1| 0|
+------------+-----+-----+-----+---------+----+--------+------+-------+---+----+------+------+------+-------+
答案 1 :(得分:1)
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
object sampleApp {
def main(args: Array[String]) = {
val spark = SparkSession.builder().master("local").getOrCreate()
import spark.sqlContext.implicits._
val df = Seq(("aa","hy","Ford"),("bb","NA","Manindra"),("","my","Ford"),("de","cz","Toyota"),
("NA Except xy","mg","Hyundai"),("NA","gf","Toyota")).toDF("col_a","col_b","col_brand")
val chkrules = udf((a:String) =>{
//println("before")
val a1= (a.toLowerCase.contains("na") || a.toLowerCase.contains("except") || a.toLowerCase.contains("between")) match {
case true => ""
case false => a
}
a1
})
def generateArr(n: Int) = {
if (n <= 0) Array.empty[String]
var arr: Array[String] = Array()
arr = arr :+ "0" :+ "1"
var i: Int = 0
var j: Int = 0
i = 2
while (i < (1 << n)) {
j = i - 2
while (j <= 1) {
arr = arr :+ arr(j)
j += 1
}
j = 0
while (j < i) {
arr(j)= arr(j) + "0"
j += 1
}
j = i
while (j < 2 * i) {
arr(j) = arr(j) + "1"
j += 1
}
i = i << 1
}
arr.filter(_!="00")
}
def create_flg(n_size:Int,cols: String) = {
val flgBin = generateArr(n_size)
val colsArr = cols.split(",")
(flgBin zip colsArr.toIterable).zipWithIndex.map { case (c_flg, i) => (i+1, c_flg._1, c_flg._2) }
}
val flgValues = create_flg(2,"col_a,col_b,col_ab") //output: Array((1,"10","col_a"),(2,"01","col_b"),(3,"11","col_ab")
val flgClms = Seq("flg_a","flg_b","flg_ab").zipWithIndex
val flgChk = udf((a:String,b:String) =>{
val ar = Seq(a,b)
val a1 = ar.map(x =>
(x.toLowerCase.contains("na") || x.toLowerCase.contains("except") || x.toLowerCase.contains("between")|| x.trim == "") match {
case true => 0
case false => 1
}) //output: 00 or 10 or 01 or 11
val out = flgValues.toSeq.sortBy(_._1).map(kv => {
kv._2 == a1.mkString("") match {
case true => 1
case false => 0
}
})
out
})
println("Input:")
df.show(false)
val finaldf = df.withColumn("col_c",concat(chkrules($"col_a"),chkrules($"col_b")))
.withColumn("col_d",regexp_extract($"col_a",".*NA Except\\s*([^\n\r]*)",1)).withColumn("flg",flgChk($"col_a",$"col_b"))
println("Creating flg column")
finaldf.show(false)
val clms = finaldf.columns.map(z => z == "flg" match{
case false => Seq(col(z))
case true => flgClms.map { case (i,j) => col(z).getItem(j).as(i)}
})
//finaldf.select(clms.flatten:+ $"flg" :_*).show(false)
val final_df_pivot = finaldf.groupBy(clms.flatten:_*)
.pivot("col_brand").agg(coalesce(first(lit("yes")),lit("no")))
println("Output:")
final_df_pivot.show(false)
}
}
Input:
+------------+-----+---------+
|col_a |col_b|col_brand|
+------------+-----+---------+
|aa |hy |Ford |
|bb |NA |Manindra |
| |my |Ford |
|de |cz |Toyota |
|NA Except xy|mg |Hyundai |
|NA |gf |Toyota |
+------------+-----+---------+
Creating flg column:
+------------+-----+---------+-----+-----+---------+
|col_a |col_b|col_brand|col_c|col_d|flg |
+------------+-----+---------+-----+-----+---------+
|aa |hy |Ford |aahy | |[0, 0, 1]|
|bb |NA |Manindra |bb | |[1, 0, 0]|
| |my |Ford |my | |[0, 1, 0]|
|de |cz |Toyota |decz | |[0, 0, 1]|
|NA Except xy|mg |Hyundai |mg |xy |[0, 1, 0]|
|NA |gf |Toyota |gf | |[0, 1, 0]|
+------------+-----+---------+-----+-----+---------+
Output:
+------------+-----+---------+-----+-----+-----+-----+------+----+-------+--------+------+
|col_a |col_b|col_brand|col_c|col_d|flg_a|flg_b|flg_ab|Ford|Hyundai|Manindra|Toyota|
+------------+-----+---------+-----+-----+-----+-----+------+----+-------+--------+------+
|NA Except xy|mg |Hyundai |mg |xy |0 |1 |0 |no |yes |no |no |
|bb |NA |Manindra |bb | |1 |0 |0 |no |no |yes |no |
|aa |hy |Ford |aahy | |0 |0 |1 |yes |no |no |no |
| |my |Ford |my | |0 |1 |0 |yes |no |no |no |
|de |cz |Toyota |decz | |0 |0 |1 |no |no |no |yes |
|NA |gf |Toyota |gf | |0 |1 |0 |no |no |no |yes |
+------------+-----+---------+-----+-----+-----+-----+------+----+-------+--------+------+