加入倍数数据帧spark

时间:2017-03-11 12:18:55

标签: scala apache-spark apache-spark-sql

我有多个从csv文件加载的数据帧我希望根据列加入它们,这就是我所做的。我只是想让它自动化并使其自动化。

val spark = SparkSession.builder.master("local").appName("my-spark-app").getOrCreate()
   import spark.sqlContext.implicits._
  val df1 = spark.read.option("inferSchema", "true").option("header", "true").csv("C:/Users/mhattabi/Desktop/dataTestCsvFile/dataTest1.txt")
  val df2 = spark.read.option("inferSchema", "true").option("header", "true").csv("C:/Users/mhattabi/Desktop/dataTestCsvFile/dataTest2.txt")
 val df3 = spark.read.option("inferSchema", "true").option("header", "true").csv("C:/Users/mhattabi/Desktop/dataTestCsvFile/dataTest3.txt")
  df1.show
  df2.show
  df3.show
val df =List(df1,df2,df3).reduce((a, b) => a.join(b,Seq("time"),joinType="outer"))
   df.show

问题只是其中两个而不是全部。结果加入其中两个 感谢

1 个答案:

答案 0 :(得分:1)

这是答案的解决方案

val df1 = spark.read.option("inferSchema", "true").option("header", "true").csv("C:/Users/mhattabi/Desktop/dataTestCsvFile/dataTest1.txt")
      val df2 = spark.read.option("inferSchema", "true").option("header", "true").csv("C:/Users/mhattabi/Desktop/dataTestCsvFile/dataTest2.txt")
     val df3 = spark.read.option("inferSchema", "true").option("header", "true").csv("C:/Users/mhattabi/Desktop/dataTestCsvFile/dataTest3.txt")
     val df_result=recursiveJoinOnDate(List(df1,df2,df3))
    df_result.show
      }
def recursiveJoinOnDate(list: List[DataFrame]): DataFrame = 
 { if (list.isEmpty){ null }else if(list.size >1){ list.head.join(recursiveJoinOnDate(list.tail),Seq("`time.1`"),joinType="outer") }else list.head }
  }