我有以下数据样本:
文件名:sample.txt
| TRANSACTION_ID | ITEM_ID | AUC_END_DT | BD_ID | BD_SITE |
+----------------+--------------+------------+-----------+---------+
| 320562466 | 7322548247 | 5/22/2005 | 32148826 | 77 |
| 569643695009 | 190558793670 | 7/31/2011 | 112644812 | 0 |
以下是我正在运行的查询:
select * from table_name where item_id = '$item_id';
我需要将此sample.txt
文件转换为序列文件,然后需要为该序列文件创建DataFrame
以进行进一步分析。
case class db_col( transaction_id:Double,
item_id:Long,
auc_end_dt:String,
bd_id:Long,
bd_site:Int)
object V_bd {
def main(args: Array[String]) {
val item_id_args = args(0)
val conf = new SparkConf().setAppName("POC_Naren").setMaster("local")
val sc = new SparkContext(conf)
val ssc = new SQLContext(sc)
import ssc.implicits._
val dw_bid_base_rdd = sc.textFile("C:/Users/Downloads/sqlscript/reference/data/sample.txt")
val bd_trans_rdd = dw_bid_base_rdd.map(row => row.split("\\|"))
val bd_col_rdd = bd_trans_rdd.map(p => db_col(p(0).trim.toDouble,p(1),p(2),p(.3).trim.tolong,p(4).trim.toInt))
val bd_df_rdd = bd_col_rdd.toDF()
bd_df_rdd.registerTempTable("bd_table")
val bd_table_query = ssc.sql("select * from table_name where item_id = '$item_id_args';")
bd_table_query.show()
}
}
答案 0 :(得分:2)
您需要将DataFrame转换为RDD [(K,V)]。实施例
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, DataFrame}
val bd_table_query : DataFrame = ???
val rdd : RDD[(Int,String)] = df.rdd.map {
case r : Row => (r.getAs[Int](0),r.getAs[String](1)) // I'll let you choose your keys and convert into the right format
}
然后你可以保存RDD:
rdd.saveAsSequenceFile("output.seq")