我有一个矩阵,列数和行数是未知的
Matrix的一个例子是:
[5,1.3]
[1,5.2]
我想将其转换为DataFrame,列名是随机的,如何实现呢? 这是我期望的结果:
+-------------+----+
| _1 | _2 |
+-------------+----+
|5 |1.3 |
|1 |5.2 |
--------------------
答案 0 :(得分:1)
我建议你将矩阵转换为RDD,然后将RDD转换为DataFrame,这不是一个好方法,但在Spark 2.0.0中工作正常。
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.mllib.linalg._
import org.apache.spark.rdd.RDD
object mat2df {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("mat2df").setMaster("local[1]")
val sc = new SparkContext(conf)
val values = Array(5, 1, 1.3, 5.2)
val mat = Matrices.dense(2, 2, values).asInstanceOf[DenseMatrix]
def toRDD(m: Matrix): RDD[Vector] = {
val columns = m.toArray.grouped(m.numRows)
val rows = columns.toSeq.transpose
val vectors = rows.map(row => new DenseVector(row.toArray))
sc.parallelize(vectors)
}
val mat_rows = toRDD(mat)// matrix to rdd
val mat_rdd = mat_rows.map(_.toArray).map{case Array(p0, p1) => (p0, p1)}
val spark: SparkSession = SparkSession.builder.master("local").getOrCreate
val df = spark.createDataFrame(mat_rdd) // rdd to dataframe
df.show()
}
}
答案 1 :(得分:1)
def matrixToDataFrame(sc:SparkContext, matrix:Matrix, m_nodeColName:String):DataFrame={
val rdd = sc.parallelize(matrix.colIter.toSeq).map(x => {
Row.fromSeq(x.toArray.toSeq)
})
val sc = new SQLContext(nodeContext.getSparkCtx())
var schema = new StructType()
val ids = ArrayBuffer[String]()
for (i <- 0 until matrix.rowIter.size) {
schema = schema.add(StructField(m_nodeColName +"_"+ i.toString(), DoubleType, true))
ids.append(m_nodeColName +"_"+ i.toString())
}
sc.sparkSession.createDataFrame(rdd, schema)
}