如何在Java / Scala中将CSV文件值解析为MatrixEntry

时间:2015-05-12 10:40:25

标签: java scala csv apache-spark

我在Scala中获得了此代码,我必须将其更改为Java:

private async void btnNextQuestion_Click(object sender, RoutedEventArgs e)
    {
        await Task.Run(() => StartProgrssRing());
        Frame.Navigate(typeof(Question), _survey.Id.ToString() + "|" + _nextField.Id.ToString());
    }

    private void StartProgrssRing()
    {
        ProgressRing1.IsActive = true;
    }

我已经做到了这一点:

import au.com.bytecode.opencsv.CSVParser
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import scala.collection.immutable.List
import java.io._
import java.nio.file.{Paths, Files}
import java.nio.charset.StandardCharsets
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.linalg.DenseMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix

def exportMatrix(matrix:Array[Double], filename: String, numCols:Int, numRows:Int) = {
  val pw = new PrintWriter(filename)

  for(columnIndex <- 0 until numCols) {
    pw.print("word"+columnIndex)
    if(columnIndex == numCols - 1)
        pw.println
      else
        pw.print(",")
  }

  for( rowIndex <- 0 until numRows){
    for(columnIndex <- 0 until numCols) {
      pw.print(matrix(numRows * columnIndex + rowIndex))
      if(columnIndex == numCols - 1)
        pw.println
      else
        pw.print(",")
    }

  }
  pw.flush
  pw.close
}

def exportRowMatrix(matrix:RDD[String], fileName: String) = {
  val pw = new PrintWriter(fileName)
  matrix.collect().foreach(line => pw.println(line))

  pw.flush
  pw.close
}

val csv = sc.textFile("hdfs://myhost/sparse.csv").cache()  // original file

val data = csv.mapPartitions(lines => {
    val parser = new CSVParser(' ')
    lines.map(line => {
      parser.parseLine(line)
    })
  }).map(line => {
    MatrixEntry(line(0).toLong - 1, line(1).toLong - 1 , line(2).toInt) 
  }
)

val indexedRowMatrix: IndexedRowMatrix = new CoordinateMatrix(data).toIndexedRowMatrix() 

/*val mat: CoordinateMatrix = 
val rowMatrix: RowMatrix = mat.toRowMatrix()*/


val svd: SingularValueDecomposition[IndexedRowMatrix, Matrix] = indexedRowMatrix.computeSVD(100, computeU = true)

val U: IndexedRowMatrix = svd.U // The U factor is a RowMatrix.
val S: Vector = svd.s // The singular values are stored in a local dense vector.
val V: Matrix = svd.V // The V factor is a local dense matrix.

val sArray: Array[Double] = S.toArray // done
val vArray: Array[Double] = V.toArray // done

val rdd = U.rows.map( x => x.vector.toArray.mkString(","))

exportMatrix(sArray, "../S.csv", S.size, 1)
exportMatrix(vArray, "../V.csv", V.numCols.toInt, V.numRows.toInt)
exportRowMatrix(rdd, "../U.csv")

val diag = Matrices.diag(S)

val D = new DenseMatrix(diag.numRows, diag.numCols, diag.toArray)

val multiplyResult = V.multiply(D)

val dArray = multiplyResult.toArray
exportMatrix(dArray, "../D.csv", multiplyResult.numCols, multiplyResult.numRows)

我的问题是:

  • 如何将Matrix Market格式的每一行解析为MatrixEntry?它应该由csv.mapPartitions()
  • 完成
  • 如何在Java中定义像exportMatrix这样的函数?它和普通的Java函数一样吗?

0 个答案:

没有答案