我正在使用Spark从hdfs读取csv文件。它进入了FSDataInputStream对象。我不能使用textfile()方法,因为它通过换行符拆分csv文件,我正在读取在文本字段中包含换行符的csv文件。来自sourcefourge的Opencsv处理单元格内的换行,这是一个不错的项目,但它接受一个Reader作为输入。我需要将它转换为字符串,以便我可以将它作为StringReader传递给opencsv。那么,HDFS文件 - > FSdataINputStream - >字符串 - > StringReader - >一个opencsv字符串列表。以下是代码......
import java.io._
import org.apache.spark.sql.SQLContext
import org.apache.hadoop.fs._
import org.apache.hadoop.conf._
import com.opencsv._
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import java.lang.StringBuilder
val conf = new Configuration()
val hdfsCoreSitePath = new Path("core-site.xml")
val hdfsHDFSSitePath = new Path("hdfs-site.xml")
conf.addResource(hdfsCoreSitePath)
conf.addResource(hdfsHDFSSitePath)
val fileSystem = FileSystem.get(conf)
val csvPath = new Path("/raw_data/project_name/csv/file_name.csv")
val csvFile = fileSystem.open(csvPath)
val fileLen = fileSystem.getFileStatus(csvPath).getLen().toInt
var b = Array.fill[Byte](2048)(0)
var j = 1
val stringBuilder = new StringBuilder()
var bufferString = ""
csvFile.seek(0)
csvFile.read(b)
var bufferString = new String(b,"UTF-8")
stringBuilder.append(bufferString)
while(j != -1) {b = Array.fill[Byte](2048)(0);j=csvFile.read(b);bufferString = new String(b,"UTF-8");stringBuilder.append(bufferString)}
val stringBuilderClean = new StringBuilder()
stringBuilderClean = stringBuilder.substring(0,fileLen)
val reader: Reader = new StringReader(stringBuilderClean.toString()).asInstanceOf[Reader]
val csv = new CSVReader(reader)
val javaContext = new JavaSparkContext(sc)
val sqlContext = new SQLContext(sc)
val javaRDD = javaContext.parallelize(csv.readAll())
//do a bunch of transformations on the RDD
它有效,但我怀疑它是可扩展的。这让我想知道有一个驱动程序有多大限制,它通过一个jvm管理所有数据。我对那些非常熟悉spark的人的问题是:
当您在整个数据集中对数据进行数据操作时会发生什么情况,甚至在它被放入输入RDD之前?它被视为任何其他程序,并且会像疯了一样交换出来我想?
您如何使任何火花程序可扩展?您是否总是需要将数据直接提取到输入RDD中?
答案 0 :(得分:3)
您的代码将数据加载到内存中,然后Spark驱动程序将拆分并将每个数据部分发送给执行程序,原因是它不可扩展。
有两种方法可以解决您的问题。
编写自定义InputFormat以支持CSV文件格式
import java.io.{InputStreamReader, IOException}
import com.google.common.base.Charsets
import com.opencsv.{CSVParser, CSVReader}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Seekable, Path, FileSystem}
import org.apache.hadoop.io.compress._
import org.apache.hadoop.io.{ArrayWritable, Text, LongWritable}
import org.apache.hadoop.mapred._
class CSVInputFormat extends FileInputFormat[LongWritable, ArrayWritable] with JobConfigurable {
private var compressionCodecs: CompressionCodecFactory = _
def configure(conf: JobConf) {
compressionCodecs = new CompressionCodecFactory(conf)
}
protected override def isSplitable(fs: FileSystem, file: Path): Boolean = {
val codec: CompressionCodec = compressionCodecs.getCodec(file)
if (null == codec) {
return true
}
codec.isInstanceOf[SplittableCompressionCodec]
}
@throws(classOf[IOException])
def getRecordReader(genericSplit: InputSplit, job: JobConf, reporter: Reporter): RecordReader[LongWritable, ArrayWritable] = {
reporter.setStatus(genericSplit.toString)
val delimiter: String = job.get("textinputformat.record.delimiter")
var recordDelimiterBytes: Array[Byte] = null
if (null != delimiter) {
recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8)
}
new CsvLineRecordReader(job, genericSplit.asInstanceOf[FileSplit], recordDelimiterBytes)
}
}
class CsvLineRecordReader(job: Configuration, split: FileSplit, recordDelimiter: Array[Byte])
extends RecordReader[LongWritable, ArrayWritable] {
private val compressionCodecs = new CompressionCodecFactory(job)
private val maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.
LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE)
private var filePosition: Seekable = _
private val file = split.getPath
private val codec = compressionCodecs.getCodec(file)
private val isCompressedInput = codec != null
private val fs = file.getFileSystem(job)
private val fileIn = fs.open(file)
private var start = split.getStart
private var pos: Long = 0L
private var end = start + split.getLength
private var reader: CSVReader = _
private var decompressor: Decompressor = _
private lazy val CSVSeparator =
if (recordDelimiter == null)
CSVParser.DEFAULT_SEPARATOR
else
recordDelimiter(0).asInstanceOf[Char]
if (isCompressedInput) {
decompressor = CodecPool.getDecompressor(codec)
if (codec.isInstanceOf[SplittableCompressionCodec]) {
val cIn = (codec.asInstanceOf[SplittableCompressionCodec])
.createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK)
reader = new CSVReader(new InputStreamReader(cIn), CSVSeparator)
start = cIn.getAdjustedStart
end = cIn.getAdjustedEnd
filePosition = cIn
}else {
reader = new CSVReader(new InputStreamReader(codec.createInputStream(fileIn, decompressor)), CSVSeparator)
filePosition = fileIn
}
} else {
fileIn.seek(start)
reader = new CSVReader(new InputStreamReader(fileIn), CSVSeparator)
filePosition = fileIn
}
@throws(classOf[IOException])
private def getFilePosition: Long = {
if (isCompressedInput && null != filePosition) {
filePosition.getPos
}else
pos
}
private def nextLine: Option[Array[String]] = {
if (getFilePosition < end){
//readNext automatical split the line to elements
reader.readNext() match {
case null => None
case elems => Some(elems)
}
} else
None
}
override def next(key: LongWritable, value: ArrayWritable): Boolean =
nextLine
.exists { elems =>
key.set(pos)
val lineLength = elems.foldRight(0)((a, b) => a.length + 1 + b)
pos += lineLength
value.set(elems.map(new Text(_)))
if (lineLength < maxLineLength) true else false
}
@throws(classOf[IOException])
def getProgress: Float =
if (start == end)
0.0f
else
Math.min(1.0f, (getFilePosition - start) / (end - start).toFloat)
override def getPos: Long = pos
override def createKey(): LongWritable = new LongWritable
override def close(): Unit = {
try {
if (reader != null) {
reader.close
}
} finally {
if (decompressor != null) {
CodecPool.returnDecompressor(decompressor)
}
}
}
override def createValue(): ArrayWritable = new ArrayWritable(classOf[Text])
}
简单的测试示例:
val arrayRdd = sc.hadoopFile("source path", classOf[CSVInputFormat], classOf[LongWritable], classOf[ArrayWritable],
sc.defaultMinPartitions).map(_._2.get().map(_.toString))
arrayRdd.collect().foreach(e => println(e.mkString(",")))
我喜欢的另一种方式是使用由数据库编写的spark-csv,这对CSV文件格式很有支持,你可以在github页面中进行一些练习。
针对spark-csv进行了更新,使用univocity作为parserLib ,可以处理多行单元格
val df = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "true") // Use first line of all files as header
.option("parserLib", "univocity")
.option("inferSchema", "true") // Automatically infer data types
.load("source path")
答案 1 :(得分:1)
当您在整个数据集中对数据进行数据操作之前,甚至在它被放入输入RDD之前会发生什么?它被视为任何其他程序,并且会像疯了一样交换出来吗?
将整个数据集加载到本地内存中。所以,如果你有记忆,它就有效。
您如何使任何火花程序可扩展?
您已选择了spark可以加载的数据格式,或者您更改了应用程序,以便它可以直接将数据格式加载到spark或两者中。
在这种情况下,您可以查看创建一个自定义InputFormat
,它可以分割换行以外的其他内容。我想你也想看看你如何编写你的数据,以便在记录边界而不是新行的HDFS中进行分区。
但我怀疑最简单的答案是以不同方式编码数据。 JSON Lines或在写入期间对CSV文件中的换行进行编码或Avro或...使用Spark&amp; HDFS。