我在Parquet文档中读到,只有我查询的列,才会读取和处理该列的数据。但是当我看到Spark-UI时,我发现读完了整个文件。
以下是编写镶木地板文件并在Spark-Sql中读取的代码。
object ParquetFileCreator_simple {
def datagenerate(schema: Schema, ind: Long): GenericRecord ={
var data: GenericRecord = new GenericData.Record(schema)
data.put("first", "Pg20 " + ind )
data.put("appType", "WAP" + ind)
data
}
def main (args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").set("spark.app.name", "merger").set("spark.eventLog.enabled", "true")
val sc = new SparkContext(conf)
val sqlc = new org.apache.spark.sql.SQLContext(sc)
val schemaPath = "/home/a.avsc"
val schema = new Schema.Parser().parse(new FileInputStream(schemaPath))
val outFile = "/home/parquet_simple.parquet"
val outPath: org.apache.hadoop.fs.Path = new org.apache.hadoop.fs.Path(outFile);
var writer: AvroParquetWriter[GenericRecord] = new AvroParquetWriter[GenericRecord](outPath, schema);
for(ind <- 1 to 50000000) {
var r = datagenerate(schema, ind)
writer.write(r);
}
writer.close();
val df = sqlc.read.parquet(outFile)
df.registerTempTable("nestedread")
//var results = df.select("address.pincode")
val results = sqlc.sql("SELECT first FROM nestedread ")
results.count()
//results.foreach(x=>println(x))
Thread.sleep(60000)
}
}
My Avro Schema是:a.avsc
{
"type": "record",
"name": "FullName",
"namespace": "com.snapdeal.event.avro",
"fields": [{
"name": "first",
"type": ["string", "null"]
}, {
"name" : "appType",
"type" : {
"name" : "app_types",
"type" : "enum",
"symbols" : [ "WAP", "WEB", "APP" ]
}
}
]
}
我在本地运行了这个。我首先创建了一个1.7GB的文件,同样是由Parquet读取的。