我正在使用Google大查询平台,并且我想使用Scala Google大查询客户端加载spark RDD 我写了以下代码:
import org.apache.spark.SparkConf
import org.apache.spark.sql
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory
import com.google.api.services.bigquery.model.TableFieldSchema
import com.google.api.services.bigquery.model.TableSchema
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat
import com.google.cloud.hadoop.io.bigquery.GsonBigQueryInputFormat
import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration
import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import com.google.api.services.bigquery.model.TableFieldSchema
import com.google.api.services.bigquery.model.TableSchema
import java.util
object Main {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val sc = sparkSession.sparkContext
val conf = sparkSession.sparkContext.hadoopConfiguration
import sparkSession.implicits._
val testdata = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
val dfha = testdata.toDF()
// Input parameters.
val projectId = conf.get("fs.gs.project.id")
val bucket = conf.get("fs.gs.system.bucket")
conf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId)
conf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket)
val outputTableId = projectId + ":wordcount_dataset.wordcount_output"
// Temp output bucket that is deleted upon completion of job.
val outputGcsPath = ("gs://" + bucket + "/hadoop/tmp/bigquery/wordcountoutput")
// Output configuration.
val outputTableFieldSchema = new util.ArrayList[TableFieldSchema]
outputTableFieldSchema.add(new TableFieldSchema().setName("Word").setType("STRING"))
outputTableFieldSchema.add(new TableFieldSchema().setName("Count").setType("STRING"))
val outputSchema = new TableSchema().setFields(outputTableFieldSchema)
conf.set("mapreduce.job.outputformat.class",
classOf[IndirectBigQueryOutputFormat[_, _]].getName)
conf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY,
"WRITE_TRUNCATE")
BigQueryOutputConfiguration.configure(conf, outputTableId, outputSchema, outputGcsPath, BigQueryFileFormat.CSV, classOf[TextOutputFormat[_, _]])
testdata.saveAsNewAPIHadoopDataset(conf)
}
}
提交到dataproc时,出现以下错误。能否请你帮我:
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Bigquery connector version 0.10.8-hadoop2
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from default credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from given credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputFormat: Delegating functionality to 'TextOutputFormat'.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputFormat: Delegating functionality to 'TextOutputFormat'.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from default credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from given credential.
18/07/05 09:00:27 INFO com.google.cloud.hadoop.io.bigquery.BigQueryHelper: Importing into table 'renault-ftt:wordcount_dataset.wordcount_output' from 1 paths; path[0] is 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput/part-r-00000'; awaitCompletion: true
18/07/05 09:00:27 INFO com.google.cloud.hadoop.io.bigquery.BigQueryHelper: Using provided import schema '{fields=[{"name":"Word","type":"STRING"}, {"name":"Count","type":"STRING"}]}'.
18/07/05 09:00:41 ERROR org.apache.spark.internal.io.SparkHadoopMapReduceWriter: Aborting job job_20180705090013_0000.
java.io.IOException: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
at com.google.cloud.hadoop.io.bigquery.BigQueryUtils.waitForJobCompletion(BigQueryUtils.java:108)
at com.google.cloud.hadoop.io.bigquery.BigQueryHelper.importFromGcs(BigQueryHelper.java:183)
at com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputCommitter.commitJob(IndirectBigQueryOutputCommitter.java:70)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
at com.renault.ftt.example.Main$.main(Main.scala:129)
at com.renault.ftt.example.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
18/07/05 09:00:42 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputCommitter: Found GCS output data at 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput', attempting to clean up.
18/07/05 09:00:42 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputCommitter: Successfully deleted GCS output path 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput'.
Exception in thread "main" org.apache.spark.SparkException: Job aborted.
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:107)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
at com.renault.ftt.example.Main$.main(Main.scala:129)
at com.renault.ftt.example.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
at com.google.cloud.hadoop.io.bigquery.BigQueryUtils.waitForJobCompletion(BigQueryUtils.java:108)
at com.google.cloud.hadoop.io.bigquery.BigQueryHelper.importFromGcs(BigQueryHelper.java:183)
at com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputCommitter.commitJob(IndirectBigQueryOutputCommitter.java:70)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
... 18 more
18/07/05 09:00:43 INFO org.spark_project.jetty.server.AbstractConnector: Stopped Spark@717cfabd{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
我不明白此错误的原因,有人可以帮助我