将spark rdd保存到大查询表中

时间:2018-07-05 09:28:03

标签: scala google-bigquery

我正在使用Google大查询平台,并且我想使用Scala Google大查询客户端加载spark RDD 我写了以下代码:

import org.apache.spark.SparkConf
import org.apache.spark.sql
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory
import com.google.api.services.bigquery.model.TableFieldSchema
import com.google.api.services.bigquery.model.TableSchema
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat
import com.google.cloud.hadoop.io.bigquery.GsonBigQueryInputFormat
import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration
import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import com.google.api.services.bigquery.model.TableFieldSchema
import com.google.api.services.bigquery.model.TableSchema
import java.util


object Main {


  def main(args: Array[String]): Unit = {


    val sparkConf = new SparkConf()
    val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()

    val sc = sparkSession.sparkContext
    val conf = sparkSession.sparkContext.hadoopConfiguration
    import sparkSession.implicits._
    val testdata = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)

    val dfha = testdata.toDF()

    // Input parameters.

    val projectId = conf.get("fs.gs.project.id")
    val bucket = conf.get("fs.gs.system.bucket")


    conf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId)
    conf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket)


    val outputTableId = projectId + ":wordcount_dataset.wordcount_output"
    // Temp output bucket that is deleted upon completion of job.
    val outputGcsPath = ("gs://" + bucket + "/hadoop/tmp/bigquery/wordcountoutput")



    // Output configuration.


    val outputTableFieldSchema = new util.ArrayList[TableFieldSchema]
    outputTableFieldSchema.add(new TableFieldSchema().setName("Word").setType("STRING"))
    outputTableFieldSchema.add(new TableFieldSchema().setName("Count").setType("STRING"))
    val outputSchema = new TableSchema().setFields(outputTableFieldSchema)


    conf.set("mapreduce.job.outputformat.class",
      classOf[IndirectBigQueryOutputFormat[_, _]].getName)


    conf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY,
      "WRITE_TRUNCATE")


    BigQueryOutputConfiguration.configure(conf, outputTableId, outputSchema, outputGcsPath, BigQueryFileFormat.CSV, classOf[TextOutputFormat[_, _]])


    testdata.saveAsNewAPIHadoopDataset(conf)
  }
}

提交到dataproc时,出现以下错误。能否请你帮我:

18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Bigquery connector version 0.10.8-hadoop2
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from default credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from given credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputFormat: Delegating functionality to 'TextOutputFormat'.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputFormat: Delegating functionality to 'TextOutputFormat'.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from default credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from given credential.
18/07/05 09:00:27 INFO com.google.cloud.hadoop.io.bigquery.BigQueryHelper: Importing into table 'renault-ftt:wordcount_dataset.wordcount_output' from 1 paths; path[0] is 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput/part-r-00000'; awaitCompletion: true
18/07/05 09:00:27 INFO com.google.cloud.hadoop.io.bigquery.BigQueryHelper: Using provided import schema '{fields=[{"name":"Word","type":"STRING"}, {"name":"Count","type":"STRING"}]}'.
18/07/05 09:00:41 ERROR org.apache.spark.internal.io.SparkHadoopMapReduceWriter: Aborting job job_20180705090013_0000.
java.io.IOException: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
    at com.google.cloud.hadoop.io.bigquery.BigQueryUtils.waitForJobCompletion(BigQueryUtils.java:108)
    at com.google.cloud.hadoop.io.bigquery.BigQueryHelper.importFromGcs(BigQueryHelper.java:183)
    at com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputCommitter.commitJob(IndirectBigQueryOutputCommitter.java:70)
    at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
    at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
    at com.renault.ftt.example.Main$.main(Main.scala:129)
    at com.renault.ftt.example.Main.main(Main.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
18/07/05 09:00:42 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputCommitter: Found GCS output data at 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput', attempting to clean up.
18/07/05 09:00:42 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputCommitter: Successfully deleted GCS output path 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput'.
Exception in thread "main" org.apache.spark.SparkException: Job aborted.
    at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:107)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
    at com.renault.ftt.example.Main$.main(Main.scala:129)
    at com.renault.ftt.example.Main.main(Main.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
    at com.google.cloud.hadoop.io.bigquery.BigQueryUtils.waitForJobCompletion(BigQueryUtils.java:108)
    at com.google.cloud.hadoop.io.bigquery.BigQueryHelper.importFromGcs(BigQueryHelper.java:183)
    at com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputCommitter.commitJob(IndirectBigQueryOutputCommitter.java:70)
    at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
    at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
    ... 18 more
18/07/05 09:00:43 INFO org.spark_project.jetty.server.AbstractConnector: Stopped Spark@717cfabd{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}

我不明白此错误的原因,有人可以帮助我

0 个答案:

没有答案