使用Spark作业在Cassandra集群表之间进行数据迁移

时间:2018-08-19 16:16:49

标签: apache-spark cassandra-3.0

我想使用Spark Job将数据从一个Cassandra集群表迁移到另一个。

我写了下面的代码。如果源表和目标表都在同一台主机上,则可以正常工作,但是当源表和目标表位于不同主机上时,则无法正常工作。 我能够从目标表中读取信息,但是当我尝试将结果写入目标表时,出现诸如“无效参数”之类的错误,登台失败。 我已经交叉检查了参数的数量。所有的细节似乎都是正确的。

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.cassandra._
import org.apache.spark.sql.types.IntegerType

object Test2 {
  def main(args : Array[String]) = {
    println("Hello World")

    val conf = new SparkConf().setAppName("Data_Migration_1").setMaster("local[*]");
    val sc = new SparkContext(conf);

    val spark = SparkSession
      .builder()
      .appName("Spark SQL data Migration program")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()

    val df_read1 = spark.read
                        .format("org.apache.spark.sql.cassandra")                                                                            .option("spark.cassandra.connection.host","127.0.0.132")
                        .option("spark.cassandra.connection.port","9042")
                        .option("keyspace","sparkdb")
                        .option("table","emp1")
                        .load()
    println(df_read1.show); 
    println("total Records in Table1 = "+ df_read1.count());

    val df_read2 = spark.read
                        .format("org.apache.spark.sql.cassandra")                         
                        .option("spark.cassandra.connection.host","127.0.0.132")
                        .option("spark.cassandra.connection.port","9042")
                        .option("keyspace","sparkdb2")
                        .option("table","emp3")
                        .load()
    println(df_read2.show);
    println("total Records in Table2 = "+ df_read2.count());

    println("cassandra Read Happened successfully");


    val df3= df_read1.join(df_read2, df_read1("emp_id") === df_read2("emp_id"), "leftanti");
    println(df3.show);
    println("total differenced Records in 2 Tables = "+ df3.count());


    df3.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").save("C:/spark_windows_proj/DataM_HFT/File1.csv");

    df3.write
       .format("org.apache.spark.sql.cassandra")
       .mode("append")
       .option("confirm.truncate","true")
       .option("spark.cassandra.connection.host","127.0.0.132")
       .option("spark.cassandra.connection.port","9042")
       .option("keyspace","sparkdb2")
       .option("table","emp3")
       .save()    

    println("cassandra write Happened successfully");        
  }
}

也请让我知道是否有更好的方法可以实现这一目标。 我实际上有18个列,带有一些时间戳和null值。当我尝试插入时,出现以下错误: java.lang.IllegalArgumentException:要求失败:无效的行大小:18而不是17。     在scala.Predef $ .require(Predef.scala:224)     在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:23)     在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:12)     在com.datastax.spark.connector.writer.BoundStatementBuilder.bind(BoundStatementBuilder.scala:99)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:106)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:31)     在scala.collection.Iterator $ class.foreach(Iterator.scala:891)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.foreach(GroupingBatchBuilder.scala:31)     在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:233)     在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:210)     在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:112)     在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:111)     在com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:145)     在com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111)     在com.datastax.spark.connector.writer.TableWriter.writeInternal(TableWriter.scala:210)     在com.datastax.spark.connector.writer.TableWriter.insert(TableWriter.scala:197)     在com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:183)     在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36)     在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36)     在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)     在org.apache.spark.scheduler.Task.run(Task.scala:109)     在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:345)     在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)     在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:624)     在java.lang.Thread.run(Thread.java:748) 18/08/20 15:40:20 INFO TaskSetManager:在阶段24.0中启动任务119.0(TID 575,localhost,执行程序驱动程序,分区119,ANY,8082字节) 18/08/20 15:40:20 INFO执行器:在阶段24.0中运行任务119.0(TID 575) 18/08/20 15:40:20 INFO ShuffleBlockFetcherIterator:从4个块中获取0个非空块 18/08/20 15:40:20 INFO ShuffleBlockFetcherIterator:在0毫秒内开始了0次远程获取 18/08/20 15:40:20 WARN TaskSetManager:在阶段24.0(TID 574,本地主机,执行程序驱动程序)中丢失任务118.0:java.lang.IllegalArgumentException:要求失败:无效的行大小:18而不是17。     在scala.Predef $ .require(Predef.scala:224)     在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:23)     在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:12)     在com.datastax.spark.connector.writer.BoundStatementBuilder.bind(BoundStatementBuilder.scala:99)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:106)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:31)     在scala.collection.Iterator $ class.foreach(Iterator.scala:891)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.foreach(GroupingBatchBuilder.scala:31)     在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:233)     在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:210)     在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:112)     在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:111)     在com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:145)     在com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111)     在com.datastax.spark.connector.writer.TableWriter.writeInternal(TableWriter.scala:210)     在com.datastax.spark.connector.writer.TableWriter.insert(TableWriter.scala:197)     在com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:183)     在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36)     在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36)     在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)     在org.apache.spark.scheduler.Task.run(Task.scala:109)     在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:345)     在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)     在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:624)     在java.lang.Thread.run(Thread.java:748)

18/08/20 15:40:20信息ShuffleBlockFetcherIterator:从4个块中获取2个非空块 18/08/20 15:40:20 INFO ShuffleBlockFetcherIterator:在0毫秒内开始了0次远程获取 18/08/20 15:40:20错误TaskSetManager:阶段24.0中的任务118失败1次;放弃工作 18/08/20 15:40:20 INFO TaskSchedulerImpl:取消阶段24 18/08/20 15:40:20信息执行者:执行者试图在阶段24.0(TID 575)中杀死任务119.0,原因:阶段已取消 18/08/20 15:40:20 INFO TaskSchedulerImpl:阶段24已取消 18/08/20 15:40:20信息DAScheduler:ResultStage 24(RDDFunctions.scala:36的runJob)在2.169 s内由于阶段失败而因Job失败而失败:阶段24.0中的任务118失败1次,最近一次失败:在阶段24.0(TID 574,本地主机,执行程序驱动程序)中丢失任务118.0:java.lang.IllegalArgumentException:要求失败:无效的行大小:18而不是17。     在scala.Predef $ .require(Predef.scala:224)     在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:23)     在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:12)     在com.datastax.spark.connector.writer.BoundStatementBuilder.bind(BoundStatementBuilder.scala:99)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:106)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:31)     在scala.collection.Iterator $ class.foreach(Iterator.scala:891)     在com.datastax.spark.connector.writer.GroupingBatchBuilder.foreach(GroupingBatchBuilder.scala:31)     在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:233)     在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:210)     在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:112)     在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:111)     在com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:145)     在com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111)     在com.datastax.spark.connector.writer.TableWriter.writeInternal(TableWriter.scala:210)     在com.datastax.spark.connector.writer.TableWriter.insert(TableWriter.scala:197)     在com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:183)     在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36)     在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36)     在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)     在org.apache.spark.scheduler.Task.run(Task.scala:109)     在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:345)     在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)     在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:624)     在java.lang.Thread.run(Thread.java:748)

0 个答案:

没有答案