我想使用Spark Job将数据从一个Cassandra集群表迁移到另一个。
我写了下面的代码。如果源表和目标表都在同一台主机上,则可以正常工作,但是当源表和目标表位于不同主机上时,则无法正常工作。 我能够从目标表中读取信息,但是当我尝试将结果写入目标表时,出现诸如“无效参数”之类的错误,登台失败。 我已经交叉检查了参数的数量。所有的细节似乎都是正确的。
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.cassandra._
import org.apache.spark.sql.types.IntegerType
object Test2 {
def main(args : Array[String]) = {
println("Hello World")
val conf = new SparkConf().setAppName("Data_Migration_1").setMaster("local[*]");
val sc = new SparkContext(conf);
val spark = SparkSession
.builder()
.appName("Spark SQL data Migration program")
.config("spark.some.config.option", "some-value")
.getOrCreate()
val df_read1 = spark.read
.format("org.apache.spark.sql.cassandra") .option("spark.cassandra.connection.host","127.0.0.132")
.option("spark.cassandra.connection.port","9042")
.option("keyspace","sparkdb")
.option("table","emp1")
.load()
println(df_read1.show);
println("total Records in Table1 = "+ df_read1.count());
val df_read2 = spark.read
.format("org.apache.spark.sql.cassandra")
.option("spark.cassandra.connection.host","127.0.0.132")
.option("spark.cassandra.connection.port","9042")
.option("keyspace","sparkdb2")
.option("table","emp3")
.load()
println(df_read2.show);
println("total Records in Table2 = "+ df_read2.count());
println("cassandra Read Happened successfully");
val df3= df_read1.join(df_read2, df_read1("emp_id") === df_read2("emp_id"), "leftanti");
println(df3.show);
println("total differenced Records in 2 Tables = "+ df3.count());
df3.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").save("C:/spark_windows_proj/DataM_HFT/File1.csv");
df3.write
.format("org.apache.spark.sql.cassandra")
.mode("append")
.option("confirm.truncate","true")
.option("spark.cassandra.connection.host","127.0.0.132")
.option("spark.cassandra.connection.port","9042")
.option("keyspace","sparkdb2")
.option("table","emp3")
.save()
println("cassandra write Happened successfully");
}
}
也请让我知道是否有更好的方法可以实现这一目标。 我实际上有18个列,带有一些时间戳和null值。当我尝试插入时,出现以下错误: java.lang.IllegalArgumentException:要求失败:无效的行大小:18而不是17。 在scala.Predef $ .require(Predef.scala:224) 在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:23) 在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:12) 在com.datastax.spark.connector.writer.BoundStatementBuilder.bind(BoundStatementBuilder.scala:99) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:106) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:31) 在scala.collection.Iterator $ class.foreach(Iterator.scala:891) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.foreach(GroupingBatchBuilder.scala:31) 在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:233) 在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:210) 在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:112) 在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:111) 在com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:145) 在com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111) 在com.datastax.spark.connector.writer.TableWriter.writeInternal(TableWriter.scala:210) 在com.datastax.spark.connector.writer.TableWriter.insert(TableWriter.scala:197) 在com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:183) 在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36) 在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36) 在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 在org.apache.spark.scheduler.Task.run(Task.scala:109) 在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:345) 在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:624) 在java.lang.Thread.run(Thread.java:748) 18/08/20 15:40:20 INFO TaskSetManager:在阶段24.0中启动任务119.0(TID 575,localhost,执行程序驱动程序,分区119,ANY,8082字节) 18/08/20 15:40:20 INFO执行器:在阶段24.0中运行任务119.0(TID 575) 18/08/20 15:40:20 INFO ShuffleBlockFetcherIterator:从4个块中获取0个非空块 18/08/20 15:40:20 INFO ShuffleBlockFetcherIterator:在0毫秒内开始了0次远程获取 18/08/20 15:40:20 WARN TaskSetManager:在阶段24.0(TID 574,本地主机,执行程序驱动程序)中丢失任务118.0:java.lang.IllegalArgumentException:要求失败:无效的行大小:18而不是17。 在scala.Predef $ .require(Predef.scala:224) 在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:23) 在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:12) 在com.datastax.spark.connector.writer.BoundStatementBuilder.bind(BoundStatementBuilder.scala:99) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:106) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:31) 在scala.collection.Iterator $ class.foreach(Iterator.scala:891) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.foreach(GroupingBatchBuilder.scala:31) 在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:233) 在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:210) 在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:112) 在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:111) 在com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:145) 在com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111) 在com.datastax.spark.connector.writer.TableWriter.writeInternal(TableWriter.scala:210) 在com.datastax.spark.connector.writer.TableWriter.insert(TableWriter.scala:197) 在com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:183) 在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36) 在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36) 在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 在org.apache.spark.scheduler.Task.run(Task.scala:109) 在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:345) 在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:624) 在java.lang.Thread.run(Thread.java:748)
18/08/20 15:40:20信息ShuffleBlockFetcherIterator:从4个块中获取2个非空块 18/08/20 15:40:20 INFO ShuffleBlockFetcherIterator:在0毫秒内开始了0次远程获取 18/08/20 15:40:20错误TaskSetManager:阶段24.0中的任务118失败1次;放弃工作 18/08/20 15:40:20 INFO TaskSchedulerImpl:取消阶段24 18/08/20 15:40:20信息执行者:执行者试图在阶段24.0(TID 575)中杀死任务119.0,原因:阶段已取消 18/08/20 15:40:20 INFO TaskSchedulerImpl:阶段24已取消 18/08/20 15:40:20信息DAScheduler:ResultStage 24(RDDFunctions.scala:36的runJob)在2.169 s内由于阶段失败而因Job失败而失败:阶段24.0中的任务118失败1次,最近一次失败:在阶段24.0(TID 574,本地主机,执行程序驱动程序)中丢失任务118.0:java.lang.IllegalArgumentException:要求失败:无效的行大小:18而不是17。 在scala.Predef $ .require(Predef.scala:224) 在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:23) 在com.datastax.spark.connector.writer.SqlRowWriter.readColumnValues(SqlRowWriter.scala:12) 在com.datastax.spark.connector.writer.BoundStatementBuilder.bind(BoundStatementBuilder.scala:99) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:106) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.next(GroupingBatchBuilder.scala:31) 在scala.collection.Iterator $ class.foreach(Iterator.scala:891) 在com.datastax.spark.connector.writer.GroupingBatchBuilder.foreach(GroupingBatchBuilder.scala:31) 在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:233) 在com.datastax.spark.connector.writer.TableWriter $$ anonfun $ writeInternal $ 1.apply(TableWriter.scala:210) 在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:112) 在com.datastax.spark.connector.cql.CassandraConnector $$ anonfun $ withSessionDo $ 1.apply(CassandraConnector.scala:111) 在com.datastax.spark.connector.cql.CassandraConnector.closeResourceAfterUse(CassandraConnector.scala:145) 在com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111) 在com.datastax.spark.connector.writer.TableWriter.writeInternal(TableWriter.scala:210) 在com.datastax.spark.connector.writer.TableWriter.insert(TableWriter.scala:197) 在com.datastax.spark.connector.writer.TableWriter.write(TableWriter.scala:183) 在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36) 在com.datastax.spark.connector.RDDFunctions $$ anonfun $ saveToCassandra $ 1.apply(RDDFunctions.scala:36) 在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 在org.apache.spark.scheduler.Task.run(Task.scala:109) 在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:345) 在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:624) 在java.lang.Thread.run(Thread.java:748)