我正在使用MongoDB将简单的csv(如下)写入Azure Cosmos DB
我正在加载到Collection-具有分区键的产品-productid(分片键),这只是概念验证,实际应用可以是具有更复杂数据的500K Lines
我的程序如下
import com.mongodb.spark._
import com.mongodb.spark.config.WriteConfig
import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
import org.bson.codecs.configuration.CodecRegistries.{fromProviders, fromRegistries}
import org.mongodb.scala.bson.codecs.{DEFAULT_CODEC_REGISTRY, Macros}
object SparkCosmos extends App {
case class Product (productName: String,
productid: String,
image: String,
brand: String,
category: String,
styleId: BigInt,
age: String)
val sparkSession = SparkSession.builder().appName("SparkMongo")
.master("local[*]")
.config("spark.mongodb.output.collection","Products")
.config("spark.mongodb.output.database","PLUM")
.config("spark.mongodb.output.uri", "mongodb://xxxxxxxx-dev-azure-cosmosdb.documents.azure.com:10255/?ssl=true&replicaSet=globaldb")
.getOrCreate()
val productEncoder = Encoders.product[Product]
val productCodecProvider = Macros.createCodecProviderIgnoreNone[Product]()
val codecRegistry = fromRegistries( fromProviders(productCodecProvider), DEFAULT_CODEC_REGISTRY )
val ds: Dataset[Product] = sparkSession.read.option("header", "true").csv("styles.csv").as(productEncoder);
ds.show()
val writeConfig = WriteConfig(Map(
"uri" -> "mongodb://xxxxxxxx-dev-azure-cosmosdb.documents.azure.com:10255/?ssl=true&replicaSet=globaldb",
"database" -> "PLUM",
"collection" -> "Products",
"writeConcern.w" -> "majority"))
ds.rdd.saveToMongoDB(writeConfig)
}
现在,当我以Job身份在Spark中运行程序时,出现错误
找不到类Product的编解码器
Caused by: org.bson.codecs.configuration.CodecConfigurationException: Can't find a codec for class Main$Product.
at org.bson.codecs.configuration.CodecCache.getOrThrow(CodecCache.java:46)
at org.bson.codecs.configuration.ProvidersCodecRegistry.get(ProvidersCodecRegistry.java:63)
at org.bson.codecs.configuration.ProvidersCodecRegistry.get(ProvidersCodecRegistry.java:37)
at com.mongodb.internal.operation.Operations.getCodec(Operations.java:525)
at com.mongodb.internal.operation.Operations.insertMany(Operations.java:373)
at com.mongodb.internal.operation.SyncOperations.insertMany(SyncOperations.java:205)
at com.mongodb.client.internal.MongoCollectionImpl.executeInsertMany(MongoCollectionImpl.java:524)
at com.mongodb.client.internal.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:508)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:119)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:118)
at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:189)
at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:187)
at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:174)
at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:174)
at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:157)
at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:174)
at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:187)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:118)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:117)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$28.apply(RDD.scala:935)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
任何人都可以帮助我为什么带有实现和编解码器定义的数据集会引发此错误
仅供参考,我的build.sbt