使用MongoSpark,在2个不同大小的不同数据集上运行相同的代码,导致其中一个抛出E11000 duplicate key error
。
在我们继续之前,以下是代码:
object ScrapeHubCompanyImporter {
def importData(path: String, companyMongoUrl: String): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.config("spark.mongodb.input.uri", companyMongoUrl)
.config("spark.mongodb.output.uri", companyMongoUrl)
.config("spark.mongodb.input.partitionerOptions.partitionKey", "profileUrl")
.getOrCreate()
import spark.implicits._
val websiteToDomainTransformer = udf((website: String) => {
val tldExtract = SplitHost.fromURL(website)
if (tldExtract.domain == "") {
null
} else {
tldExtract.domain + "." + tldExtract.tld
}
})
val jsonDF =
spark
.read
.json(path)
.filter { row =>
row.getAs[String]("canonical_url") != null
}
.dropDuplicates(Seq("canonical_url"))
.select(
toHttpsUdf($"canonical_url").as("profileUrl"),
$"city",
$"country",
$"founded",
$"hq".as("headquartes"),
$"industry",
$"company_id".as("companyId"),
$"name",
$"postal",
$"size",
$"specialties",
$"state",
$"street_1",
$"street_2",
$"type",
$"website"
)
.filter { row => row.getAs[String]("website") != null }
.withColumn("domain", websiteToDomainTransformer($"website"))
.filter(row => row.getAs[String]("domain") != null)
.as[ScrapeHubCompanyDataRep]
val jsonColsSet = jsonDF.columns.toSet
val mongoData = MongoSpark
.load[LinkedinCompanyRep](spark)
.withColumn("companyUrl", toHttpsUdf($"companyUrl"))
.as[CompanyRep]
val mongoColsSet = mongoData.columns.toSet
val union = jsonDF.joinWith(
mongoData,
jsonDF("companyUrl") === mongoData("companyUrl"),
joinType = "left")
.map { t =>
val scrapeHub = t._1
val liCompanyRep = if (t._2 != null ) {
t._2
} else {
CompanyRep(domain = scrapeHub.domain)
}
CompanyRep(
_id = pickValue(liCompanyRep._id, None),
city = pickValue(scrapeHub.city, liCompanyRep.city),
country = pickValue(scrapeHub.country, liCompanyRep.country),
postal = pickValue(scrapeHub.postal, liCompanyRep.postal),
domain = scrapeHub.domain,
founded = pickValue(scrapeHub.founded, liCompanyRep.founded),
headquartes = pickValue(scrapeHub.headquartes, liCompanyRep.headquartes),
headquarters = liCompanyRep.headquarters,
industry = pickValue(scrapeHub.industry, liCompanyRep.industry),
linkedinId = pickValue(scrapeHub.companyId, liCompanyRep.companyId),
companyUrl = Option(scrapeHub.companyUrl),
name = pickValue(scrapeHub.name, liCompanyRep.name),
size = pickValue(scrapeHub.size, liCompanyRep.size),
specialties = pickValue(scrapeHub.specialties, liCompanyRep.specialties),
street_1 = pickValue(scrapeHub.street_1, liCompanyRep.street_1),
street_2 = pickValue(scrapeHub.street_2, liCompanyRep.street_2),
state = pickValue(scrapeHub.state, liCompanyRep.state),
`type` = pickValue(scrapeHub.`type`, liCompanyRep.`type`),
website = pickValue(scrapeHub.website, liCompanyRep.website),
updatedDate = None,
scraped = Some(true)
)
}
val idToMongoId = udf { st: String =>
if (st != null) {
ObjectId(st)
} else {
null
}
}
val saveReady =
union
.map { rep =>
rep.copy(
updatedDate = Some(new Timestamp(System.currentTimeMillis)),
scraped = Some(true),
headquarters = generateCompanyHeadquarters(rep)
)
}
.dropDuplicates(Seq("companyUrl"))
MongoSpark.save(
saveReady.withColumn("_id", idToMongoId($"_id")),
WriteConfig(Map(
"uri" -> companyMongoUrl
)))
}
def generateCompanyHeadquarters(companyRep: CompanyRep): Option[CompanyHeadquarters] = {
val hq = CompanyHeadquarters(
country = companyRep.country,
geographicArea = companyRep.state,
city = companyRep.city,
postalCode = companyRep.postal,
line1 = companyRep.street_1,
line2 = companyRep.street_2
)
CompanyHeadquarters
.unapply(hq)
.get
.productIterator.toSeq.exists {
case a: Option[_] => a.isDefined
case _ => false
} match {
case true => Some(hq)
case false => None
}
}
def pickValue(left: Option[String], right: Option[String]): Option[String] = {
def _noneIfNull(opt: Option[String]): Option[String] = {
if (opt != null) {
opt
} else {
None
}
}
val lOpt = _noneIfNull(left)
val rOpt = _noneIfNull(right)
lOpt match {
case Some(l) => Option(l)
case None => rOpt match {
case Some(r) => Option(r)
case None => None
}
}
}
}
此问题围绕companyUrl
,它是集合中的唯一键之一,另一个是_id
键。问题是Spark会尝试在700gb数据集上保存大量重复项,但如果我在本地运行一个非常小的数据集,我就无法复制该问题。我试图了解最新情况,我如何确保对companyUrl
上的所有现有公司进行分组,并确保在整个数据集中全局删除重复项。
修改 以下是一些出现的情况:
EDIT2
companyUrl
字段周围发生重复错误。
编辑3
我把这缩小为合并阶段的问题。查看已标记为具有重复companyUrl
的记录,其中一些记录不在目标集合中,但仍以某种方式将重复记录写入集合。在其他情况下,新记录的_id
字段与具有相同companyUrl
的旧记录不匹配。