Question

使用MongoSpark，在2个不同大小的不同数据集上运行相同的代码，导致其中一个抛出E11000 duplicate key error。

在我们继续之前，以下是代码：

object ScrapeHubCompanyImporter {
  def importData(path: String, companyMongoUrl: String): Unit = {
    val spark = SparkSession.builder()
      .master("local[*]")
      .config("spark.mongodb.input.uri", companyMongoUrl)
      .config("spark.mongodb.output.uri", companyMongoUrl)
      .config("spark.mongodb.input.partitionerOptions.partitionKey", "profileUrl")
      .getOrCreate()
    import spark.implicits._

    val websiteToDomainTransformer = udf((website: String) => {
      val tldExtract = SplitHost.fromURL(website)
      if (tldExtract.domain == "") {
        null
      } else {
        tldExtract.domain + "." + tldExtract.tld
      }
    })

    val jsonDF =
      spark
        .read
        .json(path)
        .filter { row =>
          row.getAs[String]("canonical_url") != null
        }
        .dropDuplicates(Seq("canonical_url"))
        .select(
          toHttpsUdf($"canonical_url").as("profileUrl"),
          $"city",
          $"country",
          $"founded",
          $"hq".as("headquartes"),
          $"industry",
          $"company_id".as("companyId"),
          $"name",
          $"postal",
          $"size",
          $"specialties",
          $"state",
          $"street_1",
          $"street_2",
          $"type",
          $"website"
        )
        .filter { row => row.getAs[String]("website") != null }
        .withColumn("domain", websiteToDomainTransformer($"website"))
        .filter(row => row.getAs[String]("domain") != null)
        .as[ScrapeHubCompanyDataRep]

    val jsonColsSet = jsonDF.columns.toSet

    val mongoData = MongoSpark
      .load[LinkedinCompanyRep](spark)
      .withColumn("companyUrl", toHttpsUdf($"companyUrl"))
      .as[CompanyRep]

    val mongoColsSet = mongoData.columns.toSet

    val union = jsonDF.joinWith(
      mongoData,
      jsonDF("companyUrl") === mongoData("companyUrl"),
      joinType = "left")
      .map { t =>
        val scrapeHub = t._1
        val liCompanyRep = if (t._2 != null ) {
          t._2
        } else {
          CompanyRep(domain = scrapeHub.domain)
        }

        CompanyRep(
          _id = pickValue(liCompanyRep._id, None),
          city = pickValue(scrapeHub.city, liCompanyRep.city),
          country = pickValue(scrapeHub.country, liCompanyRep.country),
          postal = pickValue(scrapeHub.postal, liCompanyRep.postal),
          domain = scrapeHub.domain,
          founded = pickValue(scrapeHub.founded, liCompanyRep.founded),
          headquartes = pickValue(scrapeHub.headquartes, liCompanyRep.headquartes),
          headquarters = liCompanyRep.headquarters,
          industry = pickValue(scrapeHub.industry, liCompanyRep.industry),
          linkedinId = pickValue(scrapeHub.companyId, liCompanyRep.companyId),
          companyUrl = Option(scrapeHub.companyUrl),
          name = pickValue(scrapeHub.name, liCompanyRep.name),
          size = pickValue(scrapeHub.size, liCompanyRep.size),
          specialties = pickValue(scrapeHub.specialties, liCompanyRep.specialties),
          street_1 = pickValue(scrapeHub.street_1, liCompanyRep.street_1),
          street_2 = pickValue(scrapeHub.street_2, liCompanyRep.street_2),
          state = pickValue(scrapeHub.state, liCompanyRep.state),
          `type` = pickValue(scrapeHub.`type`, liCompanyRep.`type`),
          website = pickValue(scrapeHub.website, liCompanyRep.website),
          updatedDate = None,
          scraped = Some(true)
        )
      }

    val idToMongoId = udf { st: String =>
      if (st != null) {
        ObjectId(st)
      } else {
        null
      }
    }

    val saveReady =
      union
      .map { rep =>
          rep.copy(
            updatedDate = Some(new Timestamp(System.currentTimeMillis)),
            scraped = Some(true),
            headquarters = generateCompanyHeadquarters(rep)
          )
      }
      .dropDuplicates(Seq("companyUrl"))

    MongoSpark.save(
      saveReady.withColumn("_id", idToMongoId($"_id")),
      WriteConfig(Map(
        "uri" -> companyMongoUrl
      )))
  }

  def generateCompanyHeadquarters(companyRep: CompanyRep): Option[CompanyHeadquarters] = {
    val hq = CompanyHeadquarters(
      country = companyRep.country,
      geographicArea = companyRep.state,
      city = companyRep.city,
      postalCode = companyRep.postal,
      line1 = companyRep.street_1,
      line2 = companyRep.street_2
    )

    CompanyHeadquarters
      .unapply(hq)
      .get
      .productIterator.toSeq.exists {
        case a: Option[_] => a.isDefined
        case _ => false
      } match {
        case true => Some(hq)
        case false => None
      }
  }

  def pickValue(left: Option[String], right: Option[String]): Option[String] = {
    def _noneIfNull(opt: Option[String]): Option[String] = {
      if (opt != null) {
        opt
      } else {
        None
      }
    }

    val lOpt = _noneIfNull(left)
    val rOpt = _noneIfNull(right)

    lOpt match {
      case Some(l) => Option(l)
      case None => rOpt match {
        case Some(r) => Option(r)
        case None => None
      }
    }
  }
}

此问题围绕companyUrl，它是集合中的唯一键之一，另一个是_id键。问题是Spark会尝试在700gb数据集上保存大量重复项，但如果我在本地运行一个非常小的数据集，我就无法复制该问题。我试图了解最新情况，我如何确保对companyUrl上的所有现有公司进行分组，并确保在整个数据集中全局删除重复项。

修改以下是一些出现的情况：

公司位于Mongo，读取的文件已更新数据 - ＆gt;此处可能出现重复的密钥错误
公司不在Mongo但在档案中 - ＆gt;此处也可能出现重复键错误。

EDIT2 companyUrl字段周围发生重复错误。

编辑3 我把这缩小为合并阶段的问题。查看已标记为具有重复companyUrl的记录，其中一些记录不在目标集合中，但仍以某种方式将重复记录写入集合。在其他情况下，新记录的_id字段与具有相同companyUrl的旧记录不匹配。

MongoSpark仅在海量数据集上重复键错误

0 个答案: