我必须对我的源DF进行查找检查。我需要对源DF中存在的10列执行大约10个查找检查。因此,我正在遍历for循环,以检查每次查找检查的源DF列中是否存在数据。
该程序可以正常运行7次(进行7次查询)。从第8次查找开始,该程序停顿了更长的时间,之后,我由于对等错误而使连接重置。
以下是逻辑:
我尝试缓存unionDF(第5步),但是经过7次查找后,我仍然面临性能问题
for (lookupTableValues <- larry){
print("\n")
print(lookupTableValues)
print("\n")
lookupCheckCount = lookupCheckCount + 1
val lookupTable = lookupTableValues.split(";")(0)
lookUpCheckMap(lookupTable) = lookupCheckCount
if (!lookupTableValues.split(";")(1).equalsIgnoreCase("sqoop")) {
lookupKeys = lookupTableValues.split(";").drop(1)
parquetDf1 = sqlContext.read.parquet(basePath + "/" + lookupTable)
print("\n")
print(parquetDf1.count())
print("\n")
print(parquetDf1.rdd.getNumPartitions)
print("\n")
}else{
lookupKeys = lookupTableValues.split(";").drop(2)
val hc = new HiveContext(sc)
hc.sql("use " + sqoopedDatabase)
parquetDf1 = hc.sql("select * from " + lookupTable)
}
var joinKey: Column = null
var nullColumnFilter:Column = null
var notNullColumnFilter:Column = null
var lookUpKeyForPickup = ""
var lookUpKeyForPickupAlias = ""
if (lookupKeys.length == 1) {
for (lookupKey <- lookupKeys) {
print(lookupKey)
val firstlookupKey = lookupKey.split("\\=")(0)
val secondlookupKey = lookupKey.split("\\=")(1)
joinKey = upper(dfjoin(firstlookupKey)) === upper(parquetDf1(secondlookupKey))
lookUpKeyForPickup = secondlookupKey
lookUpKeyForPickupAlias = lookupTable + ":" + firstlookupKey
if(lookupTableCounter == 0)
{
filterNullDF = dfjoin.filter(dfjoin(firstlookupKey).isNull || dfjoin(firstlookupKey).===(""))
filterDF = dfjoin.except(filterNullDF)
}
else
{
filterNullDF = unionDF.filter(dfjoin(firstlookupKey).isNull || dfjoin(firstlookupKey).===("") )
filterDF = unionDF.except(filterNullDF)
}
filterNullDF = filterNullDF.withColumn(lookUpKeyForPickupAlias,lit(9999))
joinedDF = filterDF.as("d1").join(parquetDf1.as("d2"), joinKey, "left_outer").select($"d1.*", parquetDf1(lookUpKeyForPickup) as lookUpKeyForPickupAlias)
joinedDF.repartition(4)
unionDF = joinedDF.unionAll(filterNullDF)
print("\n")
print(unionDF.count())
print("\n")
unionDF.show()
}
}
else
{
for (lookupKey <- lookupKeys) {
val firstlookupKey = lookupKey.split("\\=")(0)
val secondlookupKey = lookupKey.split("\\=")(1)
if (joinKey == null) {
joinKey = upper(dfjoin(firstlookupKey)) === upper(parquetDf1(secondlookupKey))
lookUpKeyForPickup = secondlookupKey
lookUpKeyForPickupAlias = lookupTable + ":" + firstlookupKey
if (nullColumnFilter == null){
nullColumnFilter = (dfjoin(firstlookupKey).isNull || dfjoin(firstlookupKey).===(""))
}
} else {
joinKey = joinKey && upper(dfjoin(firstlookupKey)) === upper(parquetDf1(secondlookupKey))
nullColumnFilter = nullColumnFilter && (dfjoin(firstlookupKey).isNull || dfjoin(firstlookupKey).===(""))
}
}
if(lookupTableCounter == 0){
filterNullDF = dfjoin.filter(nullColumnFilter)
filterDF = dfjoin.except(filterNullDF)
}else{
filterNullDF = unionDF.filter(nullColumnFilter)
filterDF = unionDF.except(filterNullDF)
}
filterNullDF = filterNullDF.withColumn(lookUpKeyForPickupAlias,lit(9999))
joinedDF = filterDF.as("d1").join(parquetDf1.as("d2"), joinKey, "left_outer").select($"d1.*", parquetDf1(lookUpKeyForPickup) as lookUpKeyForPickupAlias)
joinedDF.repartition(4)
unionDf.unpersist()
unionDF = joinedDF.unionAll(filterNullDF)
unionDF.cache()
unionDF.show()
print("\n")
print(unionDF.rdd.getNumPartitions)
print("\n")
}
lookupTableCounter += 1
print("\n")
print(lookupTableCounter)
print("\n")
}
}