我有一个数据帧,我一度转换为rdd来执行自定义计算。在使用UDF(创建新列)完成此操作之前,我注意到这很慢。 因此我转换为RDD并再次返回,但是我注意到在将rdd转换为dataframe期间执行似乎停滞不前。
conf = SparkConf().setMaster(server_location).setAppName("MoleculesTests")
ss = SparkSession.builder.config(conf = conf).getOrCreate()
ss.sparkContext.addPyFile("helpers.py")
def mapper(line):
fields = line.split(' ')
return Row(name=str(fields[0]))
def calculate_tanimoto(smiles1,smiles2):
try:
mol1 = Chem.MolFromSmiles(smiles1)
mol2 = Chem.MolFromSmiles(smiles2)
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
similarity = DataStructs.FingerprintSimilarity(fp1,fp2, metric=DataStructs.TanimotoSimilarity)
return similarity
except Exception as e:
print (str(e))
print ("Error Smiles1", smiles1, " 2", smiles2)
CREATE_VECTORS = False
SIMILARITY_THRESHOLD = 0.3
dataFile = '../mols/compounds18.smi'
lines = ss.sparkContext.textFile(dataFile)
smiles = lines.map(mapper)
schemaSmiles = ss.createDataFrame(smiles).cache()
schemaSmiles.createOrReplaceTempView("smiles")
#some basic filtering
valid_smiles = ss.sql("SELECT * FROM smiles WHERE name != 'smiles'")
valid_smiles_id = valid_smiles.select("*").withColumn("id", monotonically_increasing_id())
#cartesian join and then filtering to get only upper triangle of a similarity matrix, with result being Source_Id, Source_description, Target_id, Target_description
combinations = valid_smiles_id.alias("source").join(valid_smiles_id.alias("target") )\
.where("source.Id <= target.Id")\
.select(f.col("source.Id").alias("source_id"), f.col("source.Name").alias("source_smile"), f.col("target.Id").alias("target_id"),f.col("target.Name").alias("target_smile"))
#Change to rdd to perform calculate_tanimoto using source and target descriptions
combinations_rdd = combinations.rdd.map(tuple)
similarities_fp = combinations_rdd.map(lambda (source_id, source_smiles,target_id,target_smiles): (source_id, target_id, calculate_tanimoto(source_smiles, target_smiles)))\
.filter(lambda (a,b,c): c >= SIMILARITY_THRESHOLD).cache()
schema = StructType([StructField("source",IntegerType(), False),StructField("target",IntegerType(), False),StructField("tanimoto",StringType(), False)])
#change back to Dataframe, execution seems to get stuck here
combinations_sim = ss.createDataFrame(similarities_fp,schema=schema).cache()
print(combinations_sim.show(n=10))
combinations_sim = combinations_sim.groupby(combinations_sim.source_id).agg(f.collect_set("target_id"))
为2500种化合物运行此类似乎卡住了。