from pyspark import SparkConf, SparkContext
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
def parseLine(line): #get the names from the csv file
fields = line.split('\t')
canonical_smiles = fields[1]
return canonical_smiles
def transform(smile):
m1 = Chem.MolFromSmiles(smile)
fp1_4 = AllChem.GetMorganFingerprint(m1,2)
return fp1_4
#set the configuartion
conf = SparkConf().setMaster("local").setAppName("Similarity")
sc = SparkContext(conf = conf)
lines = sc.textFile("PS22_smiles_only_3.tsv") #loads the data
all_data = lines.map(parseLine) #method
similarity_fcp4 = all_data.map(similarity) #method
results = similarity_fcp4.collect() #action
for result in results:
print(result)
到目前为止,这是我的代码。它读取一个tsv文件,然后根据ecfp4指纹转换列的每一行。我坚持的是如何将rdd的每个元素与所有其他元素进行比较。 在普通的python中,我的代码是一个for循环,
for j in range(i+1, df.shape[0]):
fp2_4 = AllChem.GetMorganFingerprint(m2,2)
sim_ecfp4 = DataStructs.DiceSimilarity(fp1_4,fp2_4)
我如何在pyspark中做到这一点?