我想查找两个数据帧中是否存在重叠值。我的代码大约需要两个小时。如何使其更快。
条件: 如果list2'chr'(chr1)]等于列表'chr'(1)] 并且如果list2'start'(100028246)大于或等于list1'positoin'(100028246)小于或等于list2'end'(100029244) 输出:list1'snp'(rs75631842)list1'chr'(1)list'positon'(100029244)'是' 其他输出:list1'chr'list1'chr'list'positon''否'
list2<tab separated>
chr10 100027038 100027943
chr1 100028246 100029244
list1<tab separate>
snp chr position
rs6604985 1 1552755
rs75631842 1 100028246
Output
rs6604985 1 1552755 No
rs75631842 1 100028246 Yes
def snp_enhancer_overlap_region(self,H3K27ac_file,snp_list):
H3K27ac_marks = pd.read_csv(self.H3K27ac_file, sep="\t",header=None,names=['chr','start','end'])
snp_list = pd.read_csv(self.snp_list, sep="\t",header=0)
H3K27ac_marks.loc[H3K27ac_marks['start'] > H3K27ac_marks['end'], ['start','end']] = H3K27ac_marks.loc[H3K27ac_marks['start'] > H3K27ac_marks['end'], ['end','start']].values
for index, row in snp_list.iterrows():
count = 0
for index, row1 in H3K27ac_marks.iterrows():
if 'chr'+str(row['chr']) == row1['chr']:
if(row1['start'] <= row['position'] <= row['position']):
count = count +1
else:
count = 0
if count > 0:
print(row['snp'],"\t",row['chr'],"\t",row['position'],'Yes')
else:
print(row['snp'],"\t",row['chr'],"\t",row['position'],'No')
enhancer_overlap = Enhancer_overlap()
data = enhancer_overlap.snp_enhancer_overlap_region('list2', 'list1')