import distance
from collections import defaultdict
my_list = ['ACAA', 'TCAA','TCAT','TGAT','TCGA','TGGA','GCGA','AAAA','GGGG','GGGC']
counts = {'ACAA':60, 'TCAA':3,'TCAT':30,'TGAT':8,'TCGA':1,'TGGA':1,'GCGA':8,'AAAA':5,'GGGG':8,'GGGC':1}
adj_list = defaultdict(list)
for strng1 in my_list:
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and counts[strng1] >= (counts[strng2]*2):
adj_list[strng1].append(strng2)
我有这个实现来获取方向性邻接列表。预期结果:
ACAA: TCAA
TCAA: TCGA
TCAT: TCAA, TGAT
TGAT
TCGA: TGGA
TGGA: TCGA
GCGA: TCGA
AAAA
GGGG: GGGC
GGGC
是否有更快的实施?对于大型数据集,这变得非常慢。在cython中重写这个会加速吗?如果是,有人可以帮助我开始使用cython吗?
答案 0 :(得分:1)
我不了解Cython,但您可以避免在内循环中访问dict项目:
adj_list = defaultdict(list)
for strng1 in my_list:
a1 = adj_list[strng1]
c1 = counts[strng1]
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2):
a1.append(strng2)
你甚至可以通过仅在下半部分迭代并执行对称附加来削减更多。这样你可以节省50%的距离计算,因为它是对称的。你只在上面的矩阵三角形上执行它(对角线排除,我认为一个字符串与自身的距离是0)而不是完整的矩阵。
for i,strng1 in enumerate(my_list):
...
for j in range(i+1,len(my_list)):
我的尝试,我不确定,但应该关闭:
adj_list = defaultdict(list)
for i,strng1 in enumerate(my_list):
a1 = adj_list[strng1]
c1 = counts[strng1]
for j in range(i+1,len(my_list)):
strng2 = my_list[j]
if distance.hamming(strng1, strng2) == 1:
c2 = counts[strng2]
if c1 >= (c2*2):
a1.append(strng2)
if c2 >= (c1*2):
adj_list[strng1].append(strng2)
crysis405编辑:
原件:
def adj_lst(my_list, counts):
adj_list = defaultdict(list)
for strng1 in my_list:
a1 = adj_list[strng1]
c1 = counts[strng1]
for strng2 in my_list:
if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2):
adj_list[strng1].append(strng2)
建议的改进:
def adj_lst_fast(my_list, counts):
adj_list_fast = defaultdict(list)
for i,strng1 in enumerate(my_list):
a1 = adj_list_fast[strng1]
c1 = counts[strng1]
for j in range(i+1,len(my_list)):
strng2 = my_list[j]
if distnace.hamming(strng1, strng2:
c2 = counts[strng2]
if c1 >= (c2*2):
adj_list_fast[strng1].append(strng2)
elif c2 >= (c1*2):
adj_list_fast[strng2].append(strng1)
性能:
print(timeit.timeit('adj_lst(my_list, counts)', number = 10000,
setup="from __main__ import adj_lst, my_list, counts"))
1.2892486669989012
print(timeit.timeit('adj_lst_fast(my_list, counts)', number = 10000,
setup="from __main__ import adj_lst_fast, my_list, counts"))
0.6437049919986748