sample = [['CGG','ATT'],['GCGC','TAAA']]
#Frequencies of each base in the pair
d1 = [[{'G': 0.66, 'C': 0.33}, {'A': 0.33, 'T': 0.66}], [{'G': 0.5, 'C': 0.5}, {'A': 0.75, 'T': 0.25}]]
#Frequencies of each pair occurring together
d2 = [{('C', 'A'): 0.33, ('G', 'T'): 0.66}, {('G', 'T'): 0.25, ('C', 'A'): 0.5, ('G', 'A'): 0.25}]
问题:
考虑第一对:['CGG','ATT']
如何计算a,其中a是:
float(a) = (freq of pairs) - ((freq of C in CGG) * (freq of A in ATT))
eg. in CA pairs, float (a) = (freq of CA pairs) - ((freq of C in CGG) * (freq of A in ATT))
Output a = (0.33) - ((0.33) * (0.33)) = 0.222222
为任何一个组合(CA对或GT对)计算“a”
Final Output for sample : a = [0.2222, - 0.125]
如何计算b,其中b为:
float (b) = (float(a)^2)/ (freq of C in CGG) * (freq G in CGG) * (freq A in ATT) * (freq of T in ATT)
Output b = 1
为整个列表执行此操作
Final Output for sample : b = [1, 0.3333]
我不知道如何从d1和d2中提取所需的值并执行数学运算。
我尝试为
的值编写以下代码float a = {k: float(d1[k][0]) - d2[k][0] * d2[k][1]for k in d1.viewkeys() & d2.viewkeys()}
但是,它不起作用。另外,我更喜欢for循环而不是理解
我试图为上面写的(一个非常有缺陷的)for循环:
float_a = []
for pair,i in enumerate(d2):
for base,j in enumerate(d1):
float (a) = pair[i][0] - base[j][] * base[j+1][]
float_a.append(a)
float_b = []
for floata in enumerate(float_a):
for base,j in enumerate(d1):
float (b) = (float(a) * float(a)) - (base[j] * base[j+1]*base[j+2]*base[j+3])
float_b.append(b)
答案 0 :(得分:1)
通常当有多个公式和中间步骤这样的棘手问题时,我喜欢通过将工作分成几个函数来模块化它。以下是生成的注释代码,用于处理原始问题和注释中的案例:
from collections import Counter
def get_base_freq(seq):
"""
Returns the normalized frequency of each base in a given sequence as a dictionary.
A dictionary comprehension converts the Counter object into a "normalized" dictionary.
"""
seq_len = len(seq)
base_counts = Counter(seq)
base_freqs = {base: float(count)/seq_len for base, count in base_counts.items()}
return base_freqs
def get_pair_freq(seq1, seq2):
"""
Uses zip to merge two sequence strings together.
Then performs same counting and normalization as in get_base_freq.
"""
seq_len = len(seq1)
pair_counts = Counter(zip(seq1, seq2))
pair_freqs = {pair: float(count)/seq_len for pair, count in pair_counts.items()}
return pair_freqs
def calc_a(d1, d2):
"""
Arbitrarily takes the first pair in d2 and calculates the a-value from it.
"""
first_pair, pair_freq = d2.items()[0]
base1, base2 = first_pair
a = pair_freq - (d1[0][base1]*d1[1][base2])
return a
def calc_b(a, d1):
"""
For this calculation, we need to use all of the values from d1 and multiply them together.
This is done by merging the two sequence half-results together and multiplying in a for loop.
"""
denom_ACGT = d1[0].values() + d1[1].values()
denom = 1
for val in denom_ACGT:
denom *= val
b = a*a/float(denom)
return b
if __name__ == "__main__":
sample = [['CGG','ATT'], ['GCGC','TAAA'], ['ACAA','CAAC']]
b_result = []
for seq_pair in sample:
d1 = [get_base_freq(seq) for seq in seq_pair]
d2 = get_pair_freq(*seq_pair)
a = calc_a(d1, d2)
b = calc_b(a, d1)
b_result.append(b)
print b_result
如果有任何事情需要澄清,或者我没有考虑过的情况失败,请告诉我。