Python Dict和For循环与FASTA文件

时间:2014-11-18 20:15:52

标签: python for-loop dictionary fasta


L: 139002 (10.7%) 

A: 123885 (9.6%) 

G: 95475 (7.4%) 

V: 91683 (7.1%) 

I: 77836 (6.0%)


ecoli = open("/home/file_pathway").read()
counts = dict()
for line in ecoli:
    words = line.split()
    for word in words:
        if word in ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]:
            if word not in counts:
                counts[word] = 1
                counts[word] += 1

for key in counts:
    print key, counts[key]


3 个答案:

答案 0 :(得分:3)


with open("input.fasta") as ecoli: # will close your file automatically
    from collections import defaultdict
    counts = defaultdict(int) 
    for line in ecoli: # iterate over file object, no need to read all contents into memory
        if line.startswith(">"): # skip lines that start with >
        for char in line: # just iterate over the characters in the line
            if char in {"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}:
                    counts[char] += 1
    total = float(sum(counts.values()))       
    for key,val in counts.items():
        print("{}: {}, ({:.1%})".format(key,val, val / total))

您也可以使用collections.Counter dict,因为这些行只包含您感兴趣的内容:

with open("input.fasta") as ecoli: # will close your file automatically
    from collections import Counter
    counts = Counter()
    for line in ecoli: # iterate over file object, no need to read all contents onto memory
        if line.startswith(">"): # skip lines that start with >
    total = float(sum(counts.values()))
    for key,val in counts.items():
        print("{}: {}, ({:.1%})".format(key,val, val / total))

答案 1 :(得分:0)




ecoli = open("/home/file_pathway.faa").read()
counts = dict()
nucleicAcids = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
for acid in nucleicAcids:
    counts[acid] = 0
total = 0

for line in ecoli.split('\n'):
    if ">" not in line:
        total += len(line)
        for acid in counts.keys():
            counts[acid] += line.count(acid)

答案 2 :(得分:-1)


from collections import Counter
acids = ""                      # dunno if this is the right terminology
with open(filename, 'r') as ecoli_file:
    for line in ecoli_file:
        if line.startswith('>'):
        # from what I saw in the FASTA files, the character-check is
        # not necessary anymore...
        acids += line.strip()   # stripping newline and possible whitespaces
 counter = Counter(acids)       # and all the magic is done.
 total = float(sum(counter.values()))
 for k, v in counter.items():
     print "{}: {} ({:.1%})".format(k, v, v / total)


from collections import Counter
with open(filename) as f:
    counter = Counter(c for line in f if not line.startswith('>')
                      for c in line.strip())
# and now as above
total = float(sum(counter.values()))
for k, v in counter.items():
    print "{}: {} ({:.1%})".format(k, v, v/total)