如何编辑Python脚本?

时间:2018-12-06 12:56:12

标签: python sequence edit fasta id

我在尝试编辑有效的python脚本时发现困难。

我有2个文件:

  1. 包含ID的.txt文件
  2. 一个具有Fasta序列及其ID的.fasta文件。

此脚本的目的是比较两个文件,并且第一个文件中的ID与第二个文件中的序列及其ID匹配后,输出应为ID,完整序列及其ID。

我在这里获得的脚本将获取第一个文件的ID和仅包含序列ID的序列文本作为输出。

enter image description here

这是脚本:

with open('uniprot_reviewed_taxonomy_9606.fasta', 'r') as f:
    for line in f.readlines():
        line = line.replace("\n", "")
        if line.startswith('>'):
            full_name = line.split('|')
            accession_x = full_name[1]
            print(accession_x)
        else:
           print (line)

            with open('homosapiens_output1.txt', 'r') as f1:
                for line1 in f1.readlines()[1:]:  # ignores the first line
                    line1 = line1.replace("\n", "")

                    full_name1 = line1.split(' ')
                    accession_y = full_name1[0].replace(" ", "")
                    accession_z = full_name1[1].replace(" ", "")
                    main_accession = accession_x + " " + accession_z + " " + accession_y + " " + line

                    if accession_x == accession_z:
                        print(main_accession)

所以您可以通过编辑脚本来帮我吗?输出的内容可以是ID,Fasta序列及其ID吗?

1 个答案:

答案 0 :(得分:0)

这是一个小测试示例,该怎么做。

代码:

# create test fasta:
test_content = """>tr|Q53XC5|Q53XC5_HUMAN Bone morphogenetic protein 4 OS=Homo sapiens OX=9606 GN=BMP4 PE=2 SV=1
MIPGNRMLMVVLLCQVLLGGASHASLIPETGKKKVAEIQGHAGGRRSGQSHELLRDFEAT
LLQMFGLRRRPQPSKSAVIPDYMRDLYRLQSGEEEEEQIHSTGLEYPERPASRANTVRSF
HHEEHLENIPGTSENSAFRFLFNLSSIPENEVISSAELRLFREQVDQGPDWERGFHRINI
YEVMKPPAEVVPGHLITRLLDTRLVHHNVTRWETFDVSPAVLRWTREKQPNYGLAIEVTH
LHQTRTHQGQHVRISRSLPQGSGNWAQLRPLLVTFGHDGRGHALTRRRRAKRSPKHHSQR
ARKKNKNCRRHSLYVDFSDVGWNDWIVAPPGYQAFYCHGDCPFPLADHLNSTNHAIVQTL
VNSVNSSIPKACCVPTELSAISMLYLDEYDKVVLKNYQEMVVEGCGCR
>tr|A8K571|A8K571_HUMAN Bone morphogenetic protein 7 (Osteogenic protein 1), isoform CRA_b OS=Homo sapiens OX=9606 GN=BMP7 PE=2 SV=1
MHVRSLRAAAPHSFVALWAPLFLLRSALADFSLDNEVHSSFIHRRLRSQERREMQREILS
ILGLPHRPRPHLQGKHNSAPMFMLDLYNAMAVEEGGGPGGQGFSYPYKAVFSTQGPPLAS
LQDSHFLTDADMVMSFVNLVEHDKEFFHPRYHHREFRFDLSKIPEGEAVTAAEFRIYKDY
IRERFDNETFRISVYQVLQEHLGRESDLFLLDSRTLWASEEGWLVFDITATSNHWVVNPR
HNLGLQLSVETLDGQSINPKLAGLIGRHGPQNKQPFMVAFFKATEVHFRSIRSTGSKQRS
QNRSKTPKNQEALRMANVAENSSSDQRQACKKHELYVSFRDLGWQDWIIAPEGYAAYYCE
GECAFPLNSYMNATNHAIVQTLVHFINPETVPKPCCAPTQLNAISVLYFDDSSNVILKKY
RNMVVRACGCH
>tr|A8K660|A8K660_HUMAN Adiponectin OS=Homo sapiens OX=9606 GN=ADIPOQ PE=2 SV=1
MLLLGAVLLLLALPGHDQETTTQGPGVLLPLPKGACTGWMAGIPGHPGHNGAPGRDGRDG
TPGEKGEKGDPGLIGPKGDIGETGVPGAEGPRGFPGIQGRKGEPGEGAYVYRSAFSVGLE
TYVTIPNMPIRFTKIFYNQQNHYDGSTGKFHCNIPGLYYFAYHITVYMKDVKVSLFKKDK
AMLFTYDQYQENNVDQASGSVLLHLEVGDQVWLQVYGEGERNGLYADNDNDSTFTGFLLY
HDTN
"""

with open('test_sequences.fasta', 'w') as f:
    f.write(test_content)

# create test ids:
test_ids = 'A8K660\nQ53XC5\n'

with open('test_ids.txt', 'w') as f:
    f.write(test_ids)

# Load all sequences and store them in dict:

with open('test_sequences.fasta', 'r') as f:
    lines = f.read().split('>')

sequences = {}
for seq in lines:
    if seq:
        id_ = seq[: seq.find('\n')].split('|')[1]
        seq = seq[seq.find('\n')+1:]
        sequences[id_] = seq

# import ids:
with open('test_ids.txt', 'r') as f:
    ids = f.readlines()
    ids = [id_.strip() for id_ in ids]  # remove \n from id end

# checking ids
# and filter out those that are not in sequences dict

filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]

# writing new file with filtered sequences:

with open('filtered_ids.txt', 'w') as f:
    for id_ in filtered_ids:
        f.write('>|' + id_ + '\n')
        f.write(sequences[id_])

# the final function:

def ids_filter(ids_file, seq_file, out_file):
    with open(seq_file, 'r') as f:
        lines = f.read().split('>')

    sequences = {}
    for seq in lines:
        if seq:
            id_ = seq[: seq.find('\n')].split('|')[1]
            seq = seq[seq.find('\n')+1:]
            sequences[id_] = seq

    with open(ids_file, 'r') as f:
        ids = f.readlines()
        ids = [id_.strip() for id_ in ids]

    filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]

    with open(out_file, 'w') as f:
        for id_ in filtered_ids:
            f.write('>|' + id_ + '\n')
            f.write(sequences[id_])