从在线资源中提取序列

时间:2018-02-19 16:05:44

标签: python bioinformatics data-retrieval

我有一个大约150个真核生物GFF文件的数据库,我已经解析了它们,以提取每个基因最长的同种型的ID,导致大约400万个序列ID。我尝试使用以下代码从UNIPROT中提取这些内容:

import requests

baseURL = 'http://www.uniprot.com/uniprot/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36','Content-Type': 'text/html',
}

sample = open('sample.txt','r')

for line in sample:
query = line.strip()
payload = {'id': query, 
            'format':'tab',
            'columns': 'id,entry_name'}
result = requests.get(baseURL, headers = headers)
print result.text

import uniprot
import pprint
import requests
import sys, argparse
import os
import json
import shutil

url = 'http://www.uniprot.org/'

def _map(query, f, t, format='tab'):
    tool = 'mapping/'
data = {
        'from':f,
        'to':t,
        'format':format,
        'query':query
        }
response = requests.post(url + tool, data=data)
page = response.text
return page

seqids = []
i = open('sample.txt','r')
for line in i:
    seqids = line.strip()

def idmap(ids, f, t, format='tab'):
    if type(ids) is not list:
        ids = [ids]
page = _map(' '.join(ids), f, t, format)
result = dict()
for row in page.splitlines()[1:]:
    key, value = row.split('\t')
    result[key]=[]
    if key in result:
        result[key].append(value)
    else:
        result[key].append(value)
return result


special=["\[u\'","\'\]","\."]
b=0
e=400
res={}
while e<=len(seqids):
    a=idmap(seqids[b:e],f='ENSEMBLGENOME_PRO_ID',t="ACC") 
res.update(a)
b+=400
e+=400
onion = open('onion','w')   
items = []  
items = res.values()
for item in items:
reiterate = dict()
uniprot_data = uniprot.batch_uniprot_metadata(item, 'cache')
org = ['organism']
acc = ['accs'] 
seq = ['sequence']
subdict = {}
reiterate = dict()
for key in uniprot_data:
    if key in uniprot_data:
        for acc_key in uniprot_data[key]:
                if acc_key in acc:
                    separator = ","
                    accession = str(uniprot_data[key][acc_key]).replace("[","").replace("]","").replace("u'","").replace("'","").replace("', u'","__OR__").split(separator,1)[0]
        for org_key in uniprot_data[key]:
            if org_key in org:
                orgs = str(uniprot_data[key][org_key]).replace(" ","_").replace(".","")
                seperator = "_("
                organism = orgs.split(seperator,1)[0]
        for seq_key in uniprot_data[key]:
                if seq_key in seq:
                    seqs = str(uniprot_data[key][seq_key])
        print ">" + accession +'\t'+ organism + "\n" + seqs

                    #data = str(uniprot_data[key][subkey]).strip('\n')
                    #print data

                    #for i in clean_data:
                    #   term = line.split('\t')
                    #   seq = str(term[0])
                    #   species = str(term[1])
                    #   seqID = str(term[2])
                    #   print ">" + species + "___" + seqID + "\n" + seq
        #else:
        #   print key

但是,我一直收到空白文件或乱码文本输出。有谁知道一个简单的方法来快速做到这一点?理想情况下,我的输出将是:

Query    ID    Organism    Sequence

我现在已经尝试了几个星期而且我不知所措。

谢谢

0 个答案:

没有答案