我有一个大约150个真核生物GFF文件的数据库,我已经解析了它们,以提取每个基因最长的同种型的ID,导致大约400万个序列ID。我尝试使用以下代码从UNIPROT中提取这些内容:
import requests
baseURL = 'http://www.uniprot.com/uniprot/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36','Content-Type': 'text/html',
}
sample = open('sample.txt','r')
for line in sample:
query = line.strip()
payload = {'id': query,
'format':'tab',
'columns': 'id,entry_name'}
result = requests.get(baseURL, headers = headers)
print result.text
和
import uniprot
import pprint
import requests
import sys, argparse
import os
import json
import shutil
url = 'http://www.uniprot.org/'
def _map(query, f, t, format='tab'):
tool = 'mapping/'
data = {
'from':f,
'to':t,
'format':format,
'query':query
}
response = requests.post(url + tool, data=data)
page = response.text
return page
seqids = []
i = open('sample.txt','r')
for line in i:
seqids = line.strip()
def idmap(ids, f, t, format='tab'):
if type(ids) is not list:
ids = [ids]
page = _map(' '.join(ids), f, t, format)
result = dict()
for row in page.splitlines()[1:]:
key, value = row.split('\t')
result[key]=[]
if key in result:
result[key].append(value)
else:
result[key].append(value)
return result
special=["\[u\'","\'\]","\."]
b=0
e=400
res={}
while e<=len(seqids):
a=idmap(seqids[b:e],f='ENSEMBLGENOME_PRO_ID',t="ACC")
res.update(a)
b+=400
e+=400
onion = open('onion','w')
items = []
items = res.values()
for item in items:
reiterate = dict()
uniprot_data = uniprot.batch_uniprot_metadata(item, 'cache')
org = ['organism']
acc = ['accs']
seq = ['sequence']
subdict = {}
reiterate = dict()
for key in uniprot_data:
if key in uniprot_data:
for acc_key in uniprot_data[key]:
if acc_key in acc:
separator = ","
accession = str(uniprot_data[key][acc_key]).replace("[","").replace("]","").replace("u'","").replace("'","").replace("', u'","__OR__").split(separator,1)[0]
for org_key in uniprot_data[key]:
if org_key in org:
orgs = str(uniprot_data[key][org_key]).replace(" ","_").replace(".","")
seperator = "_("
organism = orgs.split(seperator,1)[0]
for seq_key in uniprot_data[key]:
if seq_key in seq:
seqs = str(uniprot_data[key][seq_key])
print ">" + accession +'\t'+ organism + "\n" + seqs
#data = str(uniprot_data[key][subkey]).strip('\n')
#print data
#for i in clean_data:
# term = line.split('\t')
# seq = str(term[0])
# species = str(term[1])
# seqID = str(term[2])
# print ">" + species + "___" + seqID + "\n" + seq
#else:
# print key
但是,我一直收到空白文件或乱码文本输出。有谁知道一个简单的方法来快速做到这一点?理想情况下,我的输出将是:
Query ID Organism Sequence
我现在已经尝试了几个星期而且我不知所措。
谢谢