我目前正在开展一个项目,我需要从PubMed下载几千个引文。我目前正在使用BioPython并编写了这段代码:
from Bio import Entrez
from Bio import Medline
from pandas import *
from sys import argv
import os
Entrez.email = "email"
df = read_csv("/Users/.../Desktop/sr_dataset/adhd/excluded/adhdExcluded.csv")
i=0
withoutMesh = 0
withoutMeshID = ""
withoutAbstract = 0
withoutAbstractID = ""
path = '/Users/.../Desktop/sr_dataset/adhd/excluded'
for index, row in df.iterrows():
print (row.id)
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(row.id))
records = Medline.parse(handle)
for record in records:
try:
abstract = str(record["AB"])
except:
abstract = "none"
withoutAbstract = withoutAbstract +1
withoutAbstractID = withoutAbstractID + str(row.id) + "\n"
try:
title = str(record["TI"])
except:
title = "none"
try:
mesh = str(record["MH"])
except:
mesh = "none"
withoutMesh = withoutMesh +1
withoutMeshID = withoutMeshID + str(row.id) + "\n"
filename= str(row.id) + '.txt'
filename = os.path.join(path, filename)
file = open(filename, "w")
output = "title: "+str(title) + "\n\n" + "abstract: "+str(abstract) + "\n\n" + "mesh: "+str(mesh) + "\n\n"
file.write(output)
file.close()
print (i)
i=i+1
filename = os.path.join(path, "overview.txt")
file = open(filename, "w")
output = "Without MeSH terms:" + str(withoutMesh) + "\n" + "ID's: "+str(withoutMeshID) + "\n\n" + "Without abstract: "+str(withoutAbstract) + "\n" + "ID's: "+str(withoutAbstractID)
file.write(output)
file.close()
代码适用于表格中的前几百行,但随后停止执行,我收到的错误是:
Traceback (most recent call last):
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1151, in _send_request
self.endheaders(body)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1102, in endheaders
self._send_output(message_body)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1260, in connect
server_hostname=server_hostname)
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 377, in wrap_socket
_context=self)
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 752, in __init__
self.do_handshake()
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 988, in do_handshake
self._sslobj.do_handshake()
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 633, in do_handshake
self._sslobj.do_handshake()
ConnectionResetError: [Errno 54] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/.../Desktop/sr_dataset/ace_inhibitor/excluded/pumbedMedline.py", line 18, in <module>
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(row.id))
File "/Users/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 180, in efetch
return _open(cgi, variables, post=post)
File "/Users/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 524, in _open
handle = _urlopen(cgi)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 1297, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 54] Connection reset by peer>
以下是CSV文件的前几列:
id
10029645
10073846
10078088
10080457
10088066
...
答案 0 :(得分:1)
Biopython确实遵循&#34;每秒最多三个查询规则&#34;为避免滥用NCBI服务器,但您已经错过了我们的教程http://biopython.org/DIST/docs/tutorial/Tutorial.html中关于指南的第一个要点:
&#34;对于任何超过100个请求的系列,请在周末或 美国高峰时段之外。这取决于你服从。&#34;
也就是说,有时您会从Entrez获得间歇性错误,并建议使用try / except块来处理此问题。本教程中有一个例子。