我的代码:
import sqlite3, os, urllib.request
from xml.dom import minidom
if os.path.exists("data.db"):
con = sqlite3.connect("data.db")
cursor = con.cursor()
sql = "SELECT * FROM data WHERE test= '123'"
cursor.execute(sql)
else:
print("ERROR")
for dsatz in cursor:
#print(dsatz)
link = 'http://test.org/publication/' + dsatz[0] + '' + dsatz[1] +'/bib'
#print(link)
web_data = urllib.request.urlopen(link)
xmldoc = minidom.parse(web_data)
di = xmldoc.getElementsByTagName("document-id")[:1]
for x in di:
publicationcountry = x.getElementsByTagName("country")[0].firstChild.data
publicationdocnumber = x.getElementsByTagName("doc-number")[0].firstChild.data
punlicationkind = x.getElementsByTagName("kind")[0].firstChild.data
publicationdate = x.getElementsByTagName("date")[0].firstChild.data
sql = "INSERT INTO link_xml_data VALUES('" \
+ publicationcountry + "', '" \
+ str(publicationdocnumber) + "', '" \
+ punlicationkind + "')"
con.close()
但在15个链接之后我得到了错误:
Traceback (most recent call last):
File "C:\Users\j\3.py", line 34, in <module>
web_data = urllib.request.urlopen(link)
File "C:\Users\j\Python35-32\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\j\Python35-32\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "C:\Users\j\Python35-32\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\j\Python35-32\lib\urllib\request.py", line 510, in error
return self._call_chain(*args)
File "C:\Users\j\Python35-32\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\j\Python35-32\lib\urllib\request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
我应该添加或更改什么?
答案 0 :(得分:0)
Web服务器告诉您该链接是被禁止的。您的代码(可能)没有任何问题。
某些链接是否始终有效,其他链接是否总是失败,或者模式是否随时间而变化?
获得403 Forbidden响应后,您是否尝试过返回并重新请求其中一个早期成功的链接?
也许服务器最终会将您识别为网络刮刀,并告诉您要离开?