今天早些时候,我可以使用以下代码从Google Patents中提取数据
import urllib2
url = 'http://www.google.com/search?tbo=p&q=ininventor:"John-Mudd"&hl=en&tbm=pts&source=lnt&tbs=ptso:us'
req = urllib2.Request(url, headers={'User-Agent' : "foobar"})
response = urllib2.urlopen(req)
现在,当我去运行它时,我得到以下503错误。我只用了30次这个代码(我试图获得30人名单所拥有的所有专利)。
HTTPError Traceback (most recent call last)
<ipython-input-4-01f83e2c218f> in <module>()
----> 1 response = urllib2.urlopen(req)
C:\Python27\lib\urllib2.pyc in urlopen(url, data, timeout)
124 if _opener is None:
125 _opener = build_opener()
--> 126 return _opener.open(url, data, timeout)
127
128 def install_opener(opener):
C:\Python27\lib\urllib2.pyc in open(self, fullurl, data, timeout)
404 for processor in self.process_response.get(protocol, []):
405 meth = getattr(processor, meth_name)
--> 406 response = meth(req, response)
407
408 return response
C:\Python27\lib\urllib2.pyc in http_response(self, request, response)
517 if not (200 <= code < 300):
518 response = self.parent.error(
--> 519 'http', request, response, code, msg, hdrs)
520
521 return response
C:\Python27\lib\urllib2.pyc in error(self, proto, *args)
436 http_err = 0
437 args = (dict, proto, meth_name) + args
--> 438 result = self._call_chain(*args)
439 if result:
440 return result
C:\Python27\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
376 func = getattr(handler, meth_name)
377
--> 378 result = func(*args)
379 if result is not None:
380 return result
C:\Python27\lib\urllib2.pyc in http_error_302(self, req, fp, code, msg, headers)
623 fp.close()
624
--> 625 return self.parent.open(new, timeout=req.timeout)
626
627 http_error_301 = http_error_303 = http_error_307 = http_error_302
C:\Python27\lib\urllib2.pyc in open(self, fullurl, data, timeout)
404 for processor in self.process_response.get(protocol, []):
405 meth = getattr(processor, meth_name)
--> 406 response = meth(req, response)
407
408 return response
C:\Python27\lib\urllib2.pyc in http_response(self, request, response)
517 if not (200 <= code < 300):
518 response = self.parent.error(
--> 519 'http', request, response, code, msg, hdrs)
520
521 return response
C:\Python27\lib\urllib2.pyc in error(self, proto, *args)
442 if http_err:
443 args = (dict, 'default', 'http_error_default') + orig_args
--> 444 return self._call_chain(*args)
445
446 # XXX probably also want an abstract factory that knows when it makes
C:\Python27\lib\urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
376 func = getattr(handler, meth_name)
377
--> 378 result = func(*args)
379 if result is not None:
380 return result
C:\Python27\lib\urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
525 class HTTPDefaultErrorHandler(BaseHandler):
526 def http_error_default(self, req, fp, code, msg, hdrs):
--> 527 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
528
529 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 503: Service Unavailable
答案 0 :(得分:4)
谷歌的TOS禁止自动查询,遗憾的是。几乎可以肯定地发现你“没有好处”。
答案 1 :(得分:1)
在黑暗中猜测:
您是否在查看响应中是否有“Retry-After标头”。这是503的真实可能性。
14.37重试 - 之后
Retry-After响应标头字段可与503一起使用(服务 不可用)响应以指示服务预期的时间 请求客户端无法访问。也可以使用该字段 使用任何3xx(重定向)响应来指示最小时间 在发出重定向的请求之前要求用户代理等待。该 该字段的值可以是HTTP日期或整数 响应时间后的秒数(十进制)。 Retry-After =“Retry-After”“:”(HTTP-date | delta-seconds)
其使用的两个例子是 Retry-After:1999年12月31日星期五23:59:59 GMT Retry-After:120
在后一个例子中,延迟是2分钟。