class HttpProxyMiddleware(object):
###never retry these errors
def __init__(self, settings):
socket.setdefaulttimeout(3)
self.proxies = []
self.proxy_index = 1
##plant proxies
self.proxy_list = settings.get('PROXY_LIST')
fin = open(self.proxy_list)
for line in fin.readlines():
line = line.rstrip()
line = 'http://%s' % line
self.proxies.append(line)
print self.proxies
fin.close()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
if len(self.proxies) == 0:
raise ValueError('All proxies are unusable, cannot proceed')
proxy_address = self.proxies[self.proxy_index]
print proxy_address
request.meta['proxy'] = proxy_address
def process_exception(self, request, exception, spider):
print 'not working'
self.proxy_index += 1
new_request = request.copy()
new_request.dont_filter = True
return new_request
我有来自HMA的完整代理列表,我尝试使用此中间件来抓取亚马逊物品。然而,事实证明它几乎不起作用。在'process_request'中,在打印出代理地址后,它会停止很长时间并最终失败。
我还创建了以下程序来测试列表中的每个代理ip。所有代理ip都在这里工作。
def main():
socket.setdefaulttimeout(3)
proxies = []
##plant proxies
proxy_list = '/users/zehuapan/desktop/amazon/amazon/proxy_list.txt'
fin = open(proxy_list)
for line in fin.readlines():
line = line.rstrip()
line = 'http://%s' % line
print line
if check_validity(line):
proxies.append(line)
fin.close()
print proxies
file = open('/users/zehuapan/desktop/amazon/amazon/valid_proxy_list.txt', 'w+')
for proxy in proxies:
file.write(proxy + '\n')
file.close()
def check_validity(proxy):
try:
proxy_handler = urllib2.ProxyHandler({'http': proxy})
opener = urllib2.build_opener(proxy_handler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
req=urllib2.Request('https://www.amazon.com')
sock=urllib2.urlopen(req)
except urllib2.HTTPError, e:
print 'Error code: ', e.code
return e.code
except Exception, detail:
print "ERROR:", detail
return False
return True
if __name__ == '__main__':
main()
答案 0 :(得分:0)
不可能使用代理给出零响应。在数百个请求之后,可用的代理服务器与Amazon的连接速度很快下降。 当代理仍然有效时,您可以获得带有200个代码的验证码页面,而被禁止时,您将获得503个代码。 此外,代理可能适用于主页和商品列表,但在产品页面上失败。 希望有帮助。