我编写了一段代码来旋转多线程爬虫的代理,但它看起来不太好,我想看看我能改进什么。
我的想法:
1)使用代理发出大量请求(随机范围),然后更改
2)如果被阻止,请更改代理(将其从代理列表中删除)并重试。
3)如果发生HTTP错误,请使用相同的代理重试
4)如果发生代理错误,请更改代理(将其从代理列表中删除),然后重试。
通常情况下,它看起来很不错,但我发现可能会出现一些问题:
1)make_request函数调用itsel,在某些情况下可能导致infinte循环
2)代理错误处理不当
这是我的代码:
import requests
import threading
import random
import time
import logging
import os
class Crawler():
def __init__(self):
self.user_agents = []
with open('user_agents.txt', 'r') as inpt:
for line in inpt:
if line.strip():
self.user_agents.append(line.strip())
self.proxies = []
with open('proxies.txt', 'r') as inpt:
for line in inpt:
if not line.strip():
continue
self.proxies.append({"http": ''.join(["http://",
line.strip()]),
"https": ''.join(["https://",
line.strip()])})
self.headers = {'User-agent': random.choice(self.user_agents)}
self.session = requests.Session()
self.counter = 0
self.current_proxy = None
self.lock = threading.Lock()
self.set_proxy()
def make_request(self, method, url, **kwargs):
"""Request a page and return its content
@method - string, POST or GET
@url - string
@return: string, HTML page source
or bytes for binary files
"""
# make only 10 to 20 requests using a proxy
with self.lock:
if self.counter > random.randrange(10, 20):
self.set_proxy()
else:
self.counter += 1
try:
if method == 'GET':
if kwargs.get('download', False):
req = self.session.get(url,
headers=self.headers,
stream=True, verify=False)
return req.raw
req = self.session.get(url,
headers=self.headers,
verify=False,
**kwargs)
else:
req = self.session.post(url,
headers=self.headers,
verify=False,
**kwargs)
if req.status_code == 407:
logging.exception('make_request[Proxy Authentication]')
os._exit(1)
if req.encoding not in ['utf8', 'utf-8', None]:
html = req.content.decode(req.encoding)
else:
html = req.text
if 'Access Denied' in html:
# website's error message. proxy blocked
with self.lock:
self.set_proxy(remove=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
else:
return html
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
# access forbidden. proxy blocked
with self.lock:
self.set_proxy(remove_proxy=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
elif e.response.status_code == 404:
logging.exception(' '.join([
'make_request[HTTPError]',
url, str(e)]))
return
elif e.response.status_code == 429:
# too many requests. proxy blocked
with self.lock:
self.set_proxy(remove_proxy=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
else:
logging.exception(' '.join([
'make_request[unknown HTTPError]',
url, str(e)]))
return None
except requests.exceptions.InvalidURL as e:
logging.exception(' '.join([
'make_request[InvalidURL]',
url, str(e)]))
return None
except requests.exceptions.Timeout:
time.sleep(1)
return self.make_request(method, url, **kwargs)
except requests.exceptions.ConnectionError as e:
# Connection refused
if '403 Forbidden' in str(e):
logging.exception('make_requests[403 forbidden]')
os._exit(1)
with self.lock:
self.set_proxy()
time.sleep(1)
return self.make_request(method, url, **kwargs)
except Exception as e:
logging.exception(' '.join([
'make_request[unknown Exception]',
url, str(e)]))
return None
def set_proxy(self, remove_proxy=False):
"""Get a random proxy from the list"""
if remove_proxy:
try:
self.proxies.remove(self.current_proxy)
except:
pass
while True:
if self.proxies:
proxy = random.choice(self.proxies)
if not self.is_alive(proxy):
continue
self.current_proxy = proxy
self.session = requests.Session()
self.session.proxies = self.current_proxy
self.headers = {'User-agent': random.choice(self.user_agents)}
self.counter = 0
break
else:
logging.exception('EMPTY PROXY LIST')
os._exit(1)
break
def is_alive(self, proxy):
"""Check if a proxy is alive or not
@proxy - dict
@return: True if alive, False otherwise
"""
try:
requests.get('http://www.google.com',
proxies=proxy, timeout=5)
return True
except:
return False
由于