我用网站列表编写了一个用于网络抓取的脚本,但是在54个请求之后它停止了并给了我域错误。我当时正在考虑跳过某个域,但这是怎么回事?
我尝试删除停止它的站点,但似乎错误仍然存在
请帮助!
import csv
import traceback
from random import randint
from time import sleep
from requests_html import HTML, HTMLSession, requests
BASE_URL = 'http://www.statshow.com/www/'
HEADERS = {
'User-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
def write_csv(data, filename='domainresult.csv', mode='a'):
with open(filename, mode=mode, encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(data)
def get_domains(filename='domains.txt'):
domains = list()
with open(filename, encoding='utf-8') as f:
for line in f:
domains.append(line.strip())
return domains
def get_proxies():
url = 'https://free-proxy-list.net/' #<--- Will take proxies from this website and run them one by one until one of them work
session = HTMLSession()
response = session.get(url, headers=HEADERS)
proxies = set()
while not proxies:
for i in response.html.xpath('//tbody/tr'):
if i.xpath('.//td[7][contains(text(),"yes")]'):
proxy = ":".join([
i.xpath('.//td[1]/text()')[0],
i.xpath('.//td[2]/text()')[0]
])
proxies.add(proxy)
if not proxies:
wait = randint(1, 3)
print('Can not find proxy, waiting for {} before next finding'.
format(wait))
sleep(wait)
else:
return proxies
def get_domain_info(domain, proxy):
session = HTMLSession()
print('Checking {}'.format(domain))
r = session.get(
BASE_URL + domain,
headers=HEADERS,
proxies={
"http": proxy,
"https": proxy
})
pageviews, visitors = [
x.text.strip() for x in r.html.find('div#box_2 .red_bold')
]
write_csv([domain, pageviews, visitors])
def main():
write_csv(['Domain', 'Monthly Pageviews', 'Monthly Visitors'], mode='w') # <---- this will display on the header of the CSV file
domains = get_domains()
if not domains:
print('Please put domains in domains.txt file')
return
proxies = get_proxies()
for domain in domains:
is_success = False
while not is_success:
try:
if not proxies:
proxies = get_proxies()
proxy = proxies.pop()
get_domain_info(domain, proxy)
is_success = True
except:
print('Some error occurs, checking domain {} again'.format(
domain))
finally:
sleep(randint(1, 3)) # <--- Sleeper, configured for 1-2 seconds
if __name__ == "__main__":
main()