我在Ubuntu 14.04上使用python 2.7使用旋转代理进行刮擦...几分钟后刮出错误:
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
if keyword1 in text and keyword2 in text and keyword3 in text:
print("LINK SCRAPED")
print(text, "link scraped")
found = True
break
except requests.exceptions.ConnectionError as err:
print("Encountered ConnectionError, retrying: {}".format(err))
如果这不是实现try
的正确方法,我假设只有request
进入try子句而其他所有内容都在except
之后?
答案 0 :(得分:2)
您可以使用try / except语句处理错误,而不是重新启动脚本。
例如:
try:
# line of code that is failing
except requests.exceptions.ConnectionError as err:
print("Encountered ConnectionError, retrying: {}".format(err))
然后重试原来的电话。
更新:根据您更新的代码示例,以下是我要做的事情:
from bs4 import BeautifulSoup
import requests
import smtplib
import urllib2
from random import randint
import time
from lxml import etree
from time import sleep
import random
proxies = {'https': '100.00.00.000:00000'}
hdr1 = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
}
hdrs = [hdr1] #, hdr2, hdr3, hdr4, hdr5, hdr6, hdr7]
ua = random.choice(hdrs)
head = {
'Connection': 'close',
'User-Agent': ua,
}
##### REQUEST 1 ####
done = False
while not done:
try:
a = requests.get('https://store.fabspy.com/sitemap.xml', proxies=proxies, headers=head)
done = True
except requests.exceptions.ConnectionError as err:
print('Encountered ConnectionError, retrying: {}'.format(err))
time.sleep(1)
scrape = BeautifulSoup(a.text, 'lxml')
links = scrape.find_all('loc')
for link in links:
if 'products' in link.text:
sitemap = str(link.text)
break
keyword1 = 'not'
keyword2 = 'on'
keyword3 = 'site'
######### REQUEST 2 #########
done = False
while not done:
try:
r = requests.get(sitemap, proxies=proxies, headers=head)
done = True
except requests.exceptions.ConnectionError as err:
print('Encountered ConnectionError, retrying: {}'.format(err))
sleep(randint(4,6))
soup = BeautifulSoup(r.text, 'lxml')
links = soup.find_all('loc')
for link in links:
text = link.text
if keyword1 in text and keyword2 in text and keyword3 in text:
print(text, 'link scraped')
break