有人可以帮助我理解我的Scrapy Spider的每个抓取请求的响应状态代码捕获。我能够获得Resp代码200的输出,但如果网站有404错误,它也没有写任何输出,包括301和302.
以下是我为其他网站实施的代码,并添加了一个包含我姓名的域名供参考。
import scrapy
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
class TestSpider(CrawlSpider):
name = 'TestSpider'
handle_httpstatus_list = [404]
resp_log_file = 'C:\\resp'
ok_log_file = 'C:\\alright'
bad_log_file = 'C:\\badresp'
redirect_log_file = 'C:\\redirect'
allowed_domains = ['santhosh.com']
start_urls = ['santhosh.com/']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_item"
)
]
def parse_item(self, response):
# The list of items that are found on the particular page
items = []
res = Selector(response)
self.append(self.resp_log_file, str(response))
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
try:
if response.status == 404:
## 404 tracciate anche separatamente
self.append(self.bad_log_file, response.url)
elif response.status == 200:
## printa su ok_log_file
self.append(self.ok_log_file, response.url)
elif response.status == 302:
## printa su ok_log_file
self.append(self.redirect_log_file, response.url)
else:
self.append(self.bad_log_file, response.url)
except Exception, e:
pass
return None
def append(self, file, string):
print " Writing content to File "
file = open(file, 'a')
file.write(string+"\n")
file.close()
我已经看到了与响应代码捕获相关的问题,但它们与我的请求并不完全相似,因此创建了这个新帖子。如果有任何与此相关的问题,请求您忽略此问题并将我重定向到那里。提前谢谢!
答案 0 :(得分:0)
我尝试了代码,我看到它将404
和301
发送到parse()
,而不是parse_item()
,但我没有包含损坏链接的页面,所以它不开始LinkExtractor
我使用门户网站httpbin.org生成具有不同状态的网页。
也许如果我有破坏网址的网页,那么LinkExtractor
就可以运行,我可能会得到不同的结果。
#!/usr/bin/env python3
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
#from scrapy.commands.view import open_in_browser
class MySpider(CrawlSpider):
name = 'MySpider'
handle_httpstatus_list = [404, 301, 302, 303]
all_responses_log = './responses_all.log'
ok_responses_log = './responses_ok.log'
bad_responses_log = './responses_bad.log'
redirects_responses_log = './responses_redirect.log'
start_urls = [
'http://httpbin.org/status/301',
'http://httpbin.org/status/302',
'http://httpbin.org/status/303',
'http://httpbin.org/status/404',
'http://httpbin.org/status/200',
]
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_item"
)
]
def parse(self, response):
print('parse url:', response.url)
self.test_status('parse()', response)
def parse_item(self, response):
print('parse item url:', response.url)
self.test_status('parse_item()', response)
# The list of items that are found on the particular page
items = []
res = Selector(response)
self.append(self.resp_log_file, str(response))
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
def test_status(self, text, response):
try:
if response.status == 404:
log = self.bad_responses_log
elif response.status == 200:
log = self.ok_responses_log
#elif 299 < response.status < 400:
elif response.status in (301, 302, 303, 307):
log = self.redirects_responses_log
else:
log = self.redirects_responses_log
message = "{} | {} | {}\n".format(response.status, text, response.url)
self.append(log, message)
except Exception as e:
print('Error:', e)
def append(self, filename, string):
print('Writing log:', filename)
with open(filename, 'a') as f:
f.write(string)
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider)
c.start()