使用Python Scrapy抓取网站并捕获不同的响应状态代码

时间:2017-12-20 14:19:52

标签: python scrapy response

有人可以帮助我理解我的Scrapy Spider的每个抓取请求的响应状态代码捕获。我能够获得Resp代码200的输出,但如果网站有404错误,它也没有写任何输出,包括301和302.

以下是我为其他网站实施的代码,并添加了一个包含我姓名的域名供参考。

import scrapy
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider



class TestSpider(CrawlSpider):
    name = 'TestSpider' 
    handle_httpstatus_list = [404]
    resp_log_file = 'C:\\resp'
    ok_log_file = 'C:\\alright'
    bad_log_file = 'C:\\badresp'
    redirect_log_file = 'C:\\redirect'

    allowed_domains = ['santhosh.com']
    start_urls = ['santhosh.com/']

    # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
    rules = [
        Rule(
            LinkExtractor(
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback="parse_item"
        )
    ]

    def parse_item(self, response):
        # The list of items that are found on the particular page
        items = []
        res = Selector(response)
        self.append(self.resp_log_file, str(response))
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
        try:
            if response.status == 404:
                ## 404 tracciate anche separatamente
                self.append(self.bad_log_file, response.url)
            elif response.status == 200:
                ## printa su ok_log_file
                self.append(self.ok_log_file, response.url)
            elif response.status == 302:
                ## printa su ok_log_file
                self.append(self.redirect_log_file, response.url)
            else:
                self.append(self.bad_log_file, response.url)
        except Exception, e:
            pass

        return None


    def append(self, file, string):
        print " Writing content to File "
        file = open(file, 'a')
        file.write(string+"\n")
        file.close()

我已经看到了与响应代码捕获相关的问题,但它们与我的请求并不完全相似,因此创建了这个新帖子。如果有任何与此相关的问题,请求您忽略此问题并将我重定向到那里。提前谢谢!

1 个答案:

答案 0 :(得分:0)

我尝试了代码,我看到它将404301发送到parse(),而不是parse_item(),但我没有包含损坏链接的页面,所以它不开始LinkExtractor

我使用门户网站httpbin.org生成具有不同状态的网页。

也许如果我有破坏网址的网页,那么LinkExtractor就可以运行,我可能会得到不同的结果。

#!/usr/bin/env python3

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
#from scrapy.commands.view import open_in_browser

class MySpider(CrawlSpider):

    name = 'MySpider' 

    handle_httpstatus_list = [404, 301, 302, 303]

    all_responses_log = './responses_all.log'
    ok_responses_log  = './responses_ok.log'
    bad_responses_log = './responses_bad.log'
    redirects_responses_log = './responses_redirect.log'

    start_urls = [
        'http://httpbin.org/status/301',
        'http://httpbin.org/status/302',
        'http://httpbin.org/status/303',

        'http://httpbin.org/status/404',
        'http://httpbin.org/status/200',
    ]

    # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
    rules = [
        Rule(
            LinkExtractor(
                canonicalize=True,
                unique=True
            ),
            follow=True,
            callback="parse_item"
        )
    ]

    def parse(self, response):
        print('parse url:', response.url)

        self.test_status('parse()', response)

    def parse_item(self, response):
        print('parse item url:', response.url)

        self.test_status('parse_item()', response)

        # The list of items that are found on the particular page
        items = []
        res = Selector(response)
        self.append(self.resp_log_file, str(response))
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)

    def test_status(self, text, response):
        try:
            if response.status == 404:
                log = self.bad_responses_log
            elif response.status == 200:
                log = self.ok_responses_log
            #elif 299 < response.status < 400:
            elif response.status in (301, 302, 303, 307):
                log = self.redirects_responses_log
            else:
                log = self.redirects_responses_log

            message = "{} | {} | {}\n".format(response.status, text, response.url)
            self.append(log, message)
        except Exception as e:
            print('Error:', e)

    def append(self, filename, string):
        print('Writing log:', filename)
        with open(filename, 'a') as f:
            f.write(string)


# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider)
c.start()