scrapy无法终止,但会不断显示日志统计信息

时间:2018-11-02 10:43:20

标签: python-3.x scrapy scrapy-spider

我写了一个蜘蛛,它可以爬网到一定深度,并使用scrapy的内置文件下载器下载pdf / docs文件。除了一个网址(http://www.imerys.com)以外,它都运作良好。

scrapy_pdf.py

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
# from scrapy_splash import SplashRequest
from scrapy.http import Request
# from urllib.parse import urlencode, parse_qs
# from O365 import Message
import subprocess
import datetime
import re
import pandas as pd
from ..items import PdfCrawlerItem

def check_link(url):
    # function to check url for relevancy
    return check

def extract_domain(url):
    url = url.replace("http://","")
    url = url.replace("https://","")
    url = url.replace("www.","")
    if url[-1] == '/':
        url = url[0:-1]
    return url.strip()

class MySpider(CrawlSpider):
    name = 'pdf_extractor'
    rules = (
        Rule(LinkExtractor(tags="a", deny_extensions = []), callback='parse_document',follow=True),
    )

    def __init__(self, ip, **kwargs):
        domain = extract_domain(ip)
        self.domain = domain
        subprocess.call(["mkdir","/home/dev/scrapy-inbuild-downloader-example/pdf_crawler/documents/"+domain])
        self.start_time = datetime.datetime.now()
        self.start_urls =  [ip] # py36
        self.allowed_domains = [domain]
        super().__init__(**kwargs)  # python3

    def parse_document(self, response):
        content_type = response.headers.get('Content-Type',None).decode("utf-8")
        url = response.url
        if content_type == "application/pdf" or content_type == "application/msword":
            # print("checking url: %s"%url)
            check = check_link(url)
            if check:
                # print("pass url: %s"%url)
                name = response.headers.get('Content-Disposition',None)
                if name:
                    name = name.decode("utf-8")
                    name = re.findall(r"filename=(.*)", name)
                    if name:
                        name = name[0].replace("\"",'').replace('\'','')
                        if name.endswith('.pdf') or name.endswith('.doc') or name.endswith('.docx'):
                            pass
                        else:
                            name = name + '.pdf' if content_type == "application/pdf" else name + '.docx'
                    else:
                        name = url.split('/')[-1]
                else:
                    name = url.split('/')[-1]
                item = PdfCrawlerItem()
                item['file_urls'] = url
                item['name'] = self.domain+"/"+name
                print(item)
                return item
            # else:
                # print("checking url: %s"%url)

    def close(self, spider, reason): # override this method for receiving notification after job finished.
        time = datetime.datetime.now() - self.start_time
        time = time.total_seconds() / 3600.
        print("total time:", time)

items.py

import scrapy

class PdfCrawlerItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    file_urls = scrapy.Field()
    name = scrapy.Field()

pipelines.py

from scrapy.pipelines.files import FilesPipeline
from scrapy import Request

class PdfCrawlerPipeline(FilesPipeline):       
    def file_path(self, request, response=None, info=None):
        return request.meta.get('filename','')

    def get_media_requests(self, item, info):
        file_url = item['file_urls']
        meta = {'filename': item['name']}
        yield Request(url=file_url, meta=meta)

    # def item_completed(self, results, item, info):
        # print(item['name'])
        # return item

日志

2018-11-02 16:05:33 [scrapy.extensions.logstats] INFO: Crawled 1796 pages (at 343 pages/min), scraped 18 items (at 3 items/min)
2018-11-02 16:06:33 [scrapy.extensions.logstats] INFO: Crawled 1796 pages (at 0 pages/min), scraped 18 items (at 0 items/min)


{'downloader/exception_count': 5,
 'downloader/exception_type_count/twisted.web._newclient.ResponseFailed': 3,
 'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 2,
 'downloader/request_bytes': 724471,
 'downloader/request_count': 1819,
 'downloader/request_method_count/GET': 1819,
 'downloader/response_bytes': 448477779,
 'downloader/response_count': 1814,
 'downloader/response_status_count/200': 1776,
 'downloader/response_status_count/301': 8,
 'downloader/response_status_count/302': 3,
 'downloader/response_status_count/404': 18,
 'downloader/response_status_count/500': 9,
 'dupefilter/filtered': 24148,
 'file_count': 18,
 'file_status_count/downloaded': 15,
 'file_status_count/uptodate': 3,
 'finish_reason': 'shutdown',
 'finish_time': datetime.datetime(2018, 11, 2, 10, 10, 56, 530946),
 'httperror/response_ignored_count': 19,
 'httperror/response_ignored_status_count/404': 16,
 'httperror/response_ignored_status_count/500': 3,
 'item_scraped_count': 18,
 'log_count/DEBUG': 240624,
 'log_count/ERROR': 1,
 'log_count/INFO': 31,
 'log_count/WARNING': 3,
 'memusage/max': 258433024,
 'memusage/startup': 84455424,
 'offsite/domains': 58,
 'offsite/filtered': 1536,
 'request_depth_max': 2,
 'response_received_count': 1797,
 'retry/count': 10,
 'retry/max_reached': 4,
 'retry/reason_count/500 Internal Server Error': 6,
 'retry/reason_count/twisted.web._newclient.ResponseFailed': 2,
 'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 2,
 'scheduler/dequeued': 1792,
 'scheduler/dequeued/memory': 1792,
 'scheduler/enqueued': 1794,
 'scheduler/enqueued/memory': 1794,
 'start_time': datetime.datetime(2018, 11, 2, 10, 7, 7, 304081)}

我无法弄清楚上面的代码有什么问题,因为除了上面的代码,它与我测试过的网站都很好。

1 个答案:

答案 0 :(得分:0)

我通过将DOWNLOAD_TIMEOUT设置为其默认值(即180秒)解决了上述问题。我出于测试目的将其设置为1800秒,但我忘记了还原更改。它等待1800秒,即超时前30分钟。