Question

执行蜘蛛数据时，是从页面中提取的，但是当管道启动时，出现了问题...我得到以下错误：

回溯（最近通话最近）： _runCallbacks中的第654行“ C：\ Users \ EAgnelli \ AppData \ Local \ Continuum \ anaconda3 \ envs \ tensorflow \ lib \ site-packages \ twisted \ internet \ defer.py” current.result =回调（current.result，* args，** kw） TypeError：close_spider（）缺少1个必需的位置参数：“原因”

我正在通过Scrapy Splash发送请求以在页面上执行一些Java，然后提取链接信息...但这是我第一次遇到此错误。

这是我的蜘蛛

import scrapy
import scrapy_splash
from scrapy.linkextractors import LinkExtractor
from cointelegraph_spider.items import CointelegraphSpiderItem
import sqlite3 as sq3

class CointelegraphspiderSpider(scrapy.Spider):
    name = 'cointelegraphspider'
    allowed_domains = ['cointelegraph.com']
    start_urls = ['http://cointelegraph.com/']



    def start_requests(self):

        """
        Doc string
        """

        # Execute the LUA script for "Load Mor" button
        script = """

            function main(splash, args)
                assert(splash:go(args.url))
                splash:wait(0.5)
                local num_clicks = 2
                local delay = 1.5
                local load_more = splash:jsfunc(
                            [[
                                function ()
                                {
                                    var el = document.getElementsByClassName('post-preview-list-navigation__btn post-preview-list-navigation__btn_load-more');
                                    el[0].click();
                                } 
                            ]]

                            )

                for _ = 1, num_clicks do
                    load_more()
                    splash:wait(delay)
                end        

                return 
                {
                    html = splash:html(),
                }
            end

        """

        for url in self.start_urls:

            yield scrapy_splash.SplashRequest(
                    url=url,
                    callback=self.parse_main_page,
                    args={
                            'wait':3,
                            'lua_source':script,
                            #'timeout': 3600 # Here the max-timeout is 60 -- to increase it launch the docker with --max-timeout xxxxx
                            },
                    endpoint="execute",
                    )

    def parse_main_page(self, response):
        """
        Doc string
        """        
        # Convert Splash response into html response object
        html = scrapy.Selector(response)

        # Check DB for existing records
        conn = sq3.connect("D:\\DCC\\Projects\\crypto_projects\\master_data.db")
        db_links = conn.execute("select link from cointelegraph").fetchall() # list of tuples
        db_links = [elem[0] for elem in db_links] # flattening list
        print("DB LINKS! ", db_links)
        #db_links = ["aaa",]
        conn.close() # close connection

        # Extract all links to be followed
        news_links = LinkExtractor(restrict_xpaths=['//ul[@class="post-preview-list-cards"]/li/div/article/a', # Main Body
                                                    '//div[@class="main-news-tabs__wrp"]/ul/li/div/a'] # "Editor's Choice" & "Hot Stories"
                                    ).extract_links(html.response)

        for link in news_links[:2]:
            # Follow only new links
            if link.url not in db_links:
                yield scrapy.Request(link.url, callback=self.parse_article)


    def parse_article(self, response):
        """
        Doc string
        """

        # Create Item for Pipeline
        item = CointelegraphSpiderItem()

        item['author'] = response.xpath('//div[@class="name"]/a/text()').extract_first().strip()
        item['timestamp'] = response.xpath('//div/@datetime').extract_first().split('t')[0] # %Y-%m-%d
        item['title'] = response.xpath('//h1[@class="header"]/text()').extract_first().strip()
        item['body'] = ' '.join(response.xpath('//div[@class="post-full-text contents js-post-full-text"]/p//text()').extract())
        item['quotes'] = ';;;'.join(response.xpath('//div[@class="post-full-text contents js-post-full-text"]/blockquote//text()').extract())
        item['int_links'] = ';;;'.join(response.xpath('//div[@class="post-full-text contents js-post-full-text"]/p/a/@href').extract())
        _tmp = [elem.replace('#','') for elem in response.xpath('//div[@class="tags"]/ul/li/a/text()').extract()]
        item['tags'] = ';;;'.join([elem.replace(' ','') for elem in _tmp])
        item['link'] = response.url
        item['news_id'] = str(hash(item['link']))

        yield item

这是我的管道

import sqlite3 as sq3
import sqlite3_functions as sq_f
import logging
from scrapy.exceptions import DropItem

class CointelegraphSpiderPipeline(object):
    """
    Doc string
    """

    def __init__(self, stats):
        """
        Doc string
        """
        self.stats = stats
        self.db_file = 'D:\\DCC\\Projects\\crypto_projects\\master_data.db'
        self.conn = sq3.connect(self.db_file)
        self.table_name = 'cointelegraph'
        self.commit_counter = 0


    @classmethod
    def from_crawler(cls, crawler):
        """
        Doc string
        """
        stats = crawler.stats
        return stats   #cls(crawler.stats)

    def open_spider(self, spider):
        """
        Doc string
        """
        print("I'm starting the pipeline")
        logging.INFO("Starting Pipeline...")

    def process_item(self, item, spider):
        """
        Doc string
        """
        item_checked = True

        try:
            # Sanity Check
            for key, value in item.items():
                print("Inside the loop!!!")
                if value == '':
                    item_checked = False
                    raise DropItem("Item '{0}:{1}' has empty data - Link: {3}".format(key, value, item['link']))
                else:
                    logging.INFO("Item check OK")
                    item_checked = True

            # Insert row and increase counter
            if item_checked:
                self.conn = sq_f.insert_row(self.db_file, table_name=self.table_name, conn=self.conn, **item)
                self.commit_counter += 1
                self.conn.commit()

            # Commit every 500 inserted rows
            if self.commit_counter % 500 == 0:
                self.conn.commit()

            print(item)

        except Exception as e:
            logging.WARNING(e)




    def close_spider(self, spider):
        """
        Doc string
        """
        logging.INFO("Commiting rows...")
        self.conn.commit()
        logging.INFO("Saving spider stats...")
        print(self.stats.get_stats())
        logging.INFO("Closing pipeline..")
        self.conn.close()

和我的设置：

BOT_NAME = 'cointelegraph_spider'

SPIDER_MODULES = ['cointelegraph_spider.spiders']
NEWSPIDER_MODULE = 'cointelegraph_spider.spiders'

# Splash Settings
SPLASH_URL = 'http://localhost:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3699.0 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
    #'cointelegraph_spider.middlewares.CointelegraphSpiderSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    #'cointelegraph_spider.middlewares.CointelegraphSpiderDownloaderMiddleware': 543,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'cointelegraph_spider.pipelines.CointelegraphSpiderPipeline': 300,
}

Answer 1

虽然易碎的管道应采用close_spider(self, spider)方法，但实际的信号回调应为close_spider(self, spider, reason)。

您的代码中的某些内容将管道的close_spider方法更改为直接信号回调。您可以通过调整方法签名以使其包含reason来解决此问题：

def close_spider(self, spider, reason):
    pass

请参阅spider_closed上的信号文档
scrap脚的Pipeline.close_spider

TypeError：close_spider（）缺少1个必需的位置参数：“原因”

1 个答案: