Question

我是Scrapy的新手，并尝试使用Scrapy抓取几个链接作为测试。每当我运行scrapy crawl tier1时，我得到“TypeError：object（）不带参数”，如下所示：

Traceback (most recent call last):
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/Users/btaek/TaeksProgramming/adv/crawler/adv_crawler/adv_crawler/spiders/tier1_crawler.py", line 93, in parse
    mk_loader.add_xpath('title', 'h1[@class="top_title"]')   # Title of the article
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 167, in add_xpath
    self.add_value(field_name, values, *processors, **kw)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 77, in add_value
    self._add_value(field_name, value)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 91, in _add_value
    processed_value = self._process_input_value(field_name, value)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 150, in _process_input_value
    return proc(value)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/processors.py", line 28, in __call__
    next_values += arg_to_iter(func(v))
TypeError: object() takes no parameters
2017-08-23 17:25:02 [tier1-parse-logger] INFO: Entered the parse function to parse and index: http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166
2017-08-23 17:25:02 [tier1-parse-logger] ERROR: Error (object() takes no parameters) when trying to parse <<date>> from a mk article: http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166
2017-08-23 17:25:02 [tier1-parse-logger] ERROR: Error (object() takes no parameters) when trying to parse <<author>> from a mk article: http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166
2017-08-23 17:25:02 [scrapy.core.scraper] ERROR: Spider error processing <GET http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166> (referer: None)
Traceback (most recent call last):
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/Users/btaek/TaeksProgramming/adv/crawler/adv_crawler/adv_crawler/spiders/tier1_crawler.py", line 93, in parse
    mk_loader.add_xpath('title', 'h1[@class="top_title"]')   # Title of the article
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 167, in add_xpath
    self.add_value(field_name, values, *processors, **kw)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 77, in add_value
    self._add_value(field_name, value)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 91, in _add_value
    processed_value = self._process_input_value(field_name, value)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 150, in _process_input_value
    return proc(value)
  File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/processors.py", line 28, in __call__
    next_values += arg_to_iter(func(v))
TypeError: object() takes no parameters

而且，我的蜘蛛文件（tier1_crawler.py）：

# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import os
sys.path.append(os.path.abspath('..'))

import logging
import scrapy
from scrapy.loader import ItemLoader
from adv_crawler.items import AdvCrawlerItem
from datetime import datetime, date, time

t1_parse_logger = logging.getLogger("tier1-parse-logger")
t1_parse_logger.LOG_FILE = "Tier1-log.txt"

content_type_dic = {
                    'news': 'news',
                    }


class Tier1Crawler(scrapy.Spider):
    name = "tier1"

    def start_requests(self):
        urls = ['http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535982',
                'http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166',
                ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)


    def parse(self, response):
        t1_parse_logger.info("Entered the parse function to parse and index: %s" % response.url)   # Log at the beginning of the parse function
        item_loader = ItemLoader(item=AdvCrawlerItem(), response=response)
        if 'mk.co.kr' in response.url:
            mk_loader = item_loader.nested_xpath('//div[@id="top_header"]/div[@class="news_title"]/div[@class="news_title_text"]')
            try:
                mk_loader.add_xpath('date', 'div[@class="news_title_author"]/ul/li[@class="lasttime"]')
            except AttributeError:   # if the date is not in "lasttime" li tag
                mk_loader.add_xpath('date', 'div[@class="news_title_author"]/ul/li[@class="lasttime1"]')
            except Exception as e:   # in case the error is not AttributeError
                t1_parse_logger.error("Error "+"("+str(e)+")"+" when trying to parse <<date>> from a mk article: %s" % response.url)

            try:
                mk_loader.add_xpath('author', 'div[@class="news_title_author"]/ul/li[@class="author"]')
            except AttributeError:   # in case there is no author (some mk articles have no author)
                item_loader.add_value('author', "None")   # ir error, replace with the line below
                # item['author'] = "None"   # if the above gives any error, replace the above with this line
            except Exception as e:   # in case the error is not AttributeError
                t1_parse_logger.error("Error "+"("+str(e)+")"+" when trying to parse <<author>> from a mk article: %s" % response.url)

            item_loader.add_xpath('content', '//div[@id="Content"]/div[@class="left_content"]/div[@id="article_body"]/div[@class="art_txt"]')   # Content of the article (entire contents)
            mk_loader.add_xpath('title', 'h1[@class="top_title"]')   # Title of the article

        item_loader.add_value('content_type', content_type_dic['news'])
        item_loader.add_value('timestamp', str(datetime.now()))   # timestamp of when the document is being indexed
        item_loader.add_value('url', response.url)   # url of the article

        t1_parse_logger.info("Parsed and indexed: %s" % response.url)

        return item_loader.load_item()

而且，我的items.py文件：

# -*- coding: utf-8 -*-

import scrapy
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from w3lib.html import remove_tags

def filter_date(value):
    if isinstance(value, unicode):
        (year, month, day) = str(value.split(" ")[-2]).split(".")
        return year+"-"+month+"-"+day

def filter_utf(value):
    if isinstance(value, unicode):
        return value.encode('utf-8')

class AdvCrawlerItem(scrapy.Item):
    author = scrapy.Field(input_processor=MapCompose(remove_tags, TakeFirst, filter_utf),) # Name of the publisher/author
    content = scrapy.Field(input_processor=MapCompose(remove_tags, Join, filter_utf),) # Content of the article (entire contents)
    content_type = scrapy.Field()
    date = scrapy.Field(input_processor=MapCompose(remove_tags, TakeFirst, filter_date),)
    timestamp = scrapy.Field()  # timestamp of when the document is being indexed
    title = scrapy.Field(input_processor=MapCompose(remove_tags, TakeFirst, filter_utf),)   # title of the article
    url = scrapy.Field()   # url of the article

并且，pipelines.py文件：

import json
from scrapy import signals
from scrapy.exporters import JsonLinesItemExporter

class AdvCrawlerJsonExportPipeline(object):
    def open_spider(self, spider):
        self.file = open('crawled-articles1.txt', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dummps(dict(item)) + "\n"
        self.file.write(line)
        return item

我知道“TypeError：object（）不接受任何参数”当类的__init__方法根本没有被定义或者没有被定义为接受参数时，通常会抛出错误。

但是，在上面的情况下，我该如何修复错误？我使用项目加载器或嵌套项加载器做错了吗？

Answer 1

使用scrapy处理器时，您应该使用这些类来创建进行处理的对象：

# wrong
field = Field(output_processor=MapCompose(TakeFirst))
# right
field = Field(output_processor=MapCompose(TakeFirst()))
                                                   ^^

scrapy TypeError：object（）不带参数

1 个答案: