我是Scrapy的新手,并尝试使用Scrapy抓取几个链接作为测试。每当我运行scrapy crawl tier1
时,我得到“TypeError:object()不带参数”,如下所示:
Traceback (most recent call last):
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/btaek/TaeksProgramming/adv/crawler/adv_crawler/adv_crawler/spiders/tier1_crawler.py", line 93, in parse
mk_loader.add_xpath('title', 'h1[@class="top_title"]') # Title of the article
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 167, in add_xpath
self.add_value(field_name, values, *processors, **kw)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 77, in add_value
self._add_value(field_name, value)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 91, in _add_value
processed_value = self._process_input_value(field_name, value)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 150, in _process_input_value
return proc(value)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/processors.py", line 28, in __call__
next_values += arg_to_iter(func(v))
TypeError: object() takes no parameters
2017-08-23 17:25:02 [tier1-parse-logger] INFO: Entered the parse function to parse and index: http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166
2017-08-23 17:25:02 [tier1-parse-logger] ERROR: Error (object() takes no parameters) when trying to parse <<date>> from a mk article: http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166
2017-08-23 17:25:02 [tier1-parse-logger] ERROR: Error (object() takes no parameters) when trying to parse <<author>> from a mk article: http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166
2017-08-23 17:25:02 [scrapy.core.scraper] ERROR: Spider error processing <GET http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166> (referer: None)
Traceback (most recent call last):
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/btaek/TaeksProgramming/adv/crawler/adv_crawler/adv_crawler/spiders/tier1_crawler.py", line 93, in parse
mk_loader.add_xpath('title', 'h1[@class="top_title"]') # Title of the article
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 167, in add_xpath
self.add_value(field_name, values, *processors, **kw)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 77, in add_value
self._add_value(field_name, value)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 91, in _add_value
processed_value = self._process_input_value(field_name, value)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/__init__.py", line 150, in _process_input_value
return proc(value)
File "/Users/btaek/TaeksProgramming/adv/crawler/lib/python2.7/site-packages/scrapy/loader/processors.py", line 28, in __call__
next_values += arg_to_iter(func(v))
TypeError: object() takes no parameters
而且,我的蜘蛛文件(tier1_crawler.py):
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import os
sys.path.append(os.path.abspath('..'))
import logging
import scrapy
from scrapy.loader import ItemLoader
from adv_crawler.items import AdvCrawlerItem
from datetime import datetime, date, time
t1_parse_logger = logging.getLogger("tier1-parse-logger")
t1_parse_logger.LOG_FILE = "Tier1-log.txt"
content_type_dic = {
'news': 'news',
}
class Tier1Crawler(scrapy.Spider):
name = "tier1"
def start_requests(self):
urls = ['http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535982',
'http://news.mk.co.kr/newsRead.php?sc=30000001&year=2017&no=535166',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
t1_parse_logger.info("Entered the parse function to parse and index: %s" % response.url) # Log at the beginning of the parse function
item_loader = ItemLoader(item=AdvCrawlerItem(), response=response)
if 'mk.co.kr' in response.url:
mk_loader = item_loader.nested_xpath('//div[@id="top_header"]/div[@class="news_title"]/div[@class="news_title_text"]')
try:
mk_loader.add_xpath('date', 'div[@class="news_title_author"]/ul/li[@class="lasttime"]')
except AttributeError: # if the date is not in "lasttime" li tag
mk_loader.add_xpath('date', 'div[@class="news_title_author"]/ul/li[@class="lasttime1"]')
except Exception as e: # in case the error is not AttributeError
t1_parse_logger.error("Error "+"("+str(e)+")"+" when trying to parse <<date>> from a mk article: %s" % response.url)
try:
mk_loader.add_xpath('author', 'div[@class="news_title_author"]/ul/li[@class="author"]')
except AttributeError: # in case there is no author (some mk articles have no author)
item_loader.add_value('author', "None") # ir error, replace with the line below
# item['author'] = "None" # if the above gives any error, replace the above with this line
except Exception as e: # in case the error is not AttributeError
t1_parse_logger.error("Error "+"("+str(e)+")"+" when trying to parse <<author>> from a mk article: %s" % response.url)
item_loader.add_xpath('content', '//div[@id="Content"]/div[@class="left_content"]/div[@id="article_body"]/div[@class="art_txt"]') # Content of the article (entire contents)
mk_loader.add_xpath('title', 'h1[@class="top_title"]') # Title of the article
item_loader.add_value('content_type', content_type_dic['news'])
item_loader.add_value('timestamp', str(datetime.now())) # timestamp of when the document is being indexed
item_loader.add_value('url', response.url) # url of the article
t1_parse_logger.info("Parsed and indexed: %s" % response.url)
return item_loader.load_item()
而且,我的items.py文件:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from w3lib.html import remove_tags
def filter_date(value):
if isinstance(value, unicode):
(year, month, day) = str(value.split(" ")[-2]).split(".")
return year+"-"+month+"-"+day
def filter_utf(value):
if isinstance(value, unicode):
return value.encode('utf-8')
class AdvCrawlerItem(scrapy.Item):
author = scrapy.Field(input_processor=MapCompose(remove_tags, TakeFirst, filter_utf),) # Name of the publisher/author
content = scrapy.Field(input_processor=MapCompose(remove_tags, Join, filter_utf),) # Content of the article (entire contents)
content_type = scrapy.Field()
date = scrapy.Field(input_processor=MapCompose(remove_tags, TakeFirst, filter_date),)
timestamp = scrapy.Field() # timestamp of when the document is being indexed
title = scrapy.Field(input_processor=MapCompose(remove_tags, TakeFirst, filter_utf),) # title of the article
url = scrapy.Field() # url of the article
并且,pipelines.py文件:
import json
from scrapy import signals
from scrapy.exporters import JsonLinesItemExporter
class AdvCrawlerJsonExportPipeline(object):
def open_spider(self, spider):
self.file = open('crawled-articles1.txt', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dummps(dict(item)) + "\n"
self.file.write(line)
return item
我知道“TypeError:object()不接受任何参数”当类的__init__
方法根本没有被定义或者没有被定义为接受参数时,通常会抛出错误。
但是,在上面的情况下,我该如何修复错误?我使用项目加载器或嵌套项加载器做错了吗?
答案 0 :(得分:3)
使用scrapy处理器时,您应该使用这些类来创建进行处理的对象:
# wrong
field = Field(output_processor=MapCompose(TakeFirst))
# right
field = Field(output_processor=MapCompose(TakeFirst()))
^^