我正试图在主页上抓一些帖子,几乎我需要的东西都在那里。但是在链接(ed)页面上还有一个我需要的日期字段。我尝试使用以下内容进行回调:
from scrapy.spider import BaseSpider
from macnn_com.items import MacnnComItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import MapCompose, Join
from scrapy.http.request import Request
class MacnnSpider(BaseSpider):
name = 'macnn_com'
allowed_domains = ['macnn.com']
start_urls = ['http://www.macnn.com']
posts_list_xpath = '//div[@class="post"]'
item_fields = { 'title': './/h1/a/text()',
'link': './/h1/a/@href',
'summary': './/p/text()',
'image': './/div[@class="post_img"]/div[@class="post_img_border"]/a/img/@original' }
def parse(self, response):
hxs = HtmlXPathSelector(response)
# iterate over posts
for qxs in hxs.select(self.posts_list_xpath):
loader = XPathItemLoader(MacnnComItem(), selector=qxs)
# define processors
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# skip posts with empty titles
if loader.get_xpath('.//h1/a/text()') == []:
continue
# iterate over fields and add xpaths to the loader
for field, xpath in self.item_fields.iteritems():
loader.add_xpath(field, xpath)
request = Request(loader.get_xpath('.//h1/a/@href')[0], callback=self.parse_link,meta={'loader':loader})
yield request
#loader.add_value('datums',request)
yield loader.load_item()
def parse_link(self, response):
loader = response.meta["loader"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//div[@class='post_header']/h2/text()").extract()
loader.add_value('datums',hero)
return loader
但是我得到了像
这样的错误错误:Spider必须返回Request,BaseItem或None,在
<GET http://www.macnn.com/articles/13/06/14/sidebar.makes.it.easier.to.jump.between.columns/>
中获得'XPathItemLoader'
我在这里做错了什么?
答案 0 :(得分:1)
parse_link
需要返回一个项目,而不是装载程序。
def parse_link(self, response):
loader = response.meta["loader"]
hxs = HtmlXPathSelector(response)
hero = hxs.select("//div[@class='post-header']/h2/text()").extract()
loader.add_value('datums',hero)
return loader.load_item()