from string import join
from scrapy.contrib.spiders.crawl import CrawlSpider
from scrapy.selector import Selector
from scrapy.http.request import Request
from article.items import ArticleItem
class ArticleSpider(CrawlSpider):
name = "article"
allowed_domains = ["http://joongang.joins.com"]
j_classifications = ['politics','money','society','culture']
start_urls = ["http://news.joins.com/%s" % classification for
classification in j_classifications]
def parse_item(self, response):
sel = Selector(response)
urls = sel.xpath('//div[@class="bd"]/ul/li/strong')
items = []
for url in urls:
item = ArticleItem()
item['url'] = url.xpath('a/@href').extract()
items.append(item)
request = Request(items['url'], callback=self.parse_item2)
request.meta['item'] = items
return request
def parse_item2(self,response):
item = response.meta['item']
sel = Selector(response)
articles = sel.xpath('//div[@id=article_body]')
for article in articles:
item['article'] = article.xpath('text()').extract()
items.append(item)
return item
此文章废料代码。我已经习惯了scrapy。 parse_item方法是针对使用请求函数发送到parse_item2的文章url实现的。但是这段代码不起作用。 Item类确实实现了url = Field(),article = Field()。我怎么解决这个问题。 PS网络标签是准确的。我在scrapy shell测试过。
答案 0 :(得分:0)
您的代码中存在问题:
request = Request(items['url'], callback=self.parse_item2)
items
是item
对象的列表。所以它会引发TypeError。你可以用第二个for循环来完成它,
for itm in items:
request = Request(itm['url'], callback=self.parse_item2)
request.meta['item'] = items
yield request
或者从第一个for循环产生请求,
for url in urls:
item = ArticleItem()
item['url'] = url.xpath('a/@href').extract()
request = Request(items['url'], callback=self.parse_item2)
request.meta['item'] = items
yield request