我试图使用Scrapy从网站上抓取信息。一般结构如下:
<item>
<title>........</title>
<link>.........</link>
<category>......</category>
<category>.......</category>
<pubdate>.........</pubdate>
</item>
网站XML有26个这样的项目。我想抓取每个项目的链接,标题类别和发布日期并存储在CSV文件中。我的蜘蛛类如下:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from testscraper.items import testscraperItem
class MySpider(BaseSpider):
name="Test_scraper"
allowed_domains=["http://nytimes.com/feed/"]
start_urls=["http://nytimes.com/feed/"]
def parse(self,response):
data=[]
hxs = HtmlXPathSelector(response)
items= hxs.select('//item')
for item in items:
struct=testscraperItem()
title=item.select('./title/text()').extract()
link=item.select('./link/@href').extract()
pubdate=item.select('./pubDate/text()').extract()
topics=item.select('./category/text()').extract()
struct["title"]=title
struct["link"]=link
struct["pubdate"]=pubdate
struct["topics"]=topics
data.append(struct)
return data
一切正常,除了我无法删除的发布日期标记(我得到一个空值)。此标记的示例值为:
<pubDate>Thu, 19 Feb 2015 19:29:08 GMT</pubDate>
我使用 response.xpath 尝试了以下代码,我能够提取pubdate标记:
def parse(self,response):
items=[]
pubdates=response.xpath('//item//pubDate/text()')
for pubdate in pubdates:
item["pubdate"]=pubdate.extract()
return items
为什么我在循环项目时无法提取pubdate标记内容,而不是在我整个网页整体上提取它时?我真的很难过并且非常喜欢这方面的帮助。谢谢! !出于其他目的,我必须遍历每个项目,因此代码片段2不是一个选项 - 我必须遵循我编写的第一个代码段的结构
答案 0 :(得分:1)
它看起来很像 XML Feed 。如果是这种情况,则需要使用XMLFeedSpider
:
from scrapy import Item, Field
from scrapy.contrib.spiders import XMLFeedSpider
from testscraper.items import testscraperItem
class MySpider(XMLFeedSpider):
name = "Test_scraper"
itertag = 'item'
allowed_domains = ["dealbook.nytimes.com"]
start_urls = ["http://dealbook.nytimes.com/feed/"]
def parse_nodes(self, response, nodes):
for index, selector in enumerate(nodes, start=1):
ret = iterate_spider_output(self.parse_node(response, selector))
for result_item in self.process_results(response, ret):
result_item['index'] = index
yield result_item
def parse_node(self, response, selector):
struct = testscraperItem()
title = selector.select('./title/text()').extract()
link = selector.select('./link/@href').extract()
pubdate = selector.select('./pubDate/text()').extract()
topics = selector.select('./category/text()').extract()
struct["title"] = title
struct["link"] = link
struct["pubdate"] = pubdate
struct["topics"] = topics
yield struct
输出:
{'link': [],
'pubdate': [u'Fri, 20 Feb 2015 18:02:28 GMT'],
'title': [u'Currency\u2019s Weakness Troubles China\u2019s Policy Makers'],
'topics': [u'China',
u'Renminbi (Currency)',
u'Economic Conditions and Trends',
u"People's Bank of China",
u'Xi Jinping']}
{'link': [],
'pubdate': [u'Thu, 19 Feb 2015 15:58:15 GMT'],
'title': [u'New Rules Spur a Humbling Overhaul of Wall St. Banks'],
'topics': [u'Banking and Financial Institutions',
u'Dodd-Frank Wall Street Reform and Consumer Protection Act (2010)',
u'Executive Compensation',
u'Regulation and Deregulation of Industry',
u'Goldman Sachs Group Inc',
u'JPMorgan Chase & Company',
u'Federal Reserve System',
u'Federal Deposit Insurance Corp']}
...