我有一个包含域名列表的文件。我需要抓取域名(即整个网站)来获取rss链接。递归抓取网站的每个页面以从每个页面获取rss链接并写入对应于该域的json文件这是我的代码仅适用于一个网站:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
#rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
sites = sel.select('/html/head/link[@type=application/rss+xml]/@href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = sites
#items.append(item)
return item
尝试跑步 scrapy crawl apple -o items.json -t json
但items.json只包含一个括号[
这是我的items.py文件:
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
答案 0 :(得分:1)
您的XPath表达式需要在“application / rss + xml”测试值周围加上引号。
尝试类似:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('/html/head/link[@type="application/rss+xml"]/@href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
#items.append(item)
return item