在这里擦新手。我目前正在尝试扩展我的crawlspider,以便它可以从文本文档中获取多个参数(而不是手动在命令行中输入每个参数,如scrapy crawl crawl5 -a start_url="argument"
)。目前,我可以输入一个参数,并生成一些项目。但我想就两个问题提供一些指导:
我的目标是多次模仿运行我的crawlspider,同时保持每个参数返回的项目分开。
编辑..这是我的代码 - 你可以看到它是thesaurus.com的刮刀
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from thesaurus.items import ThesaurusItem
class MySpider(CrawlSpider):
name = 'crawl5'
def __init__(self, *args, **kwargs):
self.start_urls = ["http://www.thesaurus.com/browse/%s" %kwargs.get('start_url')]
self.allowed_domains = ["thesaurus.com"]
self.rules = (
Rule(LinkExtractor(restrict_xpaths=("//div[id='paginator']//a/@href"))),
Rule(LinkExtractor(allow=('http://www.thesaurus.com/browse/%s/.$' %kwargs.get('start_url'), 'http://www.thesaurus.com/browse/%s/..$' %kwargs.get('start_url'))), callback='parse_item', follow=True)
)
super(MySpider, self).__init__(*args, **kwargs)
def parse_start_url(self, response):
for sel in response.xpath("//div[contains(@class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[@class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
def parse_item(self, response):
for sel in response.xpath("//div[contains(@class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[@class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item