在scrapy中为我的爬虫提供参数

时间:2017-02-13 08:26:32

标签: python scrapy

我编写了一个scrapy爬虫,但我需要添加从命令行读取一些参数的功能,然后在我的蜘蛛类中填充一些静态字段。我还需要覆盖初始化器,以便填充一些蜘蛛字段。

import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re


class TutsplusItem(scrapy.Item):
    title = scrapy.Field()


class MySpider(Spider):
    name = "tutsplus"
    allowed_domains = ["bbc.com"]
    start_urls = ["http://www.bbc.com/"]

    def parse(self, response):
        links = response.xpath('//a/@href').extract()
        # We stored already crawled links in this list
        crawledLinks = []

        for link in links:
            # If it is a proper link and is not checked yet, yield it to the Spider
            # if linkPattern.match(link) and not link in crawledLinks:
            if not link in crawledLinks:
                link = "http://www.bbc.com" + link
                crawledLinks.append(link)
                yield Request(link, self.parse)

        titles = response.xpath('//a[contains(@class, "media__link")]/text()').extract()
        for title in titles:
            item = TutsplusItem()
            item["title"] = title
            print("Title is : %s" % title)
            yield item

然后它应该运行:

scrapy runspider crawler.py arg1 arg2

我如何实现这一目标?

1 个答案:

答案 0 :(得分:0)

你可以通过覆盖你的蜘蛛的init方法来做到这一点。

class MySpider(Spider):
    name = "tutsplus"
    allowed_domains = ["bbc.com"]
    start_urls = ["http://www.bbc.com/"]
    arg1 = None
    arg2 = None

    def __init__(self,  arg1, arg2,  *args, **kwargs):
        self.arg1 = arg1
        self.arg2 = arg2
        super(MySpider, self).__init__(*args, **kwargs)

    def parse(self, response):
        links = response.xpath('//a/@href').extract()
        # We stored already crawled links in this list
        crawledLinks = []

        for link in links:
            # If it is a proper link and is not checked yet, yield it to the Spider
            # if linkPattern.match(link) and not link in crawledLinks:
            if not link in crawledLinks:
                link = "http://www.bbc.com" + link
                crawledLinks.append(link)
                yield Request(link, self.parse)

        titles = response.xpath('//a[contains(@class, "media__link")]/text()').extract()
        for title in titles:
            item = TutsplusItem()
            item["title"] = title
            print("Title is : %s" % title)
            yield item

然后运行你的蜘蛛

scrapy crawl tutsplus -a arg1=arg1 -a arg2=arg2