在scrapy中将url传递给蜘蛛

时间:2017-11-03 20:54:40

标签: python scrapy

我正在尝试将url传递给我的蜘蛛但是在查看了其他人在堆栈溢出中所做的事后,我总是收到以下错误:

raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

有人可以解释我做错了什么吗?只是想学习如何设置这个api。

import scrapy
from drinkRecipe.items import DrinkrecipeItem
class RecipeSpider(scrapy.Spider):
    name = "recipe"
    allowed_domains = ["www.1001cocktails.com"]


    def __init__(self,*args, **kwargs):
        self.start_urls = [kwargs.get('searchUrl')]
        super(RecipeSpider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('searchUrl')]
        print self.start_urls


    def parse(self, response):
        for selector in response.xpath('//a[contains(@title,"Cocktail")]'):
            href = selector.xpath('@href').extract()[0]
            if href is not None:
                href = response.urljoin(href)
                yield scrapy.Request(url=href, callback=self.parse_Recipe)

    def parse_Recipe (self,response):
        item = DrinkrecipeItem()
        item['name'] = response.xpath('//h1/text()').extract()[0][:-17]

        measurements = response.xpath('//td[@class="normal2"]/text()').re(r'.*oz.*')
        ingredients = []
        recipe = ""
        for selector  in response.xpath('//a[contains(@alt,"Recipes with")]'):
            ingredient = selector.xpath('u/text()').extract()[0]
            try:
                ingredients.append(str(ingredient))
            except:
                continue
        for i in range(len(measurements)):
            measurement = str(measurements[i]).replace("                  ","")
            recipe += measurement+ingredients[i]+":"
    item['recipe'] = recipe
        yield item

运行蜘蛛的主要脚本

from scrapy import signals
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy.utils.project import get_project_settings
from drinkRecipe import DrinkrecipeItem,RecipeSpider

items = []


def scrapedItems(item, response, spider):
    items.append(item)

def main():
    settings = get_project_settings()
    crawler = Crawler(RecipeSpider(searchUrl='http://www.1001cocktails.com/recipes/cocktails/cocktails_list_vu.php3?&start=0'))
    crawler.signals.connect(scrapedItems,signal=signals.item_scraped)
    process = CrawlerProcess(settings)
    process.crawl(crawler)
    process.start()
    print '-'*50
    print items
if __name__ == '__main__':
    main()

1 个答案:

答案 0 :(得分:0)

所以我认为我是一个白痴因为没有意识到我应该在我的脚本中通过进程爬虫的arugument。