如何让scrapy中的start_urls获取另一个python函数生成的url?

时间:2014-04-02 09:34:07

标签: python web-scraping scrapy

这是我从ebay获取商品网址的代码,即link3:

def url_soup(url):

    source=(urllib2.urlopen(url)).read()
    soup=BeautifulSoup(source)
    link=soup.select('a.ListItemLink')
    for links in link:
        link3=('http://www.ebay.com/'+'%s') % (links['href'])


Dept={"All Departments":"0","Apparel":"5438","Auto":"91083","Baby":"5427","Beauty":"1085666",
"Books":"3920","Electronics":"3944","Gifts":"1094765","Grocery":"976759","Health":"976760",
"Home":"4044","Home Improvement":"1072864","Jwelery":"3891","Movies":"4096","Music":"4104",
"Party":"2637","Patio":"5428","Pets":"5440","Pharmacy":"5431","Photo Center":"5426",
"Sports":"4125","Toys":"4171","Video Games":"2636"}

def gen_url(keyword,domain):

     if domain in Dept.keys():
        main_url=('http://www.ebay.com/search/search-ng.do?search_query='+'%s'+'&ic=16_0&Find=Find&search_constraint='+'%s') % (keyword,Dept.get(domain))
     url_soup(main_url)

gen_url('Bags','Apparel')

现在我希望我的蜘蛛每次都选择start_urlslink3。 附:我是scrapy的新手!!

1 个答案:

答案 0 :(得分:5)

您需要定义start_requests()方法来动态定义蜘蛛开始的网址。

例如,你应该有这样的东西:

from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spider import BaseSpider


class MySpider(BaseSpider):
    name = "my_spider"
    domains = ['Auto']
    departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666",
                   "Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760",
                   "Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104",
                   "Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426",
                   "Sports": "4125", "Toys": "4171", "Video Games": "2636"}
    keyword = 'Auto'

    allowed_domains = ['ebay.com']

    def start_requests(self):
        for domain in self.domains:
            if domain in self.departments:
                url = 'http://www.ebay.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domain))
                print "YIELDING"
                yield Request(url)

    def parse(self, response):
        print "IN PARSE"
        sel = Selector(response)
        links = sel.select('//a[@class="ListItemLink"]/@href')
        for link in links:
            href = link.extract()[0]
            yield Request('http://www.ebay.com/' + href, self.parse_data)

    def parse_data(self, response):
        # do your actual crawling here
        print "IN PARSE DATA"

希望有所帮助。