Scrapy发送条件从start_requests解析(self)

时间:2016-07-28 16:28:49

标签: python scrapy conditional scrape

我根据我正在抓取的项目类型抓取一个具有不同行的网站。我有一个看起来像下面的 1st blockcode 的工作刮刀,但是,我希望能够从数据库中获取一个类型并从start_requests(self)发送到解析函数。我有11种不同的类型,在页面的某些部分对于一个表都有不同的行数,而页面上其他表中的其余行是相同的。我已经尝试在 2nd blockcode 中显示代码。

如何在start_requests中完成从数据库中获取类型并将其发送到解析?

第一个区块代码

# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc


class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []

def start_requests(self):

    #Get infoID and Type from database
    self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
    self.cursor = self.conn.cursor()
    self.cursor.execute("SELECT InfoID FROM dbo.infostage")

    rows = self.cursor.fetchall()

    for row in rows:
        url = 'http://www.nevermind.com/info/'
        yield self.make_requests_from_url(url+row[0])   

def parse(self, response):
    hxs = Selector(response)
    infodata = hxs.xpath('div[2]/div[2]')  # input item path

    itemPool = []

    InfoID = ''.join(response.url)
    id = InfoID[29:len(InfoID)-1]        


    for info in infodata:
        item = infoItem()

        # Details
        item['id'] = id #response.url
        item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
        item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
        item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
        item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
        item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
        item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()


        itemPool.append(item)
        yield item
    pass

第二段代码
这不起作用,但我不知道如何让它工作。我是否创建了一个全局列表,一个新函数?

# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc


class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []

def start_requests(self):

    #Get infoID and Type from database
    self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
    self.cursor = self.conn.cursor()
    self.cursor.execute("SELECT InfoID, type FROM dbo.infostage")

    rows = self.cursor.fetchall()

    for row in rows:
        url = 'http://www.nevermind.com/info/'
        type = row[1] # how do I send this value to the parse function?
        yield self.make_requests_from_url(url+row[0])

def parse(self, response):
    hxs = Selector(response)
    infodata = hxs.xpath('div[2]/div[2]')  # input base path

    itemPool = []

    InfoID = ''.join(response.url)
    id = InfoID[29:len(InfoID)-1]        


    for info in infodata:
        item = infoItem()

        # Details
        item['id'] = id #response.url

        # Here I need to implement a condition that comes from def start_requests(self).
        # If condition meet then scrape the following fields else the next
        if type = 'type1': 
# This is where I would like to use it. 
# I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same.
        # Type 1
            item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
            item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
            item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
            item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
            item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
            item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
        else:
            item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
            item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
            item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()

        itemPool.append(item)
        yield item
    pass


谢谢大家的帮助和见解!

1 个答案:

答案 0 :(得分:2)

您可以使用request.meta

def make_requests_from_url(self, url, type, callback):
    request = scrapy.Request(url, callback)
    request.meta['type'] = type
    return request

parse中,您可以使用type

访问response.meta['type']