Question

我试图从子网站抓取所有细节。对于每个项目是一个新的子网站，我想从中检索其他信息。我将所有信息存储到每个项目和每个细节的字典中。

问题是我在收到项目词典项目时会收到重复的项目，我希望它与我的功能有关，而不仅仅是更新的词典项目。但是，我成功地只将一个更新的dict项目解析回该项目。而是我获得了每个更新的dict项目的新项目。我希望问题与我的 for loop 或我的' def parse_item_sub '函数有关。

快速介绍我在代码中所做的事情;

从数据库中回复id
将网站划分为项目。因为该网站包含几个相同的项目，但各不相同信息。我创建了额外的sub_items，它们被刮掉了进入主项目，作为词典。
遍历包含href的每个dict项，以从该站点获取信息并将其刮到字典中的dict。 ' item ['subsite_dic'] [position] '此后返回项目是否可以返回项目的更新dict项目？

守则

from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc


class scrapeInfo(Spider):
    name = "info"
    allowed_domains = ["http://www.nevermind.com"]
    start_urls = []

    def start_requests(self):

        #Get infoID and Type from database
        self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
        self.cursor = self.conn.cursor()
        self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")

        rows = self.cursor.fetchall()

        for row in rows:
            url = 'http://www.nevermind.com/info/'
            InfoID = row[0]
            category = row[1]
            yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)

    def make_requests_from_url(self, url, InfoID, category, callback):
        request = Request(url, callback)
        request.meta['InfoID'] = InfoID
        request.meta['category'] = category
        return request

    def parse(self, response):
        hxs = Selector(response)
        infodata = hxs.xpath('div[2]/div[2]')  # input item path

        itemPool = []

        InfoID = response.meta['InfoID']
        category = response.meta['category']

        for info in infodata:
            item = infoItem()
            item1, item2, item3 = [InfoItemSubSite() for i in range(3)]

            # Stem Details
            item['id'] = InfoID
            item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
            item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
            item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()

            item1['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
            item1['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
            item1['field6'] = info.xpath('tr[6]/td[2]/p/b/@href').extract()

            item2['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
            item2['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
            item2['field7'] = info.xpath('tr[7]/td[2]/p/b/@href').extract()     

            item3['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
            item3['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
            item3['field7'] = info.xpath('tr[7]/td[2]/p/b/@href').extract() 

            item['subsite_dic'] = [dict(item1), dict(item2), dict(item3)]

            sub_href = [d['field6'] for d in item['subsite_dic']]

            # Below is the code that should send each dictionary item to get further information
            # I think it might be here that I have a problem
            position = 0
            for i in sub_href:
                if i:
                    url = 'http://www.nevermind.com/info/'+''.join(i[0])

                    yield Request(url, self.parse_item_sub, meta={'item':item,'position':position})
                    print i[0], position, url
                    position = position +1     
                else:
                    position = position +1

            itemPool.append(item)
            yield item

        pass

        # Function to extract additional info from the subsite, and return it to the original item.
        # This works and all the right information is extracted
        def parse_item_sub(self, response, category):
            hxs = Selector(response)
            position = response.meta['position']
            item = response.meta['item']

            subsite = item['subsite_dic'][position]

            item['subsite_field11'] = i.xpath('/td[2]/span/@title').extract()            
            item['subsite_field12'] = i.xpath('/tr[4]/td[2]/text()').extract()
            item['subsite_field13'] = i.xpath('/div[5]/a[1]/@href').extract()

            response.meta['item'] = item

            yield item

            pass

Scrapy返回更新的dict项目

0 个答案: