我试图从子网站抓取所有细节。对于每个项目是一个新的子网站,我想从中检索其他信息。我将所有信息存储到每个项目和每个细节的字典中。
问题是我在收到项目词典项目时会收到重复的项目,我希望它与我的功能有关,而不仅仅是更新的词典项目。但是,我成功地只将一个更新的dict项目解析回该项目。而是我获得了每个更新的dict项目的新项目。我希望问题与我的 for loop 或我的' def parse_item_sub '函数有关。
快速介绍我在代码中所做的事情;
守则
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item1, item2, item3 = [InfoItemSubSite() for i in range(3)]
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item1['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item1['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item1['field6'] = info.xpath('tr[6]/td[2]/p/b/@href').extract()
item2['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item2['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item2['field7'] = info.xpath('tr[7]/td[2]/p/b/@href').extract()
item3['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item3['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item3['field7'] = info.xpath('tr[7]/td[2]/p/b/@href').extract()
item['subsite_dic'] = [dict(item1), dict(item2), dict(item3)]
sub_href = [d['field6'] for d in item['subsite_dic']]
# Below is the code that should send each dictionary item to get further information
# I think it might be here that I have a problem
position = 0
for i in sub_href:
if i:
url = 'http://www.nevermind.com/info/'+''.join(i[0])
yield Request(url, self.parse_item_sub, meta={'item':item,'position':position})
print i[0], position, url
position = position +1
else:
position = position +1
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
# This works and all the right information is extracted
def parse_item_sub(self, response, category):
hxs = Selector(response)
position = response.meta['position']
item = response.meta['item']
subsite = item['subsite_dic'][position]
item['subsite_field11'] = i.xpath('/td[2]/span/@title').extract()
item['subsite_field12'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field13'] = i.xpath('/div[5]/a[1]/@href').extract()
response.meta['item'] = item
yield item
pass