我正在构建一个抓取工具。现在,我想让它浏览网站上的所有可用页面,[i]为每个产品填写许多数据字段,[ii],为每个产品,钻入相应的产品网址,并填充其他一些产品数据字段。我想要每个产品在同一个{}中的所有数据。但相反,爬虫正在执行的是执行[i],然后执行[ii],以便部分[ii]填充在单独的{}中。
我想以某种方式将数据[i]添加到[ii]中。 request.meta['item'] = item
看起来是可行的,但我还没有成功地使它发挥作用。
我有以下代码:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlerItem
class Crawler1Spider(CrawlSpider):
name = "crawler1"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
#visit each page
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="listnavpagenum"]')), callback='parse_item', follow=True),
#click on each product link
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="exhib_status exhib_status_interiors"]')), callback='parse_detail', follow=True),
]
def parse_item(self, response):
sel = Selector(response)
elements = sel.xpath('//div[@class="ez_listitem_wrapper"]')
items = []
results = []
n = 0
for element in elements:
item = CrawlerItem()
n = n + 1
#work out how to put images into image folder
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[@class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[@class="ez_merge4"]/a/@href').extract_first()
item['count'] = n
yield item
#items.append(item)
#return items
def parse_detail(self, response):
item = CrawlerItem()
item['telephone'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
建议我如何将每个产品的所有数据合并到一个{}中。
更新:2015年11月20日
我修改了代码如下:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlItem
class Crawler1Spider(CrawlSpider):
name = "test"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="listnavpagenum"]')), callback='parse_item', follow=True),
]
def parse_item(self, response):
item = CrawlItem()
sel = Selector(response)
elements = sel.xpath('//div[@class="ez_listitem_wrapper"]')
items = []
n = 0
for element in elements:
n = n + 1
#work out how to put images into image folder
#item['image_urls'] = selector.xpath('//a[@class="exhib_status exhib_status_interiors"]/img/@src').extract()
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[@class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[@class="ez_merge4"]/a/@href').extract_first()
item['count'] = n
item_detail_url = item['item_url'] = element.xpath('//div[@class="ez_merge4"]/a/@href').extract_first()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail,meta=dict(item=item))
def parse_detail(self, response):
#get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
item['telephone'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[@id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
我在相同的{}中获取数据,但是,机器人只从每页的最后一项中提取数据。还有其他建议吗?
答案 0 :(得分:0)
在这种情况下,我担心您无法使用rules
,因为每个请求在到达您要抓取的网站时都是独立的。
您需要从start_requests
定义自己的行为:
def start_requests(self):
yield Request(url=myinitialurl, callback=self.parse)
def parse(self, response):
# crawl the initial page and then do something with that info
yield Request(url=producturl, callback=self.parse_item)
def parse_item(self, response):
item = CrawlerItem()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail, meta=dict(item=item))
def parse_detail(self, response):
# get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
yield item
答案 1 :(得分:0)
尝试在item = CrawlItem()
的for循环中实例化parse_item
。