同时抓取iframe和网页

时间:2017-07-11 04:05:46

标签: python iframe scrapy

我只是想知道是否可以抓取网站上的网页并同时从此网页和此页面的iframe中提取数据?

我在python中使用scrapy,我已经知道如何从iframe中提取数据......

感谢您的帮助!!

感谢你的回答,我做到了......但是我不知道该怎么回事而不是“你能再次帮助我吗?”

# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup

class Fnac(CrawlSpider): #scrapy.Spider
    name = 'FnacCom'
    allowed_domains = ['fnac.com']
    start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']

##### To extract links in order to run the spider in them
    # rules = (
    #     Rule(LinkExtractor(allow=()), callback='parse'),
    # )

    def parse(self, response):
        soup = BeautifulSoup(urlopen(response.url), "lxml")
        iframexx = soup.find_all('iframe')
        for iframe in iframexx:
            yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)

##### Main function
    def parse1(self, response):
        item1 = FnacItem()

        nb_sales = response.xpath('//table[@summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
        country = response.xpath('//table[@summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()

        yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...

    def parse2(self, response):
        same_item = response.meta['item']

        address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
        email = response.xpath('//div/ul/li[contains(text(),"@")]/text()').extract()
        name = response.xpath('//div/p[@class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
        phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
        siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
        vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')

        if (len(name) != 0):
            item['name'] = ''.join(name).strip()
            item['address'] = ''.join(address).strip()
            item['phone'] = ''.join(phone).strip()
            item['email'] = ''.join(email).strip()
            item['nb_sales'] = ''.join(nb_sales).strip()
            item['country'] = ''.join(country).strip()
            item['vat'] = ''.join(vat).strip()
            item['siret'] = ''.join(siret).strip()
            return item

1 个答案:

答案 0 :(得分:0)

要将来自不同请求的信息合并到一个类似的项目中,您必须使用请求的meta参数:

def parse1(self, response):
    item1 = {
        ...
    }
    yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)

def parse2(self, response):
    same_item = response.meta['item']
    # keep populating the item with the second response
    ...
    yield same_item