
时间:2017-07-11 04:05:46

标签: python iframe scrapy





# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup

class Fnac(CrawlSpider): #scrapy.Spider
    name = 'FnacCom'
    allowed_domains = ['fnac.com']
    start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']

##### To extract links in order to run the spider in them
    # rules = (
    #     Rule(LinkExtractor(allow=()), callback='parse'),
    # )

    def parse(self, response):
        soup = BeautifulSoup(urlopen(response.url), "lxml")
        iframexx = soup.find_all('iframe')
        for iframe in iframexx:
            yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)

##### Main function
    def parse1(self, response):
        item1 = FnacItem()

        nb_sales = response.xpath('//table[@summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
        country = response.xpath('//table[@summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()

        yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...

    def parse2(self, response):
        same_item = response.meta['item']

        address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
        email = response.xpath('//div/ul/li[contains(text(),"@")]/text()').extract()
        name = response.xpath('//div/p[@class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
        phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
        siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
        vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')

        if (len(name) != 0):
            item['name'] = ''.join(name).strip()
            item['address'] = ''.join(address).strip()
            item['phone'] = ''.join(phone).strip()
            item['email'] = ''.join(email).strip()
            item['nb_sales'] = ''.join(nb_sales).strip()
            item['country'] = ''.join(country).strip()
            item['vat'] = ''.join(vat).strip()
            item['siret'] = ''.join(siret).strip()
            return item

1 个答案:

答案 0 :(得分:0)


def parse1(self, response):
    item1 = {
    yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)

def parse2(self, response):
    same_item = response.meta['item']
    # keep populating the item with the second response
    yield same_item