使用Scrapy检索隐藏的电话号码

时间:2017-08-21 01:33:07

标签: python web-scraping scrapy scrapy-spider

这是我的蜘蛛:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor


class ExampleSpider(CrawlSpider):
    name = 'example'
    allowed_domains = ['www.example.com']
    start_urls = ['http://www.example.com']
    rules = (
        Rule(LinkExtractor(allow=('/items/.'),deny=('sendMessage')), follow=True),
        Rule(LinkExtractor(allow=('/item/[a-z\+]+\-[0-9]+') ,deny=('sendMessage')), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        name = response.xpath('//*[@id="main"]/h1/text()').extract_first()
        locations = response.xpath("//*[@id='mylocation']//div[@class='location']")
        list_location = []
        for (i, location) in enumerate(locations):
            location_tab = {}
            phoneitem = []
                data = [
                  ('somedata', ''),
                  ('somedata', ''),
                  ('somedata', ''),
                ]
                request = scrapy.FormRequest(
                        'http://www.example.com/ajax/my-phone',
                        formdata= data,
                        callback=self.parse_for_number
                    )
                request.meta['phoneitem'] = phoneitem
                yield request
                location_tab['phone'] = phoneitem

            location_title = location.xpath("string(.//h3)").extract()
            location_title[0] = ' '.join(location_title[0].split())
            location_address = location.xpath(".//p[@class='address']").extract()
            #if not location_title:
            #    location_title = location.xpath("h3/a/text()").extract()
            location_tab['location_title'] = location_title[0].encode('utf-8')
            location_tab['location_address'] = location_address[0].encode('utf-8')
            list_location.append(location_tab)
            location_title = ''
            location_address = ''
            location_tab = {}
        if name :
            yield { 'name' : name,
                    'url' : url,
                    'locations' : list_location
            }

    def parse_for_number(self, response):
        phoneitem = request.meta['phoneitem']
        phones = response.xpath("//li[@class='phone']/text()").extract()
        for phone in phones:
            phoneitem.append(' '.join(phone.split()))
        yield phoneitem

我的蜘蛛包括检索人名(姓名)以及他们的地址和每个地址中的电话号码(电话号码通过带变量的ajax请求显示)。

手机列表(phoneitem)始终为空。错误在哪里?

编辑:
如何将parse_for_number函数的结果存储在变量中,并在之后使用它? 由于我在FOR循环中调用FormRequest,因此无法为每次执行函数生成电话变量。

1 个答案:

答案 0 :(得分:0)

已解决!

我使用请求库

import requests
request = requests.post('http://www.example.com/ajax/my-phone', cookies=cookies, data=data)

phones = Selector(text=request.content).xpath("//li[@class='phone']/text()").extract()