使用Scrapy提取数据,循环子页面

时间:2015-12-17 14:21:57

标签: python loops web-scraping scrapy scrapy-spider

我的网站上有一个页面,其中包含工作人员列表。每个工作人员姓名都链接到他们自己的页面。

我想输出一个csv文件,其中列出了每个员工的姓名和头衔,因此蜘蛛需要遍历人员列表页面上的每个链接,提取名称和标题。

到目前为止,这段代码只是为了提取列表中的姓氏和标题。我遇到的问题是通过每个人的页面来获取完整列表。

如何使这个循环工作?

class scrapeSpider(scrapy.Spider):
name = "scrape"
allowed_domains = ["example.com", "example.co.uk"]
start_urls = [
    'http://example.com/stafflist/',
]

def parse(self, response):
    for href in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href'):
        url = response.urljoin(href.extract())
        yield scrapy.Request(url, callback=self.parse_SCRAPE)


def parse_SCRAPE(self, response):
    items = []
    for sel in response.xpath('//div[contains(concat(" ",normalize-space(@class)," "), "span9")]'):
        item = scrapeItem()
        item['name'] = sel.xpath('h1/text()').extract()
        item['titles'] = sel.xpath('h2/text()').extract()
        print item['name'], item['titles']
        items.append(item)
    return items

1 个答案:

答案 0 :(得分:0)

使用CrawlSpider。 e.g。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from myspider.items import PersonItem

from pyquery import PyQuery as pq  # PyQuery is awesome!
from urlparse import urlparse, parse_qs

class MySpider(CrawlSpider):
    name = 'myspider'
    allowed_domains = ['example.si']
    start_urls = ['http://example.com/stafflist/']

    rules = (
        # if you have paginator this Rule will extract links
        Rule(LinkExtractor(
             restrict_xpaths=('//div[@class="paging"]//a[last()]')),
             follow=True),
        # restrict crawler to look for links only inside restrict_xpaths
        # and then process those links with 'parse_item'
        Rule(LinkExtractor(
             restrict_xpaths=('//div[contains(concat(" ",normalize-space(@class)," "), "span8")]//a/@href')),
             callback='parse_item',
             follow=False),
    )

    def parse_item(self, response):
        """
        process persons page
        """
        self.response = response
        self.doc = pq(self.response.body)

        i = PersonItem()
        i["name"] = self.doc("h1").text()
        i["titles"] = self.doc("h2").text()
        ...

        return i