python web递归抓取错误

时间:2016-06-30 14:43:24

标签: python recursion scrapy scrapy-spider

我正在尝试刮掉多个页面。其结构如下:

- >第1页 - 刮取链接

------->第2页 - 删除更多链接(某些页面包含分页)和数据

------------>第3页 - 抓取数据

返回18项,但有127页(第2步)和18项/页。而不是在item中返回author和author_link。

import scrapy

from scrapy.spiders         import CrawlSpider, Rule
from scrapy.linkextractors  import LinkExtractor as lext
from scrapy.selector        import Selector
from scrapy.http            import Request

from ror.items              import RorItem

class RorSpiderSpider(CrawlSpider):
    name = "ror_spider"
    allowed_domains = ["example.com"]
    start_urls = (
        'http://www.example.com/',
    )
    rules = [
        Rule(lext(allow=("http://www.example.com/$"), restrict_xpaths=('//a[@class="nextpostslink"]',)), callback='parse', follow=True),
    ]

    def parse(self, response):
        links = Selector(response).xpath('//ul[@id="nav"]/li')
        for link in links:
            item = RorItem()
            item['menu_link'] = link.xpath('a/@href').extract()[0]
            item['menu_title'] = link.xpath('a/text()').extract()[0]
            if "http" not in item['menu_link']:
                item['menu_link'] = "http://www.reviewofreligions.org" + ''.join(item['menu_link'])
                yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
            else:
                yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)

    def parse_articles(self, response):
        sel = Selector(response)
        item = response.meta['item']

        if "articles" in item['menu_link']:
            item['link_cat'] = item['menu_title']
            pg = 1
            maxPgs = 124
            while pg <= 124:
                item['article_pg_link'] = item['menu_link'] + "page/" + str(pg) + "/"
                article_links = sel.xpath('//div[@id="rightcol"]/div[@class="articlebox"]')
                for art_link in article_links:
                    item['article_link'] = art_link.xpath('a[@class="title "]/@href').extract()[0]
                    item['article_title'] = art_link.xpath('a[@class="title "]/text()').extract()[0].replace('\n\t\t\t\t', '').replace('\t\t\t\t', '')
                    # article_txt_1 = art_link.xpath('text()').extract()[1].replace('\n       \n\t\t\t\t', '').replace('\t\t\t\t', '').replace('\n \n', '')
                    # article_txt_2 = art_link.xpath('text()').extract()[2].replace('\n       \n\t\t\t\t', '') if art_link.xpath('text()').extract()[2] else ''
                    # item['article_txt'] = article_txt_1 + '\n'.join(article_txt_2).replace('\n\n\n \n\n\n \n \n \n \n\n\n\t\n\t\n\t', '')
                    yield Request(url=item['article_link'], meta={'item': item}, callback=self.article_page)
                pg += 1

    def article_page(self, response):
        select = Selector(response)
        item = response.meta['item']
        item['author'] = select.xpath('//div[@id="author"]/a/text()').extract()
        item['author_link'] = select.xpath('//div[@id="author"]/a/@href').extract()
        return item

代码有什么问题?

0 个答案:

没有答案