scrapy.Request不在递归回调中工作

时间:2017-09-09 07:50:07

标签: python recursion scrapy

我正在用垃圾邮件进行网络报废。它以双重方式进入每个子站点,首先检查站点是否为pdf。如果是,则产生此pdf并且进程终止。如果它是一个html网站,它会继续检查此子网站内是否有任何链接只有一次。如果有,则递归调用当前方法以产生任何格式的项目(html或pdf)。然而," yield scrapy.Request(url,callback = self.parse_article,meta = new_meta)"线路不起作用。任何人都可以指出我的逻辑/代码有什么问题吗?

    def parse_article(self, response):
    item = response.meta
    item['url'] = response.url
    _, ext = os.path.splitext(urlparse(response.url).path)

    is_binary_document = response.meta.pop('is_binary_document', False)

    if is_binary_document:  # binary files
        item['html_content'] = None
        item['content'] = response.body_as_unicode()

    else:  # html files
        item['content'] = extract_text(response.xpath('//div[@id="content"]//text()'))
        item['mime_type'] = 'text/html'
        item['html_content'] = response.body_as_unicode()

        category = response.meta.pop('category', None)
        follow_links = response.meta.pop('follow_links', True)

        if follow_links:
            if '/research/' in item['url']:
                for year_url in response.xpath('//select[@class="dropdownYear"]/option/@value').extract():
                    yield scrapy.Request(response.urljoin(year_url), self.parse_research_year)
            else:
                # Follow links on page for PDF, XLS files, etc that are in the same sub category as referer
                for a in response.xpath('//div[@id="content"]//a[@href]'):
                    href = a.xpath('@href').extract_first()
                    _, ext = os.path.splitext(href)

                    url = response.urljoin(href)
                    if category is None or '/{}/'.format(category) in url:
                        new_meta = response.meta.copy()
                        new_meta['follow_links'] = False  # only follow for one level
                        link_text = extract_text(a.xpath('.//text()'))
                        yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)
                #end for
            #end if
        #end if
    #end if
    yield item
#end def
#end class

以下是整个班级代码:

from __future__ import unicode_literals

import mimetypes
import os
import re
from urlparse import urlparse

from dateutil import parser as dateparser

import scrapy

from ..middlewares.binary_document import SUPPORTED_EXTENSIONS
from ..utils import extract_text

__author__ = 'Yuge Chen'


class HKMASpider(scrapy.Spider):
name = 'hkma'

jurisdiction = 'HK'
source_type = 'primary'
type = 'legislation/reg'
tier = 1
start_urls = [
    'http://www.hkma.gov.hk/eng/'
]
URL_BLACKLIST = [
    'http://apps.hkma.gov.hk/eng/index.php',
    'http://vpr.hkma.gov.hk/cgi-bin/vpr/index.pl',
    'https://www.cmu.org.hk/cmupbb_ws/eng/page/wmp0100/wmp010001.aspx',
    'http://www.hkma.gov.hk/eng/other-information/photo-gallery/',
    'http://www.hkimr.org/working_papers',
    'http://www.hkimr.org/'
]

def parse(self, response):
    for a in response.xpath('//div[@id="seo"]//a'):
        title = extract_text(a.xpath('.//text()'))
        url = response.urljoin(a.xpath('@href').extract_first())

        # if '/key-functions' in url:
        #     yield scrapy.Request(url, self.parse_key_functions, meta=dict(category='key-functions'))
        if '/publications-and-research/quarterly-bulletin' in url:
            yield scrapy.Request('http://www.hkma.gov.hk/eng/publications-and-research/quarterly-bulletin/', self.parse_publications_research)
            break
        # elif '/key-information' in url:
        #     yield scrapy.Request(url, self.parse_key_information)
        # elif 'about-the-hkma' in url:
        #     pass
        # else:
        #     yield scrapy.Request(url, self.parse_article, meta=dict(title=title))
    #end for

    #yield scrapy.Request('http://www.hkma.gov.hk/eng/key-information/guidelines-and-circulars/guidelines/', self.parse_article, meta=dict(title='Guidelines'))
#end def

def parse_key_information(self, response):
    if response.xpath('//select[@class="dropdownYear"]'):
        for year_url in response.xpath('//select[@class="dropdownYear"]/option/@value').extract():
            yield scrapy.Request(response.urljoin(year_url), self.parse_key_information_year)
        #end for
    #end if

    for x in self.parse_key_information_year(response):
        yield x
#end def

def parse_key_information_year(self, response):
    for a in response.xpath('//*[@id="content"]//a'):
        title = extract_text(a.xpath('.//text()'))
        url = response.urljoin(a.xpath('@href').extract_first())
        try:
            date_posted = dateparser.parse(extract_text(a.xpath('../../td[1]/text()')))
            yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted))
        except ValueError: pass
    #end for
#end def

def parse_publications_research(self, response):
    for a in response.xpath('//*[@id="content"]//a'):
        url = response.urljoin(a.xpath('@href').extract_first())
        if ('/half-yearly-monetary' in url or '/quarterly-bulletin' in url) and '/research' not in response.url:
            date_text = extract_text(a.xpath('.//text()')) + ' 1 ' + extract_text(a.xpath('../../td[1]/text()'))
            date_posted = dateparser.parse(date_text)
            title = None
            if '/half-yearly-monetary' in url:
                title = 'Hong Kong Monetary Authority Half-Yearly Monetary & Financial Stability Report - ' + date_text
            yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted, date_text=date_text))
        else:
            title = extract_text(a.xpath('.//text()'))
            yield scrapy.Request(url, self.parse_article, meta=dict(title=title))

def parse_key_functions(self, response):
    for a in response.xpath('//*[@id="key-functionsLeftNav"]//a'):
        title = extract_text(a.xpath('.//text()'))
        url = response.urljoin(a.xpath('@href').extract_first())
        yield scrapy.Request(url, self.parse_article, meta=dict(title=title, category='key-functions'))
    #end for
#end def

def parse_research_year(self, response):
    parent_url = response.url
    print ("parent_url::::" + parent_url)
    #print (extract_text(response.xpath('//table[@class="colorTable researchTable"]')) + '***')
    for a in response.xpath('//div[@class="prContent"]//a[@href]'):
        url = response.urljoin(a.xpath('@href').extract_first())
        if not not response.xpath('//table[@class="colorTable researchTable"]'):
            print ('++++++++')
            print ('))))))' + extract_text(a.xpath('../../td[1]/text()')))
            date_posted = extract_text(a.xpath('../../td[1]/text()'))
            print ('))))))' + re.sub('<[^<]+?>', '', extract_text(a.xpath('../../td[2]/strong'))))
            title = re.sub('<[^<]+?>', '', extract_text(a.xpath('../../td[2]/strong')))
        elif not not response.xpath('//table[@class="formTable"]'):
            print('____________')
            print ('((((((' + url)
            print ('((((((((' + extract_text(a.xpath('../../p[1]/text()')))
            title = extract_text(a.xpath('../../p[1]/text()'))
            print ('(((((((((' + extract_text(a.xpath('../text()[1]')))
            date_posted = dateparser.parse(extract_text(a.xpath('../text()[1]')))
        yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted))

def parse_article(self, response):
    print ('here????')
    item = response.meta
    item['url'] = response.url
    _, ext = os.path.splitext(urlparse(response.url).path)

    is_binary_document = response.meta.pop('is_binary_document', False)

    print ('url!!!' + item['url'])
    if is_binary_document:  # binary files
        print ('binary!!')
        item['html_content'] = None
        #item['content'] = response.body_as_unicode()
        if '/quarterly-bulletin' in item['url']:
            if item.get('date_text'):
                item['title'] = 'Hong Kong Monetary Authority Quarterly Bulletin - ' + item['date_text'] + ' - ' + item['title']
            else:
                item['title'] = 'Hong Kong Monetary Authority Quarterly Bulletin - ' + item['title']

    else:  # html files
        # item['content'] = extract_text(response.xpath('//div[@id="content"]//text()'))
        item['mime_type'] = 'text/html'
        # item['html_content'] = response.body_as_unicode()
        if not item.get('date_posted'):
            item['date_posted'] = dateparser.parse(extract_text(response.xpath("//*[@id='lastUpdate']/text()")), fuzzy=True)

        category = response.meta.pop('category', None)
        follow_links = response.meta.pop('follow_links', True)

        if follow_links:
            if '/research/' in item['url']:
                for year_url in response.xpath('//select[@class="dropdownYear"]/option/@value').extract():
                    yield scrapy.Request(response.urljoin(year_url), self.parse_research_year)
            else:
                # Follow links on page for PDF, XLS files, etc that are in the same sub category as referer
                for a in response.xpath('//div[@id="content"]//a[@href]'):
                    href = a.xpath('@href').extract_first()
                    _, ext = os.path.splitext(href)

                    url = response.urljoin(href)
                    if category is None or '/{}/'.format(category) in url:
                        new_meta = response.meta.copy()
                        new_meta['follow_links'] = False  # only follow for one level
                        link_text = extract_text(a.xpath('.//text()'))
                        if '/annual-report' in url:
                            new_meta['title'] = '{} {} - {}'.format('Hong Kong Monetary Authority', item['title'], link_text)
                            new_meta['date_posted'] = dateparser.parse('June 1 ' + item['title'][-4:])
                        elif item['title'] is not None:
                            new_meta['title'] = '{} - {}'.format(item['title'], link_text)
                        else:
                            new_meta['title'] = link_text
                        print ('url:######' + url)
                        print ('title:######' + new_meta['title'])
                        yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)
                #end for
            #end if
        #end if
    #end if
    yield item
#end def

结束类

0 个答案:

没有答案