我正在用垃圾邮件进行网络报废。它以双重方式进入每个子站点,首先检查站点是否为pdf。如果是,则产生此pdf并且进程终止。如果它是一个html网站,它会继续检查此子网站内是否有任何链接只有一次。如果有,则递归调用当前方法以产生任何格式的项目(html或pdf)。然而," yield scrapy.Request(url,callback = self.parse_article,meta = new_meta)"线路不起作用。任何人都可以指出我的逻辑/代码有什么问题吗?
def parse_article(self, response):
item = response.meta
item['url'] = response.url
_, ext = os.path.splitext(urlparse(response.url).path)
is_binary_document = response.meta.pop('is_binary_document', False)
if is_binary_document: # binary files
item['html_content'] = None
item['content'] = response.body_as_unicode()
else: # html files
item['content'] = extract_text(response.xpath('//div[@id="content"]//text()'))
item['mime_type'] = 'text/html'
item['html_content'] = response.body_as_unicode()
category = response.meta.pop('category', None)
follow_links = response.meta.pop('follow_links', True)
if follow_links:
if '/research/' in item['url']:
for year_url in response.xpath('//select[@class="dropdownYear"]/option/@value').extract():
yield scrapy.Request(response.urljoin(year_url), self.parse_research_year)
else:
# Follow links on page for PDF, XLS files, etc that are in the same sub category as referer
for a in response.xpath('//div[@id="content"]//a[@href]'):
href = a.xpath('@href').extract_first()
_, ext = os.path.splitext(href)
url = response.urljoin(href)
if category is None or '/{}/'.format(category) in url:
new_meta = response.meta.copy()
new_meta['follow_links'] = False # only follow for one level
link_text = extract_text(a.xpath('.//text()'))
yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)
#end for
#end if
#end if
#end if
yield item
#end def
#end class
以下是整个班级代码:
from __future__ import unicode_literals
import mimetypes
import os
import re
from urlparse import urlparse
from dateutil import parser as dateparser
import scrapy
from ..middlewares.binary_document import SUPPORTED_EXTENSIONS
from ..utils import extract_text
__author__ = 'Yuge Chen'
class HKMASpider(scrapy.Spider):
name = 'hkma'
jurisdiction = 'HK'
source_type = 'primary'
type = 'legislation/reg'
tier = 1
start_urls = [
'http://www.hkma.gov.hk/eng/'
]
URL_BLACKLIST = [
'http://apps.hkma.gov.hk/eng/index.php',
'http://vpr.hkma.gov.hk/cgi-bin/vpr/index.pl',
'https://www.cmu.org.hk/cmupbb_ws/eng/page/wmp0100/wmp010001.aspx',
'http://www.hkma.gov.hk/eng/other-information/photo-gallery/',
'http://www.hkimr.org/working_papers',
'http://www.hkimr.org/'
]
def parse(self, response):
for a in response.xpath('//div[@id="seo"]//a'):
title = extract_text(a.xpath('.//text()'))
url = response.urljoin(a.xpath('@href').extract_first())
# if '/key-functions' in url:
# yield scrapy.Request(url, self.parse_key_functions, meta=dict(category='key-functions'))
if '/publications-and-research/quarterly-bulletin' in url:
yield scrapy.Request('http://www.hkma.gov.hk/eng/publications-and-research/quarterly-bulletin/', self.parse_publications_research)
break
# elif '/key-information' in url:
# yield scrapy.Request(url, self.parse_key_information)
# elif 'about-the-hkma' in url:
# pass
# else:
# yield scrapy.Request(url, self.parse_article, meta=dict(title=title))
#end for
#yield scrapy.Request('http://www.hkma.gov.hk/eng/key-information/guidelines-and-circulars/guidelines/', self.parse_article, meta=dict(title='Guidelines'))
#end def
def parse_key_information(self, response):
if response.xpath('//select[@class="dropdownYear"]'):
for year_url in response.xpath('//select[@class="dropdownYear"]/option/@value').extract():
yield scrapy.Request(response.urljoin(year_url), self.parse_key_information_year)
#end for
#end if
for x in self.parse_key_information_year(response):
yield x
#end def
def parse_key_information_year(self, response):
for a in response.xpath('//*[@id="content"]//a'):
title = extract_text(a.xpath('.//text()'))
url = response.urljoin(a.xpath('@href').extract_first())
try:
date_posted = dateparser.parse(extract_text(a.xpath('../../td[1]/text()')))
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted))
except ValueError: pass
#end for
#end def
def parse_publications_research(self, response):
for a in response.xpath('//*[@id="content"]//a'):
url = response.urljoin(a.xpath('@href').extract_first())
if ('/half-yearly-monetary' in url or '/quarterly-bulletin' in url) and '/research' not in response.url:
date_text = extract_text(a.xpath('.//text()')) + ' 1 ' + extract_text(a.xpath('../../td[1]/text()'))
date_posted = dateparser.parse(date_text)
title = None
if '/half-yearly-monetary' in url:
title = 'Hong Kong Monetary Authority Half-Yearly Monetary & Financial Stability Report - ' + date_text
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted, date_text=date_text))
else:
title = extract_text(a.xpath('.//text()'))
yield scrapy.Request(url, self.parse_article, meta=dict(title=title))
def parse_key_functions(self, response):
for a in response.xpath('//*[@id="key-functionsLeftNav"]//a'):
title = extract_text(a.xpath('.//text()'))
url = response.urljoin(a.xpath('@href').extract_first())
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, category='key-functions'))
#end for
#end def
def parse_research_year(self, response):
parent_url = response.url
print ("parent_url::::" + parent_url)
#print (extract_text(response.xpath('//table[@class="colorTable researchTable"]')) + '***')
for a in response.xpath('//div[@class="prContent"]//a[@href]'):
url = response.urljoin(a.xpath('@href').extract_first())
if not not response.xpath('//table[@class="colorTable researchTable"]'):
print ('++++++++')
print ('))))))' + extract_text(a.xpath('../../td[1]/text()')))
date_posted = extract_text(a.xpath('../../td[1]/text()'))
print ('))))))' + re.sub('<[^<]+?>', '', extract_text(a.xpath('../../td[2]/strong'))))
title = re.sub('<[^<]+?>', '', extract_text(a.xpath('../../td[2]/strong')))
elif not not response.xpath('//table[@class="formTable"]'):
print('____________')
print ('((((((' + url)
print ('((((((((' + extract_text(a.xpath('../../p[1]/text()')))
title = extract_text(a.xpath('../../p[1]/text()'))
print ('(((((((((' + extract_text(a.xpath('../text()[1]')))
date_posted = dateparser.parse(extract_text(a.xpath('../text()[1]')))
yield scrapy.Request(url, self.parse_article, meta=dict(title=title, date_posted=date_posted))
def parse_article(self, response):
print ('here????')
item = response.meta
item['url'] = response.url
_, ext = os.path.splitext(urlparse(response.url).path)
is_binary_document = response.meta.pop('is_binary_document', False)
print ('url!!!' + item['url'])
if is_binary_document: # binary files
print ('binary!!')
item['html_content'] = None
#item['content'] = response.body_as_unicode()
if '/quarterly-bulletin' in item['url']:
if item.get('date_text'):
item['title'] = 'Hong Kong Monetary Authority Quarterly Bulletin - ' + item['date_text'] + ' - ' + item['title']
else:
item['title'] = 'Hong Kong Monetary Authority Quarterly Bulletin - ' + item['title']
else: # html files
# item['content'] = extract_text(response.xpath('//div[@id="content"]//text()'))
item['mime_type'] = 'text/html'
# item['html_content'] = response.body_as_unicode()
if not item.get('date_posted'):
item['date_posted'] = dateparser.parse(extract_text(response.xpath("//*[@id='lastUpdate']/text()")), fuzzy=True)
category = response.meta.pop('category', None)
follow_links = response.meta.pop('follow_links', True)
if follow_links:
if '/research/' in item['url']:
for year_url in response.xpath('//select[@class="dropdownYear"]/option/@value').extract():
yield scrapy.Request(response.urljoin(year_url), self.parse_research_year)
else:
# Follow links on page for PDF, XLS files, etc that are in the same sub category as referer
for a in response.xpath('//div[@id="content"]//a[@href]'):
href = a.xpath('@href').extract_first()
_, ext = os.path.splitext(href)
url = response.urljoin(href)
if category is None or '/{}/'.format(category) in url:
new_meta = response.meta.copy()
new_meta['follow_links'] = False # only follow for one level
link_text = extract_text(a.xpath('.//text()'))
if '/annual-report' in url:
new_meta['title'] = '{} {} - {}'.format('Hong Kong Monetary Authority', item['title'], link_text)
new_meta['date_posted'] = dateparser.parse('June 1 ' + item['title'][-4:])
elif item['title'] is not None:
new_meta['title'] = '{} - {}'.format(item['title'], link_text)
else:
new_meta['title'] = link_text
print ('url:######' + url)
print ('title:######' + new_meta['title'])
yield scrapy.Request(url, callback=self.parse_article, meta=new_meta)
#end for
#end if
#end if
#end if
yield item
#end def