我正在试图用多个页面刮一张桌子。使用以下代码我打印第一页数据:
import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators
class mySpider(scrapy.Spider):
name = "education2"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
def parse(self, response):
return Request(
url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
callback=self.parse_table
)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
我已经编写了下一个代码来下载所有页面。它基于我读过的其他帖子:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
当我尝试打印所有页面时,我没有获得任何东西。任何人都可以帮助我知道这是什么错误吗?
答案 0 :(得分:1)
Scrapy 首先需要parse
回调。 Scrapy doc
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse", follow= True),)
def parse(self,response):
for tr in response.xpath('//*[@id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
或只是使用其他回调重写start_request
方法:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//*[@id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
以下是抓取所有网页的代码:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
from w3lib.url import add_or_replace_parameter
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
def parse(self, response):
max_page = int(response.xpath('//*[@id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
for page in range(1, max_page + 1):
yield Request(
url=add_or_replace_parameter(self.api_url, 'Page', page),
callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item