我试图用python的scrapy库来搜索projecteuler.net,只是为了练习它。我在网上看过不止一个现有的刮刀实现,但它们似乎对我来说太过精心设计了。我想简单地将问题(标题,ID,内容)保存在json中,然后在我的电脑上的本地网页中使用ajax加载。
我正在实施我的解决方案,无论如何我会终止,但由于我想发现使用该库的更智能的方法,我要求你提出最聪明的程序,用scrapy来完成这项工作(如果你想避免json方式,直接保存在html中...对我来说可能会更好)。
这是我的第一种方法(不起作用):
# -*- coding: utf-8 -*-
import httplib2
import requests
import scrapy
from eulerscraper.items import Problem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
def start_urls_detection():
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return ["https://projecteuler.net/"]
class EulerSpider(CrawlSpider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = start_urls_detection()
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))),
Rule(LinkExtractor(allow=('problem=\d*',)), callback="parse_problems"),
Rule(LinkExtractor(allow=('archives;page=\d*',), unique=True), follow=True)
)
def start_requests(self):
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
# def parse_content(self, response):
# #return response.css("div.problem_content::text").extract()
# next_page = "https://projecteuler.net/archives;page=2"
# n = 3
#
# while n < 14:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
# next_page = next_page[0:len(next_page) - 1] + str(n)
# n += 1
现在我将尝试使用一些linkExtractor +手动请求组合。与此同时,我希望等待你的解决方案......
答案 0 :(得分:0)
我认为我已经找到了一个最简单但最合适的解决方案(至少就我的目的而言),就现有的编写scrape projecteuler的代码而言:
# -*- coding: utf-8 -*-
import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader
class EulerSpider(scrapy.Spider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = ["https://projecteuler.net/archives"]
def parse(self, response):
numpag = response.css("div.pagination a[href]::text").extract()
maxpag = int(numpag[len(numpag) - 1])
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
for i in range(2, maxpag + 1):
next_page = "https://projecteuler.net/archives;page=" + str(i)
yield response.follow(next_page, self.parse_next)
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_next(self, response):
for href in response.css("table#problems_table a::attr(href)").extract():
next_page = "https://projecteuler.net/" + href
yield response.follow(next_page, self.parse_problems)
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
从开始页面(存档)我按照每个链接查找问题,使用parse_problems
抓取我需要的数据。然后我为网站的其他页面启动了刮刀,每个链接列表都有相同的过程。
使用pre和post进程的Item定义也非常简洁:
import re
import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags
def extract_first_number(text):
i = re.search('\d+', text)
return int(text[i.start():i.end()])
def array_to_value(element):
return element[0]
class Problem(scrapy.Item):
id = scrapy.Field(
input_processor=MapCompose(remove_tags, extract_first_number),
output_processor=Compose(array_to_value)
)
title = scrapy.Field(input_processor=MapCompose(remove_tags))
content = scrapy.Field()
我使用命令scrapy crawl euler -o euler.json
启动它并输出一组无序的json对象,每个人都对一个问题相应:这对我来说很好,因为我会用javascript处理它,甚至如果我认为通过scrapy解决排序问题可能非常简单。
编辑:事实上,使用此管道很简单
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.list_items = []
self.file = open('euler.json', 'w')
def close_spider(self, spider):
ordered_list = [None for i in range(len(self.list_items))]
self.file.write("[\n")
for i in self.list_items:
ordered_list[int(i['id']-1)] = json.dumps(dict(i))
for i in ordered_list:
self.file.write(str(i)+",\n")
self.file.write("]\n")
self.file.close()
def process_item(self, item, spider):
self.list_items.append(item)
return item
虽然最好的解决方案可能是创建自定义导出器:
from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes
class OrderedJsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
# To initialize the object we use JsonItemExporter's constructor
super().__init__(file)
self.list_items = []
def export_item(self, item):
self.list_items.append(item)
def finish_exporting(self):
ordered_list = [None for i in range(len(self.list_items))]
for i in self.list_items:
ordered_list[int(i['id'] - 1)] = i
for i in ordered_list:
if self.first_item:
self.first_item = False
else:
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(i))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
self._beautify_newline()
self.file.write(b"]")
并在设置中对其进行配置,以便为json调用它:
FEED_EXPORTERS = {
'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}