用scrapy刮刮项目euler网站

时间:2018-02-13 16:55:38

标签: python html ajax web-scraping scrapy

我试图用python的scrapy库来搜索projecteuler.net,只是为了练习它。我在网上看过不止一个现有的刮刀实现,但它们似乎对我来说太过精心设计了。我想简单地将问题(标题,ID,内容)保存在json中,然后在我的电脑上的本地网页中使用ajax加载。

我正在实施我的解决方案,无论如何我会终止,但由于我想发现使用该库的更智能的方法,我要求你提出最聪明的程序,用scrapy来完成这项工作(如果你想避免json方式,直接保存在html中...对我来说可能会更好)。

这是我的第一种方法(不起作用):

# -*- coding: utf-8 -*-
import httplib2
import requests
import scrapy
from eulerscraper.items import Problem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule


def start_urls_detection():
    # su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
    # i = 1
    #
    # while True:
    #     request = requests.get(su[i])
    #
    #     if request.status_code != 200:
    #         break
    #
    #     i += 1
    #     su.append('https://projecteuler.net/archives;page=' + str(i + 1))

    return ["https://projecteuler.net/"]


class EulerSpider(CrawlSpider):
    name = 'euler'
    allowed_domains = ['projecteuler.net']
    start_urls = start_urls_detection()

    rules = (
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        # Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))),
        Rule(LinkExtractor(allow=('problem=\d*',)), callback="parse_problems"),
        Rule(LinkExtractor(allow=('archives;page=\d*',), unique=True), follow=True)
    )

    def start_requests(self):
        # su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
        # i = 1
        #
        # while True:
        #     request = requests.get(su[i])
        #
        #     if request.status_code != 200:
        #         break
        #
        #     i += 1
        #     su.append('https://projecteuler.net/archives;page=' + str(i + 1))

        return [scrapy.Request("https://projecteuler.net/archives", self.parse)]

    def parse_problems(self, response):
        l = ItemLoader(item=Problem(), response=response)
        l.add_css("title", "h2")
        l.add_css("id", "#problem_info")
        l.add_css("content", ".problem_content")

        yield l.load_item()

    # def parse_content(self, response):
    #     #return response.css("div.problem_content::text").extract()
    #     next_page = "https://projecteuler.net/archives;page=2"
    #     n = 3
    #
    #     while n < 14:
    #         next_page = response.urljoin(next_page)
    #         yield scrapy.Request(next_page, callback=self.parse)
    #         next_page = next_page[0:len(next_page) - 1] + str(n)
    #         n += 1

现在我将尝试使用一些linkExtractor +手动请求组合。与此同时,我希望等待你的解决方案......

1 个答案:

答案 0 :(得分:0)

我认为我已经找到了一个最简单但最合适的解决方案(至少就我的目的而言),就现有的编写scrape projecteuler的代码而言:

# -*- coding: utf-8 -*-
import scrapy
from eulerscraper.items import Problem
from scrapy.loader import ItemLoader


class EulerSpider(scrapy.Spider):
    name = 'euler'
    allowed_domains = ['projecteuler.net']
    start_urls = ["https://projecteuler.net/archives"]

    def parse(self, response):
        numpag = response.css("div.pagination a[href]::text").extract()
        maxpag = int(numpag[len(numpag) - 1])

        for href in response.css("table#problems_table a::attr(href)").extract():
            next_page = "https://projecteuler.net/" + href
            yield response.follow(next_page, self.parse_problems)

        for i in range(2, maxpag + 1):
            next_page = "https://projecteuler.net/archives;page=" + str(i)
            yield response.follow(next_page, self.parse_next)

        return [scrapy.Request("https://projecteuler.net/archives", self.parse)]

    def parse_next(self, response):
        for href in response.css("table#problems_table a::attr(href)").extract():
            next_page = "https://projecteuler.net/" + href
            yield response.follow(next_page, self.parse_problems)

    def parse_problems(self, response):
        l = ItemLoader(item=Problem(), response=response)
        l.add_css("title", "h2")
        l.add_css("id", "#problem_info")
        l.add_css("content", ".problem_content")

        yield l.load_item()

从开始页面(存档)我按照每个链接查找问题,使用parse_problems抓取我需要的数据。然后我为网站的其他页面启动了刮刀,每个链接列表都有相同的过程。 使用pre和post进程的Item定义也非常简洁:

import re

import scrapy
from scrapy.loader.processors import MapCompose, Compose
from w3lib.html import remove_tags


def extract_first_number(text):
    i = re.search('\d+', text)
    return int(text[i.start():i.end()])


def array_to_value(element):
    return element[0]


class Problem(scrapy.Item):
    id = scrapy.Field(
        input_processor=MapCompose(remove_tags, extract_first_number),
        output_processor=Compose(array_to_value)
    )
    title = scrapy.Field(input_processor=MapCompose(remove_tags))
    content = scrapy.Field()

我使用命令scrapy crawl euler -o euler.json启动它并输出一组无序的json对象,每个人都对一个问题相应:这对我来说很好,因为我会用javascript处理它,甚至如果我认为通过scrapy解决排序问题可能非常简单。

编辑:事实上,使用此管道很简单

import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.list_items = []
        self.file = open('euler.json', 'w')

    def close_spider(self, spider):
        ordered_list = [None for i in range(len(self.list_items))]

        self.file.write("[\n")

        for i in self.list_items:
            ordered_list[int(i['id']-1)] = json.dumps(dict(i))

        for i in ordered_list:
            self.file.write(str(i)+",\n")

        self.file.write("]\n")
        self.file.close()

    def process_item(self, item, spider):
        self.list_items.append(item)
        return item

虽然最好的解决方案可能是创建自定义导出器:

from scrapy.exporters import JsonItemExporter
from scrapy.utils.python import to_bytes


class OrderedJsonItemExporter(JsonItemExporter):

    def __init__(self, file, **kwargs):
        # To initialize the object we use JsonItemExporter's constructor
        super().__init__(file)
        self.list_items = []

    def export_item(self, item):
        self.list_items.append(item)

    def finish_exporting(self):
        ordered_list = [None for i in range(len(self.list_items))]

        for i in self.list_items:
            ordered_list[int(i['id'] - 1)] = i

        for i in ordered_list:
            if self.first_item:
                self.first_item = False
            else:
                self.file.write(b',')
                self._beautify_newline()
            itemdict = dict(self._get_serialized_fields(i))
            data = self.encoder.encode(itemdict)
            self.file.write(to_bytes(data, self.encoding))

        self._beautify_newline()
        self.file.write(b"]")

并在设置中对其进行配置,以便为json调用它:

FEED_EXPORTERS = {
    'json': 'eulerscraper.exporters.OrderedJsonItemExporter',
}