Scrapy - 递归刮到第三页

时间:2014-04-06 10:13:59

标签: python recursion web-scraping scrapy

我希望我的请求对于那些经验丰富的Scrapy用户来说非常简单明了。

实质上,以下代码适用于根据第一页中的链接从第二页进行抓取。我想使用第二页中的链接将代码扩展到第3页。使用下面的代码,def parse_items是登录页面(第1级),其中包含50个列表,并且代码设置为从50个链接中的每个链接递归删除。 def parse_listing_page指定要从“列表页面”中删除哪些项目。在每个列表页面中,我希望我的脚本在返回“列表页面”然后返回登录页面之前,按照链接进入另一个页面并刮取一两个项目。

以下代码适用于2级递归抓取。如何使用下面的代码将其扩展为3?

from scrapy import log
from scrapy.log import ScrapyFileLogObserver
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import exampleItem
from scrapy.http import Request
import urlparse

logfile_info = open('example_INFOlog.txt', 'a')
logfile_error = open('example_ERRlog.txt', 'a')
log_observer_info = log.ScrapyFileLogObserver(logfile_info, level=log.INFO)
log_observer_error = log.ScrapyFileLogObserver(logfile_error, level=log.ERROR)
log_observer_info.start()
log_observer_error.start()

class MySpider(CrawlSpider):
    name = "example"

    allowed_domains = ["example.com.au"]

    rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//li[@class="nextLink"]',))
    , callback="parse_items", follow=True),
    )

    def start_requests(self):
        start_urls = reversed([
            "http://www.example.com.au/1?new=true&list=10-to-100",
            "http://www.example.com.au/2?new=true&list=10-to-100",
            "http://www.example.com.au/2?new=true&list=100-to-200",
        ])

        return[Request(url = start_url) for start_url in start_urls ]

    def parse_start_url(self, response):
        return self.parse_items(response)

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        listings = hxs.select("//h2")
        items = []
        for listings in listings:
            item = exampleItem()
            item ["title"] = listings.select("a/text()").extract()[0]
            item ["link"] = listings.select("a/@href").extract()[0]
            items.append(item)

            url = "http://example.com.au%s" % item["link"]
            yield Request(url=url, meta={'item':item},callback=self.parse_listing_page)


    def parse_listing_page(self,response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        item["item_1"] = hxs.select('#censored Xpath').extract()
        item["item_2"] = hxs.select('#censored Xpath').extract()
        item["item_3"] = hxs.select('#censored Xpath').extract()
        item["item_4"] = hxs.select('#censored Xpath').extract()

        return item

非常感谢

2 个答案:

答案 0 :(得分:1)

这就是代码流程的工作方式。

调用Rule类中的MySpider构造函数开始。 Rule构造函数的回调设置为parse_itemsyield末尾有一个parse_items,可以将该函数递归到parse_listing_page。如果您希望从parse_listing_page递归到第三级,则Request的收益率必须为parse_listing_page

答案 1 :(得分:1)

这是我更新的代码。下面的代码能够以适当的格式(已测试)提取counter_link,但似乎使用else语句,因此不会产生parse_listing_counter。如果我删除ifelse子句并强制代码回调parse_listing_counter,则不会产生任何项目(甚至不会产生来自parse_items或列表页面的项目)

我的代码中出错了什么?我也检查了XPath - 一切似乎都没问题。

from scrapy import log
from scrapy.log import ScrapyFileLogObserver
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import exampleItem
from scrapy.http import Request
import urlparse

logfile_info = open('example_INFOlog.txt', 'a')
logfile_error = open('example_ERRlog.txt', 'a')
log_observer_info = log.ScrapyFileLogObserver(logfile_info, level=log.INFO)
log_observer_error = log.ScrapyFileLogObserver(logfile_error, level=log.ERROR)
log_observer_info.start()
log_observer_error.start()

class MySpider(CrawlSpider):
    name = "example"

    allowed_domains = ["example.com.au"]

    rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//li[@class="nextLink"]',))
    , callback="parse_items", follow=True),
    )

    def start_requests(self):
        start_urls = reversed([
            "http://www.example.com.au/1?new=true&list=10-to-100",
            "http://www.example.com.au/2?new=true&list=10-to-100",
            "http://www.example.com.au/2?new=true&list=100-to-200",
        ])

        return[Request(url = start_url) for start_url in start_urls ]

    def parse_start_url(self, response):
        return self.parse_items(response)

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        listings = hxs.select("//h2")
        items = []
        for listings in listings:
            item = exampleItem()
            item ["title"] = listings.select("a/text()").extract()[0]
            item ["link"] = listings.select("a/@href").extract()[0]
            items.append(item)

            url = "http://example.com.au%s" % item["link"]
            yield Request(url=url, meta={'item':item},callback=self.parse_listing_page)


    def parse_listing_page(self,response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        item["item_1"] = hxs.select('#censored Xpath').extract()
        item["item_2"] = hxs.select('#censored Xpath').extract()
        item["item_3"] = hxs.select('#censored Xpath').extract()
        item["item_4"] = hxs.select('#censored Xpath').extract()

        item["counter_link"] = hxs.selext('#censored Xpath').extract()[0]
        counter_link = response.meta.get('counter_link', None)
        if counter_link:
            url2 = "http://example.com.au%s" % item["counter_link"]
            yield Request(url=url2, meta={'item':item},callback=self.parse_listing_counter)
        else:
            yield item

    def parse_listing_counter(self,response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        item["counter"] = hxs.select('#censored Xpath').extract()

        return item