Question

我在我的CrawlSpider中集成了scrapy-splash，它仅对呈现start_urls进行爬网。想知道如何使scrapy-splash爬行内部链接。我一直在互联网上寻找解决方案，似乎没有一个可行的解决方案。

以下是我的代码：

import scrapy
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider

from scrapy.item import Item, Field
from scrapy import Request

#import requests
from scrapy_splash import SplashRequest


class Website(scrapy.Item):
    url = Field()
    response = Field()

class houzzspider(CrawlSpider):
    handle_httpstatus_list = [404, 500]
    name = "example"
    allowed_domains = ["localhost","www.example.com"]
    start_urls = ["https://www.example.com/"]    

    rules = (
    Rule(
        LinkExtractor(
            allow=(),
            deny=(),process_value=''),
            callback="parse_items",
            process_links="process_links",
            follow=True,
            ),
    Rule(
        LinkExtractor(
            allow=(),
            deny=(),process_value=''),
            follow=True,
            ),
    )

    def process_links(self, links):
        for link in links:
            if "http://localhost:8050/render.html?&" not in link.url:
                link.url = "http://localhost:8050/render.html?&" + urlencode({'url':link.url,'wait':2.0})
        return links   

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url, self.parse_items,
                endpoint='render.html',
                args={'wait': 0.5},)    

    def parse_items(self, response):
        hxs = Selector(response)
        sites = response.selector.xpath('//html')
        items = []

        for site in sites:
            #print site
            item = Website()     
            item['url'] = response.url
            item['response'] = response.status
            items.append(item)
            return items

Python：使用CrawlSpider的Scrapy-Splash递归爬网不起作用

0 个答案: