我在我的CrawlSpider中集成了scrapy-splash,它仅对呈现start_urls进行爬网。想知道如何使scrapy-splash爬行内部链接。 我一直在互联网上寻找解决方案,似乎没有一个可行的解决方案。
以下是我的代码:
import scrapy
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from scrapy.item import Item, Field
from scrapy import Request
#import requests
from scrapy_splash import SplashRequest
class Website(scrapy.Item):
url = Field()
response = Field()
class houzzspider(CrawlSpider):
handle_httpstatus_list = [404, 500]
name = "example"
allowed_domains = ["localhost","www.example.com"]
start_urls = ["https://www.example.com/"]
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),process_value=''),
callback="parse_items",
process_links="process_links",
follow=True,
),
Rule(
LinkExtractor(
allow=(),
deny=(),process_value=''),
follow=True,
),
)
def process_links(self, links):
for link in links:
if "http://localhost:8050/render.html?&" not in link.url:
link.url = "http://localhost:8050/render.html?&" + urlencode({'url':link.url,'wait':2.0})
return links
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse_items,
endpoint='render.html',
args={'wait': 0.5},)
def parse_items(self, response):
hxs = Selector(response)
sites = response.selector.xpath('//html')
items = []
for site in sites:
#print site
item = Website()
item['url'] = response.url
item['response'] = response.status
items.append(item)
return items