PhantomJS can not be used in scrapy with SSR

时间:2018-02-03 08:47:09

标签: python scrapy

I am running win10 system,My python version is 3.6, scrapy version is 1.5, when I use the SSR client to open the global agent, it will give an error, but when I turn off SSR will be normal, which is why?

And in my proxy for phantomjs SSR proxy 127.0.0.1:1083 port will also be given an error, the main error is inside the webdrive.py file, and twisted

spider.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from uu134.items import Uu134Item
from scrapy.selector import Selector

class DowmloadSpider(CrawlSpider):
     name = 'dowmload'
     allowed_domains = ['www.134uu.com']
     start_urls = ['https://www.134uu.com/htm/vodlist3/1.htm']

link_extractor = {
    'one_lever_link': LinkExtractor(allow=r'/htm/vodlist3/\d+.htm'),
    'two_lever_link': LinkExtractor(allow=r'/htm/vod3/\d+.htm'),
    # 'content': LinkExtractor(allow='bbscon,board,\w+,file,M\.\d+\.A\.html$'),
}

def parse(self, response):
    for link in self.link_extractor['one_lever_link'].extract_links(response):
        yield scrapy.Request(url=link.url, callback=self.parse_page)

def parse_page(self, response):
    for link in self.link_extractor['two_lever_link'].extract_links(response):
        yield scrapy.Request(url=link.url, callback=self.parse_page)

def parse_handle(self, response):
        print(response.url)

middlewares.py

class JSPageMiddleware(object):
def __init__(self):
    ua = random.choice(self.user_agent_list) # 随机选择一个User-Agent
    dcap = dict(DesiredCapabilities.PHANTOMJS)  # 导入PhantomJS的设置
    dcap["phantomjs.page.settings.userAgent"] = (ua)  # 设置PhantomJS的随机User-Agent
    dcap["phantomjs.page.settings.loadImages"] = False  # 禁止加载图片
    dcap['proxy'] = '127.0.0.1:1083'

    self.browser = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe', desired_capabilities=dcap, port=
                                       1083)
    super(JSPageMiddleware, self).__init__()

# 通过 PhantomJS 请求动态网页,代替scrapy的downloader
def process_request(self, request, spider):

    self.browser.get(request.url)
    import time
    time.sleep(1)
    print("访问:{0}".format(request.url))

    # 直接返回给spider,而非再传给downloader
    return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8",
                        request=request)

error info:

2018-02-03 16:25:54 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: uu134)
2018-02-03 16:25:54 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.7, cssselect 1.0.1, parsel 1.3.1, w3lib 1.18.0, Twisted 17.9.0, Python 3.6.3 (v3.6.3:2c5fed8, Oct  3 2017, 17:26:49) [MSC v.1900 32 bit (Intel)], pyOpenSSL 17.5.0 (OpenSSL 1.1.0g  2 Nov 2017), cryptography 2.1.4, Platform Windows-10-10.0.15063-SP0
2018-02-03 16:25:54 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'uu134', 'COOKIES_ENABLED': False, 'LOG_FILE': 'Spider.log', 'LOG_LEVEL': 'INFO', 'NEWSPIDER_MODULE': 'uu134.spiders', 'SPIDER_MODULES': ['uu134.spiders']}
2018-02-03 16:25:54 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2018-02-03 16:25:55 [twisted] CRITICAL: Unhandled error in Deferred:
2018-02-03 16:25:55 [twisted] CRITICAL: 
Traceback (most recent call last):
File "d:\python\lib\site-packages\twisted\internet\defer.py", line 1386, in _inlineCallbacks
result = g.send(result)
File "d:\python\lib\site-packages\scrapy\crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "d:\python\lib\site-packages\scrapy\crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "d:\python\lib\site-packages\scrapy\core\engine.py", line 69, in __init__
self.downloader = downloader_cls(crawler)
File "d:\python\lib\site-packages\scrapy\core\downloader\__init__.py", line 88, in __init__
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
File "d:\python\lib\site-packages\scrapy\middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "d:\python\lib\site-packages\scrapy\middleware.py", line 40, in from_settings
mw = mwcls()
File "E:\xampp\htdocs\study_spider\uu134\uu134\middlewares.py", line 173, in __init__1083)
File "d:\python\lib\site-packages\selenium\webdriver\phantomjs\webdriver.py", line 56, in __init__
desired_capabilities=desired_capabilities)
File "d:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 87, in __init__
self.start_session(desired_capabilities, browser_profile)
File "d:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 141, in start_session
'desiredCapabilities': desired_capabilities,
File "d:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 201, in execute
self.error_handler.check_response(response)
File "d:\python\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 151, in check_response
raise exception_class(value)

0 个答案:

没有答案