Python:如何有效地运行多个PhantomJS实例?

时间:2017-03-20 07:49:09

标签: python selenium phantomjs

我正在使用此库from multiprocessing import Pool进行多处理。

虽然我正在使用requests,但我想使用selenium,因为弹出窗口中正在加载一些数据。什么是最好的方式使用Phantomjs而不会进入内存泄漏?

1 个答案:

答案 0 :(得分:1)

粗略翻译的基本想法可能如下所示:

from __future__ import unicode_literals
import logging
from werkzeug.routing import Map
from werkzeug.exceptions import HTTPException
from werkzeug.wrappers import Request
class WebApp(object):

    def __init__(self, **kw):
        self.log = logging.getLogger(__name__)

    def __call__(self, environ, start_response):
        return self.wsgi_app(environ, start_response)

    def wsgi_app(self, environ, start_response):
        request = Request(environ)
        response = self.dispatch_request(request)
        return response(environ, start_response)

    def dispatch_request(self, request):
        adapter = self.url_map.bind_to_environ(request.environ)
        try:
            endpoint, values = adapter.match()
            method = getattr(self, 'endpoint_{}'.format(endpoint))
            return method(adapter, request, **values)
        except HTTPException, e:
            return e

    url_map = Map([])


from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from subprocess import Popen, PIPE
import multiprocessing
display = Display(visible=0, size=(800, 600))
display.start()

def get_proxy_obj():
    proxy = '123.456.789.012'

    proxyobj = Proxy({
        'proxyType': ProxyType.MANUAL,
        'httpProxy': proxy,
        'ftpProxy': proxy,
        'sslProxy': proxy,
        'noProxy': '' # set this value as desired
    })
    capabilities = DesiredCapabilities().FIREFOX
    capabilities['acceptSslCerts'] = True
    proxyobj.add_to_capabilities(capabilities)
    return capabilities





drivers = [
     Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
             capabilities=get_capabilities()),
     Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
             capabilities=get_capabilities()),
     Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
             capabilities=get_capabilities())
 ]

class Routes(WebApp):
    def endpoint_get_response(self, adapter, request, **values):
        url = request.values.get("query_param_here","")
        if url:
            # something better here
            while True:
                try:
                    driver = driver.pop()
                    resposne_txt = driver.get(url)
                    # response_txt = Popen(['docker', "exec", "-it", "selenium_phantom", url]).communicate()[0]
                    drivers.append(driver)
                    return Response(response_text)
                except:
                    sleep(1)
                    continue

        else:
            return Response("Not", status=400)

    url_map = Map([
            Rule('/get_response', endpoint='get_response', methods=['GET']),
        ])

例如用法:

curl http://node1/get_response?query_param_here=http://stackoverflow.com
curl http://node2/get_response?query_param_here=http://stackoverflow.com
curl http://node3/get_response?query_param_here=http://stackoverflow.com
curl http://node4/get_response?query_param_here=http://stackoverflow.com
...
and so on

负载均衡器如下: