我正在使用此库from multiprocessing import Pool
进行多处理。
虽然我正在使用requests
,但我想使用selenium,因为弹出窗口中正在加载一些数据。什么是最好的方式使用Phantomjs而不会进入内存泄漏?
答案 0 :(得分:1)
粗略翻译的基本想法可能如下所示:
from __future__ import unicode_literals
import logging
from werkzeug.routing import Map
from werkzeug.exceptions import HTTPException
from werkzeug.wrappers import Request
class WebApp(object):
def __init__(self, **kw):
self.log = logging.getLogger(__name__)
def __call__(self, environ, start_response):
return self.wsgi_app(environ, start_response)
def wsgi_app(self, environ, start_response):
request = Request(environ)
response = self.dispatch_request(request)
return response(environ, start_response)
def dispatch_request(self, request):
adapter = self.url_map.bind_to_environ(request.environ)
try:
endpoint, values = adapter.match()
method = getattr(self, 'endpoint_{}'.format(endpoint))
return method(adapter, request, **values)
except HTTPException, e:
return e
url_map = Map([])
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from subprocess import Popen, PIPE
import multiprocessing
display = Display(visible=0, size=(800, 600))
display.start()
def get_proxy_obj():
proxy = '123.456.789.012'
proxyobj = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy,
'ftpProxy': proxy,
'sslProxy': proxy,
'noProxy': '' # set this value as desired
})
capabilities = DesiredCapabilities().FIREFOX
capabilities['acceptSslCerts'] = True
proxyobj.add_to_capabilities(capabilities)
return capabilities
drivers = [
Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
capabilities=get_capabilities()),
Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
capabilities=get_capabilities()),
Firefox(FirefoxProfile('/etc/firefox/u2vgyy61.Proxied_User/'),
capabilities=get_capabilities())
]
class Routes(WebApp):
def endpoint_get_response(self, adapter, request, **values):
url = request.values.get("query_param_here","")
if url:
# something better here
while True:
try:
driver = driver.pop()
resposne_txt = driver.get(url)
# response_txt = Popen(['docker', "exec", "-it", "selenium_phantom", url]).communicate()[0]
drivers.append(driver)
return Response(response_text)
except:
sleep(1)
continue
else:
return Response("Not", status=400)
url_map = Map([
Rule('/get_response', endpoint='get_response', methods=['GET']),
])
例如用法:
curl http://node1/get_response?query_param_here=http://stackoverflow.com
curl http://node2/get_response?query_param_here=http://stackoverflow.com
curl http://node3/get_response?query_param_here=http://stackoverflow.com
curl http://node4/get_response?query_param_here=http://stackoverflow.com
...
and so on
负载均衡器如下: