我正在尝试使用aiohttp
在多个SOCKS代理上发出异步HTTP请求。基本上,我正在创建一个具有不同IP地址的Tor客户端池,并希望能够使用aiohttp
通过它们路由HTTP请求。
根据建议here和here,我一直在尝试使用aiosocks,但这些主题中的示例不起作用(如果他们曾经这样做)因为它们是基于在旧版本的aiosocks
上使用不同的API。在线使用aiosocks
的文档和示例非常稀少(它似乎没有被广泛使用)。但是我无法找到任何其他解决方案来使用aiohttp
和SOCKS代理。
下面是我到目前为止的代码(对于大量代码感到抱歉 - 我试图尽可能地减少示例!)。首先,我使用stem
初始化Tor客户端:
from datetime import datetime
import stem.process
from TorUtils import printCircuits, cleanShutdown
NUM_TOR_CLIENTS = 3
# create list of (source_port, control_port) tuples
tor_ports = [(str(9050 + i), str(9050 + NUM_TOR_CLIENTS + i)) for i in range(NUM_TOR_CLIENTS)]
# Every ISO 3166 country code except for {US} and {CA}
country_codes = '{AF}, {AX}, {AL}, {DZ}, {AS}, {AD}, {AO}, {AI}, {AQ}, {AG}, {AR}, {AM}, {AW}, {AU}, {AT}, {AZ}, {BS}, {BH}, {BD}, {BB}, {BY}, {BE}, {BZ}, {BJ}, {BM}, {BT}, {BO}, {BQ}, {BA}, {BW}, {BV}, {BR}, {IO}, {BN}, {BG}, {BF}, {BI}, {KH}, {CM}, {CV}, {KY}, {CF}, {TD}, {CL}, {CN}, {CX}, {CC}, {CO}, {KM}, {CG}, {CD}, {CK}, {CR}, {CI}, {HR}, {CU}, {CW}, {CY}, {CZ}, {DK}, {DJ}, {DM}, {DO}, {EC}, {EG}, {SV}, {GQ}, {ER}, {EE}, {ET}, {FK}, {FO}, {FJ}, {FI}, {FR}, {GF}, {PF}, {TF}, {GA}, {GM}, {GE}, {DE}, {GH}, {GI}, {GR}, {GL}, {GD}, {GP}, {GU}, {GT}, {GG}, {GN}, {GW}, {GY}, {HT}, {HM}, {VA}, {HN}, {HK}, {HU}, {IS}, {IN}, {ID}, {IR}, {IQ}, {IE}, {IM}, {IL}, {IT}, {JM}, {JP}, {JE}, {JO}, {KZ}, {KE}, {KI}, {KP}, {KR}, {KW}, {KG}, {LA}, {LV}, {LB}, {LS}, {LR}, {LY}, {LI}, {LT}, {LU}, {MO}, {MK}, {MG}, {MW}, {MY}, {MV}, {ML}, {MT}, {MH}, {MQ}, {MR}, {MU}, {YT}, {MX}, {FM}, {MD}, {MC}, {MN}, {ME}, {MS}, {MA}, {MZ}, {MM}, {NA}, {NR}, {NP}, {NL}, {NC}, {NZ}, {NI}, {NE}, {NG}, {NU}, {NF}, {MP}, {NO}, {OM}, {PK}, {PW}, {PS}, {PA}, {PG}, {PY}, {PE}, {PH}, {PN}, {PL}, {PT}, {PR}, {QA}, {RE}, {RO}, {RU}, {RW}, {BL}, {SH}, {KN}, {LC}, {MF}, {PM}, {VC}, {WS}, {SM}, {ST}, {SA}, {SN}, {RS}, {SC}, {SL}, {SG}, {SX}, {SK}, {SI}, {SB}, {SO}, {ZA}, {GS}, {SS}, {ES}, {LK}, {SD}, {SR}, {SJ}, {SZ}, {SE}, {CH}, {SY}, {TW}, {TJ}, {TZ}, {TH}, {TL}, {TG}, {TK}, {TO}, {TT}, {TN}, {TR}, {TM}, {TC}, {TV}, {UG}, {UA}, {AE}, {GB}, {UM}, {UY}, {UZ}, {VU}, {VE}, {VN}, {VG}, {VI}, {WF}, {EH}, {YE}, {ZM}, {ZW}'
tor_configs = [{'SOCKSPort': p[0], 'ControlPort': p[1], 'DataDirectory': './.tordata' + p[0],
'CookieAuthentication' : '1', 'MaxCircuitDirtiness': '3600', 'ExcludeNodes': country_codes,
'EntryNodes': '{us}, {ca}', 'ExitNodes': '{us}, {ca}', 'StrictNodes': '1',
'GeoIPExcludeUnknown': '1', 'EnforceDistinctSubnets': '0'
} for p in tor_ports]
print(f"Spawning {NUM_TOR_CLIENTS} tor clients ...")
start_time = datetime.now()
tor_clients = []
for cfg in tor_configs:
tor_clients.append({'config': cfg, 'process': stem.process.launch_tor_with_config(config = cfg)})
...然后我尝试使用以下代码通过aiohttp
发出HTTP请求:
from collections import defaultdict, deque
from datetime import datetime, timedelta
import asyncio
import aiohttp
import aiosocks
from aiosocks.connector import ProxyConnector, ProxyClientRequest
import async_timeout
TIMEOUT = 10
async def _get(url, session, proxy, request_limiter):
try:
async with request_limiter: # semaphore to limit number of concurrent requests
async with async_timeout.timeout(TIMEOUT):
async with session.get(url, proxy=proxy, proxy_auth=None) as resp:
status = int(resp.status)
headers = dict(resp.headers)
content_type = str(resp.content_type)
text = await resp.text()
return {'url': url, 'status': status, 'headers': headers, 'text': str(text), 'errors': None}
except asyncio.TimeoutError as e:
queue.visited_urls[url] = datetime.now()
return {'url': url, 'status': None, 'headers': None, 'text': None, 'errors': str(e)}
async def _getPagesTasks(url_list, tor_clients, request_limiter, loop):
"""Launch requests for all web pages."""
#deque rotates continuously through SOCKS sessions for each tor client ...
sessions = deque()
for tor_client in tor_clients:
conn = ProxyConnector()
session = aiohttp.ClientSession(connector=conn, request_class=ProxyClientRequest)
sessions.append({'proxy': 'http://127.0.0.1:' + tor_client['config']['SOCKSPort'], 'session': session})
tasks = []
task_count = 0
for url in url_list:
s = sessions.popleft();
session = s['session']
proxy = s['proxy']
task = loop.create_task(_get(url, session, proxy, request_limiter))
tasks.append(task)
task_count += 1
session.append(s)
results = await asyncio.gather(*tasks)
for s in sessions:
s.close()
return results
def getPages(url_list, tor_clients):
"""Given a URL list, dispatch pool of tor clients to concurrently fetch URLs"""
request_limiter = asyncio.Semaphore(len(tor_clients)) # limit to one request per client at a time
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
responses = loop.run_until_complete(_getPagesTasks(url_list, tor_clients, request_limiter, loop))
loop.close()
return responses
但是,此代码未运行。当我尝试运行它时,我收到以下错误。我想知道我是否做错了,或者这是aiosocks
的问题(这似乎已经有一段时间没有维护了,可能是针对较旧版本的aiohttp
或其他什么东西...):
~/Code/gis project/code/TorGetQueue.py in _getPagesTasks(url_list, tor_clients, request_limiter, loop)
50 sessions = deque()
51 for client in tor_clients:
---> 52 conn = ProxyConnector()
53 session = aiohttp.ClientSession(connector=conn, request_class=ProxyClientRequest)
54 sessions.append({'proxy': 'http://127.0.0.1:' + client['config']['SOCKSPort'], 'session': session})
~/.local/share/virtualenvs/code-pIyQci_2/lib/python3.6/site-packages/aiosocks/connector.py in __init__(self, verify_ssl, fingerprint, resolve, use_dns_cache, family, ssl_context, local_addr, resolver, keepalive_timeout, force_close, limit, limit_per_host, enable_cleanup_closed, loop, remote_resolve)
54 force_close=force_close, limit=limit, loop=loop,
55 limit_per_host=limit_per_host, use_dns_cache=use_dns_cache,
---> 56 enable_cleanup_closed=enable_cleanup_closed)
57
58 self._remote_resolve = remote_resolve
TypeError: __init__() got an unexpected keyword argument 'resolve'
我在这里做错了什么?是否有更简单的方法来使用aiohttp
的SOCKS代理?我需要更改什么才能使此代码与aiosocks
一起使用?
谢谢!
答案 0 :(得分:2)
我尝试将aiosocks用于我的项目,以得到与您相同的错误,但后来发现aiosocks已被放弃。
您可以改用aiosocksy。
import asyncio
import aiohttp
from aiosocksy import Socks5Auth
from aiosocksy.connector import ProxyConnector, ProxyClientRequest
async def fetch(url):
connector = ProxyConnector()
socks = 'socks5://127.0.0.1:9050'
async with aiohttp.ClientSession(connector=connector, request_class=ProxyClientRequest) as session:
async with session.get(url, proxy=socks) as response:
print(await response.text())
loop = asyncio.get_event_loop()
loop.run_until_complete(fetch('http://httpbin.org/ip'))