我正在尝试使用请求来检查巨大的代理列表。为了做到这一点,我正在使用线程。我真的需要这些线程,因为后面我使用相同的代码结构,以便在我正在做1个线程= 1 IP的网站上发出许多请求。
所以我的排序代码是:
def proxyList(proxies, nbThread):
with open('proxyList.txt', 'w') as f:
f.write('')
f.close()
proxies = list(set(proxies))
prox = []
lenS = len(proxies)
pas = int(lenS/nbThread)
subSeq = [proxies[i*pas:(i+1)*pas] for i in range(nbThread)]
subSeq[nbThread-1]+=proxies[nbThread*pas:]
threads = [0 for i in range(nbThread)]
for i in range(nbThread):
threads[i] = proxy(subSeq[i], )
for i in range(nbThread):
threads[i].start();
for i in range(nbThread):
threads[i].join();
return list(set(prox))
class proxy(Thread):
def __init__(self, proxies):
Thread.__init__(self)
self.proxies = proxies
def run(self):
k=0
prox = []
for proxy in self.proxies:
k+=1
try:
requests.get("https://api.ipify.org/?format=json", timeout=15, proxies={"https":str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0])})
try:
requests.get("https://api.ipify.org/?format=json", timeout=15, proxies={"https":str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0])})
prox+=[str(proxy)]
print("Bon proxy : " + str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0]))
with open('proxyList.txt', 'a') as f:
f.writelines(str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0])+'\n');
f.close()
except:
t = "a"
except:
print("Mauvais proxy : "+ str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0]))
print(sys.exc_info()[0])
print("Terminé: "+str(k), prox)
它可以工作,但我并不总是有相同的输出结果,它与我设置的线程数高度相关。
你们有没有想法,我已经看到可能请求不是这里的最佳选择,但我真的需要我的代理人的线程。
谢谢, Djokx
答案 0 :(得分:1)
我很确定请求是最好的方式。以下是讨论内容;
https://gist.github.com/kennethreitz/973705
但是我试图对你的代码做一些改进,减少了一直在做同样工作的循环,并阻止两次调用“get”方法。
希望有所帮助
def proxyList(proxies, nbThread):
with open('proxyList.txt', 'w') as f:
f.write('')
f.close()
proxies = list(set(proxies))
prox = []
lenS = len(proxies)
pas = int(lenS/nbThread)
subSeq = [proxies[i*pas:(i+1)*pas] for i in range(nbThread)]
subSeq[nbThread-1]+=proxies[nbThread*pas:]
threads = [0 for i in range(nbThread)]
for i in range(nbThread):
threads[i] = proxy(subSeq[i], )
threads[i].start()
threads[i].join()
return list(set(prox))
class proxy(Thread):
def __init__(self, proxies):
Thread.__init__(self)
self.proxies = proxies
def run(self):
k=0
prox = []
for proxy in self.proxies:
k+=1
try:
s = requests.Session()
try:
s.get("https://api.ipify.org/?format=json", timeout=15, proxies={"https":str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0])})
prox+=[str(proxy)]
print("Bon proxy : " + str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0]))
with open('proxyList.txt', 'a') as f:
f.writelines(str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0])+'\n');
f.close()
except:
t = "a"
except:
print("Mauvais proxy : "+ str(re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', proxy)[0]))
print(sys.exc_info()[0])
print("Terminé: "+str(k), prox)