我根据使用urllib从Siteadvisor获取的结果将100k域名处理成CSV(我不知道最好的方法)。但是,我当前的脚本创建了太多线程,Python遇到错误。有没有办法我可以“块”这个脚本一次做X个域(比如10-20)来防止这些错误?提前谢谢。
import threading
import urllib
class Resolver(threading.Thread):
def __init__(self, address, result_dict):
threading.Thread.__init__(self)
self.address = address
self.result_dict = result_dict
def run(self):
try:
content = urllib.urlopen("http://www.siteadvisor.com/sites/" + self.address).read(12000)
search1 = content.find("didn't find any significant problems.")
search2 = content.find('yellow')
search3 = content.find('web reputation analysis found potential security')
search4 = content.find("don't have the results yet.")
if search1 != -1:
result = "safe"
elif search2 != -1:
result = "caution"
elif search3 != -1:
result = "warning"
elif search4 != -1:
result = "unknown"
else:
result = ""
self.result_dict[self.address] = result
except:
pass
def main():
infile = open("domainslist", "r")
intext = infile.readlines()
threads = []
results = {}
for address in [address.strip() for address in intext if address.strip()]:
resolver_thread = Resolver(address, results)
threads.append(resolver_thread)
resolver_thread.start()
for thread in threads:
thread.join()
outfile = open('final.csv', 'w')
outfile.write("\n".join("%s,%s" % (address, ip) for address, ip in results.iteritems()))
outfile.close()
if __name__ == '__main__':
main()
修改:新版本,基于andyortlieb的建议。
import threading
import urllib
import time
class Resolver(threading.Thread):
def __init__(self, address, result_dict, threads):
threading.Thread.__init__(self)
self.address = address
self.result_dict = result_dict
self.threads = threads
def run(self):
try:
content = urllib.urlopen("http://www.siteadvisor.com/sites/" + self.address).read(12000)
search1 = content.find("didn't find any significant problems.")
search2 = content.find('yellow')
search3 = content.find('web reputation analysis found potential security')
search4 = content.find("don't have the results yet.")
if search1 != -1:
result = "safe"
elif search2 != -1:
result = "caution"
elif search3 != -1:
result = "warning"
elif search4 != -1:
result = "unknown"
else:
result = ""
self.result_dict[self.address] = result
outfile = open('final.csv', 'a')
outfile.write(self.address + "," + result + "\n")
outfile.close()
print self.address + result
threads.remove(self)
except:
pass
def main():
infile = open("domainslist", "r")
intext = infile.readlines()
threads = []
results = {}
for address in [address.strip() for address in intext if address.strip()]:
loop=True
while loop:
if len(threads) < 20:
resolver_thread = Resolver(address, results, threads)
threads.append(resolver_thread)
resolver_thread.start()
loop=False
else:
time.sleep(.25)
for thread in threads:
thread.join()
# removed so I can track the progress of the script
# outfile = open('final.csv', 'w')
# outfile.write("\n".join("%s,%s" % (address, ip) for address, ip in results.iteritems()))
# outfile.close()
if __name__ == '__main__':
main()
答案 0 :(得分:2)
您现有的代码可以很好地工作 - 只需修改__init__
内的Resolver
方法,即可一次接收一个地址而不是一个地址,因此不是每个地址都有一个线程,每10个你有一个线程(例如)。这样你就不会重载线程。
您显然必须稍微修改run
,以便它遍历地址数组而不是self.address
。
如果您愿意,我可以提供一个快速示例,但从代码质量来看,我觉得您可以很轻松地处理它。
希望这有帮助!
编辑以下示例请求。请注意,您必须修改main以发送Resolver
实例地址列表而不是单个地址 - 如果您不了解有关文件格式以及地址存储方式的更多信息,我无法为您处理此问题。注意 - 您可以使用辅助函数执行run
方法,但我认为这可能更容易理解为
class Resolver(threading.Thread):
def __init__(self, addresses, result_dict):
threading.Thread.__init__(self)
self.addresses = addresses # Now takes in a list of multiple addresses
self.result_dict = result_dict
def run(self):
for address in self.addresses: # do your existing code for every address in the list
try:
content = urllib.urlopen("http://www.siteadvisor.com/sites/" + address).read(12000)
search1 = content.find("didn't find any significant problems.")
search2 = content.find('yellow')
search3 = content.find('web reputation analysis found potential security')
search4 = content.find("don't have the results yet.")
if search1 != -1:
result = "safe"
elif search2 != -1:
result = "caution"
elif search3 != -1:
result = "warning"
elif search4 != -1:
result = "unknown"
else:
result = ""
self.result_dict[address] = result
except:
pass
答案 1 :(得分:2)
这可能有点僵硬,但你可以将线程传递给Resolver,这样当Resolver.run完成时,它可以调用threads.remove(self)
然后你可以嵌套一些条件,这样只有在有空间的情况下才能创建线程,如果没有空间,它们会等到有空间。
for address in [address.strip() for address in intext if address.strip()]:
loop=True
while loop:
if len(threads)<20:
resolver_thread = Resolver(address, results, threads)
threads.append(resolver_thread)
resolver_thread.start()
loop=False
else:
time.sleep(.25)