代码从文件中读取url并将其推送到分配给线程的队列并执行第三方web api调用以获取进入全局列表的结果。 当我执行这个程序的某个时候它会结束并完成进程(打印完成),有时它会卡住并保持进程永远不会完成。
似乎有一个异常(“我们未能到达服务器”)它保持着进程并且永远不会完成。我相信这是线程问题。
任何机构都可以弄清楚问题是什么。提前谢谢
这是代码
import threading
import Queue
import hmac
import hashlib
import base64
import urllib2
from urllib2 import Request, urlopen, URLError, HTTPError
import sys
import httplib, urllib, time, random, os
import json
from urlparse import urlparse
import time
#Number of threads
n_thread = 50
#Create queue
queue = Queue.Queue()
domainBlacklistDomain=[]
urlList=[]
def checkBlackList(domain,line):
testUrl = 'https://test.net'
apiToken = 'aaaaa'
secretKey = 'bbbb'
signature_data = 'GET\n/v1/blacklist/lookup\nurl='+domain+'\n\n\n'
digest = hmac.new(secretKey, signature_data, hashlib.sha1).digest()
digest_base64 = base64.encodestring(digest)
req = urllib2.Request('https://test.net/v1/blacklist/lookup?url='+domain)
req.add_header('Authorization', 'Test' + apiToken + ':' + digest_base64)
req.add_header('Connection', 'Keep-Alive')
try:
page = urlopen(req)
length = str(page.info())
if length.find("Content-Length: 0") != -1:
url=str(line.strip())
urlList.append(url)
else:
json_data=json.load(page)
domainBlacklistDomain.append(json_data['url'])
if int(json_data['score']) >10:
print json_data['url']
except HTTPError, e:
print 'The server couldn\'t fulfill the request.'
except URLError, e:
print 'We failed to reach a server.'
class ThreadClass(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
#Assign thread working with queue
self.queue = queue
def run(self):
while True:
#Get from queue job
host = self.queue.get()
parsed_uri = urlparse(host)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
if "\n" in domain:
domain=domain.replace('\n', '').replace('\r', '')
if domain not in domainBlacklistDomain:
checkBlackList(domain,host):
else:
if domain not in domainBlacklistDomain:
checkBlackList(domain,host):
#signals to queue job is done
self.queue.task_done()
#Create number process
for i in range(n_thread):
t = ThreadClass(queue)
t.setDaemon(True)
#Start thread
t.start()
#Read file line by line
hostfile = open("result_url.txt","r")
for line in hostfile:
#Put line to queue
queue.put(line)
#wait on the queue until everything has been processed
queue.join()
fo=open("final_result.txt","w+b")
for item in urlList:
fo.write("%s\n" %item)
print "done??"
答案 0 :(得分:0)
如果不详细阅读您的代码,问题几乎肯定与尝试建立与无响应IP地址的连接有关。这些连接的超时时间可能很长。
尝试使用socket.setdefaulttimeout()
函数建立全局套接字超时。