我有1,500,000,000行数据保存到多个txt文件中。格式如下的数据:
key1 key2
其中key1是url,key2是mysql记录row_id。
我编写了以下python代码来解析数据,但速度很慢。 例如
import Queue
import threading
class CheckThread(threading.Thread):
def __init__(self, queue, src_folder, dest_folder='check_result'):
super(CheckThread, self).__init__()
self._queue = queue
self.daemon = True
def run(self):
while True:
file_name = self._queue.get()
try:
self._prepare_check(file_name)
except:
self._queue.task_done()
continue
self._queue.task_done()
def Check(src_folder, workers=12, dest_folder='check_result'):
queue = Queue.Queue()
for (dirpath, dirnames, filelist) in os.walk(src_folder):
for name in filelist:
if name[0] == '.':
continue
queue.put(os.path.join(dirpath, name))
for worker in xrange(workers):
worker = str(worker + 1)
t = CheckThread(queue, src_folder, dest_folder)
t.start()
queue.join()
def main(folder, worker=12, out='check_result'):
try:
Check(folder, worker, out)
except:
return 1
return 0
每个线程解析队列中的一个文件。
如何提高每个文件的解析速度。
答案 0 :(得分:0)
except:
,始终指定要捕获的异常。try: ... except: ...
不是此处使用的propper结构,即使try: ... finally: ...
部分引发异常,您也应使用finally:
来执行try:
部分。 CheckThread.__init__()
的部分参数未被使用。CheckThread._prepare_check()
做什么?worker
。更改后的代码为:
import Queue
import threading
class CheckThread(threading.Thread):
def __init__(self, queue, src_folder, dest_folder='check_result'):
super(CheckThread, self).__init__()
self._queue = queue
self.daemon = True
# Do something with src_folder and dest_folder or delete them from the parameter list
def run(self):
while True:
file_name = self._queue.get()
try:
self._prepare_check(file_name)
finally:
self._queue.task_done()
def Check(src_folder, workers=12, dest_folder='check_result'):
queue = Queue.Queue()
for (dirpath, dirnames, filelist) in os.walk(src_folder):
for name in filelist:
if name[0] == '.':
continue
queue.put(os.path.join(dirpath, name))
for worker in xrange(workers):
worker = str(worker + 1) # Do something with worker or delete this line
t = CheckThread(queue, src_folder, dest_folder)
t.start()
queue.join()
def main(folder, worker=12, out='check_result'):
Check(folder, worker, out)