我正在从API端点获取数据,这是我应用程序的瓶颈。
我的问题是: 为什么添加.join()会大大降低程序速度? 我知道,为了确保主节点在子进程“正常”退出后退出,是安全的。我只是不明白为什么它比运行此单线程的速度还要慢。
在我要馈入多进程队列的文件中,包含n行文件名,然后将其提取到API。 然后将该请求的输出插入到另一个队列,该队列在mp.queue进程准备就绪(写入文件)之后运行。
from multiprocessing import Process, Queue
import time
import sys
import requests
import json
import xlrd
import openpyxl
def reader_proc (queue):
## Read from the queue; this will be spawned as a separate Process
while True:
msg = queue.get() # Read from the queue and do nothing
if (msg == 'DONE'):
break
r = search('filename:' + msg[1], token)
try:
if r.json()['totalHits'] > 0:
ad = r.json()['hits'][0]
entryQueue.put((msg[0], msg[1], 'x',
ad.get('metadata', {}).get('cf_eanCode', ''),
ad.get('metadata', {}).get('cf_assetPriority', ''),
ad.get('id', ''),
ad.get('metadata', {}).get('assetPath', ''),
r.json()['totalHits']
))
else:
entryQueue.put((msg[0], msg[1]))
except KeyError:
pass
def writer (filename, queue):
queue.put(filename) # Write to queue
queue.put('DONE')
missing_xcodes = '/missing_xcodes.txt'
xlxsf = 'xeancodes_mp.xlsx'
xfile = openpyxl.load_workbook(xlxsf)
sheet = xfile.get_sheet_by_name('xcodes')
# Multiprocess will throw data to this queue
entryQueue = Queue()
if __name__ == '__main__':
_start = time.time()
pqueue = Queue() # Processing queue for MP
with open(missing_xcodes) as f:
for i, filename in enumerate(f):
filename = filename.replace("\n", "")
filename = filename.strip()
reader_p = Process(target=reader_proc, args=((pqueue),))
reader_p.daemon = True
reader_p.start()
# reader_p.join() # I've kept reader here SUPER SLOW.
writer((i, filename), pqueue) # Send to queue.
# reader_p.join() # Tried here, too. Will skip some of the queue
elements that are not ready, so its not working as should..
time.sleep(2) # Need to use sleep here to make sure all child processes are
# done.
entryQueue.put('DONE') # This will add STOP to the queue as a last in FIFO
# queue.
while True:
msg = entryQueue.get() # Read from the queue
if (msg == 'DONE'):
break
nro = msg[0] + 1 # In excel things start @1
sheet['A' + str(nro)] = str(msg) # filename
xfile.save(xlxsf)
print(time.time() - _start)