我有一个数据库记录集(大约1000行),我正在迭代它们,使用额外的数据库查询为每条记录集成更多数据。
这样做,将整个处理时间提高到100秒。
我想要做的是将功能分享到2-4个进程。
我正在使用Python 2.7来兼容AWS Lambda。
def handler(event, context):
try:
records = connection.get_users()
mandrill_client = open_mandrill_connection()
mandrill_messages = get_mandrill_messages()
mandrill_template = 'POINTS weekly-report-to-user'
start_time = time.time()
messages = build_messages(mandrill_messages, records)
print("OVERALL: %s seconds ---" % (time.time() - start_time))
send_mandrill_message(mandrill_client, mandrill_template, messages)
connection.close_database_connection()
return "Process Completed"
except Exception as e:
print(e)
以下是我想要放入线程的函数:
def build_messages(messages, records):
for record in records:
record = dict(record)
stream = get_user_stream(record)
data = compile_loyalty_stream(stream)
messages['to'].append({
'email': record['email'],
'type': 'to'
})
messages['merge_vars'].append({
'rcpt': record['email'],
'vars': [
{
'name': 'total_points',
'content': record['total_points']
},
{
'name': 'total_week',
'content': record['week_points']
},
{
'name': 'stream_greek',
'content': data['el']
},
{
'name': 'stream_english',
'content': data['en']
}
]
})
return messages
我尝试过导入多处理库:
from multiprocessing.pool import ThreadPool
在 try 块中创建了一个池,并在该池中映射了该函数:
pool = ThreadPool(4)
messages = pool.map(build_messages_in, itertools.izip(itertools.repeat(mandrill_messages), records))
def build_messages_in(a_b):
build_msg(*a_b)
def build_msg(a, b):
return build_messages(a, b)
def get_user_stream(record):
response = []
i = 0
for mod, mod_id, act, p, act_created in izip(record['models'], record['model_ids'], record['actions'],
record['points'], record['action_creation']):
information = get_reference(mod, mod_id)
if information:
response.append({
'action': act,
'points': p,
'created': act_created,
'info': information
})
if (act == 'invite_friend') \
or (act == 'donate') \
or (act == 'bonus_500_general') \
or (act == 'bonus_1000_general') \
or (act == 'bonus_500_cancel') \
or (act == 'bonus_1000_cancel'):
response[i]['info']['date_ref'] = act_created
response[i]['info']['slug'] = 'attiki'
if (act == 'bonus_500_general') \
or (act == 'bonus_1000_general') \
or (act == 'bonus_500_cancel') \
or (act == 'bonus_1000_cancel'):
response[i]['info']['title'] = ''
i += 1
return response
最后,我从build_message函数中删除了 for 循环。
我得到的结果是'NoneType'对象不可迭代。
这是正确的做法吗?
答案 0 :(得分:2)
您的代码似乎非常深入,因此您无法确定multithreading
在高级别应用时是否会带来任何性能提升。因此,值得深入研究,为您提供最大的延迟,并考虑如何处理特定的瓶颈。有关线程限制的更多讨论,请参阅here。
例如,如我们在评论中所讨论的那样,您可以查明需要很长时间的单个任务,那么您可以尝试使用multiprocessing
来并行化它 - 以充分利用您的CPU能力。这是一个通用的例子,希望很容易理解为镜像你的Postgres查询而不进入你自己的代码库;我认为这是一项不可行的努力。
import multiprocessing as mp
import time
import random
import datetime as dt
MAILCHIMP_RESPONSE = [x for x in range(1000)]
def chunks(l, n):
n = max(1, n)
return [l[i:i + n] for i in range(0, len(l), n)]
def db_query():
''' Delayed response from database '''
time.sleep(0.01)
return random.random()
def do_queries(query_list):
''' The function that takes all your query ids and executes them
sequentially for each id '''
results = []
for item in query_list:
query = db_query()
# Your super-quick processing of the Postgres response
processing_result = query * 2
results.append([item, processing_result])
return results
def single_processing():
''' As you do now - equivalent to get_reference '''
result_of_process = do_queries(MAILCHIMP_RESPONSE)
return result_of_process
def multi_process(chunked_data, queue):
''' Same as single_processing, except we put our results in queue rather
than returning them '''
result_of_process = do_queries(chunked_data)
queue.put(result_of_process)
def multiprocess_handler():
''' Divide and conquor on our db requests. We split the mailchimp response
into a series of chunks and fire our queries simultaneously. Thus, each
concurrent process has a smaller number of queries to make '''
num_processes = 4 # depending on cores/resources
size_chunk = len(MAILCHIMP_RESPONSE) / num_processes
chunked_queries = chunks(MAILCHIMP_RESPONSE, size_chunk)
queue = mp.Queue() # This is going to combine all the results
processes = [mp.Process(target=multi_process,
args=(chunked_queries[x], queue)) for x in range(num_processes)]
for p in processes: p.start()
divide_and_conquor_result = []
for p in processes:
divide_and_conquor_result.extend(queue.get())
return divide_and_conquor_result
if __name__ == '__main__':
start_single = dt.datetime.now()
single_process = single_processing()
print "Single process took {}".format(dt.datetime.now() - start_single)
print "Number of records processed = {}".format(len(single_process))
start_multi = dt.datetime.now()
multi = multiprocess_handler()
print "Multi process took {}".format(dt.datetime.now() - start_multi)
print "Number of records processed = {}".format(len(multi))