有效地合并并发的结果。未来并行执行?

时间:2018-10-30 15:44:55

标签: python pandas concurrency parallel-processing

我有大约1亿行的熊猫数据框。在多核计算机上并行处理效果很好,每个核的利用率为100%。但是,const functions = require('firebase-functions'); const admin = require('firebase-admin'); const {WebhookClient, Suggestion} = require('dialogflow-fulfillment'); const {dialogflow, Permission, Image, SignIn, BasicCard} = require('actions-on-google'); process.env.DEBUG = 'dialogflow:*'; // enables lib debugging statements admin.initializeApp(functions.config().firebase); const db = admin.firestore(); db.settings({timestampsInSnapshots: true}); const {ssml} = require('./util'); exports.dialogflowFirebaseFulfillment = functions.https.onRequest((request, response) => { const agent = new WebhookClient({request, response}); let conv = agent.conv(); function ask_for_sign_in(agent) { let conv = agent.conv(); conv.ask(new SignIn('Per personalizzare')); agent.add(conv); } function actions_intent_SIGN_IN(agent) { let conv = agent.conv(); const granted = conv.arguments.get('SIGN_IN').status === 'OK'; console.log('name', conv.user.profile.payload); if(granted){ agent.add('granted'); }else{ agent.add('not granted'); } agent.add('test'); } // Map from Dialogflow intent names to functions to be run when the intent is matched let intentMap = new Map(); intentMap.set('ask_for_sign_in', ask_for_sign_in); intentMap.set('actions_intent_SIGN_IN', actions_intent_SIGN_IN); agent.handleRequest(intentMap); }); 的结果是一个生成器,因此为了实际收集处理后的结果,我迭代了该生成器。这非常非常慢(几小时),部分是因为它是单核,部分是因为循环。实际上,它比executor.map()

中的实际处理要慢得多

是否有更好的方法(可能是并发和/或矢量化的)?

编辑:将pandas 0.23.4(当前最新)与Python 3.7.0一起使用

my_function()

1 个答案:

答案 0 :(得分:1)

以下是与您的案例有关的基准:https://stackoverflow.com/a/31713471/5588279

如您所见,concat(追加)多次无效。您应该只做pd.concat(gen)。我相信underlyig实现将预分配所有需要的内存。

对于您而言,每次都会进行内存分配。