Question

我有大约1亿行的熊猫数据框。在多核计算机上并行处理效果很好，每个核的利用率为100％。但是，const functions = require('firebase-functions'); const admin = require('firebase-admin'); const {WebhookClient, Suggestion} = require('dialogflow-fulfillment'); const {dialogflow, Permission, Image, SignIn, BasicCard} = require('actions-on-google'); process.env.DEBUG = 'dialogflow:*'; // enables lib debugging statements admin.initializeApp(functions.config().firebase); const db = admin.firestore(); db.settings({timestampsInSnapshots: true}); const {ssml} = require('./util'); exports.dialogflowFirebaseFulfillment = functions.https.onRequest((request, response) => { const agent = new WebhookClient({request, response}); let conv = agent.conv(); function ask_for_sign_in(agent) { let conv = agent.conv(); conv.ask(new SignIn('Per personalizzare')); agent.add(conv); } function actions_intent_SIGN_IN(agent) { let conv = agent.conv(); const granted = conv.arguments.get('SIGN_IN').status === 'OK'; console.log('name', conv.user.profile.payload); if(granted){ agent.add('granted'); }else{ agent.add('not granted'); } agent.add('test'); } // Map from Dialogflow intent names to functions to be run when the intent is matched let intentMap = new Map(); intentMap.set('ask_for_sign_in', ask_for_sign_in); intentMap.set('actions_intent_SIGN_IN', actions_intent_SIGN_IN); agent.handleRequest(intentMap); });的结果是一个生成器，因此为了实际收集处理后的结果，我迭代了该生成器。这非常非常慢（几小时），部分是因为它是单核，部分是因为循环。实际上，它比executor.map()

中的实际处理要慢得多

是否有更好的方法（可能是并发和/或矢量化的）？

编辑：将pandas 0.23.4（当前最新）与Python 3.7.0一起使用

my_function()

Answer 1

以下是与您的案例有关的基准：https://stackoverflow.com/a/31713471/5588279

如您所见，concat（追加）多次无效。您应该只做pd.concat(gen)。我相信underlyig实现将预分配所有需要的内存。

对于您而言，每次都会进行内存分配。

有效地合并并发的结果。未来并行执行？

1 个答案: