通过自我加入计算移动平均值

时间:2019-04-16 17:28:38

标签: mongodb mongodb-query aggregation-framework

我有以下查询,用于计算MySQL表上的移动平均值:

SELECT m1.x AS x, m1.y AS y, AVG(m2.y) AS average
FROM measured_signal AS m1
JOIN measured_signal AS m2 ON (m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)
WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000
GROUP BY m1.x

它工作正常,但是现在我正在迁移到MongoDB,我需要执行相同的操作。

我读过this question,内容非常相似,但没有涵盖我的具体情况。

到目前为止,我已经写下了以下管道:

db.getCollection("measured_signal").aggregate([
  { $match: {x: { $gt: 0, $lte: 20000 } } },
  { $sort: { x: 1 } },
  { 
    $group:{
      _id: null,
      rows: { 
        $push: { x: "$x", y: "$y" }
      } 
    } 
  },
  {
    $addFields: {
      rows: {
        $map: {
          input: {
            $filter: {
              input: "$rows",
              cond: {
                $gte: ["$$this.x", {$subtract: ["$$this.x", 5000]}],
                $lte: ["$$this.x", {$add: ["$$this.x", 5000]}]
              }
            }
          },
          in: {
            x: "$$this.x",
            y: "$$this.y",
            average: { $avg: "$$this.x" },
          }
        }
      }
    }
  },
  { $unwind:  "$rows" },
  { $match: {x: { $gt: 5000, $lte: 15000 } } }
],{allowDiskUse: true});

但它不起作用。

我应该尝试完全不同的东西吗?还是我该改变什么?感谢您的帮助。

编辑

为了更好地理解问题,我添加了一个输入数据示例

{x:3628, y: 0.1452},
{x:7256, y: 0.1358},
{x:10884, y: 0.1327},
{x:14512, y: 0.1285},
{x:18140, y: 0.1256},
{x:21768, y: 0.1268},
{x:25396, y: 0.1272},
{x:29024, y: 0.1301},
...

和所需的输出,考虑到窗口大小为5000:

{x:7256, y: 0.1358, average: 0.1379}, // average computed on rows between 2256 and 12256
{x:10884, y: 0.1327, average: 0.1323}, // average computed on rows between 5884 and 15884
{x:14512, y: 0.1285, average: 0.1289}, // average computed on rows between 9512 and 19512
{x:18140, y: 0.1256, average: 0.1270}, // average computed on rows between 13140 and 23140
{x:21768, y: 0.1268, average: 0.1265}, // average computed on rows between 16768 and 26768
{x:25396, y: 0.1272, average: 0.1280}, // average computed on rows between 20396 and 30396
...

1 个答案:

答案 0 :(得分:1)

从您的SQL以及我认为的“文字解释” 到MongoDB语句,我实际上仅从八个中得到三个结果问题中发布的文件。

我认为相同的陈述实际上是:

db.measured_signal.aggregate([
  { "$match": { "x": { "$gt": 5000, "$lt": 15000 } } },
  { "$lookup": {
    "from": "measured_signal",
    "let": { "x": "$x", "y": "$y" },
    "pipeline": [
      { "$match": {
        "x": { "$gt": 0, "$lt": 20000 },
        "$expr": {
          "$and": [
            { "$gt": [ "$x", { "$subtract": [ "$$x", 5000 ] }] },
            { "$lt": [ "$x", { "$add": [ "$$x", 5000 ] }] }
          ]
        }
      }},
    ],
    "as": "results"
  }},
  { "$unwind": "$results" },
  { "$group": {
     "_id": "$x",
     "y": { "$first": "$y" },
     "average": { "$avg": "$results.y" }
  }},
  { "$addFields": {
     "_id": "$$REMOVE",
     "x": "$_id"
  }},
  { "$sort": { "x": 1 } }
]).map(({ x, y, average }) => ({ x, y, average }))

结果:

    {
            "x" : 7256,
            "y" : 0.1358,
            "average" : 0.1379
    },
    {
            "x" : 10884,
            "y" : 0.1327,
            "average" : 0.13233333333333333
    },
    {
            "x" : 14512,
            "y" : 0.1285,
            "average" : 0.12893333333333334
    }

如果您的工作很合逻辑的话。

MongoDB中的聚合管道应该通常以$match条件开始。这基本上是声明性SQL语句中的WHERE子句,但是在聚合管道中,此“过滤器”条件首先完成。值得注意的是,JOIN尚未完成,因此初始$match仅查看集合/表的初始(或m1)视图。

接下来的事情是JOIN。这是通过$lookup完成的,在这里我们实际上可以创建一个表达式,在该表达式上,只要等于SQL中显示的条件,就可以“加入”。这里WHERE的第二部分包含在$lookup$match自变量内的pipeline中。这实际上意味着在外国文档上使用了另一个“过滤器”(在本例中为“自我联接” )。

要注意的另一件事是$lookup中的let参数,以及内部管道的$expr中的$match。这允许将初始集合(或m1)中的值与 foreign 集合(或m2)进行比较。如您所见,$expr内的表达式做了一些不同,因为它们是$gt$lt的比较运算符的实际“聚合表达式” 这些版本会在比较值上返回Boolean值。简而言之,我们使变量引用原始文档中的值,并将其与国外集合中的值进行比较,以确定部分“ join” 条件。

$lookup的输出始终是添加到包含匹配的 foreign 结果的初始文档中的“数组”。即使只有一个结果,也始终是一个数组。初始文档中包含此数组的新字段由as参数命名。按照SQL的字面意思,JOIN会产生 denormalized 输出,其中将很多父文档的副本复制到每个外国 child >。其字面翻译为$unwind,但您也可以跳过该步骤,稍后再将$avg的行更改为:

 "average": { "$avg": { "$avg": "$results.y" } }

接下来,当然是$group,就像在SQL中一样,您想要GROUP BY从初始收集文档中的x值开始(仍然MongoDB称为x,当然MongoDB在这方面比SQL更为 literal ,因此您必须将累加器用于{{ $group语句中的1}}或GROUP BY。这意味着将$first运算符用作_id值的适当“累加器”。

“平均值”当然是由$avg获得的,直接基于从[y产生的奇异 denormalized 值,或者首先基于“ array”内容然后按“分组文档”。因此,在第二个示例中,出于这两个目的,$avg被指定了两次。

由于$group要求将其$unwind][5]键按惯例命名为GROUP BY,因此,如果要重命名,则需要$addFields阶段。这就是使MongoDB从聚合管道返回所需名称的方式,但是我个人可能会在返回的结果中坚持使用_id并仅通过_id或类似的操作进行重命名。在上面的清单中也证明了这一点,因为$addFields和其他$project操作实际上将保留来自$group输出的已定义字段的顺序。基本上意味着.map()将是输出文档中的 last 字段,而不是第一个字段。

所以最后一部分确实是 cosmetic ,您不必只是为了看到所需的结果就做它们。当然,$group的输出没有像x这样的默认顺序,因此您希望在实际管道执行结束时使用$sort,或者选择将生成的文档排在后面如果结果足够小,您可以转换为数组。

  

注意,由于$lookup中的GROUP BY表达式实际上是完整的管道,因此您可以而且实际上可能是在返回pipeline中的结果数组之前,应执行$avg操作。但是,这实际上并不会改变它仍然必须返回数组的事实,但是在“大联接”结果的情况下,结果将大大减少并且安全得多,因为您只返回了< strong>一个号码。

     

由于这是“ still” 数组,因此不会改变对as或* double $unwind语句的需求,如所示。只是 nicer 不会返回您不需要的大量最终结果。


仅显示这些实际上是相同的事情,我让您的SQL代码在一个独立列表中运行,而另一个在MongoDB上运行该语句。如您所见,两者产生的结果相同。

NodeJS代码只是为了方便作者在两个引擎上运行。

SQL列表

$avg

输出:

const { Op, DOUBLE, SMALLINT } = Sequelize = require('sequelize');

const logging = log = data => console.log(JSON.stringify(data, undefined, 2));
const sequelize = new Sequelize('sqlite:dbname.db', { logging });

const MeasuredSignal = sequelize.define('measured_signal', {
  id: { type: SMALLINT, primaryKey: true },
  x: DOUBLE,
  y: DOUBLE
}, { freezeTableName: true });

(async function() {

  try {

    await sequelize.authenticate();
    await MeasuredSignal.sync({ force: true });

    let result = await sequelize.transaction(transaction =>
      Promise.all(
        [
          {x:3628, y: 0.1452},
          {x:7256, y: 0.1358},
          {x:10884, y: 0.1327},
          {x:14512, y: 0.1285},
          {x:18140, y: 0.1256},
          {x:21768, y: 0.1268},
          {x:25396, y: 0.1272},
          {x:29024, y: 0.1301}
        ].map(d => MeasuredSignal.create(d, { transaction }))
      )
    );

    let output = await sequelize.query(
      `
        SELECT m1.x AS x, m1.y AS y, AVG(m2.y) as average
        FROM measured_signal as m1
        JOIN measured_signal as m2
        ON ( m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)
        WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000
        GROUP BY m1.x
      `, { type: sequelize.QueryTypes.SELECT });

    log(output);


    } catch (e) {
      console.error(e)
    } finally {
      process.exit()
  }

})()

MongoDB列表

"Executing (default): SELECT 1+1 AS result"
"Executing (default): DROP TABLE IF EXISTS `measured_signal`;"
"Executing (default): CREATE TABLE IF NOT EXISTS `measured_signal` (`id` INTEGER PRIMARY KEY, `x` DOUBLE PRECISION, `y` DOUBLE PRECISION, `createdAt` DATETIME NOT NULL, `updatedAt` DATETIME NOT NULL);"
"Executing (default): PRAGMA INDEX_LIST(`measured_signal`)"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): BEGIN DEFERRED TRANSACTION;"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): COMMIT;"
"Executing (default): SELECT m1.x AS x, m1.y AS y, AVG(m2.y) as average\n        FROM measured_signal as m1\n        JOIN measured_signal as m2\n        ON ( m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)\n        WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000\n        GROUP BY m1.x"
[
  {
    "x": 7256,
    "y": 0.1358,
    "average": 0.13790000000000002
  },
  {
    "x": 10884,
    "y": 0.1327,
    "average": 0.13233333333333333
  },
  {
    "x": 14512,
    "y": 0.1285,
    "average": 0.12893333333333332
  }
]

输出:

const { Schema } = mongoose = require('mongoose');

const uri = 'mongodb://localhost:27017/test';
const opts = { useNewUrlParser: true };

mongoose.set('useFindAndModify', false);
mongoose.set('useCreateIndex', true);
mongoose.set('debug', true);

const signalSchema = new Schema({
  x: Number,
  y: Number
});

const MeasuredSignal = mongoose.model('MeasuredSignal', signalSchema, 'measured_signal');

const log = data => console.log(JSON.stringify(data, undefined, 2));

(async function() {

  try {

    const conn = await mongoose.connect(uri, opts);

    await Promise.all(
      Object.entries(conn.models).map(([k,m]) => m.deleteMany())
    );

    await MeasuredSignal.insertMany([
      {x:3628, y: 0.1452},
      {x:7256, y: 0.1358},
      {x:10884, y: 0.1327},
      {x:14512, y: 0.1285},
      {x:18140, y: 0.1256},
      {x:21768, y: 0.1268},
      {x:25396, y: 0.1272},
      {x:29024, y: 0.1301}
    ]);

    let result = await MeasuredSignal.aggregate([
      { "$match": { "x": { "$gt": 5000, "$lt": 15000 } } },
      { "$lookup": {
        "from": MeasuredSignal.collection.name,
        "let": { "x": "$x", "y": "$y" },
        "pipeline": [
          { "$match": {
            "x": { "$gt": 0, "$lt": 20000 },
            "$expr": {
              "$and": [
                { "$gt": [ "$x", { "$subtract": [ "$$x", 5000 ] } ] },
                { "$lt": [ "$x", { "$add": [ "$$x", 5000 ] } ] }
              ]
            }
          }}
        ],
        "as": "results"
      }},
      { "$group": {
        "_id": "$x",
        "y": { "$first": "$y" },
        "average": { "$avg": { "$avg": "$results.y" } }
      }},
      { "$sort": { "_id": 1 } }
    ]);

    result = result.map(({ _id: x, y, average }) => ({ x, y, average }));

    log(result);
  } catch(e) {
    console.error(e)
  } finally {
    mongoose.disconnect()
  }

})()