Question

我有以下查询，用于计算MySQL表上的移动平均值：

SELECT m1.x AS x, m1.y AS y, AVG(m2.y) AS average
FROM measured_signal AS m1
JOIN measured_signal AS m2 ON (m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)
WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000
GROUP BY m1.x

它工作正常，但是现在我正在迁移到MongoDB，我需要执行相同的操作。

我读过this question，内容非常相似，但没有涵盖我的具体情况。

到目前为止，我已经写下了以下管道：

db.getCollection("measured_signal").aggregate([
  { $match: {x: { $gt: 0, $lte: 20000 } } },
  { $sort: { x: 1 } },
  { 
    $group:{
      _id: null,
      rows: { 
        $push: { x: "$x", y: "$y" }
      } 
    } 
  },
  {
    $addFields: {
      rows: {
        $map: {
          input: {
            $filter: {
              input: "$rows",
              cond: {
                $gte: ["$$this.x", {$subtract: ["$$this.x", 5000]}],
                $lte: ["$$this.x", {$add: ["$$this.x", 5000]}]
              }
            }
          },
          in: {
            x: "$$this.x",
            y: "$$this.y",
            average: { $avg: "$$this.x" },
          }
        }
      }
    }
  },
  { $unwind:  "$rows" },
  { $match: {x: { $gt: 5000, $lte: 15000 } } }
],{allowDiskUse: true});

但它不起作用。

我应该尝试完全不同的东西吗？还是我该改变什么？感谢您的帮助。

编辑

为了更好地理解问题，我添加了一个输入数据示例

{x:3628, y: 0.1452},
{x:7256, y: 0.1358},
{x:10884, y: 0.1327},
{x:14512, y: 0.1285},
{x:18140, y: 0.1256},
{x:21768, y: 0.1268},
{x:25396, y: 0.1272},
{x:29024, y: 0.1301},
...

和所需的输出，考虑到窗口大小为5000：

{x:7256, y: 0.1358, average: 0.1379}, // average computed on rows between 2256 and 12256
{x:10884, y: 0.1327, average: 0.1323}, // average computed on rows between 5884 and 15884
{x:14512, y: 0.1285, average: 0.1289}, // average computed on rows between 9512 and 19512
{x:18140, y: 0.1256, average: 0.1270}, // average computed on rows between 13140 and 23140
{x:21768, y: 0.1268, average: 0.1265}, // average computed on rows between 16768 and 26768
{x:25396, y: 0.1272, average: 0.1280}, // average computed on rows between 20396 and 30396
...

Answer 1

从您的SQL以及我认为的“文字解释” 到MongoDB语句，我实际上仅从八个中得到三个结果问题中发布的文件。

我认为相同的陈述实际上是：

db.measured_signal.aggregate([
  { "$match": { "x": { "$gt": 5000, "$lt": 15000 } } },
  { "$lookup": {
    "from": "measured_signal",
    "let": { "x": "$x", "y": "$y" },
    "pipeline": [
      { "$match": {
        "x": { "$gt": 0, "$lt": 20000 },
        "$expr": {
          "$and": [
            { "$gt": [ "$x", { "$subtract": [ "$$x", 5000 ] }] },
            { "$lt": [ "$x", { "$add": [ "$$x", 5000 ] }] }
          ]
        }
      }},
    ],
    "as": "results"
  }},
  { "$unwind": "$results" },
  { "$group": {
     "_id": "$x",
     "y": { "$first": "$y" },
     "average": { "$avg": "$results.y" }
  }},
  { "$addFields": {
     "_id": "$$REMOVE",
     "x": "$_id"
  }},
  { "$sort": { "x": 1 } }
]).map(({ x, y, average }) => ({ x, y, average }))

结果：

    {
            "x" : 7256,
            "y" : 0.1358,
            "average" : 0.1379
    },
    {
            "x" : 10884,
            "y" : 0.1327,
            "average" : 0.13233333333333333
    },
    {
            "x" : 14512,
            "y" : 0.1285,
            "average" : 0.12893333333333334
    }

如果您的工作很合逻辑的话。

MongoDB中的聚合管道应该通常以$match条件开始。这基本上是声明性SQL语句中的WHERE子句，但是在聚合管道中，此“过滤器”条件首先完成。值得注意的是，JOIN尚未完成，因此初始$match仅查看集合/表的初始（或m1）视图。

接下来的事情是JOIN。这是通过$lookup完成的，在这里我们实际上可以创建一个表达式，在该表达式上，只要等于SQL中显示的条件，就可以“加入”。这里WHERE的第二部分包含在$lookup的$match自变量内的pipeline中。这实际上意味着在外国文档上使用了另一个“过滤器”（在本例中为“自我联接” ）。

要注意的另一件事是$lookup中的let参数，以及内部管道的$expr中的$match。这允许将初始集合（或m1）中的值与 foreign 集合（或m2）进行比较。如您所见，$expr内的表达式做了一些不同，因为它们是$gt和$lt的比较运算符的实际“聚合表达式” 这些版本会在比较值上返回Boolean值。简而言之，我们使变量引用原始文档中的值，并将其与国外集合中的值进行比较，以确定部分“ join” 条件。

$lookup的输出始终是添加到包含匹配的 foreign 结果的初始文档中的“数组”。即使只有一个结果，也始终是一个数组。初始文档中包含此数组的新字段由as参数命名。按照SQL的字面意思，JOIN会产生 denormalized 输出，其中将很多父文档的副本复制到每个外国 child >。其字面翻译为$unwind，但您也可以跳过该步骤，稍后再将$avg的行更改为：

 "average": { "$avg": { "$avg": "$results.y" } }

接下来，当然是$group，就像在SQL中一样，您想要GROUP BY从初始收集文档中的x值开始（仍然MongoDB称为x，当然MongoDB在这方面比SQL更为 literal ，因此您必须将累加器用于{{ $group语句中的1}}或GROUP BY。这意味着将$first运算符用作_id值的适当“累加器”。

“平均值”当然是由$avg获得的，直接基于从[y产生的奇异 denormalized 值，或者首先基于“ array”内容然后按“分组文档”。因此，在第二个示例中，出于这两个目的，$avg被指定了两次。

由于$group要求将其$unwind][5]键按惯例命名为GROUP BY，因此，如果要重命名，则需要$addFields阶段。这就是使MongoDB从聚合管道返回所需名称的方式，但是我个人可能会在返回的结果中坚持使用_id并仅通过_id或类似的操作进行重命名。在上面的清单中也证明了这一点，因为$addFields和其他$project操作实际上将保留来自$group输出的已定义字段的顺序。基本上意味着.map()将是输出文档中的 last 字段，而不是第一个字段。

所以最后一部分确实是 cosmetic ，您不必只是为了看到所需的结果就做它们。当然，$group的输出没有像x这样的默认顺序，因此您希望在实际管道执行结束时使用$sort，或者选择将生成的文档排在后面如果结果足够小，您可以转换为数组。

注意，由于$lookup中的GROUP BY表达式实际上是完整的管道，因此您可以而且实际上可能是在返回pipeline中的结果数组之前，应执行$avg操作。但是，这实际上并不会改变它仍然必须返回数组的事实，但是在“大联接”结果的情况下，结果将大大减少并且安全得多，因为您只返回了< strong>一个号码。

由于这是“ still” 数组，因此不会改变对as或* double $unwind语句的需求，如所示。只是 nicer 不会返回您不需要的大量最终结果。

仅显示这些实际上是相同的事情，我让您的SQL代码在一个独立列表中运行，而另一个在MongoDB上运行该语句。如您所见，两者产生的结果相同。

NodeJS代码只是为了方便作者在两个引擎上运行。

SQL列表

$avg

输出：

const { Op, DOUBLE, SMALLINT } = Sequelize = require('sequelize'); const logging = log = data => console.log(JSON.stringify(data, undefined, 2)); const sequelize = new Sequelize('sqlite:dbname.db', { logging }); const MeasuredSignal = sequelize.define('measured_signal', { id: { type: SMALLINT, primaryKey: true }, x: DOUBLE, y: DOUBLE }, { freezeTableName: true }); (async function() { try { await sequelize.authenticate(); await MeasuredSignal.sync({ force: true }); let result = await sequelize.transaction(transaction => Promise.all( [ {x:3628, y: 0.1452}, {x:7256, y: 0.1358}, {x:10884, y: 0.1327}, {x:14512, y: 0.1285}, {x:18140, y: 0.1256}, {x:21768, y: 0.1268}, {x:25396, y: 0.1272}, {x:29024, y: 0.1301} ].map(d => MeasuredSignal.create(d, { transaction })) ) ); let output = await sequelize.query( ` SELECT m1.x AS x, m1.y AS y, AVG(m2.y) as average FROM measured_signal as m1 JOIN measured_signal as m2 ON ( m2.x BETWEEN m1.x - 5000 AND m1.x + 5000) WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000 GROUP BY m1.x `, { type: sequelize.QueryTypes.SELECT }); log(output); } catch (e) { console.error(e) } finally { process.exit() } })()

MongoDB列表

"Executing (default): SELECT 1+1 AS result" "Executing (default): DROP TABLE IF EXISTS `measured_signal`;" "Executing (default): CREATE TABLE IF NOT EXISTS `measured_signal` (`id` INTEGER PRIMARY KEY, `x` DOUBLE PRECISION, `y` DOUBLE PRECISION, `createdAt` DATETIME NOT NULL, `updatedAt` DATETIME NOT NULL);" "Executing (default): PRAGMA INDEX_LIST(`measured_signal`)" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): BEGIN DEFERRED TRANSACTION;" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);" "Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): COMMIT;" "Executing (default): SELECT m1.x AS x, m1.y AS y, AVG(m2.y) as average\n FROM measured_signal as m1\n JOIN measured_signal as m2\n ON ( m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)\n WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000\n GROUP BY m1.x" [ { "x": 7256, "y": 0.1358, "average": 0.13790000000000002 }, { "x": 10884, "y": 0.1327, "average": 0.13233333333333333 }, { "x": 14512, "y": 0.1285, "average": 0.12893333333333332 } ]

输出：

const { Schema } = mongoose = require('mongoose'); const uri = 'mongodb://localhost:27017/test'; const opts = { useNewUrlParser: true }; mongoose.set('useFindAndModify', false); mongoose.set('useCreateIndex', true); mongoose.set('debug', true); const signalSchema = new Schema({ x: Number, y: Number }); const MeasuredSignal = mongoose.model('MeasuredSignal', signalSchema, 'measured_signal'); const log = data => console.log(JSON.stringify(data, undefined, 2)); (async function() { try { const conn = await mongoose.connect(uri, opts); await Promise.all( Object.entries(conn.models).map(([k,m]) => m.deleteMany()) ); await MeasuredSignal.insertMany([ {x:3628, y: 0.1452}, {x:7256, y: 0.1358}, {x:10884, y: 0.1327}, {x:14512, y: 0.1285}, {x:18140, y: 0.1256}, {x:21768, y: 0.1268}, {x:25396, y: 0.1272}, {x:29024, y: 0.1301} ]); let result = await MeasuredSignal.aggregate([ { "$match": { "x": { "$gt": 5000, "$lt": 15000 } } }, { "$lookup": { "from": MeasuredSignal.collection.name, "let": { "x": "$x", "y": "$y" }, "pipeline": [ { "$match": { "x": { "$gt": 0, "$lt": 20000 }, "$expr": { "$and": [ { "$gt": [ "$x", { "$subtract": [ "$$x", 5000 ] } ] }, { "$lt": [ "$x", { "$add": [ "$$x", 5000 ] } ] } ] } }} ], "as": "results" }}, { "$group": { "_id": "$x", "y": { "$first": "$y" }, "average": { "$avg": { "$avg": "$results.y" } } }}, { "$sort": { "_id": 1 } } ]); result = result.map(({ _id: x, y, average }) => ({ x, y, average })); log(result); } catch(e) { console.error(e) } finally { mongoose.disconnect() } })()

通过自我加入计算移动平均值

1 个答案: