我有以下查询,用于计算MySQL表上的移动平均值:
SELECT m1.x AS x, m1.y AS y, AVG(m2.y) AS average
FROM measured_signal AS m1
JOIN measured_signal AS m2 ON (m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)
WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000
GROUP BY m1.x
它工作正常,但是现在我正在迁移到MongoDB,我需要执行相同的操作。
我读过this question,内容非常相似,但没有涵盖我的具体情况。
到目前为止,我已经写下了以下管道:
db.getCollection("measured_signal").aggregate([
{ $match: {x: { $gt: 0, $lte: 20000 } } },
{ $sort: { x: 1 } },
{
$group:{
_id: null,
rows: {
$push: { x: "$x", y: "$y" }
}
}
},
{
$addFields: {
rows: {
$map: {
input: {
$filter: {
input: "$rows",
cond: {
$gte: ["$$this.x", {$subtract: ["$$this.x", 5000]}],
$lte: ["$$this.x", {$add: ["$$this.x", 5000]}]
}
}
},
in: {
x: "$$this.x",
y: "$$this.y",
average: { $avg: "$$this.x" },
}
}
}
}
},
{ $unwind: "$rows" },
{ $match: {x: { $gt: 5000, $lte: 15000 } } }
],{allowDiskUse: true});
但它不起作用。
我应该尝试完全不同的东西吗?还是我该改变什么?感谢您的帮助。
编辑
为了更好地理解问题,我添加了一个输入数据示例
{x:3628, y: 0.1452},
{x:7256, y: 0.1358},
{x:10884, y: 0.1327},
{x:14512, y: 0.1285},
{x:18140, y: 0.1256},
{x:21768, y: 0.1268},
{x:25396, y: 0.1272},
{x:29024, y: 0.1301},
...
和所需的输出,考虑到窗口大小为5000:
{x:7256, y: 0.1358, average: 0.1379}, // average computed on rows between 2256 and 12256
{x:10884, y: 0.1327, average: 0.1323}, // average computed on rows between 5884 and 15884
{x:14512, y: 0.1285, average: 0.1289}, // average computed on rows between 9512 and 19512
{x:18140, y: 0.1256, average: 0.1270}, // average computed on rows between 13140 and 23140
{x:21768, y: 0.1268, average: 0.1265}, // average computed on rows between 16768 and 26768
{x:25396, y: 0.1272, average: 0.1280}, // average computed on rows between 20396 and 30396
...
答案 0 :(得分:1)
从您的SQL以及我认为的“文字解释” 到MongoDB语句,我实际上仅从八个中得到三个结果问题中发布的文件。
我认为相同的陈述实际上是:
db.measured_signal.aggregate([
{ "$match": { "x": { "$gt": 5000, "$lt": 15000 } } },
{ "$lookup": {
"from": "measured_signal",
"let": { "x": "$x", "y": "$y" },
"pipeline": [
{ "$match": {
"x": { "$gt": 0, "$lt": 20000 },
"$expr": {
"$and": [
{ "$gt": [ "$x", { "$subtract": [ "$$x", 5000 ] }] },
{ "$lt": [ "$x", { "$add": [ "$$x", 5000 ] }] }
]
}
}},
],
"as": "results"
}},
{ "$unwind": "$results" },
{ "$group": {
"_id": "$x",
"y": { "$first": "$y" },
"average": { "$avg": "$results.y" }
}},
{ "$addFields": {
"_id": "$$REMOVE",
"x": "$_id"
}},
{ "$sort": { "x": 1 } }
]).map(({ x, y, average }) => ({ x, y, average }))
结果:
{
"x" : 7256,
"y" : 0.1358,
"average" : 0.1379
},
{
"x" : 10884,
"y" : 0.1327,
"average" : 0.13233333333333333
},
{
"x" : 14512,
"y" : 0.1285,
"average" : 0.12893333333333334
}
如果您的工作很合逻辑的话。
MongoDB中的聚合管道应该通常以$match
条件开始。这基本上是声明性SQL语句中的WHERE
子句,但是在聚合管道中,此“过滤器”条件首先完成。值得注意的是,JOIN
尚未完成,因此初始$match
仅查看集合/表的初始(或m1
)视图。
接下来的事情是JOIN
。这是通过$lookup
完成的,在这里我们实际上可以创建一个表达式,在该表达式上,只要等于SQL中显示的条件,就可以“加入”。这里WHERE
的第二部分包含在$lookup
的$match
自变量内的pipeline
中。这实际上意味着在外国文档上使用了另一个“过滤器”(在本例中为“自我联接” )。
要注意的另一件事是$lookup
中的let
参数,以及内部管道的$expr
中的$match
。这允许将初始集合(或m1
)中的值与 foreign 集合(或m2
)进行比较。如您所见,$expr
内的表达式做了一些不同,因为它们是$gt
和$lt
的比较运算符的实际“聚合表达式” 这些版本会在比较值上返回Boolean
值。简而言之,我们使变量引用原始文档中的值,并将其与国外集合中的值进行比较,以确定部分“ join” 条件。
$lookup
的输出始终是添加到包含匹配的 foreign 结果的初始文档中的“数组”。即使只有一个结果,也始终是一个数组。初始文档中包含此数组的新字段由as
参数命名。按照SQL的字面意思,JOIN
会产生 denormalized 输出,其中将很多父文档的副本复制到每个外国 child >。其字面翻译为$unwind
,但您也可以跳过该步骤,稍后再将$avg
的行更改为:
"average": { "$avg": { "$avg": "$results.y" } }
接下来,当然是$group
,就像在SQL中一样,您想要GROUP BY
从初始收集文档中的x
值开始(仍然MongoDB称为x
,当然MongoDB在这方面比SQL更为 literal ,因此您必须将累加器用于{{ $group
语句中的1}}或GROUP BY
。这意味着将$first
运算符用作_id
值的适当“累加器”。
“平均值”当然是由$avg
获得的,直接基于从[y
产生的奇异 denormalized 值,或者首先基于“ array”内容然后按“分组文档”。因此,在第二个示例中,出于这两个目的,$avg
被指定了两次。
由于$group
要求将其$unwind][5]
键按惯例命名为GROUP BY
,因此,如果要重命名,则需要$addFields
阶段。这就是使MongoDB从聚合管道返回所需名称的方式,但是我个人可能会在返回的结果中坚持使用_id
并仅通过_id
或类似的操作进行重命名。在上面的清单中也证明了这一点,因为$addFields
和其他$project
操作实际上将保留来自$group
输出的已定义字段的顺序。基本上意味着.map()
将是输出文档中的 last 字段,而不是第一个字段。
所以最后一部分确实是 cosmetic ,您不必只是为了看到所需的结果就做它们。当然,$group
的输出没有像x
这样的默认顺序,因此您希望在实际管道执行结束时使用$sort
,或者选择将生成的文档排在后面如果结果足够小,您可以转换为数组。
注意,由于
$lookup
中的GROUP BY
表达式实际上是完整的管道,因此您可以而且实际上可能是在返回pipeline
中的结果数组之前,应执行$avg
操作。但是,这实际上并不会改变它仍然必须返回数组的事实,但是在“大联接”结果的情况下,结果将大大减少并且安全得多,因为您只返回了< strong>一个号码。由于这是“ still” 数组,因此不会改变对
as
或* double$unwind
语句的需求,如所示。只是 nicer 不会返回您不需要的大量最终结果。
仅显示这些实际上是相同的事情,我让您的SQL代码在一个独立列表中运行,而另一个在MongoDB上运行该语句。如您所见,两者产生的结果相同。
NodeJS代码只是为了方便作者在两个引擎上运行。
SQL列表
$avg
输出:
const { Op, DOUBLE, SMALLINT } = Sequelize = require('sequelize');
const logging = log = data => console.log(JSON.stringify(data, undefined, 2));
const sequelize = new Sequelize('sqlite:dbname.db', { logging });
const MeasuredSignal = sequelize.define('measured_signal', {
id: { type: SMALLINT, primaryKey: true },
x: DOUBLE,
y: DOUBLE
}, { freezeTableName: true });
(async function() {
try {
await sequelize.authenticate();
await MeasuredSignal.sync({ force: true });
let result = await sequelize.transaction(transaction =>
Promise.all(
[
{x:3628, y: 0.1452},
{x:7256, y: 0.1358},
{x:10884, y: 0.1327},
{x:14512, y: 0.1285},
{x:18140, y: 0.1256},
{x:21768, y: 0.1268},
{x:25396, y: 0.1272},
{x:29024, y: 0.1301}
].map(d => MeasuredSignal.create(d, { transaction }))
)
);
let output = await sequelize.query(
`
SELECT m1.x AS x, m1.y AS y, AVG(m2.y) as average
FROM measured_signal as m1
JOIN measured_signal as m2
ON ( m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)
WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000
GROUP BY m1.x
`, { type: sequelize.QueryTypes.SELECT });
log(output);
} catch (e) {
console.error(e)
} finally {
process.exit()
}
})()
MongoDB列表
"Executing (default): SELECT 1+1 AS result"
"Executing (default): DROP TABLE IF EXISTS `measured_signal`;"
"Executing (default): CREATE TABLE IF NOT EXISTS `measured_signal` (`id` INTEGER PRIMARY KEY, `x` DOUBLE PRECISION, `y` DOUBLE PRECISION, `createdAt` DATETIME NOT NULL, `updatedAt` DATETIME NOT NULL);"
"Executing (default): PRAGMA INDEX_LIST(`measured_signal`)"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): BEGIN DEFERRED TRANSACTION;"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): INSERT INTO `measured_signal` (`id`,`x`,`y`,`createdAt`,`updatedAt`) VALUES ($1,$2,$3,$4,$5);"
"Executing (7c7d0f4d-719a-4b4c-ad6a-5d5c209b8fa1): COMMIT;"
"Executing (default): SELECT m1.x AS x, m1.y AS y, AVG(m2.y) as average\n FROM measured_signal as m1\n JOIN measured_signal as m2\n ON ( m2.x BETWEEN m1.x - 5000 AND m1.x + 5000)\n WHERE m1.x BETWEEN 5000 AND 15000 AND m2.x BETWEEN 0 AND 20000\n GROUP BY m1.x"
[
{
"x": 7256,
"y": 0.1358,
"average": 0.13790000000000002
},
{
"x": 10884,
"y": 0.1327,
"average": 0.13233333333333333
},
{
"x": 14512,
"y": 0.1285,
"average": 0.12893333333333332
}
]
输出:
const { Schema } = mongoose = require('mongoose');
const uri = 'mongodb://localhost:27017/test';
const opts = { useNewUrlParser: true };
mongoose.set('useFindAndModify', false);
mongoose.set('useCreateIndex', true);
mongoose.set('debug', true);
const signalSchema = new Schema({
x: Number,
y: Number
});
const MeasuredSignal = mongoose.model('MeasuredSignal', signalSchema, 'measured_signal');
const log = data => console.log(JSON.stringify(data, undefined, 2));
(async function() {
try {
const conn = await mongoose.connect(uri, opts);
await Promise.all(
Object.entries(conn.models).map(([k,m]) => m.deleteMany())
);
await MeasuredSignal.insertMany([
{x:3628, y: 0.1452},
{x:7256, y: 0.1358},
{x:10884, y: 0.1327},
{x:14512, y: 0.1285},
{x:18140, y: 0.1256},
{x:21768, y: 0.1268},
{x:25396, y: 0.1272},
{x:29024, y: 0.1301}
]);
let result = await MeasuredSignal.aggregate([
{ "$match": { "x": { "$gt": 5000, "$lt": 15000 } } },
{ "$lookup": {
"from": MeasuredSignal.collection.name,
"let": { "x": "$x", "y": "$y" },
"pipeline": [
{ "$match": {
"x": { "$gt": 0, "$lt": 20000 },
"$expr": {
"$and": [
{ "$gt": [ "$x", { "$subtract": [ "$$x", 5000 ] } ] },
{ "$lt": [ "$x", { "$add": [ "$$x", 5000 ] } ] }
]
}
}}
],
"as": "results"
}},
{ "$group": {
"_id": "$x",
"y": { "$first": "$y" },
"average": { "$avg": { "$avg": "$results.y" } }
}},
{ "$sort": { "_id": 1 } }
]);
result = result.map(({ _id: x, y, average }) => ({ x, y, average }));
log(result);
} catch(e) {
console.error(e)
} finally {
mongoose.disconnect()
}
})()