将ObjectId与$ graphLookup

时间:2018-05-18 01:18:51

标签: mongodb mongodb-query aggregation-framework

我正在尝试运行$graphLookup,如下面的版本所示:

$graphLookup sample

目标是,给定一个特定的记录(在那里注释$match),通过immediateAncestors属性检索它的完整“路径”。如你所见,它没有发生。

我在此处介绍了$convert来处理来自_id的集合string,认为可以与来自_id记录的immediateAncestors“匹配”列表(string)。

所以,我确实使用不同的数据运行了另一个测试(没有涉及ObjectId):

db.nodos.insert({"id":5,"name":"cinco","children":[{"id":4}]})
db.nodos.insert({"id":4,"name":"quatro","ancestors":[{"id":5}],"children":[{"id":3}]})
db.nodos.insert({"id":6,"name":"seis","children":[{"id":3}]})
db.nodos.insert({"id":1,"name":"um","children":[{"id":2}]})
db.nodos.insert({"id":2,"name":"dois","ancestors":[{"id":1}],"children":[{"id":3}]})
db.nodos.insert({"id":3,"name":"três","ancestors":[{"id":2},{"id":4},{"id":6}]})
db.nodos.insert({"id":7,"name":"sete","children":[{"id":5}]})

查询:

db.nodos.aggregate( [
  { $match: { "id": 3 } },
  { $graphLookup: {
      from: "nodos",
      startWith: "$ancestors.id",
      connectFromField: "ancestors.id",
      connectToField: "id",
      as: "ANCESTORS_FROM_BEGINNING"
    }
  },
  { $project: {
      "name": 1,
      "id": 1,
      "ANCESTORS_FROM_BEGINNING": "$ANCESTORS_FROM_BEGINNING.id"
    }
  }
] )

...输出我期望的内容(直接和间接连接到id 3的那五个记录):

{
    "_id" : ObjectId("5afe270fb4719112b613f1b4"),
    "id" : 3.0,
    "name" : "três",
    "ANCESTORS_FROM_BEGINNING" : [ 
        1.0, 
        4.0, 
        6.0, 
        5.0, 
        2.0
    ]
}

问题是:有一种方法可以实现我在开始时提到的目标吗?

我正在运行Mongo 3.7.9(来自官方Docker)

提前致谢!

1 个答案:

答案 0 :(得分:5)

您目前正在使用MongoDB的开发版本,该版本具有一些预期将与MongoDB 4.0一起发布的功能作为正式版本。请注意,某些功能可能会在最终版本发布之前发生变化,因此生产代码在您提交之前应该知道这一点。

为什么$ convert在这里失败

解释此问题的最佳方法可能是查看您更改过的示例,但替换为_id和"字符串"的ObjectId值。对于阵列下的人:

{
  "_id" : ObjectId("5afe5763419503c46544e272"),
   "name" : "cinco",
   "children" : [ { "_id" : "5afe5763419503c46544e273" } ]
},
{
  "_id" : ObjectId("5afe5763419503c46544e273"),
  "name" : "quatro",
  "ancestors" : [ { "_id" : "5afe5763419503c46544e272" } ],
  "children" : [ { "_id" : "5afe5763419503c46544e277" } ]
},
{ 
  "_id" : ObjectId("5afe5763419503c46544e274"),
  "name" : "seis",
  "children" : [ { "_id" : "5afe5763419503c46544e277" } ]
},
{ 
  "_id" : ObjectId("5afe5763419503c46544e275"),
  "name" : "um",
  "children" : [ { "_id" : "5afe5763419503c46544e276" } ]
}
{
  "_id" : ObjectId("5afe5763419503c46544e276"),
  "name" : "dois",
  "ancestors" : [ { "_id" : "5afe5763419503c46544e275" } ],
  "children" : [ { "_id" : "5afe5763419503c46544e277" } ]
},
{ 
  "_id" : ObjectId("5afe5763419503c46544e277"),
  "name" : "três",
  "ancestors" : [
    { "_id" : "5afe5763419503c46544e273" },
    { "_id" : "5afe5763419503c46544e274" },
    { "_id" : "5afe5763419503c46544e276" }
  ]
},
{ 
  "_id" : ObjectId("5afe5764419503c46544e278"),
  "name" : "sete",
  "children" : [ { "_id" : "5afe5763419503c46544e272" } ]
}

这应该对您尝试使用的内容进行一般模拟。

您尝试将_id值转换为"字符串"在进入$project阶段之前通过$graphLookup。失败的原因是你在"内完成了$project"这个问题是,"from"选项中$graphLookup的来源仍然是未经更改的集合,因此您无法在后续"查找&#34上获得正确的详细信息;迭代。

db.strcoll.aggregate([
  { "$match": { "name": "três" } },
  { "$addFields": {
    "_id": { "$toString": "$_id" }
  }},
  { "$graphLookup": {
    "from": "strcoll",
    "startWith": "$ancestors._id",
    "connectFromField": "ancestors._id",
    "connectToField": "_id",
    "as": "ANCESTORS_FROM_BEGINNING"
  }},
  { "$project": {
    "name": 1,
    "ANCESTORS_FROM_BEGINNING": "$ANCESTORS_FROM_BEGINNING._id"
  }}
])

"查找"不匹配因此:

{
        "_id" : "5afe5763419503c46544e277",
        "name" : "três",
        "ANCESTORS_FROM_BEGINNING" : [ ]
}

"修补"问题

然而,这是核心问题,而不是$convert或它本身的别名失败。为了使这个实际工作,我们可以创建一个"view",它为了输入而将自己呈现为一个集合。

我会以相反的方式执行此操作并转换"字符串"通过ObjectId$toObjectId

db.createView("idview","strcoll",[
  { "$addFields": {
    "ancestors": {
      "$ifNull": [ 
        { "$map": {
          "input": "$ancestors",
          "in": { "_id": { "$toObjectId": "$$this._id" } }
        }},
        "$$REMOVE"
      ]
    },
    "children": {
      "$ifNull": [
        { "$map": {
          "input": "$children",
          "in": { "_id": { "$toObjectId": "$$this._id" } }
        }},
        "$$REMOVE"
      ]
    }
  }}
])

然而,使用"view"意味着数据始终与转换后的值一致。所以使用视图进行以下聚合:

db.idview.aggregate([
  { "$match": { "name": "três" } },
  { "$graphLookup": {
    "from": "idview",
    "startWith": "$ancestors._id",
    "connectFromField": "ancestors._id",
    "connectToField": "_id",
    "as": "ANCESTORS_FROM_BEGINNING"
  }},
  { "$project": {
    "name": 1,
    "ANCESTORS_FROM_BEGINNING": "$ANCESTORS_FROM_BEGINNING._id"
  }}
])

返回预期的输出:

{
    "_id" : ObjectId("5afe5763419503c46544e277"),
    "name" : "três",
    "ANCESTORS_FROM_BEGINNING" : [
        ObjectId("5afe5763419503c46544e275"),
        ObjectId("5afe5763419503c46544e273"),
        ObjectId("5afe5763419503c46544e274"),
        ObjectId("5afe5763419503c46544e276"),
        ObjectId("5afe5763419503c46544e272")
    ]
}

解决问题

所有这些都说明了,这里真正的问题是你有一些"看起来像"一个ObjectId值,实际上有效ObjectId,但它已被记录为"字符串"。所有工作的基本问题应该是两个"类型"是不一样的,这会导致平等不匹配,因为"加入"正在尝试。

因此真正的修复方法仍然与以往一样,而是通过数据并修复它以便"字符串"实际上也是ObjectId值。然后,它们将与它们要引用的_id键相匹配,并且您节省了大量的存储空间,因为ObjectId占用的存储空间比它要少得多。以十六进制字符表示的字符串表示。

使用MongoDB 4.0方法,"" 实际上可以使用"$toObjectId"来编写新的集合,就像我们创建的一样"视图"早期:

db.strcoll.aggregate([
  { "$addFields": {
    "ancestors": {
      "$ifNull": [ 
        { "$map": {
          "input": "$ancestors",
          "in": { "_id": { "$toObjectId": "$$this._id" } }
        }},
        "$$REMOVE"
      ]
    },
    "children": {
      "$ifNull": [
        { "$map": {
          "input": "$children",
          "in": { "_id": { "$toObjectId": "$$this._id" } }
        }},
        "$$REMOVE"
      ]
    }
  }}
  { "$out": "fixedcol" }
])

当然,或者你需要"保持相同的集合,然后传统的"循环和更新"与一直需要的一样:

var updates = [];

db.strcoll.find().forEach(doc => {
  var update = { '$set': {} };

  if ( doc.hasOwnProperty('children') )
    update.$set.children = doc.children.map(e => ({ _id: new ObjectId(e._id) }));
  if ( doc.hasOwnProperty('ancestors') )
    update.$set.ancestors = doc.ancestors.map(e => ({ _id: new ObjectId(e._id) }));

  updates.push({
    "updateOne": {
      "filter": { "_id": doc._id },
      update
    }
  });

  if ( updates.length > 1000 ) {
    db.strcoll.bulkWrite(updates);
    updates = [];
  }

})

if ( updates.length > 0 ) {
  db.strcoll.bulkWrite(updates);
  updates = [];
}

这实际上是一个" sledgehammer"由于实际上一次覆盖了整个阵列。对于生产环境来说不是一个好主意,但足以作为本练习目的的演示。

结论

因此,虽然MongoDB 4.0将添加这些" cast"这些功能确实非常有用,它们的实际意图并不适用于这种情况。事实上它们更有用,如转换"转换"使用聚合管道的新集合,而不是大多数其他可能的用途。

虽然我们"可以" 创建"视图"它改变了数据类型,使$lookup$graphLookup之类的东西适用于实际收集数据不同的地方,这实际上只是一个" band-aid" 关于真正的问题,因为数据类型确实不应该有所不同,实际上应该永久转换。

使用"视图"实际上意味着构建的聚合管道需要有效地运行每个时间"集合" (实际上是"视图")被访问,这会产生真正的开销。

避免开销通常是设计目标,因此纠正此类数据存储错误对于从应用程序中获取真正的性能是必不可少的,而不仅仅是使用"蛮力"这只会让事情变慢。

更安全的转换"应用"匹配"的脚本更新每个数组元素。这里的代码需要NodeJS v10.x和最新版本的MongoDB节点驱动程序3.1.x:

const { MongoClient, ObjectID: ObjectId } = require('mongodb');
const EJSON = require('mongodb-extended-json');

const uri = 'mongodb://localhost/';

const log = data => console.log(EJSON.stringify(data, undefined, 2));

(async function() {

  try {

    const client = await MongoClient.connect(uri);
    let db = client.db('test');
    let coll = db.collection('strcoll');

    let fields = ["ancestors", "children"];

    let cursor = coll.find({
      $or: fields.map(f => ({ [`${f}._id`]: { "$type": "string" } }))
    }).project(fields.reduce((o,f) => ({ ...o, [f]: 1 }),{}));

    let batch = [];

    for await ( let { _id, ...doc } of cursor ) {

      let $set = {};
      let arrayFilters = [];

      for ( const f of fields ) {
        if ( doc.hasOwnProperty(f) ) {
          $set = { ...$set,
            ...doc[f].reduce((o,{ _id },i) =>
              ({ ...o, [`${f}.$[${f.substr(0,1)}${i}]._id`]: ObjectId(_id) }),
              {})
          };

          arrayFilters = [ ...arrayFilters,
            ...doc[f].map(({ _id },i) =>
              ({ [`${f.substr(0,1)}${i}._id`]: _id }))
          ];
        }
      }

      if (arrayFilters.length > 0)
        batch = [ ...batch,
          { updateOne: { filter: { _id }, update: { $set }, arrayFilters } }
        ];

      if ( batch.length > 1000 ) {
        let result = await coll.bulkWrite(batch);
        batch = [];
      }

    }

    if ( batch.length > 0 ) {
      log({ batch });
      let result = await coll.bulkWrite(batch);
      log({ result });
    }

    await client.close();

  } catch(e) {
    console.error(e)
  } finally {
    process.exit()
  }

})()

为七个文档生成并执行这些批量操作:

{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5763419503c46544e272"
      }
    },
    "update": {
      "$set": {
        "children.$[c0]._id": {
          "$oid": "5afe5763419503c46544e273"
        }
      }
    },
    "arrayFilters": [
      {
        "c0._id": "5afe5763419503c46544e273"
      }
    ]
  }
},
{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5763419503c46544e273"
      }
    },
    "update": {
      "$set": {
        "ancestors.$[a0]._id": {
          "$oid": "5afe5763419503c46544e272"
        },
        "children.$[c0]._id": {
          "$oid": "5afe5763419503c46544e277"
        }
      }
    },
    "arrayFilters": [
      {
        "a0._id": "5afe5763419503c46544e272"
      },
      {
        "c0._id": "5afe5763419503c46544e277"
      }
    ]
  }
},
{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5763419503c46544e274"
      }
    },
    "update": {
      "$set": {
        "children.$[c0]._id": {
          "$oid": "5afe5763419503c46544e277"
        }
      }
    },
    "arrayFilters": [
      {
        "c0._id": "5afe5763419503c46544e277"
      }
    ]
  }
},
{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5763419503c46544e275"
      }
    },
    "update": {
      "$set": {
        "children.$[c0]._id": {
          "$oid": "5afe5763419503c46544e276"
        }
      }
    },
    "arrayFilters": [
      {
        "c0._id": "5afe5763419503c46544e276"
      }
    ]
  }
},
{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5763419503c46544e276"
      }
    },
    "update": {
      "$set": {
        "ancestors.$[a0]._id": {
          "$oid": "5afe5763419503c46544e275"
        },
        "children.$[c0]._id": {
          "$oid": "5afe5763419503c46544e277"
        }
      }
    },
    "arrayFilters": [
      {
        "a0._id": "5afe5763419503c46544e275"
      },
      {
        "c0._id": "5afe5763419503c46544e277"
      }
    ]
  }
},
{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5763419503c46544e277"
      }
    },
    "update": {
      "$set": {
        "ancestors.$[a0]._id": {
          "$oid": "5afe5763419503c46544e273"
        },
        "ancestors.$[a1]._id": {
          "$oid": "5afe5763419503c46544e274"
        },
        "ancestors.$[a2]._id": {
          "$oid": "5afe5763419503c46544e276"
        }
      }
    },
    "arrayFilters": [
      {
        "a0._id": "5afe5763419503c46544e273"
      },
      {
        "a1._id": "5afe5763419503c46544e274"
      },
      {
        "a2._id": "5afe5763419503c46544e276"
      }
    ]
  }
},
{
  "updateOne": {
    "filter": {
      "_id": {
        "$oid": "5afe5764419503c46544e278"
      }
    },
    "update": {
      "$set": {
        "children.$[c0]._id": {
          "$oid": "5afe5763419503c46544e272"
        }
      }
    },
    "arrayFilters": [
      {
        "c0._id": "5afe5763419503c46544e272"
      }
    ]
  }
}