如何在mongo查询中展开两个数组

时间:2015-01-05 17:41:58

标签: java mongodb mongodb-query aggregation-framework

我在mongodb中有这样的集合:

{
"_id" : ObjectId("5490a00879dc6a138dcefb0f"),
"Date" : 20141012,
"Type" : "Twitter",
"Entities" : [ 
    {
        "ID" : 2,
        "Name" : "test1",
        "Sentiment" : {
            "Value" : 0.1,
            "Neutral" : 12
        }
     }
],
"Topics" : [ 
    {
        "ID" : 1,
        "Name" : "Test2",
        "Sentiment" : {
            "Value" : 0.5,
            "Neutral" : 1
        }
    }
]
}

现在我需要解开数组主题和实体,然后我想按日期分组并总结所有情绪值,所以我按照以下方式进行:

    DBObject unwind = new BasicDBObject("$unwind", "$Entities"); 
    unwind.put("$unwind", "$Topics");
    collectionG = db.getCollection("GraphDataCollection");
    DBObject groupFields = new BasicDBObject( "_id", "$Date");
    groupFields.put("value", new BasicDBObject( "$sum", "$Entities.Sentiment.Value"));
    DBObject groupBy = new BasicDBObject("$group", groupFields );
    AggregationOutput output = collectionG.aggregate(where,unwind, groupBy);

现在的问题是,对于情绪的总和,只返回0但是如果我删除以下行:

    unwind.put("$unwind", "$Topics");

它正常工作所以我的问题是如何用一个聚合解开两个数组?

更新:

我改变了我的代码如下:

DBObject unwind = new BasicDBObject("$unwind", "$Entities"); // "$unwind" converts object with array into many duplicate objects, each with one from array
    DBObject unwindT = new BasicDBObject("$unwind", "$Topics"); // "$unwind" converts object with array into many duplicate objects, each with one from array
    collectionG = db.getCollection("GraphDataCollection");
    DBObject groupFields = new BasicDBObject( "_id", "$Date");
   groupFields.put("value", new BasicDBObject( "$sum", "$Entities.Sentiment.Value"));
    groupFields.put("value1", new BasicDBObject( "$sum", "$Topics.Sentiment.Value"));
    DBObject groupBy = new BasicDBObject("$group", groupFields );
    List<DBObject> pipeline = Arrays.asList(unwind, unwindT);
    DBObject sort = new BasicDBObject("$sort", new BasicDBObject("_id", 1));
    AggregationOutput output = collectionG.aggregate(where,unwind,unwindT, groupBy,sort);

但问题是我添加以下内容后的行:

groupFields.put("value1", new BasicDBObject( "$sum", "$Topics.Sentiment.Value"));

返回的value1和value的数字不正确,我认为我没有正确解开。有人可以帮忙吗?

3 个答案:

答案 0 :(得分:1)

这是mongo查询(不是java):

// if you want the sum of Entities and Topics together 
db.test.aggregate(
   [
     {
         $unwind : '$Entities'
     },
     {
         $unwind : '$Topics'
     },     
     {
       $group:
         {
           _id: {'Date' : '$Date'},
           sum: { $sum: { $add : ['$Entities.Sentiment.Value', '$Topics.Sentiment.Value']} }
         }
     }
   ]
)

// if you want separated sum
db.test.aggregate(
   [
     {
         $unwind : '$Entities'
     },
     {
         $unwind : '$Topics'
     },     
     {
       $group:
         {
           _id: {'Date' : '$Date'},
           value1: { $sum: '$Entities.Sentiment.Value'},
           value2: { $sum: '$Topics.Sentiment.Value'}
         }
     }
   ]
)

爪哇:

// if you want the sum of Entities and Topics together 
private static void sumOfTopicsAndEntities(DBCollection coll) {
    DBObject unwind1 = new BasicDBObject("$unwind", "$Entities");
    DBObject unwind2 = new BasicDBObject("$unwind", "$Topics");

    // Now the $group operation
    ArrayList fileds = new ArrayList();
    fileds.add("$Entities.Sentiment.Value");
    fileds.add("$Topics.Sentiment.Value");

    DBObject groupFields = new BasicDBObject( "_id", "$Date");
    BasicDBObject add = new BasicDBObject( "$add", fileds);

    groupFields.put("sum", new BasicDBObject( "$sum", add));
    DBObject group = new BasicDBObject("$group", groupFields);

    // run aggregation
    AggregationOutput output = coll.aggregate(unwind1, unwind2, group);

    // result: { "serverUsed" : "/127.0.0.1:27017" , "result" : [ { "_id" : 2.0141012E7 , "sum" : 0.6}] , "ok" : 1.0}
    System.out.println(output);
}

// if you want separated sum
private static void seperatedValues(DBCollection coll) {
    DBObject unwind1 = new BasicDBObject("$unwind", "$Entities");
    DBObject unwind2 = new BasicDBObject("$unwind", "$Topics");

    // Now the $group operation
    DBObject groupFields = new BasicDBObject( "_id", "$Date");      
    groupFields.put("value1", new BasicDBObject( "$sum", "$Entities.Sentiment.Value"));
    groupFields.put("value2", new BasicDBObject( "$sum", "$Topics.Sentiment.Value"));
    DBObject group = new BasicDBObject("$group", groupFields);

    // run aggregation
    AggregationOutput output = coll.aggregate(unwind1, unwind2, group);

   // result: { "serverUsed" : "/127.0.0.1:27017" , "result" : [ { "_id" : 2.0141012E7 , "value1" : 0.1 , "value2" : 0.5}] , "ok" : 1.0}
    System.out.println(output);
}

答案 1 :(得分:1)

这是一个容易出错的问题,因为大多数事情都在细节中,你应该彻底测试。良好测试用例的良好来源是具有不同条件的各种数据,这里明显的错误是作为样本,每个数组只显示一个数组项。

在现实世界中,这些字段是数组的原因是因为您打算在其中包含多个条目。因此,简单地处理两个$unwind管道阶段是行不通的,因为它会将第一个数组中的项目乘以第二个数组中每个文档的项目数量。

因此,需要考虑更好的测试数据表示如下:

{
    "_id" : ObjectId("5490a00879dc6a138dcefb0f"),
    "Date" : 20141012,
    "Type" : "Twitter",
    "Entities" : [
            {
                    "ID" : 2,
                    "Name" : "test1",
                    "Sentiment" : {
                            "Value" : 0.1,
                            "Neutral" : 12
                    }
            }
    ],
    "Topics" : [
            {
                    "ID" : 1,
                    "Name" : "Test2",
                    "Sentiment" : {
                            "Value" : 0.5,
                            "Neutral" : 1
                    }
            },
            {
                    "ID" : 3,
                    "Name" : "Test3",
                    "Sentiment" : {
                            "Value" : 0.4,
                            "Neutral" : 1
                    }
            }
    ]
}

要在文档中使用两个数组正确执行此操作,您需要按类型识别条目,并仅添加特定成员。首先是注释的JSON序列化表单,以便于阅读:

[
    // Unwind both arrays, produces duplicates
    { "$unwind": "$Entities" },
    { "$unwind": "$Topics" },

    // Add another field to discern type as an array
    { "$project": {
        "Date": 1,
        "Entities": 1,
        "Topics": 1,
        "select": { "$literal": [ "E", "T" ] }
    }},

    // Unwind that array as well
    { "$unwind": "$select" },


    // Group in documents by individual array ID values and per select condition
    // makes everything unique again
    { "$group": {
        "_id": {
            "_id": "$_id",
            "Date": "$Date",
            "innerId": {
               "$cond": [
                   { "$eq": [ "$select", "E" ] },
                   "$Entities.ID",
                   "$Topics.ID"
               ]
            }
        },
        "value": {
            "$first": {
                "$cond": [
                   { "$eq": [ "$select", "E" ] },
                   "$Entities.Sentiment.Value",
                   "$Topics.Sentiment.Value"
                ]
            }
        }
    }},

    //Now just sum the values per date grouping
    { "$group": {
        "_id": "$_id.Date",
        "value": { "$sum": "$value" }
    }}
])

还有另一个稍微长一点的方法,但我认为内部数组“ID”字段值是唯一的,至少在文档中并且应该没问题。整个过程实际上是将两个独立的文档属性组合成一个奇异的字段,并处理这些是数组的事实。

因此,您将阵列拉开,使用备用类型标记每个文档,然后再次复制它们。现在,基本上每个文档和每个数组成员,检查匹配类型并从适当的数组值中选择。此时,每个数组成员现在有一个文档和一个单独的“值”字段,根据选择的字段保存*.Sentiment.Value的相应值,但总点是现在所有的值都没有复制。他们所做的就是总结结果的值字段。

事实上,这里要学习的主要教训是,应该首先将其记录为单个数组,其结构如下:

{
    "_id" : ObjectId("5490a00879dc6a138dcefb0f"),
    "Date" : 20141012,
    "Type" : "Twitter",
    "Data" : [
            {
                    "ID" : 2,
                    "Name" : "test1",
                    "Sentiment" : {
                            "Value" : 0.1,
                            "Neutral" : 12
                    },
                    "Class": "Entity"
            },
            {
                    "ID" : 1,
                    "Name" : "Test2",
                    "Sentiment" : {
                            "Value" : 0.5,
                            "Neutral" : 1
                    },
                    "Class": "Topic"
            },
            {
                    "ID" : 3,
                    "Name" : "Test3",
                    "Sentiment" : {
                            "Value" : 0.4,
                            "Neutral" : 1
                    },
                    "Class": "Topic"
            }
    ]
}

在单个数组上处理$unwind一次并且只是对所有值进行求和就是一个简单的问题。如果您想单独使用数据“类”,那么您可以过滤它或使用条件。但是,大多数操作更容易以这种方式简单地构建。

将其翻译成Java是微不足道的,但万一你在这个过程中迷路了:

    DBObject unwind1 = new BasicDBObject("$unwind", "$Entities");
    DBObject unwind2 = new BasicDBObject("$unwind", "$Topics");

    DBObject project = new BasicDBObject("$project",
        new BasicDBObject( "Date", 1 )
            .append( "Entities", 1)
            .append( "Topics", 1)
            .append( "select", 
                new BasicDBObject( "$literal", new String[]{ "E", "T" })
            )
        );

    DBObject unwind3 = new BasicDBObject("$unwind", "select");

    DBObject group1 = new BasicDBObject("$group",
        new BasicDBObject("_id",
           new BasicDBObject("_id","$_id")
                .append("Date", "$Date")
                .append("innerId",
                    new BasicDBObject("$cond",
                        new Object[]{
                            new BasicDBObject("$eq", new String[]{"$select", "E"}),
                            "$Entities.ID",
                            "$Topics.ID"
                        }
                    )
                )
        )
        .append("value",
            new BasicDBObject("$first",
                new BasicDBObject("$cond",
                    new Object[]{
                        new BasicDBObject("$eq", new String[]{"$select", "E"}),
                        "$Entities.Sentiment.Value",
                        "$Topics.Sentiment.Value"
                    }
                )
            )
        )
    );

    DBObject group2 = new BasicDBObject("$group",
        new BasicDBObject("_id", "$_id.Date")
            .append("value", new BasicDBObject("$sum","$value"))
    );

    AggregationOutput output = coll.aggregate(unwind1,unwind2,project,unwind3,group1,group2);

还有一点需要注意。虽然你现在可能应该在那里,但是$literal运算符是在MongoDB 2.6及更高版本中引入的。对于早期的服务器版本,有一个未记录的 $const 运算符实际上是相同的。如果必须可能针对MongoDB的早期服务器版本运行,请交换代码。

答案 2 :(得分:1)

另一种方法,

  • Unwind Entities数组。
  • Group _id,获取sum的{​​{1}}。
  • Entities Unwind数组。
  • Topics Group,以获得_id的总和。
  • Topics一个字段,用于显示Projecttopics情绪的总和 值。
  • entities Group获取净额。

这样,每个管道中的文档数量最少,不涉及太多自连接。

汇总代码:

Date

Java等效:

db.collection.aggregate([
{$unwind:"$Entities"},
{$group:{"_id":"$_id",
         "Date":{$first:"$Date"},
         "Topics":{$first:"$Topics"},
         "EntitiesSum":{$sum:"$Entities.Sentiment.Value"}}},
{$unwind:"$Topics"},
{$group:{"_id":"$_id",
         "Date":{$first:"$Date"},
         "EntitiesSum":{$first:"$EntitiesSum"},
         "TopicsSum":{$sum:"$Topics.Sentiment.Value"}}},
{$project:{"_id":0,"Date":1,"EntitiesSum":1,"TopicsSum":1,
           "indSum":{$add:["$EntitiesSum","$TopicsSum"]}}},
{$group:{"_id":"$Date",
         "EntitiesSentimentSum":{$sum:"$EntitiesSum"},
         "TopicsSentimentSum":{$sum:"$TopicsSum"},
         "netSentimentSum":{$sum:"$indSum"}}}
])

样本o / p(有两个文件):

     DBObject unwindEntities = new BasicDBObject("$unwind","$Entities");

     DBObject groupSameIdEntities = new BasicDBObject("_id","$_id");
     groupSameIdEntities.put("Date", new BasicDBObject("$first","$Date"));
     groupSameIdEntities.put("Topics", new BasicDBObject("$first","$Topics"));
     groupSameIdEntities.put("EntitiesSum", 
                    new BasicDBObject("$sum","$Entities.Sentiment.Value"));


     DBObject unwindTopics = new BasicDBObject("$unwind","$Topics");

     DBObject groupSameIdTopics = new BasicDBObject("_id","$_id");
     groupSameIdTopics.put("Date", new BasicDBObject("$first","$Date"));
     groupSameIdTopics.put("EntitiesSum", 
                         new BasicDBObject("$first","$EntitiesSum"));
     groupSameIdTopics.put("TopicsSum",
                        new BasicDBObject("$sum","$Topics.Sentiment.Value"));

     DBObject project = new BasicDBObject("_id",0);
     project.put("Date",1);
     project.put("EntitiesSum",1);
     project.put("TopicsSum",1);
     project.put("netSumPerId",
             new BasicDBObject("$add",
                   new String[]{"$EntitiesSum","$TopicsSum"}));

     DBObject groupByDate = new BasicDBObject("_id","$Date");
     groupByDate.put("EntitiesSentimentSum", 
                     new BasicDBObject("$sum","$EntitiesSum"));
     groupByDate.put("TopicsSentimentSum", 
                     new BasicDBObject("$sum","$TopicsSum"));
     groupByDate.put("netSentimentSum", 
                      new BasicDBObject("$sum","$netSumPerId"));

     AggregationOutput output = col.aggregate(unwindEntities,
                                new BasicDBObject("$group",
                                             groupSameIdEntities),
                                unwindTopics,
                                new BasicDBObject("$group",groupSameIdTopics),
                                new BasicDBObject("$project",project),
                                new BasicDBObject("$group",groupByDate));

将日期字段保存为{ "_id" : 2.0141012E7, "EntitiesSentimentSum" : 0.30000000000000004 , "TopicsSentimentSum" : 1.2 , "netSentimentSum" : 1.5}