MongoDB聚合操作中的$group和$match与分组结果

4
我有一个如下所示的集合
{

    "_id" : ObjectId("553b2c740f12bb30f85bd41c"),
    "symbol" : "EUR/GBP",
    "order_id" : "PW_BarclaysTrades60530",
    "ticket_id" : "PW_BarclaysTrades.60530",
    "basketid" : "TESTBASKET-1428483828043",
    "date_sent" : ISODate("2015-04-07T18:30:00.000Z"),
    "destination" : "BarclaysTrades",
    "order_price" : 0.0000000000000000,
    "order_quantity" : 4000000.0000000000000000,
    "order_type" : 1.0000000000000000,
    "parent_quantity" : 250000000.0000000000000000,
    "time_sent" : "09:03:48",
    "side" : 1,
    "tif" : "0",
    "execution_id" : 88939,
    "date_recvd" : ISODate("2015-04-07T18:30:00.000Z"),
    "exe_quantity" : 50000.0000000000000000,
    "time_recvd" : "09:03:48",
    "execution_price" : 2.5000000000000000,
    "execution_type" : 1
}

我希望获取收货地为每个目的地平均执行价格大于平均(执行价格)的文档。

聚合如下:

db.orders_by_symbol.aggregate( [
{ $limit:300000 },
{ $match:{ destination: "PAPER" } },
{ $group:{_id:{Destination:"$destination"},avg_exec_price:  
              {$avg:"$execution_price"} ,"data":{"$push": "$$ROOT"}}},
{$unwind:"$data"},
{$match:{execution_price:{$ne: "$avg_exec_price"}}},
{$project:{_id:0,symbol:"$data.symbol",destination:"$data.destination",
          execution_id:"$data.execution_id",
          exec_price:"$data.execution_price",
         avg_ex_price:"$avg_exec_price"}}], 
{allowDiskUse:true})

获得以下结果
{

    "result" : [ 
        {
            "symbol" : "EUR/GBP",

            "destination" : "PAPER",
            "execution_id" : 89109,
            "exec_price" : 6.5000000000000000,
            "avg_ex_price" : 95.0747920857049140
        }, 
        {
            "symbol" : "EUR/GBP",
            "destination" : "PAPER",
            "execution_id" : 89110,
            "exec_price" : 6.0000000000000000,
            "avg_ex_price" : 95.0747920857049140
        }, 
        {
            "symbol" : "EUR/GBP",
            "destination" : "PAPER",
            "execution_id" : 89111,
            "exec_price" : 6.5000000000000000,
            "avg_ex_price" : 95.0747920857049140
        }

但是当我将'$ne'运算符更改为'$gt'时,没有产生任何结果。exec_price和avg_ex_price都是双精度数据类型。不确定为什么它不能按预期工作。


我认为你的条件 {$match:{execution_price:{$ne: "$avg_exec_price"}}} 不起作用,也许你应该写成 {$match:{data.execution_price:{$ne: "$avg_exec_price"}}} - Neo-coder
是的,那是一个类型错误。 - sasidharan
如果你使用这样的查询,会让你的服务器崩溃。如果你想要你的应用程序扩展,你应该像瘟疫一样避免使用聚合框架。 - Rick Houlihan
2个回答

11

使用 MongoDB Server 3.6 及更高版本:

var pipeline = [
    { "$match": { "destination": "PAPER" } },
    { "$facet": {
        "average": [
            { "$group": {
                "_id": null,
                "avg_exec_price": { "$avg": "$execution_price" }
            } }
        ],
        "data": [
            { "$project": { 
                "_id": 0,
                "symbol": 1,
                "destination": 1,
                "execution_id": 1,
                "execution_price": 1
            } }                   
        ]
    } },
    { "$addFields": {
        "average": { "$arrayElemAt": ["$average", 0] }
    } },
    { "$addFields": {
        "data": { 
            "$filter" : {
                "input": {
                    "$map": {
                        "input": "$data",
                        "as": "el",
                        "in": {
                            "symbol": "$$el.symbol",
                            "destination": "$$el.symbol",
                            "execution_id": "$$el.symbol",
                            "exec_price": "$$el.execution_price",
                            "avg_exec_price": "$average.avg_exec_price"
                        }
                    }
                },
                "as": "doc",
                "cond": {
                    "$gt" : [ 
                        "$$doc.exec_price", 
                        "$$doc.avg_exec_price"
                    ]
                }
            }
        }
    } },  
    { "$unwind": "$data" },  
    { "$replaceRoot": {  "newRoot": "$data" } }
];
对于不支持上述操作符和管道的MongoDB版本,可以使用$project运算符创建一个额外的字段,该字段通过$gt聚合操作符存储两个字段的比较结果。
var pipeline = [
    { "$match": {
        "destination": "PAPER"
    } },
    { "$group": {
        "_id": null,
        "avg_exec_price": { "$avg": "$execution_price" },
        "data": { "$addToSet": "$$ROOT" }
    } },
    { "$unwind": "$data" },
    { "$project": { 
        "_id": 0,
        "data": 1,
        "avg_exec_price": 1,            
        "isGreaterThanAverage": { 
            "$gt" : [ "$data.execution_price", "$avg_exec_price" ] 
        }
    } },    
    { "$match": {            
        "isGreaterThanAverage": true
    } },
    { "$project": { 
        "_id": 0,
        "symbol": "$data.symbol",
        "destination": "$data.destination",
        "execution_id": "$data.execution_id",
        "exec_price": "$data.execution_price",
        "avg_ex_price": "$avg_exec_price"
    } }
];

现在为了测试上述聚合,假设您拥有以下最小测试用例集合:

db.test.insert([{
    "symbol" : "EUR/GBP",    
    "destination" : "PAPER",    
    "execution_id" : 88939,    
    "execution_price" : 1.8
},
{
    "symbol" : "EUR/GBP",    
    "destination" : "PAPER",    
    "execution_id" : 88921,    
    "execution_price" : 6.8
},
{
    "symbol" : "USD/GBP",    
    "destination" : "foo",    
    "execution_id" : 88955,    
    "execution_price" : 3.1
},
{
    "symbol" : "AUD/GBP",    
    "destination" : "PAPER",    
    "execution_id" : 88941,    
    "execution_price" : 1.1
},
{
    "symbol" : "EUR/GBP",    
    "destination" : "PAPER",    
    "execution_id" : 88907,    
    "execution_price" : 9.4
}]);

运行上述聚合操作

db.test.aggregate(pipeline);

将会产生以下结果:

/* 0 */
{
    "result" : [ 
        {
            "symbol" : "EUR/GBP",
            "destination" : "PAPER",
            "execution_id" : 88907,
            "exec_price" : 9.4,
            "avg_ex_price" : 4.775
        }, 
        {
            "symbol" : "EUR/GBP",
            "destination" : "PAPER",
            "execution_id" : 88921,
            "exec_price" : 6.8,
            "avg_ex_price" : 4.775
        }
    ],
    "ok" : 1
}

1
非常感谢您,chridam。 我想知道为什么要用这种复杂的方式进行比较。是否有其他方法可以实现它? - sasidharan
@sasidharan 不用担心 :-) 恐怕这是 MongoDB 聚合框架中的众多复杂方法之一,您将不得不使用不同的聚合管道运算符来实现所需的结果。 - chridam
我们计划使用Java进行应用程序开发,并计划在MongoDB集合上进行更多的聚合操作。对于如何实现聚合操作,我们感到困惑,因为简单的平均值和比较看起来很复杂。非常感谢您的想法和建议。PS:以上查询需要大约30秒才能处理250万个文档,这是不可取的。感谢您宝贵的时间。 - sasidharan
在规划过程中,您应该对应用程序的聚合查询有深入的了解。在构建聚合管道之前,先绘制出您将运行的查询类型,以便构建最佳和适当的聚合管道来完成任务。此外,还要考虑文档大小对聚合性能的影响,文档越大,即使在硬盘上分配是连续的,加载它所需的IO也会更多。 - chridam
嗨,Chirdam,上述聚合查询需要2分钟才能返回结果。目前我们的集合中有1000万个文档。我在目标列上有索引。缓慢的原因是什么,有没有办法减少吞吐时间?如果您能提出一些想法,将不胜感激。 - sasidharan

2

阅读您的问题后,您应该按如下方式在聚合操作中使用$ cond

db.collectionName.aggregate({
  "$match": {
    "destination": "PAPER"
  }
}, {
  "$group": {
    "_id": "$destination",
    "avg_exec_price": {
      "$avg": "$execution_price"
    },
    "data": {
      "$push": "$$ROOT"
    }
  }
}, {
  "$unwind": "$data"
}, {
  "$group": {
    "_id": "$_id",
    "data": {
      "$push": {
        "check": {
          "$cond": [{
              "$gt": ["$data.execution_price", "$avg_exec_price"] // check in $cond if execution_price gt avg_exec_price
            }, "$data", ""] //push data if true else blank 
        }
      }
    }
  }
}, {
  "$unwind": "$data"
}, {
  "$match": {
    "data.check": {
      "$exists": true, // check data.check not empty or blank
      "$ne": ""
    }
  }
}, {
  "$project": {
    "_id": "$_id",
    "data": "$data.check"
  }
}).pretty()

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接