使用MongoDB聚合框架有没有计算中位数的方法?
由于计算中位数需要对整个数据集进行排序,或使用递归,并且递归深度也与数据集大小成比例,因此在一般情况下计算中位数有些棘手。这可能是为什么许多数据库没有开箱即用的中位数运算符的原因(MySQL 也没有)。
计算中位数最简单的方法是使用以下两个语句(假设要计算中位数的属性名为 a
,想要在集合 coll
的所有文档中计算它):
count = db.coll.count();
db.coll.find().sort( {"a":1} ).skip(count / 2 - 1).limit(1);
这相当于人们建议MySQL的方法。
通过聚合框架可以一次性完成此操作。
排序 => 将排序后的值放入数组 => 获取数组大小 => 将大小除以二 => 获取除法的整数值(中位数左侧) => 在左侧加1(右侧) => 获取左侧和右侧的数组元素 => 两个元素的平均值
以下是使用Spring java mongoTemplate的示例:
该模型是一个作者登录名(“所有者”)列表,目标是通过用户获取书籍中位数:
GroupOperation countByBookOwner = group("owner").count().as("nbBooks");
SortOperation sortByCount = sort(Direction.ASC, "nbBooks");
GroupOperation putInArray = group().push("nbBooks").as("nbBooksArray");
ProjectionOperation getSizeOfArray = project("nbBooksArray").and("nbBooksArray").size().as("size");
ProjectionOperation divideSizeByTwo = project("nbBooksArray").and("size").divide(2).as("middleFloat");
ProjectionOperation getIntValueOfDivisionForBornLeft = project("middleFloat", "nbBooksArray").and("middleFloat")
.project("trunc").as("beginMiddle");
ProjectionOperation add1ToBornLeftToGetBornRight = project("beginMiddle", "middleFloat", "nbBooksArray")
.and("beginMiddle").project("add", 1).as("endMiddle");
ProjectionOperation arrayElementAt = project("beginMiddle", "endMiddle", "middleFloat", "nbBooksArray")
.and("nbBooksArray").project("arrayElemAt", "$beginMiddle").as("beginValue").and("nbBooksArray")
.project("arrayElemAt", "$endMiddle").as("endValue");
ProjectionOperation averageForMedian = project("beginMiddle", "endMiddle", "middleFloat", "nbBooksArray",
"beginValue", "endValue").and("beginValue").project("avg", "$endValue").as("median");
Aggregation aggregation = newAggregation(countByBookOwner, sortByCount, putInArray, getSizeOfArray,
divideSizeByTwo, getIntValueOfDivisionForBornLeft, add1ToBornLeftToGetBornRight, arrayElementAt,
averageForMedian);
long time = System.currentTimeMillis();
AggregationResults<MedianContainer> groupResults = mongoTemplate.aggregate(aggregation, "book",
MedianContainer.class);
这里是聚合的结果:
{
"aggregate": "book" ,
"pipeline": [
{
"$group": {
"_id": "$owner" ,
"nbBooks": {
"$sum": 1
}
}
} , {
"$sort": {
"nbBooks": 1
}
} , {
"$group": {
"_id": null ,
"nbBooksArray": {
"$push": "$nbBooks"
}
}
} , {
"$project": {
"nbBooksArray": 1 ,
"size": {
"$size": ["$nbBooksArray"]
}
}
} , {
"$project": {
"nbBooksArray": 1 ,
"middleFloat": {
"$divide": ["$size" , 2]
}
}
} , {
"$project": {
"middleFloat": 1 ,
"nbBooksArray": 1 ,
"beginMiddle": {
"$trunc": ["$middleFloat"]
}
}
} , {
"$project": {
"beginMiddle": 1 ,
"middleFloat": 1 ,
"nbBooksArray": 1 ,
"endMiddle": {
"$add": ["$beginMiddle" , 1]
}
}
} , {
"$project": {
"beginMiddle": 1 ,
"endMiddle": 1 ,
"middleFloat": 1 ,
"nbBooksArray": 1 ,
"beginValue": {
"$arrayElemAt": ["$nbBooksArray" , "$beginMiddle"]
} ,
"endValue": {
"$arrayElemAt": ["$nbBooksArray" , "$endMiddle"]
}
}
} , {
"$project": {
"beginMiddle": 1 ,
"endMiddle": 1 ,
"middleFloat": 1 ,
"nbBooksArray": 1 ,
"beginValue": 1 ,
"endValue": 1 ,
"median": {
"$avg": ["$beginValue" , "$endValue"]
}
}
}
]
// { "a" : 25, "b" : 12 }
// { "a" : 89, "b" : 7 }
// { "a" : 25, "b" : 17 }
// { "a" : 25, "b" : 24 }
// { "a" : 89, "b" : 15 }
db.collection.aggregate([
{ $group: {
_id: "$a",
median: {
$accumulator: {
accumulateArgs: ["$b"],
init: function() { return []; },
accumulate: function(bs, b) { return bs.concat(b); },
merge: function(bs1, bs2) { return bs1.concat(bs2); },
finalize: function(bs) {
bs.sort(function(a, b) { return a - b });
var mid = bs.length / 2;
return mid % 1 ? bs[mid - 0.5] : (bs[mid - 1] + bs[mid]) / 2;
},
lang: "js"
}
}
}}
])
// { "_id" : 25, "median" : 17 }
// { "_id" : 89, "median" : 11 }
累加器:
b
上进行累加(accumulateArgs
)init
)b
个项累加到数组中(accumulate
和merge
)b
个项执行中位数计算(finalize
)尽管maxiplay的答案并不准确,但它确实指导了我正确的方向。给定解决方案的问题在于它仅适用于记录数为偶数的情况。因为对于记录数为奇数的情况,只需要取中点处的值而无需计算平均值。
这是我让它正常运行的方法。
db.collection.aggregate([
{ "$match": { "processingStatus": "Completed" } },
{ "$sort": { "value": 1 } },
{
"$group": {
"_id": "$userId",
"valueArray": {
"$push": "$value"
}
}
},
{
"$project": {
"_id": 0,
"userId": "$_id",
"valueArray": 1,
"size": { "$size": ["$valueArray"] }
}
},
{
"$project": {
"userId": 1,
"valueArray": 1,
"isEvenLength": { "$eq": [{ "$mod": ["$size", 2] }, 0 ] },
"middlePoint": { "$trunc": { "$divide": ["$size", 2] } }
}
},
{
"$project": {
"userId": 1,
"valueArray": 1,
"isEvenLength": 1,
"middlePoint": 1,
"beginMiddle": { "$subtract": [ "$middlePoint", 1] },
"endMiddle": "$middlePoint"
}
},
{
"$project": {
"userId": 1,
"valueArray": 1,
"middlePoint": 1,
"beginMiddle": 1,
"beginValue": { "$arrayElemAt": ["$valueArray", "$beginMiddle"] },
"endValue": { "$arrayElemAt": ["$valueArray", "$endMiddle"] },
"isEvenLength": 1
}
},
{
"$project": {
"userId": 1,
"valueArray": 1,
"middlePoint": 1,
"beginMiddle": 1,
"beginValue": 1,
"endValue": 1,
"middleSum": { "$add": ["$beginValue", "$endValue"] },
"isEvenLength": 1
}
},
{
"$project": {
"userId": 1,
"valueArray": 1,
"median": {
"$cond": {
if: "$isEvenLength",
then: { "$divide": ["$middleSum", 2] },
else: { "$arrayElemAt": ["$valueArray", "$middlePoint"] }
}
}
}
}
])
$stepsArray
改为 $valueArray
,这个查询就能用了。 - jesse_english$let
可以使得使用单个 $project
成为可能。 - Amit Beckenstein我对这个问题的解决方案与Taher's answer相似,但使用了较少的$project
阶段。
// { "value" : 1 }
// { "value" : 2 }
// { "value" : 4 }
// { "value" : 5 }
db.median_values.aggregate([
// Sort the values
{ $sort: { value: 1 } },
// Get an array of all the values
{ $group: { _id: null, valuesArray: { $push: "$value" } } },
// Get if the array has an even or odd number of elements
{
$project: {
_id: 0,
valuesArray: 1,
isEven: { $eq: [{ $mod: [{ $size: "$valuesArray" }, 2] }, 0] },
dividedByTwoIndex: { $divide: [{ $size: "$valuesArray" }, 2] },
},
},
// Get the left value and right value if the array has an even or odd number of elements
{
$project: {
_id: 0,
left: {
$cond: {
if: "$isEven",
then: {
$arrayElemAt: [
"$valuesArray",
{ $subtract: ["$dividedByTwoIndex", 1] },
],
},
else: {
$arrayElemAt: ["$valuesArray", { $floor: "$dividedByTwoIndex" }],
},
},
},
right: {
$cond: {
if: "$isEven",
then: {
$arrayElemAt: ["$valuesArray", "$dividedByTwoIndex"],
},
else: {
$arrayElemAt: ["$valuesArray", { $floor: "$dividedByTwoIndex" }],
},
},
},
},
},
// Compute the median value
{ $project: { median: { $avg: ["$left", "$right"] } } },
]);
// Output:
// { "median" : 3 }
db.collection.aggregate([
{$group: {
_id: null,
median: {
$median: {
input: "$rating",
method: "approximate"
}
}
}
}
])
看看它在7.0版本以来的游乐场中是如何工作的
$median
这样的东西,因此您可能需要使用map-reduce来实现这一点。 - hgoebl$median
聚合器的支持。请在 MongoDB 问题跟踪器中点赞/关注 SERVER-4929。 - Stennie