合并两个MongoDB集合

19

我一直在尝试在MongoDB中使用MapReduce执行我认为是一个简单的操作。我不知道这是否是正确的方法,或者我是否应该使用MapReduce。我搜索了我想到的关键词,并尝试在我认为成功率最高的文档中查找 - 但是没有发现什么有用的信息。也许我想得太复杂了?

我有两个集合:detailsgpas

details由大量文档(3+百万)组成。每个year都会重复出现一次studentid元素,如下所示:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640100"), "classes" : [1,17,19,21], "studentid" : "12345a", "year" : 1}
{ "_id" : ObjectId("4d76b7oij7s2d8372v640100"), "classes" : [2,12,19,22], "studentid" : "98765a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8372v640100"), "classes" : [32,91,101,217], "studentid" : "12345a", "year" : 2}
{ "_id" : ObjectId("4d76b7rty7s2d8372v640100"), "classes" : [1,11,18,22], "studentid" : "24680a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8856v640100"), "classes" : [32,99,110,215], "studentid" : "98765a", "year" : 2}
...

gpas中包含与details相同的studentid元素。每个studentid只有一个条目,如下所示:

gpas具有与details相同的studentid元素。每个studentid只能有一个条目,就像这样:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "overall" : 76, "subscore": 2}
...

最终我希望得到一个集合,其中每个学生都有一行数据,格式如下:

{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "classes_1": [1,17,19,21], "classes_2": [32,91,101,217], "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "classes_1": [2,12,19,22], "classes_2": [32,99,110,215], "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "classes_1": [1,11,18,22], "classes_2": [], "overall" : 76, "subscore": 2}
...

我原本打算通过以下方式来运行MapReduce:

var mapDetails = function() {
    emit(this.studentid, {studentid: this.studentid, classes: this.classes, year: this.year, overall: 0, subscore: 0});
};

var mapGpas = function() {
    emit(this.studentid, {studentid: this.studentid, classes: [], year: 0, overall: this.overall, subscore: this.subscore});
};

var reduce = function(key, values) {
    var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};

    values.forEach(function(value) {
        if (value.year == 0) {
            outs.overall = value.overall;
            outs.subscore = value.subscore;
        }
        else {
            if (value.year == 1) {
                outs.classes_1 = value.classes;
            }
            if (value.year == 2) {
                outs.classes_2 = value.classes;
            }

            outs.studentid = value.studentid;
        }
    });

    return outs;

};

res = db.details.mapReduce(mapDetails, reduce, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, reduce, {out: {reduce: 'joined'}})

但是当我运行它时,这就是我的结果集:

{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 85, "subscore" : 5 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }

我缺少了 classes 数组。

另外,顺便问一下,如何访问生成的 MapReduce 的 value 元素中的元素?MapReduce 总是输出到 value 或其他你命名的地方吗?

2个回答

44

这与在MongoDB用户Google群组上提出的一个问题相似。
https://groups.google.com/group/mongodb-user/browse_thread/thread/60a8b683e2626ada?pli=1

答案引用了一份在线教程,该教程与你的示例看起来很相似: http://tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/

有关MongoDB中MapReduce的更多信息,请参阅文档: http://www.mongodb.org/display/DOCS/MapReduce

此外,在MongoDB Cookbook文章的“Extras”章节中有一步骤详细说明了MapReduce操作的工作原理。文章标题为“使用版本文档查找最大值和最小值”: http://cookbook.mongodb.org/patterns/finding_max_and_min/

如果您已经阅读了某些参考文献,请原谅我。我包括它们是为了其他可能正在阅读此帖子并尝试在MongoDB中使用MapReduce的用户受益。

重要的是,Map函数中'emit'语句的输出必须与Reduce函数的输出匹配。 如果Map函数只有一个文档输出,则可能根本不会运行Reduce函数,然后您的输出集合将具有不匹配的文档。

我稍微修改了你的Map语句以按你期望的输出格式发出文档,其中有两个单独的“classes”数组。
我还重新设计了你的Reduce语句,仅在classes_1和classes_2数组中不存在时添加新类别。

var mapDetails = function(){
    var output = {studentid: this.studentid, classes_1: [], classes_2: [], year: this.year, overall: 0, subscore: 0}
    if (this.year == 1) {
        output.classes_1 = this.classes;
    }
    if (this.year == 2) {
        output.classes_2 = this.classes;
    }
    emit(this.studentid, output);
};

var mapGpas = function() {
    emit(this.studentid, {studentid: this.studentid, classes_1: [], classes_2: [], year: 0, overall: this.overall, subscore: this.subscore});
};

var r = function(key, values) {
    var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};

    values.forEach(function(v){
        outs.studentid = v.studentid;
        v.classes_1.forEach(function(class){if(outs.classes_1.indexOf(class)==-1){outs.classes_1.push(class)}})
        v.classes_2.forEach(function(class){if(outs.classes_2.indexOf(class)==-1){outs.classes_2.push(class)}})

        if (v.year == 0) {
            outs.overall = v.overall;
            outs.subscore = v.subscore;
        }
    });
    return outs;
};

res = db.details.mapReduce(mapDetails, r, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, r, {out: {reduce: 'joined'}})

运行这两个MapReduce操作会得到下面的集合,这个集合与您想要的格式相匹配:

> db.joined.find()
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ 1, 17, 19, 21 ], "classes_2" : [ 32, 91, 101, 217 ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ 1, 11, 18, 22 ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ 2, 12, 19, 22 ], "classes_2" : [ 32, 99, 110, 215 ], "overall" : 85, "subscore" : 5 } }
>

MapReduce总是以{_id:"id", value:"value"}的形式输出文档。

关于在文档中使用子文档的更多信息可在题为“点符号(访问对象)”的文档中获取:

http://www.mongodb.org/display/DOCS/Dot+Notation+%28Reaching+into+Objects%29

如果您想让MapReduce的输出以不同的格式出现,您需要在应用程序中编程实现它。

希望这能提高您对MapReduce的理解,并使您离生成所需的输出集合更近一步。祝您好运!


这非常有帮助。我感谢您在这篇文章中花费的所有时间。再次感谢! - TFX
很荣幸能够帮到你!我很高兴能提供帮助!诚挚地,Marc。 - Marc

1
你不能在这里使用m/r,因为它只能应用于一个集合。从多个集合中读取数据将会破坏分片兼容性,所以是不允许的。你可以通过新的聚合框架(2.1+)或者在应用程序内部完成你想要的操作。

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接