将SQL查询转换为MongoDB mapreduce

时间:2012-10-09 09:06:31

标签: sql mongodb mapreduce

我在mongo中有以下集合:

{
    "_id" : ObjectId("506217890b50f300d020d237"),
    "o_orderkey" : NumberLong(1),
    "o_orderstatus" : "O",
    "o_totalprice" : 173665.47,
    "o_orderdate" : ISODate("1996-01-02T02:00:00Z"),
    "o_orderpriority" : "5-LOW",
    "o_clerk" : "Clerk#000000951",
    "o_shippriority" : 0,
    "o_comment" : "blithely final dolphins solve-- blithely blithe packages nag blith",
    "customer" : {
            "c_custkey" : NumberLong(36901),
            "c_name" : "Customer#000036901",
            "c_address" : "TBb1yDZcf 8Zepk7apFJ",
            "c_phone" : "23-644-998-4944",
            "c_acctbal" : 4809.84,
            "c_mktsegment" : "AUTOMOBILE",
            "c_comment" : "regular accounts after the blithely pending dependencies play blith",
            "c_nationkey" : {
                    "n_nationkey" : NumberLong(13),
                    "n_name" : "JORDAN",
                    "n_comment" : "blithe, express deposits boost carefully busy accounts. furiously pending depos",
                    "n_regioin" : {
                            "r_regionkey" : NumberLong(4),
                            "r_name" : "MIDDLE EAST",
                            "r_comment" : "furiously unusual packages use carefully above the unusual, exp"
                    }
            }
    },
    "o_lineitem" : [
        {
            "l_linenumber" : 1,
            "l_quantity" : 17,
            "l_extendedprice" : 21168.23,
            "l_discount" : 0.04,
            "l_tax" : 0.02,
            "l_returnflag" : "N",
            "l_linestatus" : "O",
            "l_shipdate" : ISODate("1996-03-13T03:00:00Z"),
            "l_commitdate" : ISODate("1996-02-12T03:00:00Z"),
            "l_receiptdate" : ISODate("1996-03-22T03:00:00Z"),
            "l_shipinstruct" : "DELIVER IN PERSON",
            "l_shipmode" : "TRUCK",
            "l_comment" : "blithely regular ideas caj",
            "partsupp" : {
                "ps_availqty" : 6157,
                "ps_supplycost" : 719.17,
                "ps_comment" : "blithely ironic packages haggle quickly silent platelets. silent packages must have to nod. slyly special theodolites along the blithely ironic packages nag above the furiously pending acc",
                "ps_partkey" : {
                    "p_partkey" : NumberLong(155190),
                    "p_name" : "slate lavender tan lime lawn",
                    "p_mfgr" : "Manufacturer#4",
                    "p_brand" : "Brand#44",
                    "p_type" : "PROMO BRUSHED NICKEL",
                    "p_size" : 9,
                    "p_container" : "JUMBO JAR",
                    "p_retailprice" : 1245.19,
                    "p_comment" : "regular, final dol"
                },
                "ps_suppkey" : {
                    "s_suppkey" : NumberLong(7706),
                    "s_name" : "Supplier#000007706",
                    "s_address" : "BlHq75VoMNCoU380SGiS9fTWbGpeI",
                    "s_phone" : "33-481-218-6643",
                    "s_acctbal" : -379.71,
                    "s_comment" : "carefully pending ideas after the instructions are alongside of the dolphins. slyly pe",
                    "s_nationkey" : {
                        "n_nationkey" : NumberLong(23),
                        "n_name" : "UNITED KINGDOM",
                        "n_comment" : "fluffily regular pinto beans breach according to the ironic dolph",
                        "n_regioin" : {
                            "r_regionkey" : NumberLong(3),
                            "r_name" : "EUROPE",
                            "r_comment" : "special, bold deposits haggle foxes. platelet"
                        }
                    }
                }
            }
        },
        .
        .
        .
    ]
}

我试图翻译以下sql查询:

select
    s_acctbal, 
    s_name, 
    n_name, 
    p_partkey, 
    p_mfgr, 
    s_address, 
    s_phone, 
    s_comment
from 
    part, 
    supplier, 
    partsupp, 
    nation, 
    region
where 
    p_partkey = ps_partkey
    and s_suppkey = ps_suppkey
    and p_size = 15
    and p_type like '%BRASS'
    and s_nationkey = n_nationkey
    and n_regionkey = r_regionkey
    and r_name = 'EUROPE'
    and ps_supplycost = (
        select 
            min(ps_supplycost)
        from 
            partsupp, supplier, 
            nation, region
        where 
            p_partkey = ps_partkey
            and s_suppkey = ps_suppkey
            and s_nationkey = n_nationkey
            and n_regionkey = r_regionkey
            and r_name = 'EUROPE'
    )
order by 
    s_acctbal desc, 
    n_name, 
    s_name, 
    p_partkey;

我正在尝试的功能:

db.runCommand({
    mapreduce: "ordersfull",
    query: {
    },
    map: function Map() {
        var pattern = /BRASS$/g;

        for(var i in this.o_lineitem){
            var p_size = this.o_lineitem[i].partsupp.ps_partkey.p_size;
            var p_type = this.o_lineitem[i].partsupp.ps_partkey.p_type;
            var region = this.o_lineitem[i].partsupp.ps_suppkey.s_nationkey.n_regioin.r_name;

            if(p_size==15 && p_type.match(pattern)!=null && region == "EUROPE"){
                emit("",{
                    s_acctbal: this.o_lineitem[i].partsupp.ps_suppkey.s_acctbal,
                    s_name: this.o_lineitem[i].partsupp.ps_suppkey.s_name,
                    n_name: this.o_lineitem[i].partsupp.ps_suppkey.s_nationkey.n_name,
                    p_partkey: this.o_lineitem[i].partsupp.ps_partkey.p_partkey,
                    p_mfgr: this.o_lineitem[i].partsupp.ps_partkey.p_mfgr,
                    s_address: this.o_lineitem[i].partsupp.ps_suppkey.s_address,
                    s_phone: this.o_lineitem[i].partsupp.ps_suppkey.s_phone,
                    s_comment: this.o_lineitem[i].partsupp.ps_suppkey.s_comment
                } );
            }

        }
    },
    reduce: function(key, values) {
    },
    out: 'query002'
});

在我的结果中,我为所有条目获取了空值,会发生什么?

1 个答案:

答案 0 :(得分:0)

您可以通过在JavaScript函数中包含print()printjson()语句来调试MapReduce输出。生成的打印输出将保存在MongoDB日志中。

MapReduce有几个问题:

  • for .. in循环not work as you expect ..您应该使用array.forEach(..)
  • 如果您正在迭代一个数组,那么您已经有了对数组项的引用,不应该使用array[index]
  • 如果您不想进行意外分组,则应emit()使用唯一的密钥名称
  • 您的reduce()应返回与发出数据结构相匹配的值
  • 理想情况下,您应该使用query参数来限制需要检查的文档

鉴于您似乎只是在不进行任何分组或reduce()的情况下迭代文档,您可能会发现在应用程序代码中获取文档并执行相同的匹配更容易。

在任何情况下,map()函数实际上应该更像:

var map = function () {
    var pattern = /BRASS$/;

    this.o_lineitem.forEach(function(item) {
        var partKey = item.partsupp.ps_partkey;
        var suppKey = item.partsupp.ps_suppkey;

        var region = suppKey.s_nationkey.n_regioin.r_name;

        if (partKey.p_size==15 && partKey.p_type.match(pattern) !=null && region == "EUROPE") {
            emit(suppKey.s_name,
                {
                    s_acctbal: suppKey.s_acctbal,
                    s_name:    suppKey.s_name,
                    n_name:    suppKey.s_nationkey.n_name,
                    p_partkey: partKey.p_partkey,
                    p_mfgr:    partKey.p_mfgr,
                    s_address: suppKey.s_address,
                    s_phone:   suppKey.s_phone,
                    s_comment: suppKey.s_comment
                }
            );
        }
    })
}

根据您的数据结构和所需的多重匹配和排序,将此查询转换为MongoDB 2.2中的新Aggregation Framework会更容易。

有一些current limitations需要注意(例如聚合管道输出的当前最大值为16MB),但您可能会发现查询更容易创建和调试。

以下是使用聚合框架的注释示例,包括订单状态,日期和感兴趣的部件/供应商项目的初始匹配条件:

db.ordersfull.aggregate(
    // Find matching documents first (can take advantage of index)
    { $match: {
        o_orderstatus: 'O',
        o_orderdate: { $gte: new ISODate('2012-10-01') },
        $and: [
            { o_lineitem: { $elemMatch: { 'partsupp.ps_partkey.p_size': 15 }} },
            { o_lineitem: { $elemMatch: { 'partsupp.ps_partkey.p_type': { $exists : true } }} },        
            { o_lineitem: { $elemMatch: { 'partsupp.ps_suppkey.s_nationkey.n_regioin.r_name': 'EUROPE'}} }
        ]
    }},

    // Filter to fields of interest
    { $project: {
        _id: 0,
        o_lineitem: 1
    }},

    // Convert line item arrays into document stream
    { $unwind: '$o_lineitem' },

    // Match desired line items
    { $match: {
        'o_lineitem.partsupp.ps_partkey.p_size': 15,
        'o_lineitem.partsupp.ps_partkey.p_type': /BRASS$/, 
        'o_lineitem.partsupp.ps_suppkey.s_nationkey.n_regioin.r_name': 'EUROPE'
    }},

    // Final field selection
    { $project: {
        s_acctbal: '$o_lineitem.partsupp.ps_suppkey.s_acctbal',
        s_name:    '$o_lineitem.partsupp.ps_suppkey.s_name',
        n_name:    '$o_lineitem.partsupp.ps_suppkey.s_nationkey.n_name',
        p_partkey: '$o_lineitem.partsupp.ps_partkey.p_partkey',
        p_mfgr:    '$o_lineitem.partsupp.ps_partkey.p_mfgr',
        s_address: '$o_lineitem.partsupp.ps_suppkey.s_address',
        s_phone:   '$o_lineitem.partsupp.ps_suppkey.s_phone',
        s_comment: '$o_lineitem.partsupp.ps_suppkey.s_comment'
    }},

    // Sort the output
    { $sort: {
        s_acctbal: -1,
        n_name: 1,
        s_name: 1,
        p_partkey: 1
    }}
)