


    
    
    
    
    







    
    
    



    
    

    


    


    


    


    
{
    "_id" : ObjectId("4fb9fb91d066d657de8d6f36"),
    "text" : “MongoDB uses Map/Reduce #epic #win",
    …
    "user" : {
         "friends_count" : 73,
         …
         "followers_count" : 102,
         "id" : 53507833,
    },
    …
}



    
    
    
           mongod   --rest --shardsvr --port 27017 --dbpath /tmp/shard1/ --smallfiles
           mongod   --rest --shardsvr --port 27017 --dbpath /tmp/shard1/ --smallfiles
           mongod   --configsvr --port 10000 --dbpath /tmp/config/ --smallfiles
           mongos   --port 22222 --configdb localhost:10000



    1. db.tweets.mapReduce()
    2. db.tweets.group()
    3. db.tweets.aggregate()
    4. MongoDB-Hadoop Adapter
    5. db.tweets.find()
var measure = function(c) {
     var a = Date.now();
     var results = c.apply();
     var d = Date.now() - a;
     return { results:results, duration:d };
};
function() {
     if (this.user != null) {
          emit("user",
             {userName: this.user.name,
             followers: this.user.followers_count});
     }
}
function(key, values) {
      var result = null;

     values.forEach( function(value) {
           if (result == null ||
                 result.followers < value.followers) {
                             result = value;
           }
     })
     return result;
}
db.tweets.group({
  key: {},
  initial: { name:'', followers_count:0 },
  reduce: function(obj,prev) {
    if (obj.user != null &&
        prev.followers_count < obj.user.followers_count)
    {
       prev.name = obj.user.name;
       prev.followers_count = obj.user.followers_count;
    }
  }
})
db.tweets.aggregate(
      {$group: {
            _id: {user_name: "$user.name"},
            followers_count: {$max: "$user.followers_count"}
      }},
      {$sort: {"followers_count" : -1}},
      {$limit : 1},
      {$project: {
            _id : 0,
            user_name : "$_id.user_name",
            followers_count : "$followers_count"
     }})
#!/usr/bin/env python
# encoding: utf-8

import sys
sys.path.append(".")
from pymongo_hadoop import BSONMapper

def mapper(documents):
       for doc in documents:
              if doc['user'] != None:
                     yield {'_id': doc['user']['name'].encode('utf-8'),
                            'followers':doc['user']['followers_count']}

BSONMapper(mapper)
print >> sys.stderr, "Done Mapping!"
#!/usr/bin/env python
# encoding: utf-8

import sys
sys.path.append('.')
from pymongo_hadoop import BSONReducer

def reducer(key, values):
       print >> sys.stderr, "Processing key %s" % key.encode('utf-8')
       _count = 0
       for v in values:
              if _count < v['followers']:
                        _count = v["followers"]
              return {"_id": key.encode('utf-8'), "count": _count}
BSONReducer(reducer)
print >> sys.stderr, "Done Reducing!"
hadoop jar /usr/lib/hadoop/lib/mongo-hadoop-streaming-
assembly-1.1.0-SNAPSHOT.jar
-files mapper.py, reducer.py
-inputURI mongodb://localhost:27017/twitter.tweets
-outputURI mongodb://localhost:27017/twitter.top_user
-mapper mapper.py
-reducer reducer.py
db.tweets.find().sort( {"user.followers_count": -1} ).limit(1)
db.tweets.mapReduce()

db.tweets.group()

db.tweets.aggregate()

MongoDB-Hadoop Adapter

db.tweets.find()
db.tweets.mapReduce()

db.tweets.group()

db.tweets.aggregate()

MongoDB-Hadoop Adapter

db.tweets.find()
db.tweets.mapReduce()

db.tweets.group()

db.tweets.aggregate()

MongoDB-Hadoop Adapter

db.tweets.find()
db.tweets.mapReduce()

db.tweets.group()

db.tweets.aggregate()

MongoDB-Hadoop Adapter

db.tweets.find()



db.tweets.mapReduce()

db.tweets.group()

db.tweets.aggregate()

MongoDB-Hadoop Adapter

db.tweets.find()

    
    



















Map/Confused? A practical approach to Map/Reduce with MongoDB