Mongodb Aggregation Pipeline

Zahid Mian
Part of the Brown-bag Series

Basic Aggregate functions available
Count, Distinct, Group
MongoDB doesn’t support SQL syntax
Aggregation requires building of “pipeline”
Essentially, one step/stage at a time, e.g.:
Step 1: Filter
Step 2: Projection
Step 3: Group

http://docs.mongodb.org/getting-started/shell/import-data/

db.restaurants.count();
> db.restaurants.distinct("borough");
[
"Brooklyn",
"Bronx",
"Manhattan",
"Queens",
"Staten Island",
"Missing"
]

> db.restaurants.group( {
... key: { borough: 1 },
... cond: { cuisine: "Bakery"},
... reduce: function(cur, result) { result.count += 1 },
... initial: { count: 0 }
... } );
[
{
"borough" : "Bronx",
"count" : 71
},
{
"borough" : "Manhattan",
"count" : 221
},
{
"borough" : "Brooklyn",
"count" : 173
},
{
"borough" : "Queens",
"count" : 204
},
{
"borough" : "Staten Island",
"count" : 20
},
{
"borough" : "Missing",
"count" : 2
}
]
>
key is equivalent to the group by clause
cond is equivalent to the where clause
reduce function is called for each document in the
collection that passes the condition
reduce function has two parameters: cur and result. cur
stores the current document and result stores the result so
far for that group
In this case result.count simply adds 1 for each document
initial sets the initial value for each group result

> db.restaurants.count();
25359
> db.restaurants.aggregate([{$group:{_id:'$cuisine', total: {$sum:1}}}]);
{ "_id" : "Chilean", "total" : 1 }
{ "_id" : "Californian", "total" : 1 }
{ "_id" : "Creole/Cajun", "total" : 1 }
{ "_id" : "Hawaiian", "total" : 3 }
{ "_id" : "Nuts/Confectionary", "total" : 6 }
{ "_id" : "Chinese/Japanese", "total" : 59 }
{ "_id" : "Soups", "total" : 4 }
{ "_id" : "Bagels/Pretzels", "total" : 168 }
{ "_id" : "Polynesian", "total" : 1 }
{ "_id" : "Delicatessen", "total" : 321 }
{ "_id" : "Eastern European", "total" : 65 }
{ "_id" : "Scandinavian", "total" : 7 }
{ "_id" : "Afghan", "total" : 14 }
{ "_id" : "Iranian", "total" : 2 }
{ "_id" : "Fruits/Vegetables", "total" : 7 }
{ "_id" : "German", "total" : 31 }
{ "_id" : "Creole", "total" : 24 }
{ "_id" : "Steak", "total" : 86 }
{ "_id" : "Czech", "total" : 6 }
{ "_id" : "Peruvian", "total" : 68 }
Type "it" for more

db.restaurants.aggregate(
[ // bracket indicates an array
{ // first "step" or stage
$group:{ // aggregate operator
_id:'$cuisine', // group by cuisine property
total: {$sum:1} // sum or count each “row”
}
}
]
);

> db.restaurants.aggregate(
... [
... {$group:{_id:'$cuisine', total: {$sum:1}}},
… {$sort: {total:-1}}
... ]
... );
{ "_id" : "American ", "total" : 6183 }
{ "_id" : "Chinese", "total" : 2418 }
{ "_id" : "Café/Coffee/Tea", "total" : 1214 }
{ "_id" : "Pizza", "total" : 1163 }
{ "_id" : "Italian", "total" : 1069 }
{ "_id" : "Other", "total" : 1011 }
{ "_id" : "Latin (Cuban, Dominican, Puerto Rican, South & Central American)", "total" : 850 }
{ "_id" : "Japanese", "total" : 760 }
{ "_id" : "Mexican", "total" : 754 }
{ "_id" : "Bakery", "total" : 691 }
{ "_id" : "Caribbean", "total" : 657 }
{ "_id" : "Spanish", "total" : 637 }
{ "_id" : "Donuts", "total" : 479 }
{ "_id" : "Pizza/Italian", "total" : 468 }
{ "_id" : "Sandwiches", "total" : 459 }
{ "_id" : "Hamburgers", "total" : 433 }
{ "_id" : "Chicken", "total" : 410 }
{ "_id" : "Ice Cream, Gelato,Yogurt, Ices", "total" : 348 }
{ "_id" : "French", "total" : 344 }
{ "_id" : "Delicatessen", "total" : 321 }
Type "it" for more

}
},
{ // second "step" or stage
$sort: { // sort operator
total:-1 // sort on total; -1 indicates DESC
}
}
]
);

... [
... {$match : {borough: "Bronx"}},
... {$sort: {total:-1}}
... ]
... );
{ "_id" : "Chinese", "total" : 323 }
{ "_id" : "Pizza", "total" : 197 }
{ "_id" : "Latin (Cuban, Dominican, Puerto Rican, South & Central American)", "total" : 187 }
{ "_id" : "Spanish", "total" : 127 }
{ "_id" : "Caribbean", "total" : 110 }
{ "_id" : "Chicken", "total" : 108 }
{ "_id" : "Mexican", "total" : 89 }
{ "_id" : "Other", "total" : 86 }
{ "_id" : "Hamburgers", "total" : 78 }
{ "_id" : "Bakery", "total" : 71 }
{ "_id" : "Donuts", "total" : 68 }
{ "_id" : "Pizza/Italian", "total" : 53 }
{ "_id" : "Italian", "total" : 52 }
{ "_id" : "Sandwiches", "total" : 49 }
{ "_id" : "Café/Coffee/Tea", "total" : 45 }
{ "_id" : "Juice, Smoothies, Fruit Salads", "total" : 35 }
{ "_id" : "African", "total" : 31 }
{ "_id" : "Ice Cream, Gelato,Yogurt, Ices", "total" : 27 }
{ "_id" : "Seafood", "total" : 26 }
Type "it" for more

$match : { // match operator
borough: "Bronx" // where borough = "Bronx"
}
},
{ // second "step" or stage
}
},
{ // third "step" or stage
$sort: {
total:-1 // sort on total; -1 indicates DESC
}
}
]
);

$sum
$avg
$first
$last
$max
$min
$push
$addToSet: similar to $push, but adds unique
values

Returns an array of all values that result from applying an expression to each document in a group
... [
... {
... $group:
... {
... _id: { cuisine: "$cuisine" },
... restaurantByStreet: { $push: { name: "$name" } }
... }
... },
... {$limit: 4},
... {$skip: 3}
... ]
... ).pretty();
{
"_id" : {
"cuisine" : "Hawaiian"
},
"restaurantByStreet" : [
{
"name" : "Makana"
},
{
"name" : "General Assembly"
},
{
"name" : "Onomea"
}
]
}
>

Sort by borough ASC, cuisine DESC
... [
... {$group:{_id:{borough: '$borough', cuisine:'$cuisine' }, total: {$sum:1}}},
... {$sort: {"_id.borough":1, "_id.cuisine":-1}}, // use dot notation
... {$limit: 5 }
... ]
... );
{ "_id" : { "borough" : "Bronx", "cuisine" : "Thai" }, "total" : 2 }
{ "_id" : { "borough" : "Bronx", "cuisine" : "Tex-Mex" }, "total" : 11 }
{ "_id" : { "borough" : "Bronx", "cuisine" : "Steak" }, "total" : 4 }
{ "_id" : { "borough" : "Bronx", "cuisine" : "Spanish" }, "total" : 127 }
{ "_id" : { "borough" : "Bronx", "cuisine" : "Soups & Sandwiches" }, "total" : 1 }
>

Controls which values are output
... [
... {$limit:1},
... {$project: {_id:0, // hide the _id value
… restaurant_id:1, // show restaurant_id
… "restaurant_name":"$name", // rename/alias name to restaurant_name
… "grades.grade":1}} // show grades.grade
... ]).pretty();
{
"grades" : [
{
"grade" : "A" // part of output
},
{
"grade" : "B" // part of output
},
{
},
{
}
],
"restaurant_name" : "Wendy'S", // part of output; renamed
"restaurant_id" : "30112340" // part of output
}
>

Saves the output of a pipeline to a collection
... [
... {$match : {borough: "Bronx"}},
... {$sort: {total:-1}},
... {$limit: 5 },
... {$out: "top5"} // output data to a collection called top5
... ]
... );
> db.top5.find({}); // retrieve all data from top5
{ "_id" : "Chinese", "total" : 323 }
{ "_id" : "Pizza", "total" : 197 }
{ "_id" : "Latin (Cuban, Dominican, Puerto Rican, South & Central
American)", "total" : 187 }
{ "_id" : "Spanish", "total" : 127 }
>

Motivation: How many A grades
did a restaurant get?
> db.restaurants.find({_id: ObjectId("5602b9200a67e499361c05ad")}).pretty();
{
"_id" : ObjectId("5602b9200a67e499361c05ad"),
"address" : {
"street" : "Flatbush Avenue",
"zipcode" : "11225",
"building" : "469",
"coord" : [
-73.961704,
40.662942
]
},
"borough" : "Brooklyn",
"cuisine" : "Hamburgers",
"grades" : [ // this is an array of objects
{
"date" : ISODate("2014-12-30T00:00:00Z"),
"grade" : "A", // A grade
"score" : 8
},
{
"grade" : "B", // B grade
"score" : 23,
"date" : ISODate("2014-07-01T00:00:00Z")
},
{
"score" : 12,
"date" : ISODate("2013-04-30T00:00:00Z"),
"grade" : "A"
},
{
"date" : ISODate("2012-05-08T00:00:00Z"),
"grade" : "A",
"score" : 12
}
],
"name" : "Wendy'S",
"restaurant_id" : "30112340"
}
>
Basic pipeline
Stage 1: unwind grades
Stage 2: match grade of
“A”
Stage 3: group by / sum
Stage 4: project (alias)

There is only one document for that restaurant_id, but since there were 4 elements in
grades, the unwind operator created 4 documents, one for each grade
Notice the result of the following is four documents with the same restaurant_id
... [
... {$unwind: "$grades"}, // unwind the grades array
... {$limit:4}, // limit the output to 4 documents
... {$project: {_id:0, restaurant_id:1, "grades.date":1, "grades.grade":1, "grades.score":1}}
... ]).pretty();
{
"grades" : {
"date" : ISODate("2014-12-30T00:00:00Z"),
"grade" : "A",
"score" : 8
},
"restaurant_id": "30112340"
}
{
"grades" : {
"grade" : "B",
"score" : 23,
"date" : ISODate("2014-07-01T00:00:00Z")
},
}
{
"grades" : {
"score" : 12,
"date" : ISODate("2013-04-30T00:00:00Z"),
"grade" : "A"
},
}
{
"grades" : {
"date" : ISODate("2012-05-08T00:00:00Z"),
"grade" : "A",
"score" : 12
},
}

... [
... {$unwind: "$grades"},
... {$project: {_id:0, restaurant_id:1, name:1, "grades.grade":1}},
... {$match: {"grades.grade":"A"} }, // only count A grades
... {$group: {_id:{restaurant_id:'$restaurant_id', name:'$name' }, total: {$sum:1}}},
... {$sort: {total: -1}},
... {$limit: 5},
… // alias output to get nicer printout
... {$project: {_id:0, "rid":"$_id.restaurant_id", "rname":"$_id.name", total:1}}
... ]).pretty();
{ "total" : 8, "rid" : "41382858", "rname" : "TacoVeloz" }
{ "total" : 7, "rid" : "41587378", "rname" : "Lobster Joint" }
{"total" : 7, "rid" : "41611381", "rname" : "Burger King, Popeye'S Chicken & Biscuits"}
{ "total" : 7, "rid" : "41572121", "rname" : "Luke'S Pizza" }
{ "total" : 7, "rid" : "41578481", "rname" : "Top Hot Bagels & Grill" }
>

Mongodb Aggregation Pipeline

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Viewers also liked

Viewers also liked (11)

Similar to Mongodb Aggregation Pipeline

Similar to Mongodb Aggregation Pipeline (20)

More from zahid-mian

More from zahid-mian (9)

Recently uploaded

Recently uploaded (20)

Mongodb Aggregation Pipeline