MongoDB With Style

with
style
gabriele lana
gabriele.lana@cleancode.it
twitter: @gabrielelana
http://joind.in/2943

Style

query &
design

scale

mongo console

$ ~/Work/opt/mongodb-‐1.6.5/bin/mongod
-‐-‐dbpath=~/Work/src/nosqlday/db/mongodb.01
-‐-‐logpath=~/Work/src/nosqlday/log/mongodb.01
-‐-‐fork -‐-‐port 30001

$ ~/Work/opt/mongodb-‐1.6.5/bin/mongo localhost:30001
MongoDB shell version: 1.6.5
connecting to: localhost:30001/test

> use nosqlday
switched to db nosqlday

> db.getCollectionNames()
[ "system.indexes", "users" ]

> db.users.find({ "name": "Gabriele" })
{ "_id" : ObjectId("4d8706767bb037a8a8f98db2"), "name" : "Gabriele",
"surname" : "Lana", "job" : "softwarecraftsman" }

> exit
bye

ruby driver

require "mongo"

db = Mongo::Connection.new("localhost", 30001).db("nosqlday")

puts "Collections:"
db.collections.each do |collection|
puts "t#{collection.name}"
end

puts "Gabriele:"
db["users"].find(:name => "Gabriele").each do |user|
puts "t#{user["_id"]}"
end

db.connection.close

ruby driver

require "mongo"

db = Mongo::Connection.new("localhost", 30001).db("nosqlday")

puts "Collections:"
db.collections.each do |collection|
puts "t#{collection.name}"
$ ruby src/connect.rb
Collections:
end users
system.indexes
Gabriele:
puts "Gabriele:" 4d8706767bb037a8a8f98db2
end

db.connection.close

Style
know your
driver

mongo

smart driver

document object mapper

puts "Gabriele:"
end

puts "Gabriele:"
db["users"].select{|user| user["name"] == "Gabriele"}.each do |user|
end

mongo

smart driver

puts "Gabriele:"
end

puts "Gabriele:"
$ ruby src/find_vs_select.rb
db["users"].select{|user| user["name"] == "Gabriele"}.each do |user|
Gabriele:
puts "t#{user["_id"]}" 4d8706767bb037a8a8f98db2

Gabriele:
end 4d8706767bb037a8a8f98db2

mongo

smart driver

Style
incremental
design
based on
application
behavior

the best design is
the one where needed
data can be easily
extracted
the way you need
to query your data
should influence
your design

Style
incremental
design
based on
application
monitoring

monitoring and adapting is better
than doing it right the first time
...actually the first time
is the worst time :-)

monitoring & adapting

> db.setProfilingLevel(1, 5)

{ "was" : 1, "slowms" : 100, "ok" : 1 }

// after product usage find problematic queries

> db.system.profile.find().sort({millis:-‐1})

{ "ts": "Mon Mar 21 2011 14:30:56 GMT+0100 (CET)",
"info": "
query pomodorist.pomodori
reslen:202
nscanned:26950
query:
{ $query: { task_id: ObjectId('4d6f1d3931f2386e9c089796') }}
nreturned:1
",
"millis":17
}


> db.pomodori.find({
$query: { task_id: ObjectId('4d6f1d3931f2386e9c089796') },
$explain: true
})

{ "cursor": "BasicCursor",
"nscanned": 26950,
"nscannedObjects": 26950,
"n": 1,
"millis": 17,
"indexBounds": { },
"allPlans": [
{ "cursor" : "BasicCursor", "indexBounds" : { } }
]
}


> db.pomodori.ensureIndex({"task_id": 1})

> db.pomodori.find({
$query: { task_id: ObjectId('4d6f1d3931f2386e9c089796') },
$explain: true
})

{ "cursor": "BtreeCursor task_id_1",
"nscanned": 1,
"nscannedObjects": 1,
"n": 1,
"millis": 0,
"indexBounds": {
"task_id": [
[
ObjectId("4d6f1d3931f2386e9c089796"),
ObjectId("4d6f1d3931f2386e9c089796")
]
]}, "allPlans": [...]
}

query &
design
use $in
operator
for batch
query

retrieve all objects with $in

users = [
{:name => "Gabriele", :surname => "Lana", :job => "softwarecraftsman"},
{:name => "Federico", :surname => "Galassi", :job => "softwarecraftsman"},
{:name => "Giordano", :surname => "Scalzo", :job => "softwarecraftsman"}
]

ids = users.map{|user| db["users"].insert(user)}

puts ids.map{|id| db["users"].find_one(:_id => id)}


users = [
]

$ ruby src/find_by_all_ids.rb
{"_id"=>BSON::ObjectId('4d87605731f23824a0000001'), ...}



users = [
]

ids = db["users"].insert(users)

puts db["users"].find(:_id => {:$in => ids}).all


users = [
]

$ ruby src/find_by_all_ids.rb
ids = db["users"].insert(users)

puts db["users"].find(:_id => {:$in => ids}).all

query &
design use
conventions to
build smart
object
identifiers

conventions are fun to play with

> db.user_scores.find({}, {"_id": 1})

{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106" }
{ "_id" : "4d873ce631f238241d00000d-‐week-‐200944" }
{ "_id" : "4d873ce631f238241d00000d-‐month-‐200911" }
{ "_id" : "4d873ce631f238241d00000d-‐year-‐2009" }
{ "_id" : "4d873ce631f238241d00000d-‐user" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐advertising" }
{ "_id" : "4d873ce631f238241d00000d-‐week-‐200944-‐advertising" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐art" }
{ "_id" : "4d873ce631f238241d00000d-‐week-‐200944-‐art" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐artist" }
{ "_id" : "4d873ce631f238241d00000d-‐week-‐200944-‐artist" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐information" }


> db.user_scores.findOne(
{"_id": "4d873ce631f238241d00000d-‐day-‐20091106"}
)

{
"_id" : "4d873ce631f238241d00000d-‐day-‐20091106",
"pomodori" : 15,
"pomodori_squashed" : 3,
"breaks" : 7,
"tasks_created" : 8,
"tasks_done" : 6,
"estimation_accuracy" : 0,
"seconds_of_focused_time" : 22500,
"seconds_of_wasted_time" : 1999,
"seconds_of_breaks" : 8820
}

(user scores in day per tag)

> db.user_scores.find(
{"_id": /^4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}
)

{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐advertising" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐art" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐artist" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐blogging" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐culture" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐html" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐illustration" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐information" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐inspiration" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐marketing" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐movies" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐resources" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐technology" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐tool" }
{ "_id" : "4d873ce631f238241d00000d-‐day-‐20091106-‐tutorials" }

(list of tags per day)
{"_id": /^4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}

).map(function(document) {
return document._id.replace(
"4d873ce631f238241d00000d-‐day-‐20091106-‐", ""
)
})

[
"advertising",
"art",
"artist",
"blogging",
"culture",
"html",
"illustration",
"information",
...
]

(anchored regexp uses indexes)
{"_id": /^4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}
).explain()

{
"cursor" : "BtreeCursor _id_ multi",
"nscanned" : 15,
"nscannedObjects" : 15,
"n" : 15,
"millis" : 0,
"indexBounds" : {
"_id" : [
[
"4d873ce631f238241d00000d-‐day-‐20091106-‐",
"4d873ce631f238241d00000d-‐day-‐20091106."
],
[
/^4d873ce631f238241d00000d-‐day-‐20091106-‐/,
/^4d873ce631f238241d00000d-‐day-‐20091106-‐/
]
]

(anchored regexp uses indexes)
{"_id": /4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}
).explain()

{
"nscanned" : 109349,
"n" : 15,
"millis" : 217,
"indexBounds" : {
"_id" : [
...
]
}
}

query & use “group”
design method to
do small
computations
without
fetching
related
documents

group to compute data in mongo
(inject client side)

days = [ 20091110, 20091111, 20091112 ]
scores_id = %r{^4d87d00931f2380c7700000d-day-(#{days.join("|")})$}

scores = db["user_scores"].find(:_id => scores_id)

pomodori = scores.inject(0) do |pomodori, scores|
pomodori + scores["pomodori"]
end

puts "Pomodori in days #{days.join(",")}: #{pomodori}"

(inject client side)

days = [ 20091110, 20091111, 20091112 ]

scores = db["user_scores"].find(:_id => scores_id)

pomodori = scores.inject(0) do |pomodori, scores|
$ ruby src/inject_for_reduce.rb
pomodori + scores["pomodori"]
Pomodori in days 20091110,20091111,20091112: 36
end

puts "Pomodori in days #{days.join(",")}: #{pomodori}"

(group server side)

days = [ 20091110, 20091111, 20091112 ]

result = db["user_scores"].group(
:cond => { :_id => scores_id },
:initial => { :pomodori => 0 },
:reduce => <<-EOF
function(document, result) {
result.pomodori += document.pomodori
}
EOF
)

puts "Pomodori in days #{days.join(",")}: #{result.first["pomodori"]}"

(group server side)

days = [ 20091110, 20091111, 20091112 ]

:cond => { :_id => scores_id },
:initial => { :pomodori => 0 },
:reduce => <<-EOF $ ruby src/group_for_reduce.rb
Pomodori in days 20091110,20091111,20091112: 36
}
EOF
)

puts "Pomodori in days #{days.join(",")}: #{result.first["pomodori"]}"

(ex. sum pomodori by tag “ruby”)

:cond => {
:_id => /^4d87d00931f2380c7700000d-day-d{8}-ruby$/
},
:initial => { :pomodori => 0, :days => 0 },
:reduce => <<-EOF
result.days += 1
}
EOF
).first

puts "In #{result["days"]} days, #{result["pomodori"]} done for ruby"


:cond => {
:_id => /^4d87d00931f2380c7700000d-day-d{8}-ruby$/
},
:initial => { :pomodori => 0, :days => 0 },
:reduce => <<-EOF
$ ruby src/group_for_ruby_tag.rb
In 43 days, 45 pomodori
result.days += 1
}
EOF
).first

puts "In #{result["days"]} days, #{result["pomodori"]} pomodori"

> db.user_scores.find({
"_id": /^4d87d00931f2380c7700000d-‐day-‐d{8}-‐ruby$/
}).explain()

{
"nscanned" : 43,
"n" : 43,
"millis" : 3,
"indexBounds" : {
"_id" : [...]
}
}

query &
design create indexes
on arrays to
create local
reverse
indexes in
documents

reverse index in place
(an array could be indexed)

> db.tasks.find({ "tags": { $in: [ "nosqlday" ] } })

{ "_id" : ObjectId("4d7de446175ca8243d000004"),
"tags" : [ "nosqlday" ],
"description" : "#nosqlday keynote",
"is_recurrent" : false,
"estimated" : 0,
"worked_in" : [
"Mon Mar 14 2011 00:00:00 GMT+0100 (CET)",
"Tue Mar 15 2011 00:00:00 GMT+0100 (CET)"
],
"done_at" : "Tue Mar 15 2011 13:05:03 GMT+0100 (CET)",
"todo_at" : null,
"created_at" : "Mon Mar 14 2011 10:47:50 GMT+0100 (CET)",
"updated_at" : "Tue Mar 15 2011 13:05:03 GMT+0100 (CET)",
"keywords": [ "nosqldai", "keynot" ],
"user_id": ObjectId("4d53996c137ce423ff000001"),
"annotations" : [ ]
}

(an array could be indexed)
> db.tasks.getIndexes()
[
{
"name" : "_id_",
"ns" : "app435386.tasks",
"key" : {
"_id" : 1
}
},
{
"name" : "tags_1",
"ns" : "app435386.tasks",
"key" : {
"tags" : 1
},
"unique" : false
},
...
]

(container for deduced data, array)

db["orders"].insert({
:placed_at => [
now.strftime("%Y"), # year: "2011"
now.strftime("%Y%m"), # month: "201103"
now.strftime("%Yw%U"), # week: "2011w11"
now.strftime("%Y%m%d") # day: "20110316"
],
:user_id => user,
:items => items_in_order.map{|item| item[:id]},
:total => items_in_order.inject(0){|total,item| total += item[:price]}
})

# ...

db["orders"].ensure_index([["placed_at", Mongo::DESCENDING]])

> db.orders.findOne()

{ "_id" : ObjectId("4d88bf1f31f23812de0003fd"),
"placed_at" : [ "2011", "201103", "2011w11", "20110316" ],
"user_id" : ObjectId("4d88bf1f31f23812de0003e9"),
"items" : [
ObjectId("4d88bf1f31f23812de0003da"),
ObjectId("4d88bf1f31f23812de000047"),
ObjectId("4d88bf1f31f23812de000288")
],
"total" : 3502
}

> db.orders.find({ "placed_at": "20110310" }).count()
77

> db.orders.find({ "placed_at": "20110310" }).explain()
{
"cursor" : "BtreeCursor placed_at_-‐1",
"nscanned" : 77,
"n" : 77,
"millis" : 0,
"indexBounds" : {
"placed_at" : [
[
"20110310",
"20110310"
]
]
}
}

(container for deduced data, hash)

:placed_at => [
{ :year => now.strftime("%Y") },
{ :month => now.strftime("%Y%m") },
{ :week => now.strftime("%Y%U") },
{ :day => now.strftime("%Y%m%d") }
],
:user_id => user,
})

# ...




{ "_id" : ObjectId("4d88c31531f23812fe0003ea"),
"placed_at" : [
{ "year" : "2009" },
{ "month" : "200911" },
{ "week" : "200945" },
{ "day" : "20091109" }
],
"user_id" : ObjectId("4d88c31531f23812fe0003e9"),
"items" : [
ObjectId("4d88c31531f23812fe00013f"),
ObjectId("4d88c31531f23812fe000176"),
ObjectId("4d88c31531f23812fe0003e2"),
ObjectId("4d88c31531f23812fe0003d1"),
ObjectId("4d88c31531f23812fe0001c1"),
ObjectId("4d88c31531f23812fe000118"),
ObjectId("4d88c31531f23812fe00031d")
],
"total" : 10149
}


> db.orders.find({ "placed_at.week": "201101" }).count()
331

> db.orders.find({ "placed_at.week": "201101" }).explain()
{
"cursor" : "BasicCursor",
"nscanned" : 22374,
"n" : 331,
"millis" : 23,
"indexBounds" : {

}
}


> db.orders.find({ "placed_at": { "week": "201101" }}).count()
331

> db.orders.find({ "placed_at": { "week": "201101" }}).explain()
{
"nscanned" : 331,
"n" : 331,
"millis" : 0,
"indexBounds" : {
"placed_at" : [
[
{ "week" : "2011w01" },
{ "week" : "2011w01" }
]
]
}
}

query &
design
use dates but
be aware of
some pitfalls

plain dates are good too

:placed_at => now,
:user_id => user,
})

# ...




{
"_id" : ObjectId("4d88d1f931f23813a10003ea"),
"placed_at" : "Mon Nov 09 2009 08:00:00 GMT+0100 (CET)",
"user_id" : ObjectId("4d88d1f931f23813a10003e9"),
"items" : [
ObjectId("4d88d1f931f23813a100016d"),
ObjectId("4d88d1f931f23813a1000346"),
ObjectId("4d88d1f931f23813a10001e7"),
ObjectId("4d88d1f931f23813a10000db"),
ObjectId("4d88d1f931f23813a1000091"),
ObjectId("4d88d1f931f23813a10001c1"),
ObjectId("4d88d1f931f23813a10001d3"),
ObjectId("4d88d1f931f23813a100031b"),
ObjectId("4d88d1f931f23813a1000130")
],
"total" : 5871
}


> db.orders.find({
"placed_at": {
$gte: new Date(2011,2,10),
$lt: new Date(2011,2,11)
}
}).explain()

{
"nscanned" : 53,
"n" : 53,
"millis" : 0,
"indexBounds" : {
"placed_at" : [
[
"Fri Mar 11 2011 00:00:00 GMT+0100 (CET)",
"Thu Mar 10 2011 00:00:00 GMT+0100 (CET)"
]
]
}

plain dates are good too, but...
(total sold on this year’s mondays)

# find all mondays of the year
now = Time.now.beginning_of_year

now += 1.day until now.monday?
mondays = [ now ]
mondays << now += 7.days while now.year == Time.now.year

# find all orders placed on mondays
query = {
:$or => mondays.map do |day|
{ :placed_at => {
:$gte => day.beginning_of_day,
:$lte => day.end_of_day
}
}
end
}

puts query

(total sold on this year’s mondays)

# find all mondays of the year

mondays = [ now ]
$ ruby src/orders_on_mondays.rb
# find all orders placed on mondays
{:$or=>[
query = { {:placed_at=>{
:$gte=>2011-‐01-‐03 00:00:00 +0100,
{ :placed_at => { :$lte=>2011-‐01-‐03 23:59:59 +0100
}},
{:placed_at=>{
:$gte=>2011-‐01-‐10 00:00:00 +0100,
:$lte=>2011-‐01-‐10 23:59:59 +0100
} }},
} {:placed_at=>{
:$gte=>2011-‐01-‐17 00:00:00 +0100,
end :$lte=>2011-‐01-‐17 23:59:59 +0100
} }},
...
]}
puts query

(it works but it’s too slooow)

db["orders"].find({
{ :placed_at => {
}
}
end
})

(why it’s too slow)
> db.orders.find({
$or: [
"placed_at":{ $gte: new Date(2011,2,3), $lt: new Date(2011,2,4) },
"placed_at":{ $gte: new Date(2011,2,10), $lt: new Date(2011,2,11) }
]
}).explain()

{
"clauses" : [{
"indexBounds" : {
"placed_at" : [[
"Tue Mar 3 2011 00:00:00 GMT+0100 (CET)",
"Wed Mar 4 2011 00:00:00 GMT+0100 (CET)"
]]}
}, {
"indexBounds" : {
"placed_at" : [[
"Tue Mar 10 2011 00:00:00 GMT+0100 (CET)",
"Wed Mar 11 2011 00:00:00 GMT+0100 (CET)"

with destructured dates
(total sold on mondays this year)

{ "_id" : ObjectId("4d88bf1f31f23812de0003fd"),
"placed_at" : [ "2011", "201103", "2011w11", "20110316" ],
"user_id" : ObjectId("4d88bf1f31f23812de0003e9"),
"items" : [
ObjectId("4d88bf1f31f23812de0003da"),
ObjectId("4d88bf1f31f23812de000288")
],
"total" : 3502
}



mondays = [ now ]

orders = db["orders"].find({
:placed_at => {
:$in => mondays.map {|day| day.strftime("%Y%m%d")}
}
})

puts orders.explain



mondays = [ now ]

orders = db["orders"].find({
$ ruby src/orders_on_mondays.rb
:placed_at => {
{ "cursor"=>"BtreeCursor placed_at_-‐1 multi",
:$in => mondays.map "nscanned"=>744,
{|day| day.strftime("%Y%m%d")}
} "nscannedObjects"=>744,
"n"=>744,
}) "millis"=>1,
"indexBounds"=>{
"placed_at"=>[
puts orders.explain ["20120102", "20120102"], ["20111226", "20111226"],
["20111219", "20111219"], ["20111212", "20111212"],
["20111205", "20111205"], ["20111128", "20111128"],
["20111121", "20111121"], ...
]
}
}

query &
design
full query
power with
$where
operator

pomodori
(find who is ticking)

> db.pomodori.findOne()
{
"_id" : ObjectId("4d8916ed31f2381480000021"),
"duration" : 1500,
"interruptions" : 0,
"after_break_of" : 0,
"started_at" : "Mon Mar 14 2011 08:05:00 GMT+0100 (CET)",
"squashed_at" : "Mon Mar 14 2011 08:07:31 GMT+0100 (CET)",
"in_day" : {
"position" : 1,
"is_last" : false
},
"task_id" : ObjectId("4d8916ec31f2381480000014"),
"user_id" : ObjectId("4d8916ec31f2381480000010"),
"annotations" : [ ]
}

pomodori

now = Time.now.yesterday.beginning_of_day + 10.hours
timestamp_of_now = now.to_i

ticking = db["pomodori"].find(
:$where => <<-EOF
var startedAt = this.started_at.getTime()/1000
return
((startedAt + this.duration) > #{timestamp_of_now}) &&
(startedAt < #{timestamp_of_now})
EOF
)

puts ticking.map{|pomodoro| pomodoro["_id"]}

pomodori


:$where => <<-EOF
return $ ruby src/find_who_is_ticking.rb
4d8916ef31f238148000011d
4d8916f231f2381480000271
4d8916f931f23814800004dd
4d8916f931f23814800004e0
EOF
)


pomodori
(find who is ticking for an user)

user_id = BSON::ObjectId.from_string("4d8916ec31f2381480000010")

:user_id => user_id,
:$where => <<-EOF
return
EOF
)


pomodori
(find who is ticking for an user)

user_id = BSON::ObjectId.from_string("4d8916ec31f2381480000010")

:user_id => user_id,
:$where => <<-EOF $ ruby src/find_who_is_ticking_for_an_user.rb
4d8916ef31f238148000011d
return
EOF
)


pomodori
(related to tasks tagged with “maps”)

related_to_maps = db["pomodori"].find(
:$where => <<-EOF
db.tasks.findOne({ "_id": this.task_id }).tags.indexOf("maps") >= 0
EOF
)

puts related_to_maps.map{|pomodoro| pomodoro["_id"]}

pomodori
(related to tasks tagged with “maps”)

:$where => <<-EOF
EOF
)
$ ruby src/related_to_maps.rb
4d8916fa31f2381480000579
4d8916fa31f238148000057b
4d8916fa31f238148000057d
4d8916fa31f2381480000580

pomodori
(don’t be carried away :-))

:$where => <<-EOF
EOF
)

puts related_to_maps.explain
{ "cursor"=>"BasicCursor",
"nscanned"=>461,
"nscannedObjects"=>461,
"n"=>4,
"millis"=>52,
"indexBounds"=>{},
"allPlans"=>[...]
}

pomodori
(related to... a better solution)

related_to_maps = db["pomodori"].find(:task_id => {
:$in => db["tasks"].find(
{:tags => "maps"}, :fields => {:_id => 1}
).map{|task| task["_id"]}
})
4d8916fa31f2381480000579
4d8916fa31f238148000057b
4d8916fa31f238148000057d
4d8916fa31f2381480000580

pomodori
(related to... a better solution)

related_to_maps = db["pomodori"].find(:task_id => {
:$in => db["tasks"].find(
{:tags => "maps"}, :fields => {:_id => 1}
).map{|task| task["_id"]}
})
{ "cursor"=>"BtreeCursor tags_1",
"nscanned"=>3,
"n"=>3,
"millis"=>0,
...
}

{ "cursor"=>"BtreeCursor task_id_1 multi",
"nscanned"=>4,
"n"=>4,
"millis"=>0,
...
}

query &
design
real time
analytics with
increments

keep track of url’s visits
(upsert with custom id)

result = db["visits"].update(
{ :_id => Digest::MD5.hexdigest(url) },
{ :$inc => { :hits => 1 } },
:upsert => true,
:safe => true
)

puts "Update: #{result.inspect}"

puts db["visits"].find_one(:_id => Digest::MD5.hexdigest(url))

keep track of url’s visits
(upsert with custom id)

{ :_id => Digest::MD5.hexdigest(url) },
{ :$inc => { :hits => 1 } },
:upsert => true,
:safe => true
)
$ ruby src/realtime_analytics.rb
Update: {
puts "Update: #{result.inspect}"
"err"=>nil,
"updatedExisting"=>false,
"n"=>1,
puts db["visits"].find_one(:_id => Digest::MD5.hexdigest(url))
"ok"=>1.0
}
{"_id"=>"2d86a774beffe90e715a8028c7bd177b", "hits"=>1}

$ ruby src/realtime_analytics.rb
Update: {
"err"=>nil,
"updatedExisting"=>true,
"n"=>1,
"ok"=>1.0
}
{"_id"=>"2d86a774beffe90e715a8028c7bd177b", "hits"=>2}

url’s visits aggregated by time
(upsert with multiple documents)

url_digest = Digest::MD5.hexdigest(url)
ids = [
[ url_digest, Time.now.strftime("%Y%m%d") ].join("-"),
[ url_digest, Time.now.strftime("%Y%m") ].join("-"),
[ url_digest, Time.now.strftime("%Y") ].join("-"),
[ url_digest, user_id ].join("-")
]
puts "Expect to upsert: n#{ids}"

{ :_id => { :$in => ids } },
{ :$inc => { :hits => 1 } },
:multi => true,
:upsert => true,
:safe => true
)
puts result.inspect
puts db["visits"].all


ids = [
] $ ruby src/realtime_analytics_with_aggregation.rb
Expect to upsert:[
puts "Expect to upsert: "2d86a774beffe90e715a8028c7bd177b-‐20110323",
n#{ids}"
"2d86a774beffe90e715a8028c7bd177b-‐201103",
"2d86a774beffe90e715a8028c7bd177b-‐2011",
"2d86a774beffe90e715a8028c7bd177b-‐4d899fab31f238165c000001"
{ :_id => { :$in => ids } },
]
{ :$inc => { :hits => { "err"=>nil,
1 } },
:multi => true, "updatedExisting"=>false,
"upserted"=>BSON::ObjectId('4d899fabe23bd37e768ae76d'),
:upsert => true, "n"=>1,
:safe => true "ok"=>1.0
}
)
puts result.inspect {"_id"=>BSON::ObjectId('4d899fabe23bd37e768ae76d'), "hits"=>1}
puts db["visits"].all


ids = [
] $ ruby src/realtime_analytics_with_aggregation.rb
Expect to upsert:[
puts "Expect to upsert: "2d86a774beffe90e715a8028c7bd177b-‐20110323",
n#{ids}"
"2d86a774beffe90e715a8028c7bd177b-‐201103",
"2d86a774beffe90e715a8028c7bd177b-‐2011",
"2d86a774beffe90e715a8028c7bd177b-‐4d899fab31f238165c000001"
{ :_id => { :$in => ids } },
]
{ :$inc => { :hits => { "err"=>nil,
1 } },
:multi => true, "updatedExisting"=>false,
"upserted"=>BSON::ObjectId('4d899fabe23bd37e768ae76e'),
:upsert => true, "n"=>1,
:safe => true "ok"=>1.0
}
)
puts result.inspect {"_id"=>BSON::ObjectId('4d899fabe23bd37e768ae76d'), "hits"=>1}
puts db["visits"].all {"_id"=>BSON::ObjectId('4d899fabe23bd37e768ae76e'), "hits"=>1}

(look before you leap)
{ :_id => { :$in => ids } },
{ :$inc => { :hits => 1 } },
:multi => true,
:upsert => true,
:safe => true
)

if result["n"] != ids.size
updated_ids = db["visits"].find(
{ :_id => { :$in => ids } }, :fields => { :_id => true }
).map{|document| document["_id"]}

db["visits"].insert((ids - updated_ids).map do |id|
{ :_id => id, :hits => 1 }
end)

db["visits"].remove(:_id => result["upserted"]) if result["upserted"]
end

{ :_id => { :$in => ids } },
{ :$inc => { :hits => 1 } },
:multi => true,
:upsert => true,
:safe => true
) $ ruby src/realtime_analytics_with_aggregation.rb
{ "err"=>nil,
"updatedExisting"=>false,
"upserted"=>BSON::ObjectId('4d89a5ebe23bd37e768ae76f'),
"n"=>1,
"ok"=>1.0
}

{"_id"=>"<url_digest>-‐20110323", "hits"=>1}
{ :_id => id, :hits {"_id"=>"<url_digest>-‐4d89a43b31f238167a000001", "hits"=>1}
=> 1 }
end)

end

{ :_id => { :$in => ids } },
{ :$inc => { :hits => 1 } },
:multi => true,
:upsert => true,
:safe => true
) $ ruby src/realtime_analytics_with_aggregation.rb
{ "err"=>nil,
"updatedExisting"=>true,
"n"=>3,
"ok"=>1.0
}
{"_id"=>"<url_digest>-‐4d89a43b31f238167a000001", "hits"=>1}
{"_id"=>"<url_digest>-‐4d89a44231f238167e000001", "hits"=>1}
{ :_id => id, :hits => 1 }
end)

end

query &
design
incremental
map/reduce

map/reduce hits per day
(we have raw events)

> db.visit_events.findOne()
{
"_id" : ObjectId("4d89fc6531f2381d2c00000b"),
"url" : "8aa8b68e0b849f70df6dbb3031c6182b",
"user_id" : ObjectId("4d89fc6531f2381d2c000005"),
"at" : "Thu Jan 13 2011 08:00:06 GMT+0100 (CET)"
}

(generate data WITH something like)

def generate_events(visits, db, now)
visits.times do |time|
now += BETWEEN_VISITS.sample.seconds
db["visit_events"].insert(
:url => Digest::MD5.hexdigest(URLS.sample),
:user_id => USERS.sample[:id],
:at => now
)
end
end

generate_events(10_000, db, now)

(simple map/reduce)
MAP = <<-EOF
function() {
emit([ this.url, this.at.format("Ymd") ].join("-"), { "hits": 1 })
}
EOF

REDUCE = <<-EOF
function(key, values) {
var hits = 0
for(var index in values) hits += values[index]["hits"]
return { "hits": hits }
}
EOF

result = db["visit_events"].map_reduce(
MAP, REDUCE, :out => "visits", :raw => true, :verbose => true
)

puts result.inspect

(date.prototype.format don’t exists)
MAP = <<-EOF
function() {
}
EOF

REDUCE = <<-EOF
var hits = 0
}
EOF

MAP, REDUCE, :out => "visits", :raw => true, :verbose => true
)

puts result.inspect

(implement format in place)

MAP = <<-EOF
function() {
Date.prototype.format = function(format) {
...
}
}
EOF

REDUCE = <<-EOF
var hits = 0
}
EOF

(implement format only if needed)

MAP = <<-EOF
function() {
if (!Date.prototype.format) {
Date.prototype.format = function(format) {
...
}
}
}
EOF

REDUCE = <<-EOF
var hits = 0
}
EOF

(implement format once and for all)
db[Mongo::DB::SYSTEM_JS_COLLECTION].save(
:_id => "formatDate",
:value => BSON::Code.new(
<<-EOF
function(date, format) {
if (!Date.prototype.format) {
Date.prototype.format = function(format) { ... }
}
return date.format(format)
}
EOF
)
)

MAP = <<-EOF
function() {
emit([ this.url, formatDate(this.at, "Ymd") ].join("-"), {"hits":1})
}
EOF

(implement format once and for all)
db[Mongo::DB::SYSTEM_JS_COLLECTION].save(
:_id => "load",
:value => BSON::Code.new(
<<-EOF
function(module) {
if ((module === "date") && !Date.prototype.format) {
Date.prototype.format = function(format) { ... }
}
return true
}
EOF
)
)

MAP = <<-EOF
function() {
load("date") && emit(
[ this.url, this.at.format("Ymd") ].join("-"),
{ "hits": 1 }
)
}
EOF

(ok, but could be taking too long)
MAP = <<-EOF
function() {
}
EOF

REDUCE = <<-EOF $ ruby src/incremental_mr.rb
function(key, values)
{
{ "result"=>"visits",
var hits = 0 "timeMillis"=>4197,
"timing"=> {
"mapTime"=>3932,
"emitLoop"=>4170,
} "total"=>4197
EOF },
"counts"=> {
"input"=>10000,
"emit"=>10000,
"output"=>200
MAP, REDUCE, :out => "visits", :raw =>
}, true, :verbose => true
) "ok"=>1.0
}

puts result.inspect

(ok, every time we need to start over)
> db.visits.find()

{ "_id" : "019640ff7952425b1b8695605459d223-‐20110316",
"value" : { "hits" : 47 }
}

{ "_id" : "019640ff7952425b1b8695605459d223-‐20110317",
"value" : { "hits" : 49 }
}

{ "_id" : "019640ff7952425b1b8695605459d223-‐20110318",
"value" : { "hits" : 59 }
}

{ "_id" : "019640ff7952425b1b8695605459d223-‐20110319",
"value" : { "hits" : 37 }
}

(incremental with savepoints)

visit-elements visit
collection collection

map/reduce
on last changed upsert
documents

temporary
collection


db.create_collection("visit_events",
:capped => true,
visit-elements
:max => 50_000,
:size => 5_000_000 collection
)

map/reduce
on last changed
documents

temporary
collection


FINALIZE = <<-EOF
function(key, value) {
db.visits.update( visit
{ "_id": key }, collection
{ $inc: { "hits": value.hits } },
true
)
}
EOF
upsert

temporary
collection


generate_events(number_of_events, db, now)

from = from_last_updated(db)
to = to_last_inserted(db)

MAP, REDUCE,
:finalize => FINALIZE,
:query => { :_id => { :$gt => from, :$lte => to } },
:raw => true,
:verbose => true
)

db["visits"].save(:_id => "savepoint", :at => to)




MAP, REDUCE, $ ruby src/incremental_mr.rb -‐e 10000

:finalize => FINALIZE,{ "result"=>"tmp.mr.mapreduce_1300892393_60",
"timeMillis"=>4333,
"timing"=>{...},
:raw => true, "counts"=>{
:verbose => true "input"=>10000,
"emit"=>10000,
) "output"=>196
},
"ok"=>1.0
db["visits"].save(:_id => "savepoint",
} :at => to)

{ "_id"=>"05241f07d0e3ab6a227e67b33ea0b509-‐20110113",
"hits"=>26
}




MAP, REDUCE, $ ruby src/incremental_mr.rb -‐e 4999

:finalize => FINALIZE,{ "result"=>"tmp.mr.mapreduce_1300892399_61",
"timeMillis"=>2159,
"timing"=>{...},
:raw => true, "counts"=>{
:verbose => true "input"=>4999,
"emit"=>4999,
) "output"=>146
},
"ok"=>1.0
db["visits"].save(:_id => "savepoint",
} :at => to)

{ "_id"=>"05241f07d0e3ab6a227e67b33ea0b509-‐20110113",
"hits"=>64
}


def savepoint(db)
db["visits"].find_one(:_id => "savepoint") or
{ "at" => BSON::ObjectId.from_time(10.years.ago) }
end

def from_last_updated(db)
savepoint["at"]
end

def to_last_inserted(db)
db["visit_events"].find.sort([:_id, Mongo::DESCENDING]).first["_id"]
end

query &
design
external
map/reduce

use an external mongod process
to execute map/reduce jobs

master slave

replicate data


master slave

map/reduce
on last
replicated
data


master slave

push back results

look at the shell source
is more powerful than you think

query &
design documents
embedded
or
linked?

life cycle:
when root document
is deleted, he can
stand for himself?

if yes if no
embedded linked

are always fetched
together?

if yes if no
embedded linked

his attributes are
used to find the root
document?

if yes if no
embedded linked

he’s small?

if yes if no
embedded linked

he’s unique or there
are less then
hundreds?

if yes if no
embedded linked

scale
distributed
reads with
replica
sets

slave
replicate

read
master

read/write

slave
read

replicate

+ Durability
+ fault tolerance

scale
(seems stupid but...)

pump
your
hardware

scale
(seems stupid but...)

call 10gen
sure they can
help :-)

Questions?

gabriele lana
gabriele.lana@cleancode.it
twitter: @gabrielelana
http://joind.in/2943

MongoDB With Style

More Related Content

What's hot

Viewers also liked

Similar to MongoDB With Style

More from Gabriele Lana

Recently uploaded

MongoDB With Style