Like this presentation? Why not share!

by Hidekazu Tanaka, Programmer at None on Oct 24, 2011

• 2,611 views

Views

Total Views
2,611
Views on SlideShare
2,588
Embed Views
23

Likes
3
53
0

3 Embeds23

 http://localhost 9 http://a0.twimg.com 7 http://holidayworking.org 7

Categories

Uploaded via SlideShare as Apple Keynote

Report content

• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n
• \n

• 1
• 2
• •• • • 3
• •••• 4
• •• • •• 5
• 6
• •• • • ••• 7
• R CMD INSTALL package filename 8
• 9
• > small.ints = to.dfs(1:10)> out = mapreduce(input = small.ints, map = function(k,v) keyval(k, k^2))> res = from.dfs(out)> colres <- do.call(rbind, lapply(res,"[[",2))> t(colres) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10][1,] 1 4 9 16 25 36 49 64 81 100> groups = to.dfs(rbinom(32, n = 50, prob = 0.4))> out = mapreduce(input = groups, reduce = function(k,vv) keyval(k, length(vv)))> res = from.dfs(out)> colres <- do.call(rbind, lapply(res,"[[",2))> t(colres) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12][1,] 2 7 3 1 1 12 2 8 8 1 4 1 10
• > wordcount = function(input, output = NULL, pattern = " ") {+ mapreduce(input = input,+ output = output,+ textinputformat = rawtextinputformat,+ map = function(k ,v) {+ lapply(strsplit(x = v, split = pattern) [[1]],+ function(w) keyval(w,1))+ },+ reduce = function(k, vv) {+ keyval(k, sum(unlist(vv)))+ },+ combine = T)+ }> out <- wordcount(input="/user/hidekazu/the_social_network.txt")> results <- from.dfs(out)> results <- data.frame(word=unlist(lapply(results,"[[",1)), count=unlist(lapply(results,"[[",2)))> results <- (results[order(results\$count, decreasing=TRUE), ])> head(results) word count6313 the 11012381 a 700 1126 and 637
• 12
• kmeans.iter = function(points, distfun, ncenters = length(centers), centers = NULL) { from.dfs( mapreduce(input = points, map = if (is.null(centers)) { function(k,v)keyval(sample(1:ncenters,1),v) } else { function(k,v) { distances = lapply(centers, function(c) distfun(c,v)) keyval(centers[[which.min(distances)]],v) } }, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))) ) } 13
• kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = F)) { newCenters = kmeans.iter(points, distfun = distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = lapply(values(newCenters), unlist) newCenters = kmeans.iter(points, distfun, centers=newCenters) } newCenters }clustdata = lapply(1:10000, function(i) keyval(i, c(rnorm(1, mean = i%%3, sd = 0.01), rnorm(1, mean = i%%4, sd = 0.01))))to.dfs(clustdata, "/tmp/clustdata")kmeans ("/tmp/clustdata", 12) 14
• 15
• 15
• 16
• > model <- kmeans(iris[, 1:4], 3, nstart=10)> modelfilename <- "my_smart_unique_name"> modelfile <- hdfs.file(modelfilename, "w")> hdfs.write(model, modelfile)[1] TRUE> hdfs.close(modelfile)[1] TRUE> modelfile = hdfs.file(modelfilename, "r")> m <- hdfs.read(modelfile)Warning message:In function (h) : Closed unused DFS stream: my_smart_unique_name> model <- unserialize(m)> hdfs.close(modelfile)[1] TRUE> model 17
• 18
• • • 19
• • 20
• • • • •• 21