1
2
•
•
    •
    •




        3
•
•

•
•




    4
•
•
    •
    •
•




        5
6
•
•
    •
    •
    •
•
•



        7
R CMD INSTALL 'package filename'
                                   8
9
> small.ints = to.dfs(1:10)
> out = mapreduce(input = small.ints, map = function(k,v) keyval(k, k^2))
> res = from.dfs(out)
> colres <- do.call('rbind', lapply(res,"[[",2))
> t(colres)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    1    4    9   16    25  36   49   64   81   100




> groups = to.dfs(rbinom(32, n = 50, prob = 0.4))
> out = mapreduce(input = groups, reduce = function(k,vv) keyval(k, length(vv)))
> res = from.dfs(out)
> colres <- do.call('rbind', lapply(res,"[[",2))
> t(colres)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
[1,]    2    7    3    1    1   12    2    8     8    1     4     1



                                                                                   10
>  wordcount = function(input, output = NULL, pattern = " ") {
+    mapreduce(input = input,
+              output = output,
+              textinputformat = rawtextinputformat,
+              map = function(k ,v) {
+                 lapply(strsplit(x = v, split = pattern) [[1]],
+                        function(w) keyval(w,1))
+              },
+              reduce = function(k, vv) {
+                 keyval(k, sum(unlist(vv)))
+              },
+              combine = T)
+  }
>  out <- wordcount(input="/user/hidekazu/the_social_network.txt")
>  results <- from.dfs(out)
>  results <- data.frame(word=unlist(lapply(results,"[[",1)),
                          count=unlist(lapply(results,"[[",2)))
> results <- (results[order(results$count, decreasing=TRUE), ])
> head(results)
      word count
6313 the 1101
2381     a   700                                                     11
26     and   637
12
kmeans.iter =
  function(points, distfun, ncenters = length(centers), centers = NULL) {
    from.dfs(
              mapreduce(input = points,
                        map = if (is.null(centers)) {
                           function(k,v)keyval(sample(1:ncenters,1),v)
                        } else {
                           function(k,v) {
                             distances = lapply(centers,
                               function(c) distfun(c,v))
                             keyval(centers[[which.min(distances)]],v)
                           }
                        },
                        reduce = function(k,vv) keyval(NULL,
                                                apply(do.call(rbind, vv), 2, mean)))
              )
  }




                                                                                   13
kmeans =
    function(points, ncenters, iterations = 10,
             distfun =
             function(a,b) norm(as.matrix(a-b), type = 'F')) {
      newCenters = kmeans.iter(points, distfun = distfun, ncenters = ncenters)
      for(i in 1:iterations) {
        newCenters = lapply(values(newCenters), unlist)
        newCenters = kmeans.iter(points, distfun, centers=newCenters)
      }
      newCenters
    }




clustdata = lapply(1:10000,
                   function(i) keyval(i, c(rnorm(1, mean = i%%3, sd = 0.01),
                                           rnorm(1, mean = i%%4, sd = 0.01))))
to.dfs(clustdata, "/tmp/clustdata")
kmeans ("/tmp/clustdata", 12)


                                                                                 14
15
15
16
> model <- kmeans(iris[, 1:4], 3, nstart=10)
> modelfilename <- "my_smart_unique_name"
> modelfile <- hdfs.file(modelfilename, "w")
> hdfs.write(model, modelfile)
[1] TRUE
> hdfs.close(modelfile)
[1] TRUE




> modelfile = hdfs.file(modelfilename, "r")
> m <- hdfs.read(modelfile)
Warning message:
In function (h) : Closed unused DFS stream: my_smart_unique_name
> model <- unserialize(m)
> hdfs.close(modelfile)
[1] TRUE
> model


                                                                   17
18
•




    •




        19
•




    20
•
    •
    •
    •
•




        21

RHadoop の紹介

  • 1.
  • 2.
  • 3.
    • • • • 3
  • 4.
  • 5.
    • • • • • 5
  • 6.
  • 7.
    • • • • • • • 7
  • 8.
    R CMD INSTALL'package filename' 8
  • 9.
  • 10.
    > small.ints =to.dfs(1:10) > out = mapreduce(input = small.ints, map = function(k,v) keyval(k, k^2)) > res = from.dfs(out) > colres <- do.call('rbind', lapply(res,"[[",2)) > t(colres) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] 1 4 9 16 25 36 49 64 81 100 > groups = to.dfs(rbinom(32, n = 50, prob = 0.4)) > out = mapreduce(input = groups, reduce = function(k,vv) keyval(k, length(vv))) > res = from.dfs(out) > colres <- do.call('rbind', lapply(res,"[[",2)) > t(colres) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [1,] 2 7 3 1 1 12 2 8 8 1 4 1 10
  • 11.
    > wordcount= function(input, output = NULL, pattern = " ") { + mapreduce(input = input, + output = output, + textinputformat = rawtextinputformat, + map = function(k ,v) { + lapply(strsplit(x = v, split = pattern) [[1]], + function(w) keyval(w,1)) + }, + reduce = function(k, vv) { + keyval(k, sum(unlist(vv))) + }, + combine = T) + } > out <- wordcount(input="/user/hidekazu/the_social_network.txt") > results <- from.dfs(out) > results <- data.frame(word=unlist(lapply(results,"[[",1)), count=unlist(lapply(results,"[[",2))) > results <- (results[order(results$count, decreasing=TRUE), ]) > head(results) word count 6313 the 1101 2381 a 700 11 26 and 637
  • 12.
  • 13.
    kmeans.iter = function(points, distfun, ncenters = length(centers), centers = NULL) { from.dfs( mapreduce(input = points, map = if (is.null(centers)) { function(k,v)keyval(sample(1:ncenters,1),v) } else { function(k,v) { distances = lapply(centers, function(c) distfun(c,v)) keyval(centers[[which.min(distances)]],v) } }, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))) ) } 13
  • 14.
    kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun = distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = lapply(values(newCenters), unlist) newCenters = kmeans.iter(points, distfun, centers=newCenters) } newCenters } clustdata = lapply(1:10000, function(i) keyval(i, c(rnorm(1, mean = i%%3, sd = 0.01), rnorm(1, mean = i%%4, sd = 0.01)))) to.dfs(clustdata, "/tmp/clustdata") kmeans ("/tmp/clustdata", 12) 14
  • 15.
  • 16.
  • 17.
  • 18.
    > model <-kmeans(iris[, 1:4], 3, nstart=10) > modelfilename <- "my_smart_unique_name" > modelfile <- hdfs.file(modelfilename, "w") > hdfs.write(model, modelfile) [1] TRUE > hdfs.close(modelfile) [1] TRUE > modelfile = hdfs.file(modelfilename, "r") > m <- hdfs.read(modelfile) Warning message: In function (h) : Closed unused DFS stream: my_smart_unique_name > model <- unserialize(m) > hdfs.close(modelfile) [1] TRUE > model 17
  • 19.
  • 20.
    • 19
  • 21.
    20
  • 22.
    • • • • 21