RHadoop の紹介

4,320 views

Published on

Published in: Technology, Sports

RHadoop の紹介

  1. 1. 1
  2. 2. 2
  3. 3. •• • • 3
  4. 4. •••• 4
  5. 5. •• • •• 5
  6. 6. 6
  7. 7. •• • • ••• 7
  8. 8. R CMD INSTALL package filename 8
  9. 9. 9
  10. 10. > small.ints = to.dfs(1:10)> out = mapreduce(input = small.ints, map = function(k,v) keyval(k, k^2))> res = from.dfs(out)> colres <- do.call(rbind, lapply(res,"[[",2))> t(colres) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10][1,] 1 4 9 16 25 36 49 64 81 100> groups = to.dfs(rbinom(32, n = 50, prob = 0.4))> out = mapreduce(input = groups, reduce = function(k,vv) keyval(k, length(vv)))> res = from.dfs(out)> colres <- do.call(rbind, lapply(res,"[[",2))> t(colres) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12][1,] 2 7 3 1 1 12 2 8 8 1 4 1 10
  11. 11. > wordcount = function(input, output = NULL, pattern = " ") {+ mapreduce(input = input,+ output = output,+ textinputformat = rawtextinputformat,+ map = function(k ,v) {+ lapply(strsplit(x = v, split = pattern) [[1]],+ function(w) keyval(w,1))+ },+ reduce = function(k, vv) {+ keyval(k, sum(unlist(vv)))+ },+ combine = T)+ }> out <- wordcount(input="/user/hidekazu/the_social_network.txt")> results <- from.dfs(out)> results <- data.frame(word=unlist(lapply(results,"[[",1)), count=unlist(lapply(results,"[[",2)))> results <- (results[order(results$count, decreasing=TRUE), ])> head(results) word count6313 the 11012381 a 700 1126 and 637
  12. 12. 12
  13. 13. kmeans.iter = function(points, distfun, ncenters = length(centers), centers = NULL) { from.dfs( mapreduce(input = points, map = if (is.null(centers)) { function(k,v)keyval(sample(1:ncenters,1),v) } else { function(k,v) { distances = lapply(centers, function(c) distfun(c,v)) keyval(centers[[which.min(distances)]],v) } }, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))) ) } 13
  14. 14. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = F)) { newCenters = kmeans.iter(points, distfun = distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = lapply(values(newCenters), unlist) newCenters = kmeans.iter(points, distfun, centers=newCenters) } newCenters }clustdata = lapply(1:10000, function(i) keyval(i, c(rnorm(1, mean = i%%3, sd = 0.01), rnorm(1, mean = i%%4, sd = 0.01))))to.dfs(clustdata, "/tmp/clustdata")kmeans ("/tmp/clustdata", 12) 14
  15. 15. 15
  16. 16. 15
  17. 17. 16
  18. 18. > model <- kmeans(iris[, 1:4], 3, nstart=10)> modelfilename <- "my_smart_unique_name"> modelfile <- hdfs.file(modelfilename, "w")> hdfs.write(model, modelfile)[1] TRUE> hdfs.close(modelfile)[1] TRUE> modelfile = hdfs.file(modelfilename, "r")> m <- hdfs.read(modelfile)Warning message:In function (h) : Closed unused DFS stream: my_smart_unique_name> model <- unserialize(m)> hdfs.close(modelfile)[1] TRUE> model 17
  19. 19. 18
  20. 20. • • 19
  21. 21. • 20
  22. 22. • • • •• 21

×