RHadoop の紹介

•
•
•
•
•

5

•
•
•
•
•
•
•

7

R CMD INSTALL 'package filename'
8

> small.ints = to.dfs(1:10)
> out = mapreduce(input = small.ints, map = function(k,v) keyval(k, k^2))
> res = from.dfs(out)
> colres <- do.call('rbind', lapply(res,"[[",2))
> t(colres)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 4 9 16 25 36 49 64 81 100

> groups = to.dfs(rbinom(32, n = 50, prob = 0.4))
> out = mapreduce(input = groups, reduce = function(k,vv) keyval(k, length(vv)))
> res = from.dfs(out)
> colres <- do.call('rbind', lapply(res,"[[",2))
> t(colres)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
[1,] 2 7 3 1 1 12 2 8 8 1 4 1

10

> wordcount = function(input, output = NULL, pattern = " ") {
+ mapreduce(input = input,
+ output = output,
+ textinputformat = rawtextinputformat,
+ map = function(k ,v) {
+ lapply(strsplit(x = v, split = pattern) [[1]],
+ function(w) keyval(w,1))
+ },
+ reduce = function(k, vv) {
+ keyval(k, sum(unlist(vv)))
+ },
+ combine = T)
+ }
> out <- wordcount(input="/user/hidekazu/the_social_network.txt")
> results <- from.dfs(out)
> results <- data.frame(word=unlist(lapply(results,"[[",1)),
count=unlist(lapply(results,"[[",2)))
> results <- (results[order(results$count, decreasing=TRUE), ])
> head(results)
word count
6313 the 1101
2381 a 700 11
26 and 637

kmeans.iter =
function(points, distfun, ncenters = length(centers), centers = NULL) {
from.dfs(
mapreduce(input = points,
map = if (is.null(centers)) {
function(k,v)keyval(sample(1:ncenters,1),v)
} else {
function(k,v) {
distances = lapply(centers,
function(c) distfun(c,v))
keyval(centers[[which.min(distances)]],v)
}
},
reduce = function(k,vv) keyval(NULL,
apply(do.call(rbind, vv), 2, mean)))
)
}

13

kmeans =
function(points, ncenters, iterations = 10,
distfun =
function(a,b) norm(as.matrix(a-b), type = 'F')) {
newCenters = kmeans.iter(points, distfun = distfun, ncenters = ncenters)
for(i in 1:iterations) {
newCenters = lapply(values(newCenters), unlist)
newCenters = kmeans.iter(points, distfun, centers=newCenters)
}
newCenters
}

clustdata = lapply(1:10000,
function(i) keyval(i, c(rnorm(1, mean = i%%3, sd = 0.01),
rnorm(1, mean = i%%4, sd = 0.01))))
to.dfs(clustdata, "/tmp/clustdata")
kmeans ("/tmp/clustdata", 12)

14

> model <- kmeans(iris[, 1:4], 3, nstart=10)
> modelfilename <- "my_smart_unique_name"
> modelfile <- hdfs.file(modelfilename, "w")
> hdfs.write(model, modelfile)
[1] TRUE
> hdfs.close(modelfile)
[1] TRUE

> modelfile = hdfs.file(modelfilename, "r")
> m <- hdfs.read(modelfile)
Warning message:
In function (h) : Closed unused DFS stream: my_smart_unique_name
> model <- unserialize(m)
> hdfs.close(modelfile)
[1] TRUE
> model

17

•
•
•
•
•

21

RHadoop の紹介

More Related Content

What's hot

Viewers also liked

Similar to RHadoop の紹介

More from Hidekazu Tanaka

Recently uploaded

RHadoop の紹介

Editor's Notes