• Share
  • Email
  • Embed
  • Like
  • Save
  • Private Content
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
 

R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

on

  • 8,920 views

 

Statistics

Views

Total Views
8,920
Views on SlideShare
2,674
Embed Views
6,246

Actions

Likes
2
Downloads
139
Comments
0

10 Embeds 6,246

http://www.revolutionanalytics.com 6097
http://yonniedev.devcloud.acquia-sites.com 88
http://blog.newitfarmer.com 38
http://localhost 9
http://huiwenhan.wordpress.com 5
http://revolutionanalytics.com 4
http://webcache.googleusercontent.com 2
http://yonnie.devcloud.acquia-sites.com 1
https://www.google.co.in 1
http://www.revolution-computing.com 1
More...

Accessibility

Categories

Upload Details

Uploaded via as Adobe PDF

Usage Rights

© All Rights Reserved

Report content

Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
  • Full Name Full Name Comment goes here.
    Are you sure you want to
    Your message goes here
    Processing…
Post Comment
Edit your comment

    R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework Presentation Transcript

    • R + Hadoop = Big Data Analytics
    • 6
    • 7
    • 8
    • 9
    • library(rmr)mapreduce(…)
    • lapply(data, function)mapreduce(big.data, map = function)
    • Expose MR Hide MR Hive, Pig Rmr, Rhipe, Cascalog,RmrDumbo, Pydoop, Scalding, Hadoopy Scrunch Java, Cascading, C++ Crunch
    • mapreduce(input, output, map, reduce)
    • x = from.dfs(hdfs.object)hdfs.object = to.dfs(x)
    • small.ints = 1:1000lapply(small.ints, function(x) x^2)small.ints = to.dfs(1:1000)mapreduce(input = small.ints, map = function(k,v) keyval(v, v^2))groups = rbinom(32, n = 50, prob = 0.4)tapply(groups, groups, length)groups = to.dfs(groups)mapreduce(input = groups, map = function(k, v) keyval(v,1), reduce = function(k,vv) keyval(k, length(vv)))
    • condition = function(x) x > 10out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
    • kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = F)) {newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters =kmeans.iter(points, distfun, centers = newCenters)} newCenters}kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else{ function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v))keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2,mean))), to.data.frame = T)}
    • #!/usr/bin/pythonimport sysfrom math import fabsfrom org.apache.pig.scripting import Pigfilename = "student.txt"k=4tolerance = 0.01MAX_SCORE = 4MIN_SCORE = 0MAX_ITERATION = 100# initial centroid, equally divide the spaceinitial_centroids = ""last_centroids = [None] * kfor i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":"P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid($centroids); raw = load student.txt as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into output; """)converged = Falseiter_num = 0while iter_num<MAX_ITERATION: Q = P.bind({centroids:initial_centroids}) results = Q.runSingle()
    • if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
    • import java.io.IOException;import org.apache.pig.EvalFunc;import org.apache.pig.data.Tuple;public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; }}
    • mapreduce(mapreduce(…mapreduce(input = c(input1, input2), …)equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
    • out1 = mapreduce(…)mapreduce(input = out1, <xyz>)mapreduce(input = out1, <abc>)abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
    • input.format, output.format, formatreduce.on.data.frame, to.data.framelocal, hadoop backendsprofiling