SlideShare a Scribd company logo
1 of 29
Download to read offline
R + Hadoop = Big Data Analytics
6
7
8
9
library(rmr)

mapreduce(…)
lapply(data, function)

mapreduce(big.data, map = function)
Expose MR   Hide MR
                             Hive, Pig




  Rmr, Rhipe,               Cascalog,
Rmr
Dumbo, Pydoop,              Scalding,
    Hadoopy                  Scrunch




       Java,                Cascading,
        C++                  Crunch
mapreduce(input, output, map, reduce)
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
small.ints = 1:1000
lapply(small.ints, function(x) x^2)

small.ints = to.dfs(1:1000)
mapreduce(input = small.ints,
        map = function(k,v) keyval(v, v^2))

groups = rbinom(32, n = 50, prob = 0.4)
tapply(groups, groups, length)

groups = to.dfs(groups)
mapreduce(input = groups,
     map = function(k, v) keyval(v,1),
     reduce = function(k,vv)
               keyval(k, length(vv)))
condition = function(x) x > 10


out = mapreduce(
     input = input,
     map = function(k,v)
          if (condition(v)) keyval(k,v))
kmeans =
 function(points, ncenters, iterations = 10,    distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) {  newCenters =
kmeans.iter(points, distfun, centers = newCenters)} newCenters}




kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs(
   mapreduce(
    input = points,      map = if (is.null(centers)) {        function(k,v) keyval(sample(1:ncenters,1),v)}           else
{         function(k,v) {             distances = apply(centers, 1, function(c) distfun(c,v))
keyval(centers[which.min(distances),], v)}},      reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2,
mean))),    to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k=4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
   last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
   initial_centroids = initial_centroids + str(last_centroids[i])
   if i!=k-1:
       initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
            DEFINE find_centroid FindCentroid('$centroids');
            raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
            centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
            grouped = group centroided by centroid;
            result = foreach grouped generate group, AVG(centroided.gpa);
            store result into 'output';
          """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
   Q = P.bind({'centroids':initial_centroids})
   results = Q.runSingle()
if results.isSuccessful() == "FAILED":
      raise "Pig job failed"
  iter = results.result("result").iterator()
  centroids = [None] * k
  distance_move = 0
  # get new centroid of this iteration, caculate the moving distance with last iteration
  for i in range(k):
      tuple = iter.next()
      centroids[i] = float(str(tuple.get(1)))
      distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
  distance_move = distance_move / k;
  Pig.fs("rmr output")
  print("iteration " + str(iter_num))
  print("average distance moved: " + str(distance_move))
  if distance_move<tolerance:
      sys.stdout.write("k-means converged at centroids: [")
      sys.stdout.write(",".join(str(v) for v in centroids))
      sys.stdout.write("]n")
      converged = True
      break
  last_centroids = centroids[:]
  initial_centroids = ""
  for i in range(k):
      initial_centroids = initial_centroids + str(last_centroids[i])
      if i!=k-1:
          initial_centroids = initial_centroids + ":"
  iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
  double[] centroids;
  public FindCentroid(String initialCentroid) {
     String[] centroidStrings = initialCentroid.split(":");
     centroids = new double[centroidStrings.length];
     for (int i=0;i<centroidStrings.length;i++)
        centroids[i] = Double.parseDouble(centroidStrings[i]);
  }
  @Override
  public Double exec(Tuple input) throws IOException {
     double min_distance = Double.MAX_VALUE;
     double closest_centroid = 0;
     for (double centroid : centroids) {
        double distance = Math.abs(centroid - (Double)input.get(0));
        if (distance < min_distance) {
            min_distance = distance;
            closest_centroid = centroid;
        }
     }
     return closest_centroid;
  }

}
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
  left.input, right.input, input,
  output,
  outer,
  map.left, map.right,
  reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
  …
  result = mapreduce(input = input,
              output = output)
  …
  result}
input.format, output.format, format
reduce.on.data.frame, to.data.frame
local, hadoop backends
profiling
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

More Related Content

What's hot

Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on rAbhik Seal
 
How fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceHow fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceTobias Pfeiffer
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processingTim Essam
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptxNimrita Koul
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformationTim Essam
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformationLaura Hughes
 
Heaps
HeapsHeaps
HeapsIIUM
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat SheetLaura Hughes
 
The Death of Final Tagless
The Death of Final TaglessThe Death of Final Tagless
The Death of Final TaglessJohn De Goes
 
Clojure for Data Science
Clojure for Data ScienceClojure for Data Science
Clojure for Data Sciencehenrygarner
 
Groovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony CodeGroovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony Codestasimus
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsDataconomy Media
 
深入浅出Jscex
深入浅出Jscex深入浅出Jscex
深入浅出Jscexjeffz
 
Probabilistic Programming in Scala
Probabilistic Programming in ScalaProbabilistic Programming in Scala
Probabilistic Programming in ScalaBeScala
 
Coffee Scriptでenchant.js
Coffee Scriptでenchant.jsCoffee Scriptでenchant.js
Coffee Scriptでenchant.jsNaoyuki Totani
 

What's hot (20)

Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 
How fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceHow fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practice
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
ScalaMeter 2012
ScalaMeter 2012ScalaMeter 2012
ScalaMeter 2012
 
Heaps
HeapsHeaps
Heaps
 
Hammurabi
HammurabiHammurabi
Hammurabi
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
The Death of Final Tagless
The Death of Final TaglessThe Death of Final Tagless
The Death of Final Tagless
 
Academy PRO: ES2015
Academy PRO: ES2015Academy PRO: ES2015
Academy PRO: ES2015
 
Slides
SlidesSlides
Slides
 
Clojure for Data Science
Clojure for Data ScienceClojure for Data Science
Clojure for Data Science
 
Groovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony CodeGroovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony Code
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forests
 
深入浅出Jscex
深入浅出Jscex深入浅出Jscex
深入浅出Jscex
 
Probabilistic Programming in Scala
Probabilistic Programming in ScalaProbabilistic Programming in Scala
Probabilistic Programming in Scala
 
Rug hogan-10-03-2012
Rug hogan-10-03-2012Rug hogan-10-03-2012
Rug hogan-10-03-2012
 
Coffee Scriptでenchant.js
Coffee Scriptでenchant.jsCoffee Scriptでenchant.js
Coffee Scriptでenchant.js
 

Viewers also liked

Entreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationEntreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationmoldovaictsummit2016
 
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...CA Technologies
 
Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014TA Telecom
 
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2
 
Introduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopIntroduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopAvkash Chauhan
 
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2
 
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Accelerating R analytics with Spark and  Microsoft R Server  for HadoopAccelerating R analytics with Spark and  Microsoft R Server  for Hadoop
Accelerating R analytics with Spark and Microsoft R Server for HadoopWilly Marroquin (WillyDevNET)
 
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle CloudUKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle Cloudluisw19
 
WSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2
 
Data Hacking with RHadoop
Data Hacking with RHadoopData Hacking with RHadoop
Data Hacking with RHadoopEd Kohlwey
 
OBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchOBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchMichelle Kolbe
 
Accessing Databases from R
Accessing Databases from RAccessing Databases from R
Accessing Databases from RJeffrey Breen
 
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...CA Technologies
 
Hp distributed R User Guide
Hp distributed R User GuideHp distributed R User Guide
Hp distributed R User GuideAndrey Karpov
 
The Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanThe Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanApigee | Google Cloud
 
Tapping the Data Deluge with R
Tapping the Data Deluge with RTapping the Data Deluge with R
Tapping the Data Deluge with RJeffrey Breen
 

Viewers also liked (20)

Entreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationEntreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformation
 
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
 
Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014
 
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
 
Introduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopIntroduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache Hadoop
 
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
 
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Accelerating R analytics with Spark and  Microsoft R Server  for HadoopAccelerating R analytics with Spark and  Microsoft R Server  for Hadoop
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
 
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle CloudUKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
 
WSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and Roadmap
 
API strategy with IBM API connect
API strategy with IBM API connectAPI strategy with IBM API connect
API strategy with IBM API connect
 
Data Hacking with RHadoop
Data Hacking with RHadoopData Hacking with RHadoop
Data Hacking with RHadoop
 
OBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchOBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage Match
 
Excel/R
Excel/RExcel/R
Excel/R
 
Accessing Databases from R
Accessing Databases from RAccessing Databases from R
Accessing Databases from R
 
Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)
 
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
 
Hp distributed R User Guide
Hp distributed R User GuideHp distributed R User Guide
Hp distributed R User Guide
 
The Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanThe Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business Plan
 
R crash course
R crash courseR crash course
R crash course
 
Tapping the Data Deluge with R
Tapping the Data Deluge with RTapping the Data Deluge with R
Tapping the Data Deluge with R
 

Similar to R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

Monadologie
MonadologieMonadologie
Monadologieleague
 
Algorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileAlgorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileKushagraChadha1
 
Go: It's Not Just For Google
Go: It's Not Just For GoogleGo: It's Not Just For Google
Go: It's Not Just For GoogleEleanor McHugh
 
Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語ikdysfm
 
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
Implement the following sorting algorithms  Bubble Sort  Insertion S.pdfImplement the following sorting algorithms  Bubble Sort  Insertion S.pdf
Implement the following sorting algorithms Bubble Sort Insertion S.pdfkesav24
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIDr. Volkan OBAN
 
Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Evgeny Borisov
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1Ke Wei Louis
 
Retos de Programación en Python
Retos de Programación en PythonRetos de Programación en Python
Retos de Programación en PythonJavier Abadía
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions Dr. Volkan OBAN
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changeshayato
 
Python 101 language features and functional programming
Python 101 language features and functional programmingPython 101 language features and functional programming
Python 101 language features and functional programmingLukasz Dynowski
 
How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017Szymon Matejczyk
 

Similar to R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework (20)

RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
 
Monadologie
MonadologieMonadologie
Monadologie
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
 
Algorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileAlgorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical File
 
Go: It's Not Just For Google
Go: It's Not Just For GoogleGo: It's Not Just For Google
Go: It's Not Just For Google
 
Oh Composable World!
Oh Composable World!Oh Composable World!
Oh Composable World!
 
Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語
 
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
Implement the following sorting algorithms  Bubble Sort  Insertion S.pdfImplement the following sorting algorithms  Bubble Sort  Insertion S.pdf
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
 
Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1
 
Retos de Programación en Python
Retos de Programación en PythonRetos de Programación en Python
Retos de Programación en Python
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
 
Python 101 language features and functional programming
Python 101 language features and functional programmingPython 101 language features and functional programming
Python 101 language features and functional programming
 
Scala by Luc Duponcheel
Scala by Luc DuponcheelScala by Luc Duponcheel
Scala by Luc Duponcheel
 
How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017
 
Script jantung copy
Script jantung   copyScript jantung   copy
Script jantung copy
 

More from Revolution Analytics

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudRevolution Analytics
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureRevolution Analytics
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudRevolution Analytics
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondRevolution Analytics
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source CommunitiesRevolution Analytics
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with RRevolution Analytics
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceRevolution Analytics
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudRevolution Analytics
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorRevolution Analytics
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalRevolution Analytics
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint packageRevolution Analytics
 

More from Revolution Analytics (20)

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 final
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 

Recently uploaded

Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking MenDelhi Call girls
 
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...gurkirankumar98700
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Servicegiselly40
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processorsdebabhi2
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsTop 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsRoshan Dwivedi
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreternaman860154
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...apidays
 
Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Paola De la Torre
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonAnna Loughnan Colquhoun
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...Neo4j
 
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking MenDelhi Call girls
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxKatpro Technologies
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsEnterprise Knowledge
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Allon Mureinik
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...Martijn de Jong
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationRadu Cotescu
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Miguel Araújo
 

Recently uploaded (20)

Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men08448380779 Call Girls In Greater Kailash - I Women Seeking Men
08448380779 Call Girls In Greater Kailash - I Women Seeking Men
 
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processors
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsTop 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreter
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
 
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 

R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

  • 1. R + Hadoop = Big Data Analytics
  • 2.
  • 3.
  • 4.
  • 5.
  • 6. 6
  • 7. 7
  • 8. 8
  • 9. 9
  • 10.
  • 11.
  • 12.
  • 15. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Cascalog, Rmr Dumbo, Pydoop, Scalding, Hadoopy Scrunch Java, Cascading, C++ Crunch
  • 18. small.ints = 1:1000 lapply(small.ints, function(x) x^2) small.ints = to.dfs(1:1000) mapreduce(input = small.ints, map = function(k,v) keyval(v, v^2)) groups = rbinom(32, n = 50, prob = 0.4) tapply(groups, groups, length) groups = to.dfs(groups) mapreduce(input = groups, map = function(k, v) keyval(v,1), reduce = function(k,vv) keyval(k, length(vv)))
  • 19. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 20. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 21. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k=4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 22. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 23. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 24. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 25. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 26. input.format, output.format, format reduce.on.data.frame, to.data.frame local, hadoop backends profiling