SlideShare a Scribd company logo
R + Hadoop = Big Data Analytics
6
7
8
9
library(rmr)

mapreduce(…)
lapply(data, function)

mapreduce(big.data, map = function)
Expose MR   Hide MR
                             Hive, Pig




  Rmr, Rhipe,               Cascalog,
Rmr
Dumbo, Pydoop,              Scalding,
    Hadoopy                  Scrunch




       Java,                Cascading,
        C++                  Crunch
mapreduce(input, output, map, reduce)
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
small.ints = 1:1000
lapply(small.ints, function(x) x^2)

small.ints = to.dfs(1:1000)
mapreduce(input = small.ints,
        map = function(k,v) keyval(v, v^2))

groups = rbinom(32, n = 50, prob = 0.4)
tapply(groups, groups, length)

groups = to.dfs(groups)
mapreduce(input = groups,
     map = function(k, v) keyval(v,1),
     reduce = function(k,vv)
               keyval(k, length(vv)))
condition = function(x) x > 10


out = mapreduce(
     input = input,
     map = function(k,v)
          if (condition(v)) keyval(k,v))
kmeans =
 function(points, ncenters, iterations = 10,    distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) {  newCenters =
kmeans.iter(points, distfun, centers = newCenters)} newCenters}




kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs(
   mapreduce(
    input = points,      map = if (is.null(centers)) {        function(k,v) keyval(sample(1:ncenters,1),v)}           else
{         function(k,v) {             distances = apply(centers, 1, function(c) distfun(c,v))
keyval(centers[which.min(distances),], v)}},      reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2,
mean))),    to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k=4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
   last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
   initial_centroids = initial_centroids + str(last_centroids[i])
   if i!=k-1:
       initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
            DEFINE find_centroid FindCentroid('$centroids');
            raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
            centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
            grouped = group centroided by centroid;
            result = foreach grouped generate group, AVG(centroided.gpa);
            store result into 'output';
          """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
   Q = P.bind({'centroids':initial_centroids})
   results = Q.runSingle()
if results.isSuccessful() == "FAILED":
      raise "Pig job failed"
  iter = results.result("result").iterator()
  centroids = [None] * k
  distance_move = 0
  # get new centroid of this iteration, caculate the moving distance with last iteration
  for i in range(k):
      tuple = iter.next()
      centroids[i] = float(str(tuple.get(1)))
      distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
  distance_move = distance_move / k;
  Pig.fs("rmr output")
  print("iteration " + str(iter_num))
  print("average distance moved: " + str(distance_move))
  if distance_move<tolerance:
      sys.stdout.write("k-means converged at centroids: [")
      sys.stdout.write(",".join(str(v) for v in centroids))
      sys.stdout.write("]n")
      converged = True
      break
  last_centroids = centroids[:]
  initial_centroids = ""
  for i in range(k):
      initial_centroids = initial_centroids + str(last_centroids[i])
      if i!=k-1:
          initial_centroids = initial_centroids + ":"
  iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
  double[] centroids;
  public FindCentroid(String initialCentroid) {
     String[] centroidStrings = initialCentroid.split(":");
     centroids = new double[centroidStrings.length];
     for (int i=0;i<centroidStrings.length;i++)
        centroids[i] = Double.parseDouble(centroidStrings[i]);
  }
  @Override
  public Double exec(Tuple input) throws IOException {
     double min_distance = Double.MAX_VALUE;
     double closest_centroid = 0;
     for (double centroid : centroids) {
        double distance = Math.abs(centroid - (Double)input.get(0));
        if (distance < min_distance) {
            min_distance = distance;
            closest_centroid = centroid;
        }
     }
     return closest_centroid;
  }

}
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
  left.input, right.input, input,
  output,
  outer,
  map.left, map.right,
  reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
  …
  result = mapreduce(input = input,
              output = output)
  …
  result}
input.format, output.format, format
reduce.on.data.frame, to.data.frame
local, hadoop backends
profiling
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

More Related Content

What's hot

Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
Abhik Seal
 
How fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceHow fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practice
Tobias Pfeiffer
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
Tim Essam
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
Nimrita Koul
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
Tim Essam
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
Laura Hughes
 
ScalaMeter 2012
ScalaMeter 2012ScalaMeter 2012
ScalaMeter 2012
Aleksandar Prokopec
 
Heaps
HeapsHeaps
Heaps
IIUM
 
Hammurabi
HammurabiHammurabi
Hammurabi
Mario Fusco
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
Laura Hughes
 
The Death of Final Tagless
The Death of Final TaglessThe Death of Final Tagless
The Death of Final Tagless
John De Goes
 
Academy PRO: ES2015
Academy PRO: ES2015Academy PRO: ES2015
Academy PRO: ES2015
Binary Studio
 
Clojure for Data Science
Clojure for Data ScienceClojure for Data Science
Clojure for Data Science
henrygarner
 
Groovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony CodeGroovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony Codestasimus
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forests
Dataconomy Media
 
深入浅出Jscex
深入浅出Jscex深入浅出Jscex
深入浅出Jscexjeffz
 
Probabilistic Programming in Scala
Probabilistic Programming in ScalaProbabilistic Programming in Scala
Probabilistic Programming in ScalaBeScala
 
Rug hogan-10-03-2012
Rug hogan-10-03-2012Rug hogan-10-03-2012
Rug hogan-10-03-2012
designandanalytics
 
Coffee Scriptでenchant.js
Coffee Scriptでenchant.jsCoffee Scriptでenchant.js
Coffee Scriptでenchant.js
Naoyuki Totani
 

What's hot (20)

Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 
How fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceHow fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practice
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
ScalaMeter 2012
ScalaMeter 2012ScalaMeter 2012
ScalaMeter 2012
 
Heaps
HeapsHeaps
Heaps
 
Hammurabi
HammurabiHammurabi
Hammurabi
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
The Death of Final Tagless
The Death of Final TaglessThe Death of Final Tagless
The Death of Final Tagless
 
Academy PRO: ES2015
Academy PRO: ES2015Academy PRO: ES2015
Academy PRO: ES2015
 
Slides
SlidesSlides
Slides
 
Clojure for Data Science
Clojure for Data ScienceClojure for Data Science
Clojure for Data Science
 
Groovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony CodeGroovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony Code
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forests
 
深入浅出Jscex
深入浅出Jscex深入浅出Jscex
深入浅出Jscex
 
Probabilistic Programming in Scala
Probabilistic Programming in ScalaProbabilistic Programming in Scala
Probabilistic Programming in Scala
 
Rug hogan-10-03-2012
Rug hogan-10-03-2012Rug hogan-10-03-2012
Rug hogan-10-03-2012
 
Coffee Scriptでenchant.js
Coffee Scriptでenchant.jsCoffee Scriptでenchant.js
Coffee Scriptでenchant.js
 

Viewers also liked

Entreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationEntreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformation
moldovaictsummit2016
 
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
CA Technologies
 
Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014
TA Telecom
 
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2
 
Introduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopIntroduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache Hadoop
Avkash Chauhan
 
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2
 
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Accelerating R analytics with Spark and  Microsoft R Server  for HadoopAccelerating R analytics with Spark and  Microsoft R Server  for Hadoop
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Willy Marroquin (WillyDevNET)
 
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle CloudUKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
luisw19
 
WSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and Roadmap
WSO2
 
API strategy with IBM API connect
API strategy with IBM API connectAPI strategy with IBM API connect
API strategy with IBM API connect
Kellton Tech Solutions Ltd
 
Data Hacking with RHadoop
Data Hacking with RHadoopData Hacking with RHadoop
Data Hacking with RHadoop
Ed Kohlwey
 
OBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchOBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage Match
Michelle Kolbe
 
Excel/R
Excel/RExcel/R
Accessing Databases from R
Accessing Databases from RAccessing Databases from R
Accessing Databases from R
Jeffrey Breen
 
Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)
Revolution Analytics
 
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
CA Technologies
 
Hp distributed R User Guide
Hp distributed R User GuideHp distributed R User Guide
Hp distributed R User Guide
Andrey Karpov
 
The Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanThe Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business Plan
Apigee | Google Cloud
 
R crash course
R crash courseR crash course
R crash course
Tomislav Hengl
 
Tapping the Data Deluge with R
Tapping the Data Deluge with RTapping the Data Deluge with R
Tapping the Data Deluge with R
Jeffrey Breen
 

Viewers also liked (20)

Entreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationEntreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformation
 
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
 
Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014
 
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
 
Introduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopIntroduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache Hadoop
 
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
 
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Accelerating R analytics with Spark and  Microsoft R Server  for HadoopAccelerating R analytics with Spark and  Microsoft R Server  for Hadoop
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
 
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle CloudUKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
 
WSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and Roadmap
 
API strategy with IBM API connect
API strategy with IBM API connectAPI strategy with IBM API connect
API strategy with IBM API connect
 
Data Hacking with RHadoop
Data Hacking with RHadoopData Hacking with RHadoop
Data Hacking with RHadoop
 
OBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchOBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage Match
 
Excel/R
Excel/RExcel/R
Excel/R
 
Accessing Databases from R
Accessing Databases from RAccessing Databases from R
Accessing Databases from R
 
Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)
 
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
 
Hp distributed R User Guide
Hp distributed R User GuideHp distributed R User Guide
Hp distributed R User Guide
 
The Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanThe Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business Plan
 
R crash course
R crash courseR crash course
R crash course
 
Tapping the Data Deluge with R
Tapping the Data Deluge with RTapping the Data Deluge with R
Tapping the Data Deluge with R
 

Similar to R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
Yahoo Developer Network
 
Monadologie
MonadologieMonadologie
Monadologie
league
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
Dr. Volkan OBAN
 
Algorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileAlgorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical File
KushagraChadha1
 
Go: It's Not Just For Google
Go: It's Not Just For GoogleGo: It's Not Just For Google
Go: It's Not Just For Google
Eleanor McHugh
 
Oh Composable World!
Oh Composable World!Oh Composable World!
Oh Composable World!
Brian Lonsdorf
 
Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語
ikdysfm
 
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
Implement the following sorting algorithms  Bubble Sort  Insertion S.pdfImplement the following sorting algorithms  Bubble Sort  Insertion S.pdf
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
kesav24
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
Dr. Volkan OBAN
 
Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2
Evgeny Borisov
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1
Ke Wei Louis
 
Retos de Programación en Python
Retos de Programación en PythonRetos de Programación en Python
Retos de Programación en Python
Javier Abadía
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
Dr. Volkan OBAN
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changeshayato
 
Python 101 language features and functional programming
Python 101 language features and functional programmingPython 101 language features and functional programming
Python 101 language features and functional programming
Lukasz Dynowski
 
Scala by Luc Duponcheel
Scala by Luc DuponcheelScala by Luc Duponcheel
Scala by Luc Duponcheel
Stephan Janssen
 
How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017
Szymon Matejczyk
 
Script jantung copy
Script jantung   copyScript jantung   copy
Script jantung copy
Nurwahidah Abidin
 

Similar to R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework (20)

RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
 
Monadologie
MonadologieMonadologie
Monadologie
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
 
Algorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileAlgorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical File
 
Go: It's Not Just For Google
Go: It's Not Just For GoogleGo: It's Not Just For Google
Go: It's Not Just For Google
 
Oh Composable World!
Oh Composable World!Oh Composable World!
Oh Composable World!
 
Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語
 
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
Implement the following sorting algorithms  Bubble Sort  Insertion S.pdfImplement the following sorting algorithms  Bubble Sort  Insertion S.pdf
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
 
Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1
 
Retos de Programación en Python
Retos de Programación en PythonRetos de Programación en Python
Retos de Programación en Python
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
 
Python 101 language features and functional programming
Python 101 language features and functional programmingPython 101 language features and functional programming
Python 101 language features and functional programming
 
Scala by Luc Duponcheel
Scala by Luc DuponcheelScala by Luc Duponcheel
Scala by Luc Duponcheel
 
How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017
 
Script jantung copy
Script jantung   copyScript jantung   copy
Script jantung copy
 

More from Revolution Analytics

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
Revolution Analytics
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
Revolution Analytics
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
Revolution Analytics
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
Revolution Analytics
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
Revolution Analytics
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
Revolution Analytics
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
Revolution Analytics
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
Revolution Analytics
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
Revolution Analytics
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
Revolution Analytics
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
Revolution Analytics
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
Revolution Analytics
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
Revolution Analytics
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
Revolution Analytics
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
Revolution Analytics
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
Revolution Analytics
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
Revolution Analytics
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalRevolution Analytics
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
Revolution Analytics
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
Revolution Analytics
 

More from Revolution Analytics (20)

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 final
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 

Recently uploaded

Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1
DianaGray10
 
GraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge GraphGraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge Graph
Guy Korland
 
Uni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdfUni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems S.M.S.A.
 
Pushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 daysPushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 days
Adtran
 
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdfObservability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Paige Cruz
 
GraphSummit Singapore | The Art of the Possible with Graph - Q2 2024
GraphSummit Singapore | The Art of the  Possible with Graph - Q2 2024GraphSummit Singapore | The Art of the  Possible with Graph - Q2 2024
GraphSummit Singapore | The Art of the Possible with Graph - Q2 2024
Neo4j
 
Elevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object CalisthenicsElevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object Calisthenics
Dorra BARTAGUIZ
 
GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...
GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...
GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...
Neo4j
 
Removing Uninteresting Bytes in Software Fuzzing
Removing Uninteresting Bytes in Software FuzzingRemoving Uninteresting Bytes in Software Fuzzing
Removing Uninteresting Bytes in Software Fuzzing
Aftab Hussain
 
Epistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI supportEpistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI support
Alan Dix
 
Microsoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdfMicrosoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdf
Uni Systems S.M.S.A.
 
Free Complete Python - A step towards Data Science
Free Complete Python - A step towards Data ScienceFree Complete Python - A step towards Data Science
Free Complete Python - A step towards Data Science
RinaMondal9
 
National Security Agency - NSA mobile device best practices
National Security Agency - NSA mobile device best practicesNational Security Agency - NSA mobile device best practices
National Security Agency - NSA mobile device best practices
Quotidiano Piemontese
 
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
James Anderson
 
20240607 QFM018 Elixir Reading List May 2024
20240607 QFM018 Elixir Reading List May 202420240607 QFM018 Elixir Reading List May 2024
20240607 QFM018 Elixir Reading List May 2024
Matthew Sinclair
 
Climate Impact of Software Testing at Nordic Testing Days
Climate Impact of Software Testing at Nordic Testing DaysClimate Impact of Software Testing at Nordic Testing Days
Climate Impact of Software Testing at Nordic Testing Days
Kari Kakkonen
 
20240605 QFM017 Machine Intelligence Reading List May 2024
20240605 QFM017 Machine Intelligence Reading List May 202420240605 QFM017 Machine Intelligence Reading List May 2024
20240605 QFM017 Machine Intelligence Reading List May 2024
Matthew Sinclair
 
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 previewState of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
Prayukth K V
 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
Kari Kakkonen
 
Elizabeth Buie - Older adults: Are we really designing for our future selves?
Elizabeth Buie - Older adults: Are we really designing for our future selves?Elizabeth Buie - Older adults: Are we really designing for our future selves?
Elizabeth Buie - Older adults: Are we really designing for our future selves?
Nexer Digital
 

Recently uploaded (20)

Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1
 
GraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge GraphGraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge Graph
 
Uni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdfUni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdf
 
Pushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 daysPushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 days
 
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdfObservability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
 
GraphSummit Singapore | The Art of the Possible with Graph - Q2 2024
GraphSummit Singapore | The Art of the  Possible with Graph - Q2 2024GraphSummit Singapore | The Art of the  Possible with Graph - Q2 2024
GraphSummit Singapore | The Art of the Possible with Graph - Q2 2024
 
Elevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object CalisthenicsElevating Tactical DDD Patterns Through Object Calisthenics
Elevating Tactical DDD Patterns Through Object Calisthenics
 
GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...
GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...
GraphSummit Singapore | Enhancing Changi Airport Group's Passenger Experience...
 
Removing Uninteresting Bytes in Software Fuzzing
Removing Uninteresting Bytes in Software FuzzingRemoving Uninteresting Bytes in Software Fuzzing
Removing Uninteresting Bytes in Software Fuzzing
 
Epistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI supportEpistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI support
 
Microsoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdfMicrosoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdf
 
Free Complete Python - A step towards Data Science
Free Complete Python - A step towards Data ScienceFree Complete Python - A step towards Data Science
Free Complete Python - A step towards Data Science
 
National Security Agency - NSA mobile device best practices
National Security Agency - NSA mobile device best practicesNational Security Agency - NSA mobile device best practices
National Security Agency - NSA mobile device best practices
 
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using Deplo...
 
20240607 QFM018 Elixir Reading List May 2024
20240607 QFM018 Elixir Reading List May 202420240607 QFM018 Elixir Reading List May 2024
20240607 QFM018 Elixir Reading List May 2024
 
Climate Impact of Software Testing at Nordic Testing Days
Climate Impact of Software Testing at Nordic Testing DaysClimate Impact of Software Testing at Nordic Testing Days
Climate Impact of Software Testing at Nordic Testing Days
 
20240605 QFM017 Machine Intelligence Reading List May 2024
20240605 QFM017 Machine Intelligence Reading List May 202420240605 QFM017 Machine Intelligence Reading List May 2024
20240605 QFM017 Machine Intelligence Reading List May 2024
 
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 previewState of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
 
Elizabeth Buie - Older adults: Are we really designing for our future selves?
Elizabeth Buie - Older adults: Are we really designing for our future selves?Elizabeth Buie - Older adults: Are we really designing for our future selves?
Elizabeth Buie - Older adults: Are we really designing for our future selves?
 

R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

  • 1. R + Hadoop = Big Data Analytics
  • 2.
  • 3.
  • 4.
  • 5.
  • 6. 6
  • 7. 7
  • 8. 8
  • 9. 9
  • 10.
  • 11.
  • 12.
  • 15. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Cascalog, Rmr Dumbo, Pydoop, Scalding, Hadoopy Scrunch Java, Cascading, C++ Crunch
  • 18. small.ints = 1:1000 lapply(small.ints, function(x) x^2) small.ints = to.dfs(1:1000) mapreduce(input = small.ints, map = function(k,v) keyval(v, v^2)) groups = rbinom(32, n = 50, prob = 0.4) tapply(groups, groups, length) groups = to.dfs(groups) mapreduce(input = groups, map = function(k, v) keyval(v,1), reduce = function(k,vv) keyval(k, length(vv)))
  • 19. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 20. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 21. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k=4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 22. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 23. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 24. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 25. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 26. input.format, output.format, format reduce.on.data.frame, to.data.frame local, hadoop backends profiling