SlideShare a Scribd company logo
1 of 74
Big Data Analytics
with
Spark
October 04, 2015









Source: Cisco, IDC, Wikibon report 2013
1980s 1990-2000s 2010 - beyond
Quick Review



–

–













–
–

–






–
–




01 import java.io.IOException;
02 import java.util.StringTokenizer;
03
04 import org.apache.hadoop.conf.Configuration;
05 import org.apache.hadoop.fs.Path;
06 import org.apache.hadoop.io.IntWritable;
07 import org.apache.hadoop.io.Text;
08 import org.apache.hadoop.mapreduce.Job;
09 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13
14 public class WordCount {
15
16 public static class TokenizerMapper
17 extends Mapper<Object, Text, Text, IntWritable>{
18
19 private final static IntWritable one = new IntWritable(1);
20 private Text word = new Text();
21
22 public void map(Object key, Text value, Context context
23 ) throws IOException, InterruptedException {
24 StringTokenizer itr = new StringTokenizer(value.toString());
25 while (itr.hasMoreTokens()) {
26 word.set(itr.nextToken());
27 context.write(word, one);
28 }
29 }
30 }
31
32 public static class IntSumReducer
33 extends Reducer<Text,IntWritable,Text,IntWritable> {
34 private IntWritable result = new IntWritable();
35
36 public void reduce(Text key, Iterable<IntWritable> values,
37 Context context
38 ) throws IOException, InterruptedException {
39 int sum = 0;
40 for (IntWritable val : values) {
41 sum += val.get();
42 }
43 result.set(sum);
44 context.write(key, result);
45 }
46 }
47
48 public static void main(String[] args) throws Exception {
49 Configuration conf = new Configuration();
50 Job job = Job.getInstance(conf, "word count");
51 job.setJarByClass(WordCount.class);
52 job.setMapperClass(TokenizerMapper.class);
53 job.setCombinerClass(IntSumReducer.class);
54 job.setReducerClass(IntSumReducer.class);
55 job.setOutputKeyClass(Text.class);
56 job.setOutputValueClass(IntWritable.class);
57 FileInputFormat.addInputPath(job, new Path(args[0]));
58 FileOutputFormat.setOutputPath(job, new Path(args[1]));
59 System.exit(job.waitForCompletion(true) ? 0 : 1);
60 }
61 }
01 import org.apache.spark.SparkContext
02 import org.apache.spark.SparkContext._
03
04 object WordCount {
05 def main(args: Array[String]): Unit = {
06 val sc = new SparkContext()
07 val lines = sc.textFile(args(0))
08 val wordCounts = lines.flatMap {line => line.split(" ")}
09 .map(word => (word, 1))
10 .reduceByKey(_ + _)
11 wordCounts.saveAsTextFile(args(1))
12 }
13 }

–
–

–
–








–
–











–


–
–

–
–
–
–

–







–


















–



Big Data Analytics with Spark

More Related Content

What's hot

What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...
What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...
What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...
Simplilearn
 
Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...
Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...
Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...
Simplilearn
 

What's hot (20)

Big data and Hadoop
Big data and HadoopBig data and Hadoop
Big data and Hadoop
 
Spark SQL Deep Dive @ Melbourne Spark Meetup
Spark SQL Deep Dive @ Melbourne Spark MeetupSpark SQL Deep Dive @ Melbourne Spark Meetup
Spark SQL Deep Dive @ Melbourne Spark Meetup
 
What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...
What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...
What Is Apache Spark? | Introduction To Apache Spark | Apache Spark Tutorial ...
 
Introduction to Spark with Python
Introduction to Spark with PythonIntroduction to Spark with Python
Introduction to Spark with Python
 
Hadoop File system (HDFS)
Hadoop File system (HDFS)Hadoop File system (HDFS)
Hadoop File system (HDFS)
 
Apache Spark Introduction
Apache Spark IntroductionApache Spark Introduction
Apache Spark Introduction
 
Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...
Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...
Hive Tutorial | Hive Architecture | Hive Tutorial For Beginners | Hive In Had...
 
Sqoop
SqoopSqoop
Sqoop
 
Hadoop Architecture
Hadoop ArchitectureHadoop Architecture
Hadoop Architecture
 
Hadoop YARN
Hadoop YARNHadoop YARN
Hadoop YARN
 
Introduction to Big Data
Introduction to Big DataIntroduction to Big Data
Introduction to Big Data
 
Spark architecture
Spark architectureSpark architecture
Spark architecture
 
Apache spark
Apache sparkApache spark
Apache spark
 
Hadoop Oozie
Hadoop OozieHadoop Oozie
Hadoop Oozie
 
Introduction to Map Reduce
Introduction to Map ReduceIntroduction to Map Reduce
Introduction to Map Reduce
 
Introduction to Hadoop Administration
Introduction to Hadoop AdministrationIntroduction to Hadoop Administration
Introduction to Hadoop Administration
 
Hadoop And Their Ecosystem ppt
 Hadoop And Their Ecosystem ppt Hadoop And Their Ecosystem ppt
Hadoop And Their Ecosystem ppt
 
Hadoop
HadoopHadoop
Hadoop
 
Apache Spark Overview
Apache Spark OverviewApache Spark Overview
Apache Spark Overview
 
Mining Data Streams
Mining Data StreamsMining Data Streams
Mining Data Streams
 

Viewers also liked

Viewers also liked (20)

Introduction to Big Data
Introduction to Big DataIntroduction to Big Data
Introduction to Big Data
 
Big Data 2.0 - How Spark technologies are reshaping the world of big data ana...
Big Data 2.0 - How Spark technologies are reshaping the world of big data ana...Big Data 2.0 - How Spark technologies are reshaping the world of big data ana...
Big Data 2.0 - How Spark technologies are reshaping the world of big data ana...
 
Overview of Apache Fink: The 4G of Big Data Analytics Frameworks
Overview of Apache Fink: The 4G of Big Data Analytics FrameworksOverview of Apache Fink: The 4G of Big Data Analytics Frameworks
Overview of Apache Fink: The 4G of Big Data Analytics Frameworks
 
Building a High-Performance Database with Scala, Akka, and Spark
Building a High-Performance Database with Scala, Akka, and SparkBuilding a High-Performance Database with Scala, Akka, and Spark
Building a High-Performance Database with Scala, Akka, and Spark
 
Unified Batch and Real-Time Stream Processing Using Apache Flink
Unified Batch and Real-Time Stream Processing Using Apache FlinkUnified Batch and Real-Time Stream Processing Using Apache Flink
Unified Batch and Real-Time Stream Processing Using Apache Flink
 
Big Data Analytics
Big Data AnalyticsBig Data Analytics
Big Data Analytics
 
Fighting Fraud with Apache Spark
Fighting Fraud with Apache SparkFighting Fraud with Apache Spark
Fighting Fraud with Apache Spark
 
In pursuit of augmented intelligence
In pursuit of augmented intelligenceIn pursuit of augmented intelligence
In pursuit of augmented intelligence
 
Rise of the machine (learning algorithms)
Rise of the machine (learning algorithms)Rise of the machine (learning algorithms)
Rise of the machine (learning algorithms)
 
What's new with Apache Spark's Structured Streaming?
What's new with Apache Spark's Structured Streaming?What's new with Apache Spark's Structured Streaming?
What's new with Apache Spark's Structured Streaming?
 
Augmented Intelligence 2.0
Augmented Intelligence 2.0Augmented Intelligence 2.0
Augmented Intelligence 2.0
 
Machine Learning Meetup SOF: Intro to ML
Machine Learning Meetup SOF: Intro to MLMachine Learning Meetup SOF: Intro to ML
Machine Learning Meetup SOF: Intro to ML
 
Breakthrough OLAP performance with Cassandra and Spark
Breakthrough OLAP performance with Cassandra and SparkBreakthrough OLAP performance with Cassandra and Spark
Breakthrough OLAP performance with Cassandra and Spark
 
The How and Why of Fast Data Analytics with Apache Spark
The How and Why of Fast Data Analytics with Apache SparkThe How and Why of Fast Data Analytics with Apache Spark
The How and Why of Fast Data Analytics with Apache Spark
 
ETL to ML: Use Apache Spark as an end to end tool for Advanced Analytics
ETL to ML: Use Apache Spark as an end to end tool for Advanced AnalyticsETL to ML: Use Apache Spark as an end to end tool for Advanced Analytics
ETL to ML: Use Apache Spark as an end to end tool for Advanced Analytics
 
Intro au Big Data & Machine Learning
Intro au Big Data & Machine LearningIntro au Big Data & Machine Learning
Intro au Big Data & Machine Learning
 
That's like, so random! Monte Carlo for Data Science
That's like, so random! Monte Carlo for Data ScienceThat's like, so random! Monte Carlo for Data Science
That's like, so random! Monte Carlo for Data Science
 
An Intuitive Intro To Machine Learning
An Intuitive Intro To Machine LearningAn Intuitive Intro To Machine Learning
An Intuitive Intro To Machine Learning
 
Introduction to Spark SQL & Catalyst
Introduction to Spark SQL & CatalystIntroduction to Spark SQL & Catalyst
Introduction to Spark SQL & Catalyst
 
The Rise of the Machines - A Primer to Machine Learning and Predictive Analyt...
The Rise of the Machines - A Primer to Machine Learning and Predictive Analyt...The Rise of the Machines - A Primer to Machine Learning and Predictive Analyt...
The Rise of the Machines - A Primer to Machine Learning and Predictive Analyt...
 

Similar to Big Data Analytics with Spark

Case study ap log collector
Case study ap log collectorCase study ap log collector
Case study ap log collector
Jyun-Yao Huang
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache Hadoop
Sages
 

Similar to Big Data Analytics with Spark (20)

Building Scalable Data Pipelines - 2016 DataPalooza Seattle
Building Scalable Data Pipelines - 2016 DataPalooza SeattleBuilding Scalable Data Pipelines - 2016 DataPalooza Seattle
Building Scalable Data Pipelines - 2016 DataPalooza Seattle
 
Writing Hadoop Jobs in Scala using Scalding
Writing Hadoop Jobs in Scala using ScaldingWriting Hadoop Jobs in Scala using Scalding
Writing Hadoop Jobs in Scala using Scalding
 
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInScalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedIn
 
Create & Execute First Hadoop MapReduce Project in.pptx
Create & Execute First Hadoop MapReduce Project in.pptxCreate & Execute First Hadoop MapReduce Project in.pptx
Create & Execute First Hadoop MapReduce Project in.pptx
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
 
HBase based map reduce job unit testing
HBase based map reduce job unit testingHBase based map reduce job unit testing
HBase based map reduce job unit testing
 
2014 09 30_sparkling_water_hands_on
2014 09 30_sparkling_water_hands_on2014 09 30_sparkling_water_hands_on
2014 09 30_sparkling_water_hands_on
 
Spark Day 2017- Spark 의 과거, 현재, 미래
Spark Day 2017- Spark 의 과거, 현재, 미래Spark Day 2017- Spark 의 과거, 현재, 미래
Spark Day 2017- Spark 의 과거, 현재, 미래
 
Django deployment with PaaS
Django deployment with PaaSDjango deployment with PaaS
Django deployment with PaaS
 
SFScon 2020 - Peter Hopfgartner - Open Data de luxe
SFScon 2020 - Peter Hopfgartner - Open Data de luxeSFScon 2020 - Peter Hopfgartner - Open Data de luxe
SFScon 2020 - Peter Hopfgartner - Open Data de luxe
 
Open sourcing the store
Open sourcing the storeOpen sourcing the store
Open sourcing the store
 
High-level Programming Languages: Apache Pig and Pig Latin
High-level Programming Languages: Apache Pig and Pig LatinHigh-level Programming Languages: Apache Pig and Pig Latin
High-level Programming Languages: Apache Pig and Pig Latin
 
Case study ap log collector
Case study ap log collectorCase study ap log collector
Case study ap log collector
 
PigSPARQL: A SPARQL Query Processing Baseline for Big Data
PigSPARQL: A SPARQL Query Processing Baseline for Big DataPigSPARQL: A SPARQL Query Processing Baseline for Big Data
PigSPARQL: A SPARQL Query Processing Baseline for Big Data
 
ExSchema
ExSchemaExSchema
ExSchema
 
Harnessing the power of YARN with Apache Twill
Harnessing the power of YARN with Apache TwillHarnessing the power of YARN with Apache Twill
Harnessing the power of YARN with Apache Twill
 
Easing offline web application development with GWT
Easing offline web application development with GWTEasing offline web application development with GWT
Easing offline web application development with GWT
 
Scalding Presentation
Scalding PresentationScalding Presentation
Scalding Presentation
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache Hadoop
 
Big-data-analysis-training-in-mumbai
Big-data-analysis-training-in-mumbaiBig-data-analysis-training-in-mumbai
Big-data-analysis-training-in-mumbai
 

Recently uploaded

Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
gajnagarg
 
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
HyderabadDolls
 
Lecture_2_Deep_Learning_Overview-newone1
Lecture_2_Deep_Learning_Overview-newone1Lecture_2_Deep_Learning_Overview-newone1
Lecture_2_Deep_Learning_Overview-newone1
ranjankumarbehera14
 
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
nirzagarg
 
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
HyderabadDolls
 
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
wsppdmt
 
Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...
Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...
Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...
HyderabadDolls
 

Recently uploaded (20)

High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
High Profile Call Girls Service in Jalore { 9332606886 } VVIP NISHA Call Girl...
 
Statistics notes ,it includes mean to index numbers
Statistics notes ,it includes mean to index numbersStatistics notes ,it includes mean to index numbers
Statistics notes ,it includes mean to index numbers
 
SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...
SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...
SAC 25 Final National, Regional & Local Angel Group Investing Insights 2024 0...
 
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
Top profile Call Girls In Vadodara [ 7014168258 ] Call Me For Genuine Models ...
 
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
 
Lecture_2_Deep_Learning_Overview-newone1
Lecture_2_Deep_Learning_Overview-newone1Lecture_2_Deep_Learning_Overview-newone1
Lecture_2_Deep_Learning_Overview-newone1
 
Dubai Call Girls Peeing O525547819 Call Girls Dubai
Dubai Call Girls Peeing O525547819 Call Girls DubaiDubai Call Girls Peeing O525547819 Call Girls Dubai
Dubai Call Girls Peeing O525547819 Call Girls Dubai
 
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
 
TrafficWave Generator Will Instantly drive targeted and engaging traffic back...
TrafficWave Generator Will Instantly drive targeted and engaging traffic back...TrafficWave Generator Will Instantly drive targeted and engaging traffic back...
TrafficWave Generator Will Instantly drive targeted and engaging traffic back...
 
Gulbai Tekra * Cheap Call Girls In Ahmedabad Phone No 8005736733 Elite Escort...
Gulbai Tekra * Cheap Call Girls In Ahmedabad Phone No 8005736733 Elite Escort...Gulbai Tekra * Cheap Call Girls In Ahmedabad Phone No 8005736733 Elite Escort...
Gulbai Tekra * Cheap Call Girls In Ahmedabad Phone No 8005736733 Elite Escort...
 
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
 
20240412-SmartCityIndex-2024-Full-Report.pdf
20240412-SmartCityIndex-2024-Full-Report.pdf20240412-SmartCityIndex-2024-Full-Report.pdf
20240412-SmartCityIndex-2024-Full-Report.pdf
 
Vadodara 💋 Call Girl 7737669865 Call Girls in Vadodara Escort service book now
Vadodara 💋 Call Girl 7737669865 Call Girls in Vadodara Escort service book nowVadodara 💋 Call Girl 7737669865 Call Girls in Vadodara Escort service book now
Vadodara 💋 Call Girl 7737669865 Call Girls in Vadodara Escort service book now
 
Top Call Girls in Balaghat 9332606886Call Girls Advance Cash On Delivery Ser...
Top Call Girls in Balaghat  9332606886Call Girls Advance Cash On Delivery Ser...Top Call Girls in Balaghat  9332606886Call Girls Advance Cash On Delivery Ser...
Top Call Girls in Balaghat 9332606886Call Girls Advance Cash On Delivery Ser...
 
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptxRESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
RESEARCH-FINAL-DEFENSE-PPT-TEMPLATE.pptx
 
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
如何办理英国诺森比亚大学毕业证(NU毕业证书)成绩单原件一模一样
 
Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...
Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...
Sonagachi * best call girls in Kolkata | ₹,9500 Pay Cash 8005736733 Free Home...
 
Ranking and Scoring Exercises for Research
Ranking and Scoring Exercises for ResearchRanking and Scoring Exercises for Research
Ranking and Scoring Exercises for Research
 
Kings of Saudi Arabia, information about them
Kings of Saudi Arabia, information about themKings of Saudi Arabia, information about them
Kings of Saudi Arabia, information about them
 
Digital Transformation Playbook by Graham Ware
Digital Transformation Playbook by Graham WareDigital Transformation Playbook by Graham Ware
Digital Transformation Playbook by Graham Ware
 

Big Data Analytics with Spark

  • 4.
  • 5.
  • 6. Source: Cisco, IDC, Wikibon report 2013 1980s 1990-2000s 2010 - beyond
  • 7.
  • 8.
  • 10.
  • 12.
  • 14.
  • 15.
  • 16.
  • 17.
  • 19.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 35. 01 import java.io.IOException; 02 import java.util.StringTokenizer; 03 04 import org.apache.hadoop.conf.Configuration; 05 import org.apache.hadoop.fs.Path; 06 import org.apache.hadoop.io.IntWritable; 07 import org.apache.hadoop.io.Text; 08 import org.apache.hadoop.mapreduce.Job; 09 import org.apache.hadoop.mapreduce.Mapper; 10 import org.apache.hadoop.mapreduce.Reducer; 11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 14 public class WordCount { 15 16 public static class TokenizerMapper 17 extends Mapper<Object, Text, Text, IntWritable>{ 18 19 private final static IntWritable one = new IntWritable(1); 20 private Text word = new Text(); 21 22 public void map(Object key, Text value, Context context 23 ) throws IOException, InterruptedException { 24 StringTokenizer itr = new StringTokenizer(value.toString()); 25 while (itr.hasMoreTokens()) { 26 word.set(itr.nextToken()); 27 context.write(word, one); 28 } 29 } 30 } 31 32 public static class IntSumReducer 33 extends Reducer<Text,IntWritable,Text,IntWritable> { 34 private IntWritable result = new IntWritable(); 35 36 public void reduce(Text key, Iterable<IntWritable> values, 37 Context context 38 ) throws IOException, InterruptedException { 39 int sum = 0; 40 for (IntWritable val : values) { 41 sum += val.get(); 42 } 43 result.set(sum); 44 context.write(key, result); 45 } 46 } 47 48 public static void main(String[] args) throws Exception { 49 Configuration conf = new Configuration(); 50 Job job = Job.getInstance(conf, "word count"); 51 job.setJarByClass(WordCount.class); 52 job.setMapperClass(TokenizerMapper.class); 53 job.setCombinerClass(IntSumReducer.class); 54 job.setReducerClass(IntSumReducer.class); 55 job.setOutputKeyClass(Text.class); 56 job.setOutputValueClass(IntWritable.class); 57 FileInputFormat.addInputPath(job, new Path(args[0])); 58 FileOutputFormat.setOutputPath(job, new Path(args[1])); 59 System.exit(job.waitForCompletion(true) ? 0 : 1); 60 } 61 } 01 import org.apache.spark.SparkContext 02 import org.apache.spark.SparkContext._ 03 04 object WordCount { 05 def main(args: Array[String]): Unit = { 06 val sc = new SparkContext() 07 val lines = sc.textFile(args(0)) 08 val wordCounts = lines.flatMap {line => line.split(" ")} 09 .map(word => (word, 1)) 10 .reduceByKey(_ + _) 11 wordCounts.saveAsTextFile(args(1)) 12 } 13 }
  • 37.
  • 40.
  • 41.
  • 44.
  • 45.
  • 47.
  • 53.
  • 54.
  • 57.
  • 58.
  • 61.
  • 62.
  • 63.
  • 65.
  • 66.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.