Twitter Scalding is built on top of Cascading, which is built on top of Hadoop. It's basically a very nice to read and extend DSL for writing map reduce jobs.
16. Why Scalding?
Word Count in Scala
val text = "a a a b b"
17. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
18. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
text
19. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
text
.split(" ")
20. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
text
.split(" ")
.map(a => (a, 1))
21. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
text
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
22. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
text
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
.map { a => a._1 -> a._2.map(_._2).sum }
23. Why Scalding?
Word Count in Scala
val text = "a a a b b"
def wordCount(text: String): Map[Word, Count] =
text
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
.map { a => a._1 -> a._2.map(_._2).sum }
wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
24. Stuff > Memory
Scala collections... fun but, memory bound!
val text = "so many words... waaah! ..."
text
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
.map(a => (a._1, a._2.map(_._2).sum))
25. Stuff > Memory
Scala collections... fun but, memory bound!
in Memory
val text = "so many words... waaah! ..."
text
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
.map(a => (a._1, a._2.map(_._2).sum))
26. Stuff > Memory
Scala collections... fun but, memory bound!
in Memory
val text = "so many words... waaah! ..."
in Memory
text
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
.map(a => (a._1, a._2.map(_._2).sum))
27. Stuff > Memory
Scala collections... fun but, memory bound!
in Memory
val text = "so many words... waaah! ..."
in Memory
text
in Memory
.split(" ")
.map(a => (a, 1))
.groupBy(_._1)
.map(a => (a._1, a._2.map(_._2).sum))
28. Stuff > Memory
Scala collections... fun but, memory bound!
in Memory
val text = "so many words... waaah! ..."
in Memory
text
in Memory
.split(" ")
.map(a => (a, 1)) in Memory
.groupBy(_._1)
.map(a => (a._1, a._2.map(_._2).sum))
29. Stuff > Memory
Scala collections... fun but, memory bound!
in Memory
val text = "so many words... waaah! ..."
in Memory
text
in Memory
.split(" ")
.map(a => (a, 1)) in Memory
.groupBy(_._1)
.map(a => (a._1, a._2.map(_._2).sum))
in Memory
31. Why Scalding?
Word Count in Hadoop MR
package org.myorg;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
public class WordCount {
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
32. private final static IntWritable one = new IntWritable(1);
Why Scalding?
private Text word = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
Word Count in Hadoop MR
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter
reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
49. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
50. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
51. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
52. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
// build the Flow
FlowDef flowDef = FlowDef.flowDef()
53. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
// build the Flow
FlowDef flowDef = FlowDef.flowDef()
.addSource( copyPipe, inTap )
54. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
// build the Flow
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
55. 1: Distributed Copy
// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
// build the Flow
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
// run!
flowConnector.connect(flowDef).complete();
56. 1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
String inPath = args[0]; String outPath = args[1];
Properties props = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);
Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
Pipe copyPipe = new Pipe("copy");
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
flowConnector.connect(flowDef).complete();
}
}
57. 1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
String inPath = args[0]; String outPath = args[1];
Properties props = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);
Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
Pipe copyPipe = new Pipe("copy");
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
flowConnector.connect(flowDef).complete();
}
}
58. 1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
String inPath = args[0]; String outPath = args[1];
Properties props = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);
Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
Pipe copyPipe = new Pipe("copy");
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
flowConnector.connect(flowDef).complete();
}
}
59. 1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
String inPath = args[0]; String outPath = args[1];
Properties props = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);
Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
Pipe copyPipe = new Pipe("copy");
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
flowConnector.connect(flowDef).complete();
}
}
60. 1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
String inPath = args[0]; String outPath = args[1];
Properties props = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);
Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
Pipe copyPipe = new Pipe("copy");
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
flowConnector.connect(flowDef).complete();
}
}
61. 1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
String inPath = args[0]; String outPath = args[1];
Properties props = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);
Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
Pipe copyPipe = new Pipe("copy");
FlowDef flowDef = FlowDef.flowDef()
.addSource(copyPipe, inTap)
.addTailSink(copyPipe, outTap);
flowConnector.connect(flowDef).complete();
}
}
62. 2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
63. 2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
// specify a regex operation to split the "document" text lines into a
ken stream
64. 2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
// specify a regex operation to split the "document" text lines into a
ken stream
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [
),.]" );
65. 2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
66. String wcPath = args[ 1 ];
2: Word Count
2: Word Count
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
67. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
68. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
69. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
70. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
71. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
72. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
73. AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );
2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
74. Fields token = new Fields( "token" );
2: Word Count
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
new RegexSplitGenerator( token, "[ [](),.]" );
// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
Flow wcFlow = flowConnector.connect( flowDef );
wcFlow.writeDOT( "dot/wc.dot" );
wcFlow.complete();
}
}
75. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
2: Word Count
How it's made
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
Flow wcFlow = flowConnector.connect( flowDef );
wcFlow.writeDOT( "dot/wc.dot" );
wcFlow.complete();
}
}
76. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );
2: Word Count
How it's made
// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
.setName( "wc" )
.addSource( docPipe, docTap )
.addTailSink( wcPipe, wcTap );
// write a DOT file and run the flow
Flow wcFlow = flowConnector.connect( flowDef );
wcFlow.writeDOT( "dot/wc.dot" );
wcFlow.complete();
}
Graph representation of jobs!
}
77. 2: Word Count
How it's made
http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
81. How it's made
val flow = FlowDef
// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
82. How it's made
val flow = FlowDef
// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
// pseudo code...
83. How it's made
val flow = FlowDef
// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
// pseudo code...
HadoopCluster.execute(jobs)
84. How it's made
val flow = FlowDef
// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
// pseudo code...
HadoopCluster.execute(jobs)
85. Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...
// head and tail have same name
FlowDef flowDef = new FlowDef()
.setName( "debug" )
.addSource( "assembly", source )
.addSink( "assembly", sink )
.addTail( assembly );
86. Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...
// head and tail have same name
FlowDef flowDef = new FlowDef()
.setName( "debug" )
.addSource( "assembly", source )
.addSink( "assembly", sink )
.addTail( assembly );
flowDef.setDebugLevel( DebugLevel.NONE );
87. Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...
// head and tail have same name
FlowDef flowDef = new FlowDef()
.setName( "debug" )
.addSource( "assembly", source )
.addSink( "assembly", sink )
.addTail( assembly );
flowDef.setDebugLevel( DebugLevel.NONE );
flowConnector will NOT create the Debug pipe!
93. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
94. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
95. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
Scalding:
IterableSource(data)
96. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
Scalding:
IterableSource(data)
.map('number -> 'doubled) { n: Int => n * 2 }
97. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
Scalding:
IterableSource(data)
.map('number -> 'doubled) { n: Int => n * 2 }
// Int => Int
98. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
Scalding:
IterableSource(data)
.map('number -> 'doubled) { n: Int => n * 2 }
available in Pipe // Int => Int
99. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
Scalding:
IterableSource(data)
.map('number -> 'doubled) { n: Int => n * 2 }
stays in Pipe available in Pipe // Int => Int
100. map
Scala:
val data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
// Int => Int
Scalding:
IterableSource(data)
.map('number -> 'doubled) { n: Int => n * 2 }
must choose type! // Int => Int
105. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
106. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
107. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
108. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
Scalding:
IterableSource(data)
109. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
Scalding:
IterableSource(data)
.mapTo('doubled) { n: Int => n * 2 }
110. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
Scalding:
IterableSource(data)
.mapTo('doubled) { n: Int => n * 2 }
// Int => Int
111. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
Scalding:
IterableSource(data)
.mapTo('doubled) { n: Int => n * 2 }
doubled stays in Pipe // Int => Int
112. mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
val doubled = data map { _ * 2 }
data = null
// Int => Int
release reference
Scalding:
IterableSource(data)
.mapTo('doubled) { n: Int => n * 2 }
number is removed doubled stays in Pipe // Int => Int
139. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
140. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
141. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
142. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
143. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
144. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
Scalding:
IterableSource(List(1, 2, 30, 42), 'num)
145. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
Scalding:
IterableSource(List(1, 2, 30, 42), 'num)
.map('num -> 'lessThanTen) { i: Int => i < 10 }
146. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
Scalding:
IterableSource(List(1, 2, 30, 42), 'num)
.map('num -> 'lessThanTen) { i: Int => i < 10 }
.groupBy('lessThanTen) { _.size('size) }
147. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
Scalding:
IterableSource(List(1, 2, 30, 42), 'num)
.map('num -> 'lessThanTen) { i: Int => i < 10 }
.groupBy('lessThanTen) { _.size('size) }
groups all with == value
148. groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
val groups = data groupBy { _ < 10 }
groups // Map[Boolean, Int]
groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
Scalding:
IterableSource(List(1, 2, 30, 42), 'num)
.map('num -> 'lessThanTen) { i: Int => i < 10 }
.groupBy('lessThanTen) { _.size('size) }
groups all with == value => 'size
166. Distributed Copy in Scalding
class WordCountJob(args: Args) extends Job(args) {
167. Distributed Copy in Scalding
class WordCountJob(args: Args) extends Job(args) {
val input = Tsv(args("input"))
val output = Tsv(args("output"))
168. Distributed Copy in Scalding
class WordCountJob(args: Args) extends Job(args) {
val input = Tsv(args("input"))
val output = Tsv(args("output"))
input.read.write(output)
}
169. Distributed Copy in Scalding
class WordCountJob(args: Args) extends Job(args) {
val input = Tsv(args("input"))
val output = Tsv(args("output"))
input.read.write(output)
}
The End.
170. Main Class - "Runner"
import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding
object ScaldingJobRunner extends App {
ToolRunner.run(new Configuration, new scalding.Tool, args)
}
171. Main Class - "Runner"
import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding
object ScaldingJobRunner extends App { from App
ToolRunner.run(new Configuration, new scalding.Tool, args)
}
172. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
}
173. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
}
174. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
TextLine(inputFile)
}
175. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
TextLine(inputFile)
.flatMap('line -> 'word) { line: String => tokenize(line) }
def tokenize(text: String): Array[String] = implemented
}
176. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
TextLine(inputFile)
.flatMap('line -> 'word) { line: String => tokenize(line) }
.groupBy('word) { group => group.size('count) }
def tokenize(text: String): Array[String] = implemented
}
177. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
TextLine(inputFile)
.flatMap('line -> 'word) { line: String => tokenize(line) }
.groupBy('word) { group => group.size }
def tokenize(text: String): Array[String] = implemented
}
178. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
TextLine(inputFile)
.flatMap('line -> 'word) { line: String => tokenize(line) }
.groupBy('word) { _.size }
def tokenize(text: String): Array[String] = implemented
}
179. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
TextLine(inputFile)
.flatMap('line -> 'word) { line: String => tokenize(line) }
.groupBy('word) { _.size }
.write(Tsv(outputFile))
def tokenize(text: String): Array[String] = implemented
}
180. Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {
val inputFile = args("input")
val outputFile = args("output")
4{
TextLine(inputFile)
.flatMap('line -> 'word) { line: String => tokenize(line) }
.groupBy('word) { _.size }
.write(Tsv(outputFile))
def tokenize(text: String): Array[String] = implemented
}