SlideShare a Scribd company logo
1 of 193
Download to read offline
Scalding
Hadoop Word Count
in < 70 lines of code




                  Konrad 'ktoso' Malawski
                 JARCamp #3 12.04.2013
Scalding
Hadoop Word Count

 in   4 lines of code


                   Konrad 'ktoso' Malawski
                  JARCamp #3 12.04.2013
softwaremill.com / java.pl / sckrk.com / geecon.org / krakowscala.pl / gdgkrakow.pl
Agenda
Agenda
Why Scalding? (10%)
Agenda
Why Scalding? (10%)
       +
Agenda
Why Scalding? (10%)
       +
Hadoop Basics (20%)
Agenda
Why Scalding? (10%)
       +
Hadoop Basics (20%)
       +
Agenda
 Why Scalding? (10%)
          +
 Hadoop Basics (20%)
          +
Enter Cascading (40%)
Agenda
 Why Scalding? (10%)
          +
 Hadoop Basics (20%)
          +
Enter Cascading (40%)
          +
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
           =
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
           =
         100%
Why Scalding?
 Word Count in Types


type Word = String
type Count = Int

String => Map[Word, Count]
Why Scalding?
 Word Count in Scala
Why Scalding?
                Word Count in Scala

val text = "a a a b b"
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map { a => a._1 -> a._2.map(_._2).sum }
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map { a => a._1 -> a._2.map(_._2).sum }



wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
Stuff > Memory
Scala collections... fun but, memory bound!


val text = "so many words... waaah! ..."


  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."


  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."

                         in Memory
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."

                         in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                               in Memory
val text = "so many words... waaah! ..."

                           in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))              in Memory
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                               in Memory
val text = "so many words... waaah! ..."

                           in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))              in Memory
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))

                                       in Memory
Apache Hadoop (HDFS + MR)
    http://hadoop.apache.org/
Why Scalding?
                             Word Count in Hadoop MR



package org.myorg;

import   org.apache.hadoop.fs.Path;
import   org.apache.hadoop.io.IntWritable;
import   org.apache.hadoop.io.LongWritable;
import   org.apache.hadoop.io.Text;
import   org.apache.hadoop.mapred.*;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

public class WordCount {

    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                output.collect(word, one);
private final static IntWritable one = new IntWritable(1);




                              Why Scalding?
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());

                           Word Count in Hadoop MR
                output.collect(word, one);
            }
        }
    }

    public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
        public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter
reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            output.collect(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception {
        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("wordcount");

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);
    }
}
Trivia: How old is Hadoop?
Cascading
www.cascading.org/
Cascading
www.cascading.org/
Cascading
    is
Cascading
     is
Taps & Pipes
Cascading
     is
Taps & Pipes



        & Sinks
1: Distributed Copy
1: Distributed Copy
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource( copyPipe, inTap )
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource(copyPipe, inTap)
  .addTailSink(copyPipe, outTap);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource(copyPipe, inTap)
  .addTailSink(copyPipe, outTap);

// run!
flowConnector.connect(flowDef).complete();
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
2: Word Count



String docPath = args[ 0 ];
String wcPath = args[ 1 ];

Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
2: Word Count


  String docPath = args[ 0 ];
  String wcPath = args[ 1 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( props, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

 // create source and sink taps
 Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
 Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

  // specify a regex operation to split the "document" text lines into a
ken stream
2: Word Count

  String docPath = args[ 0 ];
  String wcPath = args[ 1 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( props, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

 // create source and sink taps
 Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
 Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

  // specify a regex operation to split the "document" text lines into a
ken stream
  Fields token = new Fields( "token" );
  Fields text = new Fields( "text" );
  RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [
),.]" );
2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];

Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
String wcPath = args[ 1 ];


            2: Word Count
            2: Word Count
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
Fields token = new Fields( "token" );


                2: Word Count
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter =
                new RegexSplitGenerator( token, "[ [](),.]" );

    // only returns "token"
    Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
}
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );


                2: Word Count
                How it's made
    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
}
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );


                2: Word Count
                How it's made
    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
                    Graph representation of jobs!
}
2: Word Count
How it's made




http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
How it's made
How it's made
val flow = FlowDef
How it's made
val flow = FlowDef

// pseudo code...
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
HadoopCluster.execute(jobs)
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
HadoopCluster.execute(jobs)
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );


flowDef.setDebugLevel( DebugLevel.NONE );
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );


flowDef.setDebugLevel( DebugLevel.NONE );

                     flowConnector will NOT create the Debug pipe!
Scalding
     =
     +


   Twitter Scalding
github.com/twitter/scalding
Scalding API
map
map
Scala:
val data = 1 :: 2 :: 3 :: Nil
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

                                   // Int => Int
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

                                   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                     // Int => Int


Scalding:
  IterableSource(data)
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                        // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                        // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                                         // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                             // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                         available in Pipe   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                             // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


     stays in Pipe       available in Pipe   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                          // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                      must choose type!   // Int => Int
mapTo
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
         release reference
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
         release reference
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                     // Int => Int
            release reference

Scalding:
  IterableSource(data)
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                           // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                           // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


                                           // Int => Int
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                                    // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


                            doubled stays in Pipe   // Int => Int
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                                    // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


  number is removed         doubled stays in Pipe   // Int => Int
flatMap
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
    .map('word -> 'number) { _.toInt }           // like List[Int]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
    .map('word -> 'number) { _.toInt }           // like List[Int]

                     MR map outside
flatMap
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
                                                  // like List[Int]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
                                                  // like List[Int]
                          map inside Scala
groupBy
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil     // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil         // List[Int]

 val groups = data groupBy { _ < 10 }

 groups                // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }

 groups all with == value
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil         // List[Int]

 val groups = data groupBy { _ < 10 }

 groups                // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }

 groups all with == value                     => 'size
groupBy


Scalding:
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.sum('total) }
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.sum('total) }

                              'total = [3, 74]
Scalding API
Scalding API
  project / discard
Scalding API
  project / discard
    map / mapTo
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
       filter
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
        filter
       unique
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug

          Group operations
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug

          Group operations

                 joins
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

  val input = Tsv(args("input"))
  val output = Tsv(args("output"))
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

    val input = Tsv(args("input"))
    val output = Tsv(args("output"))

    input.read.write(output)

}
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

    val input = Tsv(args("input"))
    val output = Tsv(args("output"))

    input.read.write(output)

}




                      The End.
Main Class - "Runner"

import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding

object ScaldingJobRunner extends App {

    ToolRunner.run(new Configuration, new scalding.Tool, args)

}
Main Class - "Runner"

import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding

object ScaldingJobRunner extends App {          from App

    ToolRunner.run(new Configuration, new scalding.Tool, args)

}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }



    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { group => group.size('count) }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { group => group.size }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { _.size }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { _.size }
      .write(Tsv(outputFile))

    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
 class WordCountJob(args: Args) extends Job(args) {

     val inputFile = args("input")
     val outputFile = args("output")




4{
     TextLine(inputFile)
       .flatMap('line -> 'word) { line: String => tokenize(line) }
       .groupBy('word) { _.size }
       .write(Tsv(outputFile))

     def tokenize(text: String): Array[String] = implemented
 }
Word Count in Scalding
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot




M
A
P
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot




M
A
P
R
E
D
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Why Scalding?
Why Scalding?


 Hadoop inside
Why Scalding?


    Hadoop inside
Cascading abstractions
Why Scalding?


    Hadoop inside
Cascading abstractions
  Scala conciseness
Ask Stuff!

      Dzięki!
      Thanks!
     ありがとう!


Konrad Malawski @ java.pl
t: ktosopl / g: ktoso
b: blog.project13.pl

More Related Content

What's hot

Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant) Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant) BigDataEverywhere
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsCheng Min Chi
 
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabCloudxLab
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGMatthew McCullough
 
Modern technologies in data science
Modern technologies in data science Modern technologies in data science
Modern technologies in data science Chucheng Hsieh
 
Introduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceIntroduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceDr Ganesh Iyer
 
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...CloudxLab
 
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB
 
Scalding Presentation
Scalding PresentationScalding Presentation
Scalding PresentationLandoop Ltd
 
Spark Schema For Free with David Szakallas
 Spark Schema For Free with David Szakallas Spark Schema For Free with David Szakallas
Spark Schema For Free with David SzakallasDatabricks
 
Hive - SerDe and LazySerde
Hive - SerDe and LazySerdeHive - SerDe and LazySerde
Hive - SerDe and LazySerdeZheng Shao
 
Cassandra 3.0 - JSON at scale - StampedeCon 2015
Cassandra 3.0 - JSON at scale - StampedeCon 2015Cassandra 3.0 - JSON at scale - StampedeCon 2015
Cassandra 3.0 - JSON at scale - StampedeCon 2015StampedeCon
 
Introduction to MapReduce and Hadoop
Introduction to MapReduce and HadoopIntroduction to MapReduce and Hadoop
Introduction to MapReduce and HadoopMohamed Elsaka
 
Making an Object System with Tcl 8.5
Making an Object System with Tcl 8.5Making an Object System with Tcl 8.5
Making an Object System with Tcl 8.5Donal Fellows
 
2017 02-07 - elastic & spark. building a search geo locator
2017 02-07 - elastic & spark. building a search geo locator2017 02-07 - elastic & spark. building a search geo locator
2017 02-07 - elastic & spark. building a search geo locatorAlberto Paro
 
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...CloudxLab
 

What's hot (20)

Scalding
ScaldingScalding
Scalding
 
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant) Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internals
 
Meet scala
Meet scalaMeet scala
Meet scala
 
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUG
 
Modern technologies in data science
Modern technologies in data science Modern technologies in data science
Modern technologies in data science
 
Introduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceIntroduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduce
 
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
 
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
 
Scalding Presentation
Scalding PresentationScalding Presentation
Scalding Presentation
 
Spark Schema For Free with David Szakallas
 Spark Schema For Free with David Szakallas Spark Schema For Free with David Szakallas
Spark Schema For Free with David Szakallas
 
Requery overview
Requery overviewRequery overview
Requery overview
 
Polyglot Persistence
Polyglot PersistencePolyglot Persistence
Polyglot Persistence
 
Hive - SerDe and LazySerde
Hive - SerDe and LazySerdeHive - SerDe and LazySerde
Hive - SerDe and LazySerde
 
Cassandra 3.0 - JSON at scale - StampedeCon 2015
Cassandra 3.0 - JSON at scale - StampedeCon 2015Cassandra 3.0 - JSON at scale - StampedeCon 2015
Cassandra 3.0 - JSON at scale - StampedeCon 2015
 
Introduction to MapReduce and Hadoop
Introduction to MapReduce and HadoopIntroduction to MapReduce and Hadoop
Introduction to MapReduce and Hadoop
 
Making an Object System with Tcl 8.5
Making an Object System with Tcl 8.5Making an Object System with Tcl 8.5
Making an Object System with Tcl 8.5
 
2017 02-07 - elastic & spark. building a search geo locator
2017 02-07 - elastic & spark. building a search geo locator2017 02-07 - elastic & spark. building a search geo locator
2017 02-07 - elastic & spark. building a search geo locator
 
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
 

Viewers also liked

Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Sriram Krishnan
 
Practical Pig and PigUnit (Michael Noll, Verisign)
Practical Pig and PigUnit (Michael Noll, Verisign)Practical Pig and PigUnit (Michael Noll, Verisign)
Practical Pig and PigUnit (Michael Noll, Verisign)Swiss Big Data User Group
 
How LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentHow LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentSasha Ovsankin
 
Scala dsls-dissecting-and-implementing-rogue
Scala dsls-dissecting-and-implementing-rogueScala dsls-dissecting-and-implementing-rogue
Scala dsls-dissecting-and-implementing-rogueKonrad Malawski
 
TDD drogą do oświecenia w Scali
TDD drogą do oświecenia w ScaliTDD drogą do oświecenia w Scali
TDD drogą do oświecenia w ScaliKonrad Malawski
 
Git tak po prostu (SFI version)
Git tak po prostu (SFI version)Git tak po prostu (SFI version)
Git tak po prostu (SFI version)Konrad Malawski
 
JavaOne 2013: Java 8 - The Good Parts
JavaOne 2013: Java 8 - The Good PartsJavaOne 2013: Java 8 - The Good Parts
JavaOne 2013: Java 8 - The Good PartsKonrad Malawski
 
Open soucerers - jak zacząć swoją przygodę z open source
Open soucerers - jak zacząć swoją przygodę z open sourceOpen soucerers - jak zacząć swoją przygodę z open source
Open soucerers - jak zacząć swoją przygodę z open sourceKonrad Malawski
 
Android my Scala @ JFokus 2013
Android my Scala @ JFokus 2013Android my Scala @ JFokus 2013
Android my Scala @ JFokus 2013Konrad Malawski
 
HBase RowKey design for Akka Persistence
HBase RowKey design for Akka PersistenceHBase RowKey design for Akka Persistence
HBase RowKey design for Akka PersistenceKonrad Malawski
 
Need for Async: Hot pursuit for scalable applications
Need for Async: Hot pursuit for scalable applicationsNeed for Async: Hot pursuit for scalable applications
Need for Async: Hot pursuit for scalable applicationsKonrad Malawski
 
Ebay legacy-code-retreat
Ebay legacy-code-retreatEbay legacy-code-retreat
Ebay legacy-code-retreatKonrad Malawski
 
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
Hadoop Summit 2012 | Optimizing MapReduce Job PerformanceHadoop Summit 2012 | Optimizing MapReduce Job Performance
Hadoop Summit 2012 | Optimizing MapReduce Job PerformanceCloudera, Inc.
 
KrakDroid: Scala on Android
KrakDroid: Scala on AndroidKrakDroid: Scala on Android
KrakDroid: Scala on AndroidKonrad Malawski
 
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)Konrad Malawski
 
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRKKonrad Malawski
 
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka StreamsFresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka StreamsKonrad Malawski
 
Disrupt 2 Grow - Devoxx 2013
Disrupt 2 Grow - Devoxx 2013Disrupt 2 Grow - Devoxx 2013
Disrupt 2 Grow - Devoxx 2013Konrad Malawski
 

Viewers also liked (20)

Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014
 
Unit testing pig
Unit testing pigUnit testing pig
Unit testing pig
 
Practical Pig and PigUnit (Michael Noll, Verisign)
Practical Pig and PigUnit (Michael Noll, Verisign)Practical Pig and PigUnit (Michael Noll, Verisign)
Practical Pig and PigUnit (Michael Noll, Verisign)
 
How LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentHow LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product Development
 
Android at-xsolve
Android at-xsolveAndroid at-xsolve
Android at-xsolve
 
Scala dsls-dissecting-and-implementing-rogue
Scala dsls-dissecting-and-implementing-rogueScala dsls-dissecting-and-implementing-rogue
Scala dsls-dissecting-and-implementing-rogue
 
TDD drogą do oświecenia w Scali
TDD drogą do oświecenia w ScaliTDD drogą do oświecenia w Scali
TDD drogą do oświecenia w Scali
 
Git tak po prostu (SFI version)
Git tak po prostu (SFI version)Git tak po prostu (SFI version)
Git tak po prostu (SFI version)
 
JavaOne 2013: Java 8 - The Good Parts
JavaOne 2013: Java 8 - The Good PartsJavaOne 2013: Java 8 - The Good Parts
JavaOne 2013: Java 8 - The Good Parts
 
Open soucerers - jak zacząć swoją przygodę z open source
Open soucerers - jak zacząć swoją przygodę z open sourceOpen soucerers - jak zacząć swoją przygodę z open source
Open soucerers - jak zacząć swoją przygodę z open source
 
Android my Scala @ JFokus 2013
Android my Scala @ JFokus 2013Android my Scala @ JFokus 2013
Android my Scala @ JFokus 2013
 
HBase RowKey design for Akka Persistence
HBase RowKey design for Akka PersistenceHBase RowKey design for Akka Persistence
HBase RowKey design for Akka Persistence
 
Need for Async: Hot pursuit for scalable applications
Need for Async: Hot pursuit for scalable applicationsNeed for Async: Hot pursuit for scalable applications
Need for Async: Hot pursuit for scalable applications
 
Ebay legacy-code-retreat
Ebay legacy-code-retreatEbay legacy-code-retreat
Ebay legacy-code-retreat
 
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
Hadoop Summit 2012 | Optimizing MapReduce Job PerformanceHadoop Summit 2012 | Optimizing MapReduce Job Performance
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
 
KrakDroid: Scala on Android
KrakDroid: Scala on AndroidKrakDroid: Scala on Android
KrakDroid: Scala on Android
 
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
 
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
 
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka StreamsFresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
 
Disrupt 2 Grow - Devoxx 2013
Disrupt 2 Grow - Devoxx 2013Disrupt 2 Grow - Devoxx 2013
Disrupt 2 Grow - Devoxx 2013
 

Similar to Scalding - Hadoop Word Count in LESS than 70 lines of code

Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with ClojureDmitry Buzdin
 
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInScalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInVitaly Gordon
 
Behm Shah Pagerank
Behm Shah PagerankBehm Shah Pagerank
Behm Shah Pagerankgothicane
 
Stream or not to Stream?

Stream or not to Stream?
Stream or not to Stream?

Stream or not to Stream?
Lukasz Byczynski
 
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6Workhorse Computing
 
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaAdvance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaDesing Pathshala
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with ScalaHimanshu Gupta
 
Open XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand DechouxOpen XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand DechouxPublicis Sapient Engineering
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik ErlandsonDatabricks
 
Scala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghScala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghStuart Roebuck
 
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014Michał Oniszczuk
 
Apache Spark on Apache HBase: Current and Future
Apache Spark on Apache HBase: Current and Future Apache Spark on Apache HBase: Current and Future
Apache Spark on Apache HBase: Current and Future HBaseCon
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in CassandraJairam Chandar
 
JRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusJRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusKoichi Fujikawa
 

Similar to Scalding - Hadoop Word Count in LESS than 70 lines of code (20)

Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInScalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedIn
 
Behm Shah Pagerank
Behm Shah PagerankBehm Shah Pagerank
Behm Shah Pagerank
 
Stream or not to Stream?

Stream or not to Stream?
Stream or not to Stream?

Stream or not to Stream?

 
Osd ctw spark
Osd ctw sparkOsd ctw spark
Osd ctw spark
 
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
 
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaAdvance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with Scala
 
Open XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand DechouxOpen XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand Dechoux
 
Apache Spark - Aram Mkrtchyan
Apache Spark - Aram MkrtchyanApache Spark - Aram Mkrtchyan
Apache Spark - Aram Mkrtchyan
 
Hw09 Hadoop + Clojure
Hw09   Hadoop + ClojureHw09   Hadoop + Clojure
Hw09 Hadoop + Clojure
 
Hadoop
HadoopHadoop
Hadoop
 
Hadoop + Clojure
Hadoop + ClojureHadoop + Clojure
Hadoop + Clojure
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
 
Scala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghScala @ TechMeetup Edinburgh
Scala @ TechMeetup Edinburgh
 
Tuples All the Way Down
Tuples All the Way DownTuples All the Way Down
Tuples All the Way Down
 
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
 
Apache Spark on Apache HBase: Current and Future
Apache Spark on Apache HBase: Current and Future Apache Spark on Apache HBase: Current and Future
Apache Spark on Apache HBase: Current and Future
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in Cassandra
 
JRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusJRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop Papyrus
 

More from Konrad Malawski

Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Networks and Types - the Future of Akka @ ScalaDays NYC 2018Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Networks and Types - the Future of Akka @ ScalaDays NYC 2018Konrad Malawski
 
Akka Typed (quick talk) - JFokus 2018
Akka Typed (quick talk) - JFokus 2018Akka Typed (quick talk) - JFokus 2018
Akka Typed (quick talk) - JFokus 2018Konrad Malawski
 
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in'tScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in'tKonrad Malawski
 
State of Akka 2017 - The best is yet to come
State of Akka 2017 - The best is yet to comeState of Akka 2017 - The best is yet to come
State of Akka 2017 - The best is yet to comeKonrad Malawski
 
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYCBuilding a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYCKonrad Malawski
 
Akka-chan's Survival Guide for the Streaming World
Akka-chan's Survival Guide for the Streaming WorldAkka-chan's Survival Guide for the Streaming World
Akka-chan's Survival Guide for the Streaming WorldKonrad Malawski
 
Reactive integrations with Akka Streams
Reactive integrations with Akka StreamsReactive integrations with Akka Streams
Reactive integrations with Akka StreamsKonrad Malawski
 
Not Only Streams for Akademia JLabs
Not Only Streams for Akademia JLabsNot Only Streams for Akademia JLabs
Not Only Streams for Akademia JLabsKonrad Malawski
 
Reactive Streams, j.u.concurrent & Beyond!
Reactive Streams, j.u.concurrent & Beyond!Reactive Streams, j.u.concurrent & Beyond!
Reactive Streams, j.u.concurrent & Beyond!Konrad Malawski
 
End to End Akka Streams / Reactive Streams - from Business to Socket
End to End Akka Streams / Reactive Streams - from Business to SocketEnd to End Akka Streams / Reactive Streams - from Business to Socket
End to End Akka Streams / Reactive Streams - from Business to SocketKonrad Malawski
 
The Cloud-natives are RESTless @ JavaOne
The Cloud-natives are RESTless @ JavaOneThe Cloud-natives are RESTless @ JavaOne
The Cloud-natives are RESTless @ JavaOneKonrad Malawski
 
Akka Streams in Action @ ScalaDays Berlin 2016
Akka Streams in Action @ ScalaDays Berlin 2016Akka Streams in Action @ ScalaDays Berlin 2016
Akka Streams in Action @ ScalaDays Berlin 2016Konrad Malawski
 
Krakow communities @ 2016
Krakow communities @ 2016Krakow communities @ 2016
Krakow communities @ 2016Konrad Malawski
 
The things we don't see – stories of Software, Scala and Akka
The things we don't see – stories of Software, Scala and AkkaThe things we don't see – stories of Software, Scala and Akka
The things we don't see – stories of Software, Scala and AkkaKonrad Malawski
 
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...Konrad Malawski
 
How Reactive Streams & Akka Streams change the JVM Ecosystem
How Reactive Streams & Akka Streams change the JVM EcosystemHow Reactive Streams & Akka Streams change the JVM Ecosystem
How Reactive Streams & Akka Streams change the JVM EcosystemKonrad Malawski
 
The Need for Async @ ScalaWorld
The Need for Async @ ScalaWorldThe Need for Async @ ScalaWorld
The Need for Async @ ScalaWorldKonrad Malawski
 
Reactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka StreamsReactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka StreamsKonrad Malawski
 
Reactive Streams / Akka Streams - GeeCON Prague 2014
Reactive Streams / Akka Streams - GeeCON Prague 2014Reactive Streams / Akka Streams - GeeCON Prague 2014
Reactive Streams / Akka Streams - GeeCON Prague 2014Konrad Malawski
 

More from Konrad Malawski (20)

Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Networks and Types - the Future of Akka @ ScalaDays NYC 2018Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Networks and Types - the Future of Akka @ ScalaDays NYC 2018
 
Akka Typed (quick talk) - JFokus 2018
Akka Typed (quick talk) - JFokus 2018Akka Typed (quick talk) - JFokus 2018
Akka Typed (quick talk) - JFokus 2018
 
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in'tScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
 
State of Akka 2017 - The best is yet to come
State of Akka 2017 - The best is yet to comeState of Akka 2017 - The best is yet to come
State of Akka 2017 - The best is yet to come
 
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYCBuilding a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
 
Akka-chan's Survival Guide for the Streaming World
Akka-chan's Survival Guide for the Streaming WorldAkka-chan's Survival Guide for the Streaming World
Akka-chan's Survival Guide for the Streaming World
 
Reactive integrations with Akka Streams
Reactive integrations with Akka StreamsReactive integrations with Akka Streams
Reactive integrations with Akka Streams
 
Not Only Streams for Akademia JLabs
Not Only Streams for Akademia JLabsNot Only Streams for Akademia JLabs
Not Only Streams for Akademia JLabs
 
Reactive Streams, j.u.concurrent & Beyond!
Reactive Streams, j.u.concurrent & Beyond!Reactive Streams, j.u.concurrent & Beyond!
Reactive Streams, j.u.concurrent & Beyond!
 
End to End Akka Streams / Reactive Streams - from Business to Socket
End to End Akka Streams / Reactive Streams - from Business to SocketEnd to End Akka Streams / Reactive Streams - from Business to Socket
End to End Akka Streams / Reactive Streams - from Business to Socket
 
The Cloud-natives are RESTless @ JavaOne
The Cloud-natives are RESTless @ JavaOneThe Cloud-natives are RESTless @ JavaOne
The Cloud-natives are RESTless @ JavaOne
 
Akka Streams in Action @ ScalaDays Berlin 2016
Akka Streams in Action @ ScalaDays Berlin 2016Akka Streams in Action @ ScalaDays Berlin 2016
Akka Streams in Action @ ScalaDays Berlin 2016
 
Krakow communities @ 2016
Krakow communities @ 2016Krakow communities @ 2016
Krakow communities @ 2016
 
The things we don't see – stories of Software, Scala and Akka
The things we don't see – stories of Software, Scala and AkkaThe things we don't see – stories of Software, Scala and Akka
The things we don't see – stories of Software, Scala and Akka
 
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
 
Zen of Akka
Zen of AkkaZen of Akka
Zen of Akka
 
How Reactive Streams & Akka Streams change the JVM Ecosystem
How Reactive Streams & Akka Streams change the JVM EcosystemHow Reactive Streams & Akka Streams change the JVM Ecosystem
How Reactive Streams & Akka Streams change the JVM Ecosystem
 
The Need for Async @ ScalaWorld
The Need for Async @ ScalaWorldThe Need for Async @ ScalaWorld
The Need for Async @ ScalaWorld
 
Reactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka StreamsReactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka Streams
 
Reactive Streams / Akka Streams - GeeCON Prague 2014
Reactive Streams / Akka Streams - GeeCON Prague 2014Reactive Streams / Akka Streams - GeeCON Prague 2014
Reactive Streams / Akka Streams - GeeCON Prague 2014
 

Recently uploaded

Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsMiki Katsuragi
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebUiPathCommunity
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clashcharlottematthew16
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Patryk Bandurski
 
Developer Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQLDeveloper Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQLScyllaDB
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):comworks
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
costume and set research powerpoint presentation
costume and set research powerpoint presentationcostume and set research powerpoint presentation
costume and set research powerpoint presentationphoebematthew05
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Mark Simos
 
Connect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationConnect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationSlibray Presentation
 
Artificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptxArtificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptxhariprasad279825
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr LapshynFwdays
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsSergiu Bodiu
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brandgvaughan
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsRizwan Syed
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024Scott Keck-Warren
 
Designing IA for AI - Information Architecture Conference 2024
Designing IA for AI - Information Architecture Conference 2024Designing IA for AI - Information Architecture Conference 2024
Designing IA for AI - Information Architecture Conference 2024Enterprise Knowledge
 

Recently uploaded (20)

Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering Tips
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio Web
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clash
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
 
Developer Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQLDeveloper Data Modeling Mistakes: From Postgres to NoSQL
Developer Data Modeling Mistakes: From Postgres to NoSQL
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):
 
DMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special EditionDMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special Edition
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
costume and set research powerpoint presentation
costume and set research powerpoint presentationcostume and set research powerpoint presentation
costume and set research powerpoint presentation
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
 
Connect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationConnect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck Presentation
 
Artificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptxArtificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptx
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platforms
 
WordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your BrandWordPress Websites for Engineers: Elevate Your Brand
WordPress Websites for Engineers: Elevate Your Brand
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL Certs
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024
 
Designing IA for AI - Information Architecture Conference 2024
Designing IA for AI - Information Architecture Conference 2024Designing IA for AI - Information Architecture Conference 2024
Designing IA for AI - Information Architecture Conference 2024
 

Scalding - Hadoop Word Count in LESS than 70 lines of code

  • 1. Scalding Hadoop Word Count in < 70 lines of code Konrad 'ktoso' Malawski JARCamp #3 12.04.2013
  • 2. Scalding Hadoop Word Count in 4 lines of code Konrad 'ktoso' Malawski JARCamp #3 12.04.2013
  • 3. softwaremill.com / java.pl / sckrk.com / geecon.org / krakowscala.pl / gdgkrakow.pl
  • 7. Agenda Why Scalding? (10%) + Hadoop Basics (20%)
  • 8. Agenda Why Scalding? (10%) + Hadoop Basics (20%) +
  • 9. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%)
  • 10. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) +
  • 11. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%)
  • 12. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%) =
  • 13. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%) = 100%
  • 14. Why Scalding? Word Count in Types type Word = String type Count = Int String => Map[Word, Count]
  • 15. Why Scalding? Word Count in Scala
  • 16. Why Scalding? Word Count in Scala val text = "a a a b b"
  • 17. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] =
  • 18. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text
  • 19. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ")
  • 20. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1))
  • 21. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1)
  • 22. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum }
  • 23. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum } wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
  • 24. Stuff > Memory Scala collections... fun but, memory bound! val text = "so many words... waaah! ..." text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 25. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 26. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 27. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 28. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) in Memory .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 29. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) in Memory .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum)) in Memory
  • 30. Apache Hadoop (HDFS + MR) http://hadoop.apache.org/
  • 31. Why Scalding? Word Count in Hadoop MR package org.myorg; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; public class WordCount { public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one);
  • 32. private final static IntWritable one = new IntWritable(1); Why Scalding? private Text word = new Text(); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); Word Count in Hadoop MR output.collect(word, one); } } } public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); } }
  • 33. Trivia: How old is Hadoop?
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 44. Cascading is
  • 45. Cascading is Taps & Pipes
  • 46. Cascading is Taps & Pipes & Sinks
  • 49. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
  • 50. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
  • 51. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy");
  • 52. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef()
  • 53. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource( copyPipe, inTap )
  • 54. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap);
  • 55. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); // run! flowConnector.connect(flowDef).complete();
  • 56. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 57. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 58. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 59. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 60. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 61. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 62. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
  • 63. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); // specify a regex operation to split the "document" text lines into a ken stream
  • 64. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); // specify a regex operation to split the "document" text lines into a ken stream Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [ ),.]" );
  • 65. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
  • 66. String wcPath = args[ 1 ]; 2: Word Count 2: Word Count Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" )
  • 67. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 68. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 69. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 70. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 71. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 72. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 73. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 74. Fields token = new Fields( "token" ); 2: Word Count Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }
  • 75. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); 2: Word Count How it's made // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }
  • 76. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); 2: Word Count How it's made // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } Graph representation of jobs! }
  • 77. 2: Word Count How it's made http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
  • 79. How it's made val flow = FlowDef
  • 80. How it's made val flow = FlowDef // pseudo code...
  • 81. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow)
  • 82. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code...
  • 83. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code... HadoopCluster.execute(jobs)
  • 84. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code... HadoopCluster.execute(jobs)
  • 85. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly );
  • 86. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly ); flowDef.setDebugLevel( DebugLevel.NONE );
  • 87. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly ); flowDef.setDebugLevel( DebugLevel.NONE ); flowConnector will NOT create the Debug pipe!
  • 88. Scalding = + Twitter Scalding github.com/twitter/scalding
  • 90. map
  • 91. map Scala: val data = 1 :: 2 :: 3 :: Nil
  • 92. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 }
  • 93. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int
  • 94. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int
  • 95. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data)
  • 96. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 }
  • 97. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } // Int => Int
  • 98. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } available in Pipe // Int => Int
  • 99. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } stays in Pipe available in Pipe // Int => Int
  • 100. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } must choose type! // Int => Int
  • 101. mapTo
  • 102. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil
  • 103. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 }
  • 104. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null
  • 105. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int
  • 106. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference
  • 107. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference
  • 108. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data)
  • 109. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 }
  • 110. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } // Int => Int
  • 111. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } doubled stays in Pipe // Int => Int
  • 112. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } number is removed doubled stays in Pipe // Int => Int
  • 114. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]
  • 115. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String
  • 116. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String]
  • 117. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int]
  • 118. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int]
  • 119. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 120. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 121. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String]
  • 122. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String]
  • 123. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int]
  • 124. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int] MR map outside
  • 126. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]
  • 127. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String
  • 128. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int]
  • 129. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] }
  • 130. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int]
  • 131. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 132. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 133. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String]
  • 134. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
  • 135. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int]
  • 136. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int] map inside Scala
  • 138. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
  • 139. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 }
  • 140. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int]
  • 141. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2))
  • 142. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42))
  • 143. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42))
  • 144. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num)
  • 145. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }
  • 146. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) }
  • 147. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) } groups all with == value
  • 148. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) } groups all with == value => 'size
  • 151. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }
  • 152. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) }
  • 153. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) } 'total = [3, 74]
  • 155. Scalding API project / discard
  • 156. Scalding API project / discard map / mapTo
  • 157. Scalding API project / discard map / mapTo flatMap / flatMapTo
  • 158. Scalding API project / discard map / mapTo flatMap / flatMapTo rename
  • 159. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter
  • 160. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique
  • 161. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle
  • 162. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit
  • 163. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug
  • 164. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug Group operations
  • 165. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug Group operations joins
  • 166. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) {
  • 167. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output"))
  • 168. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output")) input.read.write(output) }
  • 169. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output")) input.read.write(output) } The End.
  • 170. Main Class - "Runner" import org.apache.hadoop.util.ToolRunner import com.twitter.scalding object ScaldingJobRunner extends App { ToolRunner.run(new Configuration, new scalding.Tool, args) }
  • 171. Main Class - "Runner" import org.apache.hadoop.util.ToolRunner import com.twitter.scalding object ScaldingJobRunner extends App { from App ToolRunner.run(new Configuration, new scalding.Tool, args) }
  • 172. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { }
  • 173. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") }
  • 174. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) }
  • 175. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } def tokenize(text: String): Array[String] = implemented }
  • 176. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size('count) } def tokenize(text: String): Array[String] = implemented }
  • 177. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size } def tokenize(text: String): Array[String] = implemented }
  • 178. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } def tokenize(text: String): Array[String] = implemented }
  • 179. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile)) def tokenize(text: String): Array[String] = implemented }
  • 180. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") 4{ TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile)) def tokenize(text: String): Array[String] = implemented }
  • 181. Word Count in Scalding
  • 182. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph
  • 183. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot
  • 184. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot M A P
  • 185. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot M A P R E D
  • 186. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 187. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 188. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 191. Why Scalding? Hadoop inside Cascading abstractions
  • 192. Why Scalding? Hadoop inside Cascading abstractions Scala conciseness
  • 193. Ask Stuff! Dzięki! Thanks! ありがとう! Konrad Malawski @ java.pl t: ktosopl / g: ktoso b: blog.project13.pl