SlideShare a Scribd company logo
1 of 209
Download to read offline
Introduction to


Cascading
What is Cascading?
What is Cascading?


• abstraction over MapReduce
What is Cascading?


• abstraction over MapReduce
• API for data-processing workflows
Why use Cascading?
Why use Cascading?


•   MapReduce can be:
Why use Cascading?


•   MapReduce can be:

    •   the wrong level of granularity
Why use Cascading?


•   MapReduce can be:

    •   the wrong level of granularity

    •   cumbersome to chain together
Why use Cascading?
Why use Cascading?

•   Cascading helps create
Why use Cascading?

•   Cascading helps create

    •   higher-level data processing abstractions
Why use Cascading?

•   Cascading helps create

    •   higher-level data processing abstractions

    •   sophisticated data pipelines
Why use Cascading?

•   Cascading helps create

    •   higher-level data processing abstractions

    •   sophisticated data pipelines

    •   reusable components
Credits

• Cascading written by Chris Wensel
• based on his users guide

        http://bit.ly/cascading
package cascadingtutorial.wordcount;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      String inputPath = args[0];
      String outputPath = args[1];

         Scheme inputScheme = new TextLine(new Fields("offset", "line"));
         Scheme outputScheme = new TextLine();

         Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
           new Hfs(inputScheme, inputPath)    :
           new Lfs(inputScheme, inputPath);
         Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
           new Hfs(outputScheme, outputPath) :
           new Lfs(outputScheme, outputPath);

         Pipe wcPipe = new Each("wordcount",
             new Fields("line"),
             new RegexSplitGenerator(new Fields("word"), "s+"),
             new Fields("word"));

         wcPipe = new GroupBy(wcPipe, new Fields("word"));
         wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

         Properties properties = new Properties();
         FlowConnector.setApplicationJarClass(properties, Main.class);

         Flow parsedLogFlow = new FlowConnector(properties)
           .connect(sourceTap, sinkTap, wcPipe);
         parsedLogFlow.start();
         parsedLogFlow.complete();
     }
 }
Data Model


Pipes     Filters
Data Model


file
Data Model


file
Data Model


file
Data Model


file
Data Model


file
Data Model


file
Data Model

                   file




file



                   file
Data Model

                    file




file



                    file




          Filters
Tuple
Pipe Assembly
Pipe Assembly




 Tuple Stream
Taps: sources & sinks
Taps: sources & sinks

        Taps
Taps: sources & sinks

        Taps
Taps: sources & sinks

        Taps
Taps: sources & sinks

         Taps




Source
Taps: sources & sinks

           Taps




Source   Sink
Pipe + Taps = Flow

    Flow
Flow


Flow          Flow   Flow

       Flow
n Flows

       Flow


Flow          Flow   Flow

       Flow
n Flows = Cascade

       Flow


Flow          Flow   Flow

       Flow
n Flows = Cascade

       Flow


Flow          Flow      Flow

       Flow




              complex
Example
package cascadingtutorial.wordcount;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      String inputPath = args[0];
      String outputPath = args[1];

         Scheme inputScheme = new TextLine(new Fields("offset", "line"));
         Scheme outputScheme = new TextLine();

         Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
           new Hfs(inputScheme, inputPath)    :
           new Lfs(inputScheme, inputPath);
         Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
           new Hfs(outputScheme, outputPath) :
           new Lfs(outputScheme, outputPath);

         Pipe wcPipe = new Each("wordcount",
             new Fields("line"),
             new RegexSplitGenerator(new Fields("word"), "s+"),
             new Fields("word"));

         wcPipe = new GroupBy(wcPipe, new Fields("word"));
         wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

         Properties properties = new Properties();
         FlowConnector.setApplicationJarClass(properties, Main.class);

         Flow parsedLogFlow = new FlowConnector(properties)
           .connect(sourceTap, sinkTap, wcPipe);
         parsedLogFlow.start();
         parsedLogFlow.complete();
     }
 }
package cascadingtutorial.wordcount;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      String inputPath = args[0];
      String outputPath = args[1];

         Scheme inputScheme = new TextLine(new Fields("offset", "line"));
         Scheme outputScheme = new TextLine();

         Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
           new Hfs(inputScheme, inputPath)    :
           new Lfs(inputScheme, inputPath);
         Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
           new Hfs(outputScheme, outputPath) :
           new Lfs(outputScheme, outputPath);

         Pipe wcPipe = new Each("wordcount",
             new Fields("line"),
             new RegexSplitGenerator(new Fields("word"), "s+"),
             new Fields("word"));

         wcPipe = new GroupBy(wcPipe, new Fields("word"));
         wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

         Properties properties = new Properties();
         FlowConnector.setApplicationJarClass(properties, Main.class);

         Flow parsedLogFlow = new FlowConnector(properties)
           .connect(sourceTap, sinkTap, wcPipe);
         parsedLogFlow.start();
         parsedLogFlow.complete();
     }
 }
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);

     TextLine()
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);

     TextLine()
     Tap sinkTap
                                    SequenceFile()
                   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  { hdfs://master0:54310/user/nmurray/data.txt

 public static void main( String[] args )
   {
     Scheme inputScheme = new TextLine(new Fields("offset", "line"));
     Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  { hdfs://master0:54310/user/nmurray/data.txt        new Hfs()
 public static void main( String[] args )
   {
     Scheme inputScheme = new TextLine(new Fields("offset", "line"));
     Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  { hdfs://master0:54310/user/nmurray/data.txt        new Hfs()
  data/sources/obama-inaugural-address.txt
 public static void main( String[] args )
   {
     Scheme inputScheme = new TextLine(new Fields("offset", "line"));
     Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  { hdfs://master0:54310/user/nmurray/data.txt        new Hfs()
  data/sources/obama-inaugural-address.txt
 public static void main( String[] args )
                                                      new Lfs()
   {
     Scheme inputScheme = new TextLine(new Fields("offset", "line"));
     Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  { hdfs://master0:54310/user/nmurray/data.txt        new Hfs()
  data/sources/obama-inaugural-address.txt
 public static void main( String[] args )
                                                      new Lfs()
   {
     Scheme inputScheme = new TextLine(new Fields("offset", "line"));
     Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
                        S3fs()
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
/**
 * Wordcount example in Cascading
 */
public class Main
  { hdfs://master0:54310/user/nmurray/data.txt        new Hfs()
  data/sources/obama-inaugural-address.txt
 public static void main( String[] args )
                                                      new Lfs()
   {
     Scheme inputScheme = new TextLine(new Fields("offset", "line"));
     Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
                         S3fs()
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

                         GlobHfs() new Fields("count", "word"));
     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(),
/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));

     wcPipe = new GroupBy(wcPipe, new Fields("word"));
     wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Scheme outputScheme = new TextLine();

        String inputPath = args[0];
        String outputPath = args[1];

        Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
          new Hfs(inputScheme, inputPath)    :
          new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe = new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));


        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
Scheme outputScheme = new TextLine();

        String inputPath = args[0];
        String outputPath = args[1];

                                       Pipe Assembly
        Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
          new Hfs(inputScheme, inputPath)    :
          new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe = new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));


        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
Pipe Assemblies
Pipe Assemblies
Pipe Assemblies

• Define work against a Tuple Stream
Pipe Assemblies

• Define work against a Tuple Stream
• May have multiple sources and sinks
Pipe Assemblies

• Define work against a Tuple Stream
• May have multiple sources and sinks
 • Splits
Pipe Assemblies

• Define work against a Tuple Stream
• May have multiple sources and sinks
 • Splits
 • Merges
Pipe Assemblies

• Define work against a Tuple Stream
• May have multiple sources and sinks
 • Splits
 • Merges
 • Joins
Pipe Assemblies
Pipe Assemblies
• Pipe
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe         Applies a
 • Each Function or Filter
 • GroupBy Operation to each
 • CoGroup      Tuple
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
                 Group
 • GroupBy       (& merge)
 • CoGroup
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy               Joins
 • CoGroup       (inner, outer, left, right)

 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every         Applies an Aggregator (count,
                  sum) to every group of Tuples.
 • SubAssembly
Pipe Assemblies
• Pipe
 • Each
 • GroupBy
 • CoGroup
 • Every
 • SubAssembly   Nesting
Each vs. Every
Each vs. Every

• Each() is for individual Tuples
Each vs. Every

• Each() is for individual Tuples
• Every() is for groups of Tuples
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;

/**
 * Wordcount example in Cascading
 */
public class Main
  {

  public static void main( String[] args )
    {
      Scheme inputScheme = new TextLine(new Fields("offset", "line"));
      Scheme outputScheme = new TextLine();

     String inputPath = args[0];
     String outputPath = args[1];

     Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
       new Hfs(inputScheme, inputPath)    :
       new Lfs(inputScheme, inputPath);
     Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
       new Hfs(outputScheme, outputPath) :
       new Lfs(outputScheme, outputPath);

     Pipe wcPipe = new Each("wordcount",
         new Fields("line"),
         new RegexSplitGenerator(new Fields("word"), "s+"),
         new Fields("word"));
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));




                      Fields()
Each
               offset                   line
                 0              the lazy brown fox



Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each
               offset                   line
                 0              the lazy brown fox



Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

               X
               offset
                 0
                                        line
                                the lazy brown fox



Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each

               X   offset
                     0
                                           line
                                   the lazy brown fox



Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));




            word            word    word      word
            the             lazy   brown      fox
Operations: Functions
Operations: Functions

•   Insert()
Operations: Functions

•   Insert()

•   RegexParser() / RegexGenerator() / RegexReplace()
Operations: Functions

•   Insert()

•   RegexParser() / RegexGenerator() / RegexReplace()

•   DateParser() / DateFormatter()
Operations: Functions

•   Insert()

•   RegexParser() / RegexGenerator() / RegexReplace()

•   DateParser() / DateFormatter()

•   XPathGenerator()
Operations: Functions

•   Insert()

•   RegexParser() / RegexGenerator() / RegexReplace()

•   DateParser() / DateFormatter()

•   XPathGenerator()

•   Identity()
Operations: Functions

•   Insert()

•   RegexParser() / RegexGenerator() / RegexReplace()

•   DateParser() / DateFormatter()

•   XPathGenerator()

•   Identity()

•   etc...
Operations: Filters
Operations: Filters


•   RegexFilter()
Operations: Filters


•   RegexFilter()

•   FilterNull()
Operations: Filters


•   RegexFilter()

•   FilterNull()

•   And() / Or() / Not() / Xor()
Operations: Filters


•   RegexFilter()

•   FilterNull()

•   And() / Or() / Not() / Xor()

•   ExpressionFilter()
Operations: Filters


•   RegexFilter()

•   FilterNull()

•   And() / Or() / Not() / Xor()

•   ExpressionFilter()

•   etc...
Each

new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
Each


Pipe wcPipe =
new Each("wordcount",
    new Fields("line"),
    new RegexSplitGenerator(new Fields("word"), "s+"),
    new Fields("word"));
String inputPath = args[0];
        String outputPath = args[1];

        Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
          new Hfs(inputScheme, inputPath)    :
          new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
Every

new Every(previousPipe,
         argumentSelector,
         operation,
         outputSelector)
Every

new Every(previousPipe,
         argumentSelector,
         operation,
         outputSelector)


 new Every(wcPipe, new Count(), new Fields("count", "word"));
new Every(wcPipe, new Count(), new Fields("count", "word"));
Operations: Aggregator
Operations: Aggregator


•   Average()
Operations: Aggregator


•   Average()

•   Count()
Operations: Aggregator


•   Average()

•   Count()

•   First() / Last()
Operations: Aggregator


•   Average()

•   Count()

•   First() / Last()

•   Min() / Max()
Operations: Aggregator


•   Average()

•   Count()

•   First() / Last()

•   Min() / Max()

•   Sum()
Every

new Every(previousPipe,
         argumentSelector,
         operation,
         outputSelector)
Every

new Every(previousPipe,
         argumentSelector,
         operation,
         outputSelector)


 new Every(wcPipe, new Count(), new Fields("count", "word"));
Every

    new Every(previousPipe,
             argumentSelector,
             operation,
             outputSelector)


     new Every(wcPipe, new Count(), new Fields("count", "word"));



new Every(wcPipe, Fields.ALL, new Count(), new Fields("count", "word"));
Field Selection
Predefined Field Sets
Predefined Field Sets
  Fields.ALL              all available fields

Fields.GROUP         fields used for last grouping

Fields.VALUES      fields not used for last grouping

                       fields of argument Tuple
 Fields.ARGS
                          (for Operations)
                 replaces input with Operation result
Fields.RESULTS
                              (for Pipes)
Predefined Field Sets
  Fields.ALL              all available fields

Fields.GROUP         fields used for last grouping

Fields.VALUES      fields not used for last grouping

                       fields of argument Tuple
 Fields.ARGS
                          (for Operations)
                 replaces input with Operation result
Fields.RESULTS
                              (for Pipes)
Predefined Field Sets
  Fields.ALL              all available fields

Fields.GROUP         fields used for last grouping

Fields.VALUES      fields not used for last grouping

                       fields of argument Tuple
 Fields.ARGS
                          (for Operations)
                 replaces input with Operation result
Fields.RESULTS
                              (for Pipes)
Predefined Field Sets
  Fields.ALL              all available fields

Fields.GROUP         fields used for last grouping

Fields.VALUES      fields not used for last grouping

                       fields of argument Tuple
 Fields.ARGS
                          (for Operations)
                 replaces input with Operation result
Fields.RESULTS
                              (for Pipes)
Predefined Field Sets
  Fields.ALL              all available fields

Fields.GROUP         fields used for last grouping

Fields.VALUES      fields not used for last grouping

                       fields of argument Tuple
 Fields.ARGS
                          (for Operations)
                 replaces input with Operation result
Fields.RESULTS
                              (for Pipes)
Field Selection
new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)

pipe = new Each(pipe,
           new Fields("timestamp"),
           new DateParser("yyyy-MM-dd HH:mm:ss"),
           new Fields("timestamp"));
Field Selection
new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)



           Operation Input Tuple =

  Original Tuple     Argument Selector
Field Selection


pipe = new Each(pipe,
           new Fields("timestamp"),
           new DateParser("yyyy-MM-dd HH:mm:ss"),
           new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple
      visitor_ search_    time    page_
id
         id      term    stamp   number




     pipe = new Each(pipe,
                new Fields("timestamp"),
                new DateParser("yyyy-MM-dd HH:mm:ss"),
                new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                       Argument Selector
      visitor_ search_    time    page_
id                                               timestamp
         id      term    stamp   number




     pipe = new Each(pipe,
                new Fields("timestamp"),
                new DateParser("yyyy-MM-dd HH:mm:ss"),
                new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                       Argument Selector
      visitor_ search_    time    page_
id                                               timestamp
         id      term    stamp   number




     pipe = new Each(pipe,
                new Fields("timestamp"),
                new DateParser("yyyy-MM-dd HH:mm:ss"),
                new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                                    Argument Selector
      visitor_ search_     time    page_
id                                                          timestamp
         id      term     stamp   number




     pipe = new Each(pipe,
                new Fields("timestamp"),
                new DateParser("yyyy-MM-dd HH:mm:ss"),
                new Fields("ts", "search_term", "visitor_id"));



                         Input to DateParser will be:
                                           timestamp
Field Selection
new Each(previousPipe,
         argumentSelector,
         operation,
         outputSelector)



                   Output Tuple =

Original Tuple ⊕ Operation Tuple    Output Selector
Field Selection


pipe = new Each(pipe,
           new Fields("timestamp"),
           new DateParser("yyyy-MM-dd HH:mm:ss"),
           new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple
      visitor_   search_    time    page_
id
         id        term    stamp   number




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕
      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                         DateParser Output
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕    ts




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                         DateParser Output   Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕    ts               ts
                                                                       search_
                                                                         term
                                                                                 visitor_
                                                                                    id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                         DateParser Output   Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕    ts               ts
                                                                       search_
                                                                         term
                                                                                 visitor_
                                                                                    id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                         DateParser Output   Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕    ts               ts
                                                                       search_
                                                                         term
                                                                                 visitor_
                                                                                    id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));
Field Selection
     Original Tuple                         DateParser Output              Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕        ts                      ts
                                                                                  search_
                                                                                    term
                                                                                            visitor_
                                                                                               id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));



                                      Output of Each will be:
                                                      search_   visitor_
                                                ts
                                                        term       id
Field Selection
     Original Tuple                         DateParser Output              Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕        ts                      ts
                                                                                  search_
                                                                                    term
                                                                                            visitor_
                                                                                               id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));



                                      Output of Each will be:
                                                      search_   visitor_
                                                ts
                                                        term       id
Field Selection
     Original Tuple                         DateParser Output              Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕        ts                      ts
                                                                                  search_
                                                                                    term
                                                                                            visitor_
                                                                                               id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));



                                      Output of Each will be:
                                                      search_   visitor_
                                                ts
                                                        term       id
Field Selection
     Original Tuple                         DateParser Output              Output Selector
id
      visitor_
         id
                 search_
                   term
                            time
                           stamp
                                    page_
                                   number   ⊕        ts                      ts
                                                                                  search_
                                                                                    term
                                                                                            visitor_
                                                                                               id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));



                                      Output of Each will be:
                                                      search_   visitor_
                                                ts
                                                        term       id
Field Selection
     Original Tuple                         DateParser Output             Output Selector

X
id
      visitor_
         id
                 search_
                   term
                           XX ⊕
                            time
                           stamp
                                    page_
                                   number           ts                      ts
                                                                                 search_
                                                                                   term
                                                                                           visitor_
                                                                                              id




      pipe = new Each(pipe,
                 new Fields("timestamp"),
                 new DateParser("yyyy-MM-dd HH:mm:ss"),
                 new Fields("ts", "search_term", "visitor_id"));



                                      Output of Each will be:
                                                     search_   visitor_
                                               ts
                                                       term       id
word
                           hello
                            word
                           hello
                            word
                           world



new Every(wcPipe, new Count(), new Fields("count", "word"));
word
                             hello
                              word
                             hello
                              word
                             world



new Every(wcPipe, new Count(), new Fields("count", "word"));




                     count            word
                       2             hello

                     count            word
                       1             world
new Lfs(inputScheme, inputPath);
        Tap sinkTap   = outputPath.matches("^[^:]+://.*") ?
          new Hfs(outputScheme, outputPath) :
          new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
new Lfs(outputScheme, outputPath);

        Pipe wcPipe =
        new Each("wordcount",
            new Fields("line"),
            new RegexSplitGenerator(new Fields("word"), "s+"),
            new Fields("word"));

        wcPipe = new GroupBy(wcPipe, new Fields("word"));
        wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));

        Properties properties = new Properties();
        FlowConnector.setApplicationJarClass(properties, Main.class);

        Flow parsedLogFlow = new FlowConnector(properties)
          .connect(sourceTap, sinkTap, wcPipe);
        parsedLogFlow.start();
        parsedLogFlow.complete();
    }
}
Run
input:
   mary had a little lamb
   little lamb
   little lamb
   mary had a little lamb
   whose fleece was white as snow
input:
   mary had a little lamb
   little lamb
   little lamb
   mary had a little lamb
   whose fleece was white as snow




command:
   hadoop jar ./target/cascading-tutorial-1.0.0.jar 
     data/sources/misc/mary.txt data/output
output:
 2    a
 1    as
 1    fleece
 2    had
 4    lamb
 4    little
 2    mary
 1    snow
 1    was
 1    white
 1    whose
What’s next?
apply operations to
  tuple streams
apply operations to
  tuple streams


      repeat.
Cookbook
Cookbook

// remove all tuples where search_term is '-'
pipe = new Each(pipe,
           new Fields("search_term"),
           new RegexFilter("^(-)$", true));
Cookbook

// convert "timestamp" String field to "ts" long field
pipe = new Each(pipe,
           new Fields("timestamp"),
           new DateParser("yyyy-MM-dd HH:mm:ss"),
           Fields.ALL);
Cookbook
// td - timestamp of day-level granularity
pipe = new Each(pipe,
           new Fields("ts"),
           new ExpressionFunction(
               new Fields("td"),
               "ts - (ts % (24 * 60 * 60 * 1000))",
               long.class),
           Fields.ALL);
Cookbook
// SELECT DISTINCT visitor_id

// GROUP BY visitor_id ORDER BY ts
pipe = new GroupBy(pipe,
           new Fields("visitor_id"),
           new Fields("ts"));

// take the First() tuple of every grouping
pipe = new Every(pipe,
           Fields.ALL,
           new First(),
           Fields.RESULTS);
Custom Operations
Desired Operation


Pipe pipe = new Each("ngram pipe",
    new Fields("line"),
    new NGramTokenizer(2),
    new Fields("ngram"));
Desired Operation
                  line
        mary had a little lamb




Pipe pipe = new Each("ngram pipe",
    new Fields("line"),
    new NGramTokenizer(2),
    new Fields("ngram"));
Desired Operation
                     line
           mary had a little lamb




  Pipe pipe = new Each("ngram pipe",
      new Fields("line"),
      new NGramTokenizer(2),
      new Fields("ngram"));




  ngram
mary had
Desired Operation
                     line
           mary had a little lamb




  Pipe pipe = new Each("ngram pipe",
      new Fields("line"),
      new NGramTokenizer(2),
      new Fields("ngram"));




  ngram      ngram
mary had     had a
Desired Operation
                     line
           mary had a little lamb




  Pipe pipe = new Each("ngram pipe",
      new Fields("line"),
      new NGramTokenizer(2),
      new Fields("ngram"));




  ngram      ngram        ngram
mary had     had a      a little
Desired Operation
                     line
           mary had a little lamb




  Pipe pipe = new Each("ngram pipe",
      new Fields("line"),
      new NGramTokenizer(2),
      new Fields("ngram"));




  ngram      ngram        ngram        ngram
mary had     had a      a little    little lamb
Custom Operations
Custom Operations

• extend BaseOperation
Custom Operations

• extend BaseOperation
• implement Function (or Filter)
Custom Operations

• extend BaseOperation
• implement Function (or Filter)
• implement operate
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
Tuple   123   vis-abc   tacos   2009-10-08
                                 03:21:23    1
Tuple   123   vis-abc   tacos   2009-10-08
                                 03:21:23    1




        zero-indexed, ordered
                values
Tuple      123   vis-abc   tacos    2009-10-08
                                     03:21:23      1



Fields      id
                 visitor_ search_
                    id      term
                                      time
                                     stamp
                                                  page_
                                                 number



         list of strings representing
                  field names
Tuple    123   vis-abc   tacos    2009-10-08
                                   03:21:23      1



Fields    id
               visitor_ search_
                  id      term
                                    time
                                   stamp
                                                page_
                                               number
TupleEntry
      visitor_ search_     time        page_
 id
         id      term     stamp       number


123   vis-abc   tacos    2009-10-08
                          03:21:23      1
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
public class NGramTokenizer extends BaseOperation implements Function
{
  private int size;

    public NGramTokenizer(int size)
    {
      super(new Fields("ngram"));
      this.size = size;
    }

    public void operate(FlowProcess flowProcess, FunctionCall functionCall)
    {
      String token;
      TupleEntry arguments = functionCall.getArguments();
      com.attinteractive.util.NGramTokenizer t =
        new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);

        while((token = t.next()) != null) {
          TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
          functionCall.getOutputCollector().add(result);
        }
    }
}
Success
                     line
           mary had a little lamb




  Pipe pipe = new Each("ngram pipe",
      new Fields("line"),
      new NGramTokenizer(2),
      new Fields("ngram"));




  ngram      ngram        ngram        ngram
mary had     had a      a little    little lamb
code examples
dot
dot
Flow importFlow = flowConnector.connect(sourceTap, tmpIntermediateTap, importPipe);
importFlow.writeDOT( "import-flow.dot" );
What next?
What next?
What next?


• http://www.cascading.org/
What next?


• http://www.cascading.org/
• http://bit.ly/cascading
Questions?
 <nmurray@att.com>
Intro To Cascading

More Related Content

What's hot

Hadoop MapReduce Fundamentals
Hadoop MapReduce FundamentalsHadoop MapReduce Fundamentals
Hadoop MapReduce FundamentalsLynn Langit
 
PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)
PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)
PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)PyData
 
Apache Drill @ PJUG, Jan 15, 2013
Apache Drill @ PJUG, Jan 15, 2013Apache Drill @ PJUG, Jan 15, 2013
Apache Drill @ PJUG, Jan 15, 2013Gera Shegalov
 
Map reduce paradigm explained
Map reduce paradigm explainedMap reduce paradigm explained
Map reduce paradigm explainedDmytro Sandu
 
Hadoop and HBase experiences in perf log project
Hadoop and HBase experiences in perf log projectHadoop and HBase experiences in perf log project
Hadoop and HBase experiences in perf log projectMao Geng
 
Spark ETL Techniques - Creating An Optimal Fantasy Baseball Roster
Spark ETL Techniques - Creating An Optimal Fantasy Baseball RosterSpark ETL Techniques - Creating An Optimal Fantasy Baseball Roster
Spark ETL Techniques - Creating An Optimal Fantasy Baseball RosterDon Drake
 
Practical Hadoop using Pig
Practical Hadoop using PigPractical Hadoop using Pig
Practical Hadoop using PigDavid Wellman
 
Introduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLab
Introduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLabIntroduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLab
Introduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLabCloudxLab
 
Data profiling in Apache Calcite
Data profiling in Apache CalciteData profiling in Apache Calcite
Data profiling in Apache CalciteDataWorks Summit
 
Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...
Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...
Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...CloudxLab
 
Apache cassandra in 2016
Apache cassandra in 2016Apache cassandra in 2016
Apache cassandra in 2016Duyhai Doan
 
Hadoop/MapReduce/HDFS
Hadoop/MapReduce/HDFSHadoop/MapReduce/HDFS
Hadoop/MapReduce/HDFSpraveen bhat
 
Large scale, interactive ad-hoc queries over different datastores with Apache...
Large scale, interactive ad-hoc queries over different datastores with Apache...Large scale, interactive ad-hoc queries over different datastores with Apache...
Large scale, interactive ad-hoc queries over different datastores with Apache...jaxLondonConference
 
Modeling with Hadoop kdd2011
Modeling with Hadoop kdd2011Modeling with Hadoop kdd2011
Modeling with Hadoop kdd2011Milind Bhandarkar
 
Hadoop trainingin bangalore
Hadoop trainingin bangaloreHadoop trainingin bangalore
Hadoop trainingin bangaloreappaji intelhunt
 

What's hot (20)

Apache PIG
Apache PIGApache PIG
Apache PIG
 
Hadoop MapReduce Fundamentals
Hadoop MapReduce FundamentalsHadoop MapReduce Fundamentals
Hadoop MapReduce Fundamentals
 
Map reduce prashant
Map reduce prashantMap reduce prashant
Map reduce prashant
 
PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)
PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)
PyCascading for Intuitive Flow Processing with Hadoop (gabor szabo)
 
Apache Drill @ PJUG, Jan 15, 2013
Apache Drill @ PJUG, Jan 15, 2013Apache Drill @ PJUG, Jan 15, 2013
Apache Drill @ PJUG, Jan 15, 2013
 
Map reduce paradigm explained
Map reduce paradigm explainedMap reduce paradigm explained
Map reduce paradigm explained
 
Hadoop and HBase experiences in perf log project
Hadoop and HBase experiences in perf log projectHadoop and HBase experiences in perf log project
Hadoop and HBase experiences in perf log project
 
Spark ETL Techniques - Creating An Optimal Fantasy Baseball Roster
Spark ETL Techniques - Creating An Optimal Fantasy Baseball RosterSpark ETL Techniques - Creating An Optimal Fantasy Baseball Roster
Spark ETL Techniques - Creating An Optimal Fantasy Baseball Roster
 
Practical Hadoop using Pig
Practical Hadoop using PigPractical Hadoop using Pig
Practical Hadoop using Pig
 
Cascalog internal dsl_preso
Cascalog internal dsl_presoCascalog internal dsl_preso
Cascalog internal dsl_preso
 
Introduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLab
Introduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLabIntroduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLab
Introduction to Apache Hive | Big Data Hadoop Spark Tutorial | CloudxLab
 
Lecture 2 part 3
Lecture 2 part 3Lecture 2 part 3
Lecture 2 part 3
 
Data profiling in Apache Calcite
Data profiling in Apache CalciteData profiling in Apache Calcite
Data profiling in Apache Calcite
 
Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...
Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...
Apache Spark - Loading & Saving data | Big Data Hadoop Spark Tutorial | Cloud...
 
Apache cassandra in 2016
Apache cassandra in 2016Apache cassandra in 2016
Apache cassandra in 2016
 
Hadoop/MapReduce/HDFS
Hadoop/MapReduce/HDFSHadoop/MapReduce/HDFS
Hadoop/MapReduce/HDFS
 
Large scale, interactive ad-hoc queries over different datastores with Apache...
Large scale, interactive ad-hoc queries over different datastores with Apache...Large scale, interactive ad-hoc queries over different datastores with Apache...
Large scale, interactive ad-hoc queries over different datastores with Apache...
 
Modeling with Hadoop kdd2011
Modeling with Hadoop kdd2011Modeling with Hadoop kdd2011
Modeling with Hadoop kdd2011
 
Spark and shark
Spark and sharkSpark and shark
Spark and shark
 
Hadoop trainingin bangalore
Hadoop trainingin bangaloreHadoop trainingin bangalore
Hadoop trainingin bangalore
 

Viewers also liked

Cascading - A Java Developer’s Companion to the Hadoop World
Cascading - A Java Developer’s Companion to the Hadoop WorldCascading - A Java Developer’s Companion to the Hadoop World
Cascading - A Java Developer’s Companion to the Hadoop WorldCascading
 
Cascading User Group Meet
Cascading User Group MeetCascading User Group Meet
Cascading User Group MeetVinoth Kannan
 
Example team objective cascade
Example team objective cascadeExample team objective cascade
Example team objective cascadeEric Tachibana
 
Building Complex Data Workflows with Cascading on Hadoop
Building Complex Data Workflows with Cascading on HadoopBuilding Complex Data Workflows with Cascading on Hadoop
Building Complex Data Workflows with Cascading on HadoopGagan Agrawal
 
Data flow in the data center
Data flow in the data centerData flow in the data center
Data flow in the data centerAdam Cataldo
 
Deploying Hadoop-Based Bigdata Environments
Deploying Hadoop-Based Bigdata EnvironmentsDeploying Hadoop-Based Bigdata Environments
Deploying Hadoop-Based Bigdata EnvironmentsPuppet
 
Result Based KPIs
Result Based KPIsResult Based KPIs
Result Based KPIsHR Metrics
 
Balanced scorecard overview slides
Balanced scorecard overview slidesBalanced scorecard overview slides
Balanced scorecard overview slidesTools4management.com
 
How to grow your business with ease
How to grow your business with easeHow to grow your business with ease
How to grow your business with easeMartin Johnson
 
Overview of Cascading 3.0 on Apache Flink
Overview of Cascading 3.0 on Apache Flink Overview of Cascading 3.0 on Apache Flink
Overview of Cascading 3.0 on Apache Flink Cascading
 
Linking The Organizational Vision To The Individual By Creating A Strategic L...
Linking The Organizational Vision To The Individual By Creating A Strategic L...Linking The Organizational Vision To The Individual By Creating A Strategic L...
Linking The Organizational Vision To The Individual By Creating A Strategic L...Claudia Rubino
 
Bsc workshop
Bsc workshopBsc workshop
Bsc workshopparag11
 
Balance scorecard art of strategy execution
Balance scorecard  art of strategy executionBalance scorecard  art of strategy execution
Balance scorecard art of strategy executionDr Wilfred Monteiro
 
Scorecard 3
Scorecard 3Scorecard 3
Scorecard 3dmdk12
 

Viewers also liked (20)

Cascading - A Java Developer’s Companion to the Hadoop World
Cascading - A Java Developer’s Companion to the Hadoop WorldCascading - A Java Developer’s Companion to the Hadoop World
Cascading - A Java Developer’s Companion to the Hadoop World
 
Cascading User Group Meet
Cascading User Group MeetCascading User Group Meet
Cascading User Group Meet
 
Example team objective cascade
Example team objective cascadeExample team objective cascade
Example team objective cascade
 
Building Complex Data Workflows with Cascading on Hadoop
Building Complex Data Workflows with Cascading on HadoopBuilding Complex Data Workflows with Cascading on Hadoop
Building Complex Data Workflows with Cascading on Hadoop
 
Data flow in the data center
Data flow in the data centerData flow in the data center
Data flow in the data center
 
Deploying Hadoop-Based Bigdata Environments
Deploying Hadoop-Based Bigdata EnvironmentsDeploying Hadoop-Based Bigdata Environments
Deploying Hadoop-Based Bigdata Environments
 
Scalding for Hadoop
Scalding for HadoopScalding for Hadoop
Scalding for Hadoop
 
Result Based KPIs
Result Based KPIsResult Based KPIs
Result Based KPIs
 
Balanced scorecard overview slides
Balanced scorecard overview slidesBalanced scorecard overview slides
Balanced scorecard overview slides
 
How to grow your business with ease
How to grow your business with easeHow to grow your business with ease
How to grow your business with ease
 
Overview of Cascading 3.0 on Apache Flink
Overview of Cascading 3.0 on Apache Flink Overview of Cascading 3.0 on Apache Flink
Overview of Cascading 3.0 on Apache Flink
 
Linking The Organizational Vision To The Individual By Creating A Strategic L...
Linking The Organizational Vision To The Individual By Creating A Strategic L...Linking The Organizational Vision To The Individual By Creating A Strategic L...
Linking The Organizational Vision To The Individual By Creating A Strategic L...
 
Bsc workshop
Bsc workshopBsc workshop
Bsc workshop
 
Balance scorecard art of strategy execution
Balance scorecard  art of strategy executionBalance scorecard  art of strategy execution
Balance scorecard art of strategy execution
 
Countdown timer
Countdown timerCountdown timer
Countdown timer
 
Ten Steps to Enterprise Strategy Alignment
Ten Steps to Enterprise Strategy AlignmentTen Steps to Enterprise Strategy Alignment
Ten Steps to Enterprise Strategy Alignment
 
Ece221 Ch7 Part1
Ece221 Ch7 Part1Ece221 Ch7 Part1
Ece221 Ch7 Part1
 
Scorecard 3
Scorecard 3Scorecard 3
Scorecard 3
 
Effective Communication for Leaders
Effective Communication for LeadersEffective Communication for Leaders
Effective Communication for Leaders
 
digital Counter
digital Counterdigital Counter
digital Counter
 

Similar to Intro To Cascading

Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGMatthew McCullough
 
Data Processing with Cascading Java API on Apache Hadoop
Data Processing with Cascading Java API on Apache HadoopData Processing with Cascading Java API on Apache Hadoop
Data Processing with Cascading Java API on Apache HadoopHikmat Dhamee
 
Pattern - an open source project for migrating predictive models from SAS, et...
Pattern - an open source project for migrating predictive models from SAS, et...Pattern - an open source project for migrating predictive models from SAS, et...
Pattern - an open source project for migrating predictive models from SAS, et...DataWorks Summit
 
Hadoop Summit: Pattern – an open source project for migrating predictive mode...
Hadoop Summit: Pattern – an open source project for migrating predictive mode...Hadoop Summit: Pattern – an open source project for migrating predictive mode...
Hadoop Summit: Pattern – an open source project for migrating predictive mode...Paco Nathan
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and MonoidsHugo Gävert
 
The Cascading (big) data application framework
The Cascading (big) data application frameworkThe Cascading (big) data application framework
The Cascading (big) data application frameworkModern Data Stack France
 
The Cascading (big) data application framework - André Keple, Sr. Engineer, C...
The Cascading (big) data application framework - André Keple, Sr. Engineer, C...The Cascading (big) data application framework - André Keple, Sr. Engineer, C...
The Cascading (big) data application framework - André Keple, Sr. Engineer, C...Cascading
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseSages
 
HBaseConEast2016: HBase and Spark, State of the Art
HBaseConEast2016: HBase and Spark, State of the ArtHBaseConEast2016: HBase and Spark, State of the Art
HBaseConEast2016: HBase and Spark, State of the ArtMichael Stack
 
Functional Programming in Java 8 - Lambdas and Streams
Functional Programming in Java 8 - Lambdas and StreamsFunctional Programming in Java 8 - Lambdas and Streams
Functional Programming in Java 8 - Lambdas and StreamsCodeOps Technologies LLP
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemSages
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopSages
 
Getting the most out of Java [Nordic Coding-2010]
Getting the most out of Java [Nordic Coding-2010]Getting the most out of Java [Nordic Coding-2010]
Getting the most out of Java [Nordic Coding-2010]Sven Efftinge
 
JavaOne 2016 - Learn Lambda and functional programming
JavaOne 2016 - Learn Lambda and functional programmingJavaOne 2016 - Learn Lambda and functional programming
JavaOne 2016 - Learn Lambda and functional programmingHenri Tremblay
 
AJUG April 2011 Cascading example
AJUG April 2011 Cascading exampleAJUG April 2011 Cascading example
AJUG April 2011 Cascading exampleChristopher Curtin
 
Hadoop User Group EU 2014
Hadoop User Group EU 2014Hadoop User Group EU 2014
Hadoop User Group EU 2014cwensel
 

Similar to Intro To Cascading (20)

Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUG
 
Data Processing with Cascading Java API on Apache Hadoop
Data Processing with Cascading Java API on Apache HadoopData Processing with Cascading Java API on Apache Hadoop
Data Processing with Cascading Java API on Apache Hadoop
 
Pattern - an open source project for migrating predictive models from SAS, et...
Pattern - an open source project for migrating predictive models from SAS, et...Pattern - an open source project for migrating predictive models from SAS, et...
Pattern - an open source project for migrating predictive models from SAS, et...
 
What is new in Java 8
What is new in Java 8What is new in Java 8
What is new in Java 8
 
Hadoop Summit: Pattern – an open source project for migrating predictive mode...
Hadoop Summit: Pattern – an open source project for migrating predictive mode...Hadoop Summit: Pattern – an open source project for migrating predictive mode...
Hadoop Summit: Pattern – an open source project for migrating predictive mode...
 
V8
V8V8
V8
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and Monoids
 
Java 8 Workshop
Java 8 WorkshopJava 8 Workshop
Java 8 Workshop
 
The Cascading (big) data application framework
The Cascading (big) data application frameworkThe Cascading (big) data application framework
The Cascading (big) data application framework
 
The Cascading (big) data application framework - André Keple, Sr. Engineer, C...
The Cascading (big) data application framework - André Keple, Sr. Engineer, C...The Cascading (big) data application framework - André Keple, Sr. Engineer, C...
The Cascading (big) data application framework - André Keple, Sr. Engineer, C...
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
 
HBaseConEast2016: HBase and Spark, State of the Art
HBaseConEast2016: HBase and Spark, State of the ArtHBaseConEast2016: HBase and Spark, State of the Art
HBaseConEast2016: HBase and Spark, State of the Art
 
Functional Programming in Java 8 - Lambdas and Streams
Functional Programming in Java 8 - Lambdas and StreamsFunctional Programming in Java 8 - Lambdas and Streams
Functional Programming in Java 8 - Lambdas and Streams
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache Hadoop
 
Getting the most out of Java [Nordic Coding-2010]
Getting the most out of Java [Nordic Coding-2010]Getting the most out of Java [Nordic Coding-2010]
Getting the most out of Java [Nordic Coding-2010]
 
JavaOne 2016 - Learn Lambda and functional programming
JavaOne 2016 - Learn Lambda and functional programmingJavaOne 2016 - Learn Lambda and functional programming
JavaOne 2016 - Learn Lambda and functional programming
 
AJUG April 2011 Cascading example
AJUG April 2011 Cascading exampleAJUG April 2011 Cascading example
AJUG April 2011 Cascading example
 
Hadoop User Group EU 2014
Hadoop User Group EU 2014Hadoop User Group EU 2014
Hadoop User Group EU 2014
 
Java gets a closure
Java gets a closureJava gets a closure
Java gets a closure
 

Recently uploaded

AI as an Interface for Commercial Buildings
AI as an Interface for Commercial BuildingsAI as an Interface for Commercial Buildings
AI as an Interface for Commercial BuildingsMemoori
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii SoldatenkoFwdays
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebUiPathCommunity
 
Unleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding ClubUnleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding ClubKalema Edgar
 
costume and set research powerpoint presentation
costume and set research powerpoint presentationcostume and set research powerpoint presentation
costume and set research powerpoint presentationphoebematthew05
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024Scott Keck-Warren
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
Key Features Of Token Development (1).pptx
Key  Features Of Token  Development (1).pptxKey  Features Of Token  Development (1).pptx
Key Features Of Token Development (1).pptxLBM Solutions
 
Pigging Solutions in Pet Food Manufacturing
Pigging Solutions in Pet Food ManufacturingPigging Solutions in Pet Food Manufacturing
Pigging Solutions in Pet Food ManufacturingPigging Solutions
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsRizwan Syed
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Mark Simos
 
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Wonjun Hwang
 
Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsMiki Katsuragi
 
My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024The Digital Insurer
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Scott Keck-Warren
 
Pigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping ElbowsPigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping ElbowsPigging Solutions
 
Unblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesUnblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesSinan KOZAK
 
Science&tech:THE INFORMATION AGE STS.pdf
Science&tech:THE INFORMATION AGE STS.pdfScience&tech:THE INFORMATION AGE STS.pdf
Science&tech:THE INFORMATION AGE STS.pdfjimielynbastida
 

Recently uploaded (20)

AI as an Interface for Commercial Buildings
AI as an Interface for Commercial BuildingsAI as an Interface for Commercial Buildings
AI as an Interface for Commercial Buildings
 
"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko"Debugging python applications inside k8s environment", Andrii Soldatenko
"Debugging python applications inside k8s environment", Andrii Soldatenko
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio Web
 
DMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special EditionDMCC Future of Trade Web3 - Special Edition
DMCC Future of Trade Web3 - Special Edition
 
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptxE-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
 
Unleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding ClubUnleash Your Potential - Namagunga Girls Coding Club
Unleash Your Potential - Namagunga Girls Coding Club
 
costume and set research powerpoint presentation
costume and set research powerpoint presentationcostume and set research powerpoint presentation
costume and set research powerpoint presentation
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
Key Features Of Token Development (1).pptx
Key  Features Of Token  Development (1).pptxKey  Features Of Token  Development (1).pptx
Key Features Of Token Development (1).pptx
 
Pigging Solutions in Pet Food Manufacturing
Pigging Solutions in Pet Food ManufacturingPigging Solutions in Pet Food Manufacturing
Pigging Solutions in Pet Food Manufacturing
 
Scanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL CertsScanning the Internet for External Cloud Exposures via SSL Certs
Scanning the Internet for External Cloud Exposures via SSL Certs
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
 
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
 
Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering Tips
 
My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024My INSURER PTE LTD - Insurtech Innovation Award 2024
My INSURER PTE LTD - Insurtech Innovation Award 2024
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024
 
Pigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping ElbowsPigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping Elbows
 
Unblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesUnblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen Frames
 
Science&tech:THE INFORMATION AGE STS.pdf
Science&tech:THE INFORMATION AGE STS.pdfScience&tech:THE INFORMATION AGE STS.pdf
Science&tech:THE INFORMATION AGE STS.pdf
 

Intro To Cascading

  • 3. What is Cascading? • abstraction over MapReduce
  • 4. What is Cascading? • abstraction over MapReduce • API for data-processing workflows
  • 6. Why use Cascading? • MapReduce can be:
  • 7. Why use Cascading? • MapReduce can be: • the wrong level of granularity
  • 8. Why use Cascading? • MapReduce can be: • the wrong level of granularity • cumbersome to chain together
  • 10. Why use Cascading? • Cascading helps create
  • 11. Why use Cascading? • Cascading helps create • higher-level data processing abstractions
  • 12. Why use Cascading? • Cascading helps create • higher-level data processing abstractions • sophisticated data pipelines
  • 13. Why use Cascading? • Cascading helps create • higher-level data processing abstractions • sophisticated data pipelines • reusable components
  • 14. Credits • Cascading written by Chris Wensel • based on his users guide http://bit.ly/cascading
  • 15. package cascadingtutorial.wordcount; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { String inputPath = args[0]; String outputPath = args[1]; Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 16. Data Model Pipes Filters
  • 23. Data Model file file file
  • 24. Data Model file file file Filters
  • 25. Tuple
  • 29. Taps: sources & sinks Taps
  • 30. Taps: sources & sinks Taps
  • 31. Taps: sources & sinks Taps
  • 32. Taps: sources & sinks Taps Source
  • 33. Taps: sources & sinks Taps Source Sink
  • 34.
  • 35. Pipe + Taps = Flow Flow
  • 36. Flow Flow Flow Flow Flow
  • 37. n Flows Flow Flow Flow Flow Flow
  • 38. n Flows = Cascade Flow Flow Flow Flow Flow
  • 39. n Flows = Cascade Flow Flow Flow Flow Flow complex
  • 41. package cascadingtutorial.wordcount; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { String inputPath = args[0]; String outputPath = args[1]; Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 42. package cascadingtutorial.wordcount; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { String inputPath = args[0]; String outputPath = args[1]; Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 43. import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 44. import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 45. import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); TextLine() Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 46. import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); TextLine() Tap sinkTap SequenceFile() = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 47. import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 48. /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 49. /** * Wordcount example in Cascading */ public class Main { hdfs://master0:54310/user/nmurray/data.txt public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 50. /** * Wordcount example in Cascading */ public class Main { hdfs://master0:54310/user/nmurray/data.txt new Hfs() public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 51. /** * Wordcount example in Cascading */ public class Main { hdfs://master0:54310/user/nmurray/data.txt new Hfs() data/sources/obama-inaugural-address.txt public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 52. /** * Wordcount example in Cascading */ public class Main { hdfs://master0:54310/user/nmurray/data.txt new Hfs() data/sources/obama-inaugural-address.txt public static void main( String[] args ) new Lfs() { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 53. /** * Wordcount example in Cascading */ public class Main { hdfs://master0:54310/user/nmurray/data.txt new Hfs() data/sources/obama-inaugural-address.txt public static void main( String[] args ) new Lfs() { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), S3fs() new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 54. /** * Wordcount example in Cascading */ public class Main { hdfs://master0:54310/user/nmurray/data.txt new Hfs() data/sources/obama-inaugural-address.txt public static void main( String[] args ) new Lfs() { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), S3fs() new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); GlobHfs() new Fields("count", "word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(),
  • 55. /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 56. Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 57. Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Pipe Assembly Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 60. Pipe Assemblies • Define work against a Tuple Stream
  • 61. Pipe Assemblies • Define work against a Tuple Stream • May have multiple sources and sinks
  • 62. Pipe Assemblies • Define work against a Tuple Stream • May have multiple sources and sinks • Splits
  • 63. Pipe Assemblies • Define work against a Tuple Stream • May have multiple sources and sinks • Splits • Merges
  • 64. Pipe Assemblies • Define work against a Tuple Stream • May have multiple sources and sinks • Splits • Merges • Joins
  • 67. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every • SubAssembly
  • 68. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every • SubAssembly
  • 69. Pipe Assemblies • Pipe Applies a • Each Function or Filter • GroupBy Operation to each • CoGroup Tuple • Every • SubAssembly
  • 70. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every • SubAssembly
  • 71. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every • SubAssembly
  • 72. Pipe Assemblies • Pipe • Each Group • GroupBy (& merge) • CoGroup • Every • SubAssembly
  • 73. Pipe Assemblies • Pipe • Each • GroupBy Joins • CoGroup (inner, outer, left, right) • Every • SubAssembly
  • 74. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every • SubAssembly
  • 75. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every Applies an Aggregator (count, sum) to every group of Tuples. • SubAssembly
  • 76. Pipe Assemblies • Pipe • Each • GroupBy • CoGroup • Every • SubAssembly Nesting
  • 78. Each vs. Every • Each() is for individual Tuples
  • 79. Each vs. Every • Each() is for individual Tuples • Every() is for groups of Tuples
  • 80. Each new Each(previousPipe, argumentSelector, operation, outputSelector)
  • 81. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 82. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 83. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 84. import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; /** * Wordcount example in Cascading */ public class Main { public static void main( String[] args ) { Scheme inputScheme = new TextLine(new Fields("offset", "line")); Scheme outputScheme = new TextLine(); String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 85. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 86. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 87. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 88. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); Fields()
  • 89. Each offset line 0 the lazy brown fox Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 90. Each offset line 0 the lazy brown fox Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 91. Each X offset 0 line the lazy brown fox Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 92. Each X offset 0 line the lazy brown fox Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); word word word word the lazy brown fox
  • 95. Operations: Functions • Insert() • RegexParser() / RegexGenerator() / RegexReplace()
  • 96. Operations: Functions • Insert() • RegexParser() / RegexGenerator() / RegexReplace() • DateParser() / DateFormatter()
  • 97. Operations: Functions • Insert() • RegexParser() / RegexGenerator() / RegexReplace() • DateParser() / DateFormatter() • XPathGenerator()
  • 98. Operations: Functions • Insert() • RegexParser() / RegexGenerator() / RegexReplace() • DateParser() / DateFormatter() • XPathGenerator() • Identity()
  • 99. Operations: Functions • Insert() • RegexParser() / RegexGenerator() / RegexReplace() • DateParser() / DateFormatter() • XPathGenerator() • Identity() • etc...
  • 101. Operations: Filters • RegexFilter()
  • 102. Operations: Filters • RegexFilter() • FilterNull()
  • 103. Operations: Filters • RegexFilter() • FilterNull() • And() / Or() / Not() / Xor()
  • 104. Operations: Filters • RegexFilter() • FilterNull() • And() / Or() / Not() / Xor() • ExpressionFilter()
  • 105. Operations: Filters • RegexFilter() • FilterNull() • And() / Or() / Not() / Xor() • ExpressionFilter() • etc...
  • 106. Each new Each(previousPipe, argumentSelector, operation, outputSelector) Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 107. Each Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word"));
  • 108. String inputPath = args[0]; String outputPath = args[1]; Tap sourceTap = inputPath.matches( "^[^:]+://.*") ? new Hfs(inputScheme, inputPath) : new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 109. new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 110. new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 111. new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 112. Every new Every(previousPipe, argumentSelector, operation, outputSelector)
  • 113. Every new Every(previousPipe, argumentSelector, operation, outputSelector) new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 114. new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 117. Operations: Aggregator • Average() • Count()
  • 118. Operations: Aggregator • Average() • Count() • First() / Last()
  • 119. Operations: Aggregator • Average() • Count() • First() / Last() • Min() / Max()
  • 120. Operations: Aggregator • Average() • Count() • First() / Last() • Min() / Max() • Sum()
  • 121. Every new Every(previousPipe, argumentSelector, operation, outputSelector)
  • 122. Every new Every(previousPipe, argumentSelector, operation, outputSelector) new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 123. Every new Every(previousPipe, argumentSelector, operation, outputSelector) new Every(wcPipe, new Count(), new Fields("count", "word")); new Every(wcPipe, Fields.ALL, new Count(), new Fields("count", "word"));
  • 126. Predefined Field Sets Fields.ALL all available fields Fields.GROUP fields used for last grouping Fields.VALUES fields not used for last grouping fields of argument Tuple Fields.ARGS (for Operations) replaces input with Operation result Fields.RESULTS (for Pipes)
  • 127. Predefined Field Sets Fields.ALL all available fields Fields.GROUP fields used for last grouping Fields.VALUES fields not used for last grouping fields of argument Tuple Fields.ARGS (for Operations) replaces input with Operation result Fields.RESULTS (for Pipes)
  • 128. Predefined Field Sets Fields.ALL all available fields Fields.GROUP fields used for last grouping Fields.VALUES fields not used for last grouping fields of argument Tuple Fields.ARGS (for Operations) replaces input with Operation result Fields.RESULTS (for Pipes)
  • 129. Predefined Field Sets Fields.ALL all available fields Fields.GROUP fields used for last grouping Fields.VALUES fields not used for last grouping fields of argument Tuple Fields.ARGS (for Operations) replaces input with Operation result Fields.RESULTS (for Pipes)
  • 130. Predefined Field Sets Fields.ALL all available fields Fields.GROUP fields used for last grouping Fields.VALUES fields not used for last grouping fields of argument Tuple Fields.ARGS (for Operations) replaces input with Operation result Fields.RESULTS (for Pipes)
  • 131. Field Selection new Each(previousPipe, argumentSelector, operation, outputSelector) pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("timestamp"));
  • 132. Field Selection new Each(previousPipe, argumentSelector, operation, outputSelector) Operation Input Tuple = Original Tuple Argument Selector
  • 133. Field Selection pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 134. Field Selection Original Tuple visitor_ search_ time page_ id id term stamp number pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 135. Field Selection Original Tuple Argument Selector visitor_ search_ time page_ id timestamp id term stamp number pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 136. Field Selection Original Tuple Argument Selector visitor_ search_ time page_ id timestamp id term stamp number pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 137. Field Selection Original Tuple Argument Selector visitor_ search_ time page_ id timestamp id term stamp number pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id")); Input to DateParser will be: timestamp
  • 138. Field Selection new Each(previousPipe, argumentSelector, operation, outputSelector) Output Tuple = Original Tuple ⊕ Operation Tuple Output Selector
  • 139. Field Selection pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 140. Field Selection Original Tuple visitor_ search_ time page_ id id term stamp number pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 141. Field Selection Original Tuple id visitor_ id search_ term time stamp page_ number ⊕ pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 142. Field Selection Original Tuple DateParser Output id visitor_ id search_ term time stamp page_ number ⊕ ts pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 143. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 144. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 145. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id"));
  • 146. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id")); Output of Each will be: search_ visitor_ ts term id
  • 147. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id")); Output of Each will be: search_ visitor_ ts term id
  • 148. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id")); Output of Each will be: search_ visitor_ ts term id
  • 149. Field Selection Original Tuple DateParser Output Output Selector id visitor_ id search_ term time stamp page_ number ⊕ ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id")); Output of Each will be: search_ visitor_ ts term id
  • 150. Field Selection Original Tuple DateParser Output Output Selector X id visitor_ id search_ term XX ⊕ time stamp page_ number ts ts search_ term visitor_ id pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), new Fields("ts", "search_term", "visitor_id")); Output of Each will be: search_ visitor_ ts term id
  • 151. word hello word hello word world new Every(wcPipe, new Count(), new Fields("count", "word"));
  • 152. word hello word hello word world new Every(wcPipe, new Count(), new Fields("count", "word")); count word 2 hello count word 1 world
  • 153. new Lfs(inputScheme, inputPath); Tap sinkTap = outputPath.matches("^[^:]+://.*") ? new Hfs(outputScheme, outputPath) : new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 154. new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 155. new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 156. new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 157. new Lfs(outputScheme, outputPath); Pipe wcPipe = new Each("wordcount", new Fields("line"), new RegexSplitGenerator(new Fields("word"), "s+"), new Fields("word")); wcPipe = new GroupBy(wcPipe, new Fields("word")); wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word")); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); Flow parsedLogFlow = new FlowConnector(properties) .connect(sourceTap, sinkTap, wcPipe); parsedLogFlow.start(); parsedLogFlow.complete(); } }
  • 158. Run
  • 159. input: mary had a little lamb little lamb little lamb mary had a little lamb whose fleece was white as snow
  • 160. input: mary had a little lamb little lamb little lamb mary had a little lamb whose fleece was white as snow command: hadoop jar ./target/cascading-tutorial-1.0.0.jar data/sources/misc/mary.txt data/output
  • 161. output: 2 a 1 as 1 fleece 2 had 4 lamb 4 little 2 mary 1 snow 1 was 1 white 1 whose
  • 163. apply operations to tuple streams
  • 164. apply operations to tuple streams repeat.
  • 166. Cookbook // remove all tuples where search_term is '-' pipe = new Each(pipe, new Fields("search_term"), new RegexFilter("^(-)$", true));
  • 167. Cookbook // convert "timestamp" String field to "ts" long field pipe = new Each(pipe, new Fields("timestamp"), new DateParser("yyyy-MM-dd HH:mm:ss"), Fields.ALL);
  • 168. Cookbook // td - timestamp of day-level granularity pipe = new Each(pipe, new Fields("ts"), new ExpressionFunction( new Fields("td"), "ts - (ts % (24 * 60 * 60 * 1000))", long.class), Fields.ALL);
  • 169. Cookbook // SELECT DISTINCT visitor_id // GROUP BY visitor_id ORDER BY ts pipe = new GroupBy(pipe, new Fields("visitor_id"), new Fields("ts")); // take the First() tuple of every grouping pipe = new Every(pipe, Fields.ALL, new First(), Fields.RESULTS);
  • 171. Desired Operation Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram"));
  • 172. Desired Operation line mary had a little lamb Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram"));
  • 173. Desired Operation line mary had a little lamb Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram")); ngram mary had
  • 174. Desired Operation line mary had a little lamb Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram")); ngram ngram mary had had a
  • 175. Desired Operation line mary had a little lamb Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram")); ngram ngram ngram mary had had a a little
  • 176. Desired Operation line mary had a little lamb Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram")); ngram ngram ngram ngram mary had had a a little little lamb
  • 179. Custom Operations • extend BaseOperation • implement Function (or Filter)
  • 180. Custom Operations • extend BaseOperation • implement Function (or Filter) • implement operate
  • 181. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 182. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 183. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 184. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 185. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 186. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 187. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 188. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 189. Tuple 123 vis-abc tacos 2009-10-08 03:21:23 1
  • 190. Tuple 123 vis-abc tacos 2009-10-08 03:21:23 1 zero-indexed, ordered values
  • 191. Tuple 123 vis-abc tacos 2009-10-08 03:21:23 1 Fields id visitor_ search_ id term time stamp page_ number list of strings representing field names
  • 192. Tuple 123 vis-abc tacos 2009-10-08 03:21:23 1 Fields id visitor_ search_ id term time stamp page_ number
  • 193. TupleEntry visitor_ search_ time page_ id id term stamp number 123 vis-abc tacos 2009-10-08 03:21:23 1
  • 194. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 195. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 196. public class NGramTokenizer extends BaseOperation implements Function { private int size; public NGramTokenizer(int size) { super(new Fields("ngram")); this.size = size; } public void operate(FlowProcess flowProcess, FunctionCall functionCall) { String token; TupleEntry arguments = functionCall.getArguments(); com.attinteractive.util.NGramTokenizer t = new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size); while((token = t.next()) != null) { TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token)); functionCall.getOutputCollector().add(result); } } }
  • 197. Success line mary had a little lamb Pipe pipe = new Each("ngram pipe", new Fields("line"), new NGramTokenizer(2), new Fields("ngram")); ngram ngram ngram ngram mary had had a a little little lamb
  • 199. dot
  • 200. dot Flow importFlow = flowConnector.connect(sourceTap, tmpIntermediateTap, importPipe); importFlow.writeDOT( "import-flow.dot" );
  • 201.
  • 202.
  • 203.