package org.ajug;

import   cascading.cascade.Cascade;
import   cascading.cascade.CascadeConnector;
import   cascading.flow.Flow;
import   cascading.flow.FlowConnector;
import   cascading.pipe.Each;
import   cascading.pipe.Every;
import   cascading.pipe.GroupBy;
import   cascading.pipe.Pipe;
import   cascading.scheme.TextDelimited;
import   cascading.scheme.TextLine;
import   cascading.tap.Hfs;
import   cascading.tap.SinkMode;
import   cascading.tap.Tap;
import   cascading.tuple.Fields;

import java.util.Properties;

public class Main {

   public static void main(String[] args) {
       Properties properties = new Properties();
       FlowConnector.setApplicationJarClass(properties, Main.class);
       properties.put("mapred.reduce.tasks", 5);

        Pipe mainPipe = new Each("M&M", new Fields("line"), new Parser());
        mainPipe = new GroupBy(mainPipe, new Fields("COLOR"));
        mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(new
ColorData()));

         Tap sourceTap = new Hfs(new TextLine(), args[0]);

        TextDelimited scheme = new TextDelimited(new Fields("COLOR",
"AVG_WIDTH", "AVG_WEIGHT"), ",", """);
        scheme.setNumSinkParts(1); // make sure we only get one file


         Tap sinkTap = new Hfs(scheme, args[1], SinkMode.REPLACE);

         FlowConnector flowConnector = new FlowConnector(properties);
         CascadeConnector cascadeConnector = new CascadeConnector();

         Flow flow = flowConnector.connect(sourceTap, sinkTap, mainPipe);

         Cascade cascade = cascadeConnector.connect(flow);
         cascade.complete();     // Finally run everything

    }
}
================================================
package org.ajug;

import   cascading.cascade.Cascade;
import   cascading.cascade.CascadeConnector;
import   cascading.flow.Flow;
import   cascading.flow.FlowConnector;
import   cascading.pipe.*;
import   cascading.scheme.TextDelimited;
import   cascading.scheme.TextLine;
import   cascading.tap.Hfs;
import   cascading.tap.SinkMode;
import   cascading.tap.Tap;
import   cascading.tuple.Fields;

import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

public class MultiOutputMain {

   public static void main(String[] args) {
       Properties properties = new Properties();
       FlowConnector.setApplicationJarClass(properties, Main.class);
       properties.put("mapred.reduce.tasks", 5);

         Pipe sourcePipe = new Each("M&M", new Fields("line"), new Parser());

        Pipe totalPipe = new GroupBy("Total", sourcePipe, new Fields("ONE"));
        totalPipe = new Every(totalPipe, Fields.ALL, new TotalAggregator(new
TotalData()));

        Pipe mainPipe = new GroupBy("Color", sourcePipe, new Fields("COLOR"));
        mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(new
ColorData()));

         Tap sourceTap = new Hfs(new TextLine(), args[0]);

        TextDelimited scheme = new TextDelimited(new Fields("COLOR",
"AVG_WIDTH", "AVG_WEIGHT"), ",", """);
        scheme.setNumSinkParts(1); // make sure we only get one file
        Tap colorTap = new Hfs(scheme, args[1] + "/color", SinkMode.REPLACE);


        TextDelimited totalScheme = new TextDelimited(new
Fields("FINAL_WIDTH", "FINAL_WEIGHT"), ",", """);
        totalScheme.setNumSinkParts(1); // make sure we only get one file
        Tap totalTap = new Hfs(totalScheme, args[1] + "/total",
SinkMode.REPLACE);

         FlowConnector flowConnector = new FlowConnector(properties);
         CascadeConnector cascadeConnector = new CascadeConnector();

         Map<String, Tap> outputs = new HashMap<String, Tap>();
         outputs.put(totalPipe.getName(), totalTap);
         outputs.put(mainPipe.getName(), colorTap);

        Flow flow = flowConnector.connect(sourceTap, outputs, totalPipe,
mainPipe);

         Cascade cascade = cascadeConnector.connect(flow);
         cascade.complete();     // Finally run everything

    }
}
=======================================
package org.ajug;
import   cascading.flow.FlowProcess;
import   cascading.operation.Function;
import   cascading.operation.FunctionCall;
import   cascading.tuple.Fields;
import   cascading.tuple.Tuple;

import java.io.Serializable;


public class Parser extends cascading.operation.BaseOperation implements
Serializable, Function {

    public Parser() {
        super(new Fields("ONE","COLOR", "WIDTH", "WEIGHT"));   // should be
constants file ;)
    }


     public void operate(FlowProcess a_flow, FunctionCall a_call) {

         String sourceData = a_call.getArguments().getString(0);
         sourceData = sourceData.trim();
         if (sourceData == null || sourceData.length() == 0) {
             return;       // blank line read from the source file, so ignore
it
         }

         String values[] = sourceData.split(",");

         Tuple output = new Tuple();

         output.add("1");
         output.add(values[0]);
         output.add(values[1]);
         output.add(values[2]);
         a_call.getOutputCollector().add(output);
     }

}
==============================================
package org.ajug;


import cascading.tuple.Tuple;

import java.io.Serializable;

public class ColorData implements Serializable {

     private long m_num = 0;
     private double m_width = 0;
     private double m_weight = 0;

     public void reset(){
         m_num = 0;
         m_width = 0;
         m_weight = 0;
     }
public void addData(double a_weight, double a_width){
       m_weight += a_weight;
       m_width+=a_width;
       m_num++;
   }

   public   Tuple getTuple() {

       if (m_num == 0) return null;

       Tuple rtnValue = new Tuple();

       rtnValue.add(m_width/m_num);
       rtnValue.add(m_weight/m_num);

       return rtnValue;
   }
}
===============================================
package org.ajug;

import cascading.tuple.Tuple;

import java.io.Serializable;

public class TotalData implements Serializable {

   private long m_num = 0;
   private double m_width = 0;
   private double m_weight = 0;

   public void reset(){
       m_num = 0;
       m_width = 0;
       m_weight = 0;
   }

   public void addData(double a_weight, double a_width){
       m_weight += a_weight;
       m_width+=a_width;
       m_num++;
   }

   public Tuple getTuple() {

       if (m_num == 0) return null;

       Tuple rtnValue = new Tuple();

       rtnValue.add(m_width/m_num);
       rtnValue.add(m_weight/m_num);

       return rtnValue;
   }
}
==================================================
package org.ajug;
import   cascading.flow.FlowProcess;
import   cascading.operation.Aggregator;
import   cascading.operation.AggregatorCall;
import   cascading.operation.BaseOperation;
import   cascading.tuple.Fields;
import   cascading.tuple.Tuple;
import   cascading.tuple.TupleEntry;
import   org.apache.log4j.Logger;

public class TotalAggregator extends BaseOperation<TotalData>
        implements Aggregator<TotalData> {

    static Logger m_logger =
Logger.getLogger(TotalAggregator.class.getName());

    private TotalData m_row;

    public TotalAggregator(TotalData a_row) {
        super(new Fields("FINAL_WIDTH", "FINAL_WEIGHT"));
        m_row = a_row;
    }

    public void start(FlowProcess flowProcess,
                      AggregatorCall<TotalData> aggregatorCall) {

         // set the context object
         m_row.reset();
         aggregatorCall.setContext(m_row);
    }

    public void complete(FlowProcess flowProcess,
                         AggregatorCall<TotalData> aggregatorCall) {
        TotalData context = aggregatorCall.getContext();

         Tuple results = context.getTuple();
         if (results == null) return;        // Nothing there to report
         aggregatorCall.getOutputCollector().add(results);

    }

    public void aggregate(FlowProcess flowProcess,
                          AggregatorCall<TotalData> aggregatorCall) {
        TupleEntry arguments = aggregatorCall.getArguments();
        TotalData context = aggregatorCall.getContext();

         double weight = arguments.getDouble("WEIGHT");
         double width = arguments.getDouble("WIDTH");

         context.addData(weight, width);

    }
}

=========================================
package org.ajug;

import cascading.flow.FlowProcess;
import cascading.operation.Aggregator;
import cascading.operation.AggregatorCall;
import   cascading.operation.BaseOperation;
import   cascading.tuple.Fields;
import   cascading.tuple.Tuple;
import   cascading.tuple.TupleEntry;
import   org.apache.log4j.Logger;


public class ColorAggregator extends BaseOperation<ColorData>
        implements Aggregator<ColorData> {

    static Logger m_logger =
Logger.getLogger(ColorAggregator.class.getName());

    private ColorData m_row;

    public ColorAggregator(ColorData a_row) {
        super(new Fields("AVG_WIDTH", "AVG_WEIGHT"));
        m_row = a_row;
    }

    public void start(FlowProcess flowProcess,
                      AggregatorCall<ColorData> aggregatorCall) {

         // set the context object
         m_row.reset();
         aggregatorCall.setContext(m_row);
    }

    public void complete(FlowProcess flowProcess,
                         AggregatorCall<ColorData> aggregatorCall) {
        ColorData context = aggregatorCall.getContext();

         Tuple results = context.getTuple();
         if (results == null) return;        // Nothing there to report
         aggregatorCall.getOutputCollector().add(results);

    }

    public void aggregate(FlowProcess flowProcess,
                          AggregatorCall<ColorData> aggregatorCall) {
        TupleEntry arguments = aggregatorCall.getArguments();
        ColorData context = aggregatorCall.getContext();

         double weight = arguments.getDouble("WEIGHT");
         double width = arguments.getDouble("WIDTH");

         context.addData(weight, width);

    }
}

AJUG April 2011 Cascading example

  • 1.
    package org.ajug; import cascading.cascade.Cascade; import cascading.cascade.CascadeConnector; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.scheme.TextDelimited; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.Properties; public class Main { public static void main(String[] args) { Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); properties.put("mapred.reduce.tasks", 5); Pipe mainPipe = new Each("M&M", new Fields("line"), new Parser()); mainPipe = new GroupBy(mainPipe, new Fields("COLOR")); mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(new ColorData())); Tap sourceTap = new Hfs(new TextLine(), args[0]); TextDelimited scheme = new TextDelimited(new Fields("COLOR", "AVG_WIDTH", "AVG_WEIGHT"), ",", """); scheme.setNumSinkParts(1); // make sure we only get one file Tap sinkTap = new Hfs(scheme, args[1], SinkMode.REPLACE); FlowConnector flowConnector = new FlowConnector(properties); CascadeConnector cascadeConnector = new CascadeConnector(); Flow flow = flowConnector.connect(sourceTap, sinkTap, mainPipe); Cascade cascade = cascadeConnector.connect(flow); cascade.complete(); // Finally run everything } } ================================================ package org.ajug; import cascading.cascade.Cascade; import cascading.cascade.CascadeConnector; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.pipe.*; import cascading.scheme.TextDelimited;
  • 2.
    import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tuple.Fields; import java.util.HashMap; import java.util.Map; import java.util.Properties; public class MultiOutputMain { public static void main(String[] args) { Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); properties.put("mapred.reduce.tasks", 5); Pipe sourcePipe = new Each("M&M", new Fields("line"), new Parser()); Pipe totalPipe = new GroupBy("Total", sourcePipe, new Fields("ONE")); totalPipe = new Every(totalPipe, Fields.ALL, new TotalAggregator(new TotalData())); Pipe mainPipe = new GroupBy("Color", sourcePipe, new Fields("COLOR")); mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(new ColorData())); Tap sourceTap = new Hfs(new TextLine(), args[0]); TextDelimited scheme = new TextDelimited(new Fields("COLOR", "AVG_WIDTH", "AVG_WEIGHT"), ",", """); scheme.setNumSinkParts(1); // make sure we only get one file Tap colorTap = new Hfs(scheme, args[1] + "/color", SinkMode.REPLACE); TextDelimited totalScheme = new TextDelimited(new Fields("FINAL_WIDTH", "FINAL_WEIGHT"), ",", """); totalScheme.setNumSinkParts(1); // make sure we only get one file Tap totalTap = new Hfs(totalScheme, args[1] + "/total", SinkMode.REPLACE); FlowConnector flowConnector = new FlowConnector(properties); CascadeConnector cascadeConnector = new CascadeConnector(); Map<String, Tap> outputs = new HashMap<String, Tap>(); outputs.put(totalPipe.getName(), totalTap); outputs.put(mainPipe.getName(), colorTap); Flow flow = flowConnector.connect(sourceTap, outputs, totalPipe, mainPipe); Cascade cascade = cascadeConnector.connect(flow); cascade.complete(); // Finally run everything } } ======================================= package org.ajug;
  • 3.
    import cascading.flow.FlowProcess; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.tuple.Fields; import cascading.tuple.Tuple; import java.io.Serializable; public class Parser extends cascading.operation.BaseOperation implements Serializable, Function { public Parser() { super(new Fields("ONE","COLOR", "WIDTH", "WEIGHT")); // should be constants file ;) } public void operate(FlowProcess a_flow, FunctionCall a_call) { String sourceData = a_call.getArguments().getString(0); sourceData = sourceData.trim(); if (sourceData == null || sourceData.length() == 0) { return; // blank line read from the source file, so ignore it } String values[] = sourceData.split(","); Tuple output = new Tuple(); output.add("1"); output.add(values[0]); output.add(values[1]); output.add(values[2]); a_call.getOutputCollector().add(output); } } ============================================== package org.ajug; import cascading.tuple.Tuple; import java.io.Serializable; public class ColorData implements Serializable { private long m_num = 0; private double m_width = 0; private double m_weight = 0; public void reset(){ m_num = 0; m_width = 0; m_weight = 0; }
  • 4.
    public void addData(doublea_weight, double a_width){ m_weight += a_weight; m_width+=a_width; m_num++; } public Tuple getTuple() { if (m_num == 0) return null; Tuple rtnValue = new Tuple(); rtnValue.add(m_width/m_num); rtnValue.add(m_weight/m_num); return rtnValue; } } =============================================== package org.ajug; import cascading.tuple.Tuple; import java.io.Serializable; public class TotalData implements Serializable { private long m_num = 0; private double m_width = 0; private double m_weight = 0; public void reset(){ m_num = 0; m_width = 0; m_weight = 0; } public void addData(double a_weight, double a_width){ m_weight += a_weight; m_width+=a_width; m_num++; } public Tuple getTuple() { if (m_num == 0) return null; Tuple rtnValue = new Tuple(); rtnValue.add(m_width/m_num); rtnValue.add(m_weight/m_num); return rtnValue; } } ================================================== package org.ajug;
  • 5.
    import cascading.flow.FlowProcess; import cascading.operation.Aggregator; import cascading.operation.AggregatorCall; import cascading.operation.BaseOperation; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.log4j.Logger; public class TotalAggregator extends BaseOperation<TotalData> implements Aggregator<TotalData> { static Logger m_logger = Logger.getLogger(TotalAggregator.class.getName()); private TotalData m_row; public TotalAggregator(TotalData a_row) { super(new Fields("FINAL_WIDTH", "FINAL_WEIGHT")); m_row = a_row; } public void start(FlowProcess flowProcess, AggregatorCall<TotalData> aggregatorCall) { // set the context object m_row.reset(); aggregatorCall.setContext(m_row); } public void complete(FlowProcess flowProcess, AggregatorCall<TotalData> aggregatorCall) { TotalData context = aggregatorCall.getContext(); Tuple results = context.getTuple(); if (results == null) return; // Nothing there to report aggregatorCall.getOutputCollector().add(results); } public void aggregate(FlowProcess flowProcess, AggregatorCall<TotalData> aggregatorCall) { TupleEntry arguments = aggregatorCall.getArguments(); TotalData context = aggregatorCall.getContext(); double weight = arguments.getDouble("WEIGHT"); double width = arguments.getDouble("WIDTH"); context.addData(weight, width); } } ========================================= package org.ajug; import cascading.flow.FlowProcess; import cascading.operation.Aggregator; import cascading.operation.AggregatorCall;
  • 6.
    import cascading.operation.BaseOperation; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.log4j.Logger; public class ColorAggregator extends BaseOperation<ColorData> implements Aggregator<ColorData> { static Logger m_logger = Logger.getLogger(ColorAggregator.class.getName()); private ColorData m_row; public ColorAggregator(ColorData a_row) { super(new Fields("AVG_WIDTH", "AVG_WEIGHT")); m_row = a_row; } public void start(FlowProcess flowProcess, AggregatorCall<ColorData> aggregatorCall) { // set the context object m_row.reset(); aggregatorCall.setContext(m_row); } public void complete(FlowProcess flowProcess, AggregatorCall<ColorData> aggregatorCall) { ColorData context = aggregatorCall.getContext(); Tuple results = context.getTuple(); if (results == null) return; // Nothing there to report aggregatorCall.getOutputCollector().add(results); } public void aggregate(FlowProcess flowProcess, AggregatorCall<ColorData> aggregatorCall) { TupleEntry arguments = aggregatorCall.getArguments(); ColorData context = aggregatorCall.getContext(); double weight = arguments.getDouble("WEIGHT"); double width = arguments.getDouble("WIDTH"); context.addData(weight, width); } }