AJUG April 2011 Cascading example

506 views

Published on

Source code for the examples using Cascading from my April 2011 Atlanta Java Users group presentation.

Published in: Technology
0 Comments
1 Like
Statistics
Notes
  • Be the first to comment

No Downloads
Views
Total views
506
On SlideShare
0
From Embeds
0
Number of Embeds
4
Actions
Shares
0
Downloads
5
Comments
0
Likes
1
Embeds 0
No embeds

No notes for slide

AJUG April 2011 Cascading example

  1. 1. package org.ajug;import cascading.cascade.Cascade;import cascading.cascade.CascadeConnector;import cascading.flow.Flow;import cascading.flow.FlowConnector;import cascading.pipe.Each;import cascading.pipe.Every;import cascading.pipe.GroupBy;import cascading.pipe.Pipe;import cascading.scheme.TextDelimited;import cascading.scheme.TextLine;import cascading.tap.Hfs;import cascading.tap.SinkMode;import cascading.tap.Tap;import cascading.tuple.Fields;import java.util.Properties;public class Main { public static void main(String[] args) { Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); properties.put("mapred.reduce.tasks", 5); Pipe mainPipe = new Each("M&M", new Fields("line"), new Parser()); mainPipe = new GroupBy(mainPipe, new Fields("COLOR")); mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(newColorData())); Tap sourceTap = new Hfs(new TextLine(), args[0]); TextDelimited scheme = new TextDelimited(new Fields("COLOR","AVG_WIDTH", "AVG_WEIGHT"), ",", """); scheme.setNumSinkParts(1); // make sure we only get one file Tap sinkTap = new Hfs(scheme, args[1], SinkMode.REPLACE); FlowConnector flowConnector = new FlowConnector(properties); CascadeConnector cascadeConnector = new CascadeConnector(); Flow flow = flowConnector.connect(sourceTap, sinkTap, mainPipe); Cascade cascade = cascadeConnector.connect(flow); cascade.complete(); // Finally run everything }}================================================package org.ajug;import cascading.cascade.Cascade;import cascading.cascade.CascadeConnector;import cascading.flow.Flow;import cascading.flow.FlowConnector;import cascading.pipe.*;import cascading.scheme.TextDelimited;
  2. 2. import cascading.scheme.TextLine;import cascading.tap.Hfs;import cascading.tap.SinkMode;import cascading.tap.Tap;import cascading.tuple.Fields;import java.util.HashMap;import java.util.Map;import java.util.Properties;public class MultiOutputMain { public static void main(String[] args) { Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, Main.class); properties.put("mapred.reduce.tasks", 5); Pipe sourcePipe = new Each("M&M", new Fields("line"), new Parser()); Pipe totalPipe = new GroupBy("Total", sourcePipe, new Fields("ONE")); totalPipe = new Every(totalPipe, Fields.ALL, new TotalAggregator(newTotalData())); Pipe mainPipe = new GroupBy("Color", sourcePipe, new Fields("COLOR")); mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(newColorData())); Tap sourceTap = new Hfs(new TextLine(), args[0]); TextDelimited scheme = new TextDelimited(new Fields("COLOR","AVG_WIDTH", "AVG_WEIGHT"), ",", """); scheme.setNumSinkParts(1); // make sure we only get one file Tap colorTap = new Hfs(scheme, args[1] + "/color", SinkMode.REPLACE); TextDelimited totalScheme = new TextDelimited(newFields("FINAL_WIDTH", "FINAL_WEIGHT"), ",", """); totalScheme.setNumSinkParts(1); // make sure we only get one file Tap totalTap = new Hfs(totalScheme, args[1] + "/total",SinkMode.REPLACE); FlowConnector flowConnector = new FlowConnector(properties); CascadeConnector cascadeConnector = new CascadeConnector(); Map<String, Tap> outputs = new HashMap<String, Tap>(); outputs.put(totalPipe.getName(), totalTap); outputs.put(mainPipe.getName(), colorTap); Flow flow = flowConnector.connect(sourceTap, outputs, totalPipe,mainPipe); Cascade cascade = cascadeConnector.connect(flow); cascade.complete(); // Finally run everything }}=======================================package org.ajug;
  3. 3. import cascading.flow.FlowProcess;import cascading.operation.Function;import cascading.operation.FunctionCall;import cascading.tuple.Fields;import cascading.tuple.Tuple;import java.io.Serializable;public class Parser extends cascading.operation.BaseOperation implementsSerializable, Function { public Parser() { super(new Fields("ONE","COLOR", "WIDTH", "WEIGHT")); // should beconstants file ;) } public void operate(FlowProcess a_flow, FunctionCall a_call) { String sourceData = a_call.getArguments().getString(0); sourceData = sourceData.trim(); if (sourceData == null || sourceData.length() == 0) { return; // blank line read from the source file, so ignoreit } String values[] = sourceData.split(","); Tuple output = new Tuple(); output.add("1"); output.add(values[0]); output.add(values[1]); output.add(values[2]); a_call.getOutputCollector().add(output); }}==============================================package org.ajug;import cascading.tuple.Tuple;import java.io.Serializable;public class ColorData implements Serializable { private long m_num = 0; private double m_width = 0; private double m_weight = 0; public void reset(){ m_num = 0; m_width = 0; m_weight = 0; }
  4. 4. public void addData(double a_weight, double a_width){ m_weight += a_weight; m_width+=a_width; m_num++; } public Tuple getTuple() { if (m_num == 0) return null; Tuple rtnValue = new Tuple(); rtnValue.add(m_width/m_num); rtnValue.add(m_weight/m_num); return rtnValue; }}===============================================package org.ajug;import cascading.tuple.Tuple;import java.io.Serializable;public class TotalData implements Serializable { private long m_num = 0; private double m_width = 0; private double m_weight = 0; public void reset(){ m_num = 0; m_width = 0; m_weight = 0; } public void addData(double a_weight, double a_width){ m_weight += a_weight; m_width+=a_width; m_num++; } public Tuple getTuple() { if (m_num == 0) return null; Tuple rtnValue = new Tuple(); rtnValue.add(m_width/m_num); rtnValue.add(m_weight/m_num); return rtnValue; }}==================================================package org.ajug;
  5. 5. import cascading.flow.FlowProcess;import cascading.operation.Aggregator;import cascading.operation.AggregatorCall;import cascading.operation.BaseOperation;import cascading.tuple.Fields;import cascading.tuple.Tuple;import cascading.tuple.TupleEntry;import org.apache.log4j.Logger;public class TotalAggregator extends BaseOperation<TotalData> implements Aggregator<TotalData> { static Logger m_logger =Logger.getLogger(TotalAggregator.class.getName()); private TotalData m_row; public TotalAggregator(TotalData a_row) { super(new Fields("FINAL_WIDTH", "FINAL_WEIGHT")); m_row = a_row; } public void start(FlowProcess flowProcess, AggregatorCall<TotalData> aggregatorCall) { // set the context object m_row.reset(); aggregatorCall.setContext(m_row); } public void complete(FlowProcess flowProcess, AggregatorCall<TotalData> aggregatorCall) { TotalData context = aggregatorCall.getContext(); Tuple results = context.getTuple(); if (results == null) return; // Nothing there to report aggregatorCall.getOutputCollector().add(results); } public void aggregate(FlowProcess flowProcess, AggregatorCall<TotalData> aggregatorCall) { TupleEntry arguments = aggregatorCall.getArguments(); TotalData context = aggregatorCall.getContext(); double weight = arguments.getDouble("WEIGHT"); double width = arguments.getDouble("WIDTH"); context.addData(weight, width); }}=========================================package org.ajug;import cascading.flow.FlowProcess;import cascading.operation.Aggregator;import cascading.operation.AggregatorCall;
  6. 6. import cascading.operation.BaseOperation;import cascading.tuple.Fields;import cascading.tuple.Tuple;import cascading.tuple.TupleEntry;import org.apache.log4j.Logger;public class ColorAggregator extends BaseOperation<ColorData> implements Aggregator<ColorData> { static Logger m_logger =Logger.getLogger(ColorAggregator.class.getName()); private ColorData m_row; public ColorAggregator(ColorData a_row) { super(new Fields("AVG_WIDTH", "AVG_WEIGHT")); m_row = a_row; } public void start(FlowProcess flowProcess, AggregatorCall<ColorData> aggregatorCall) { // set the context object m_row.reset(); aggregatorCall.setContext(m_row); } public void complete(FlowProcess flowProcess, AggregatorCall<ColorData> aggregatorCall) { ColorData context = aggregatorCall.getContext(); Tuple results = context.getTuple(); if (results == null) return; // Nothing there to report aggregatorCall.getOutputCollector().add(results); } public void aggregate(FlowProcess flowProcess, AggregatorCall<ColorData> aggregatorCall) { TupleEntry arguments = aggregatorCall.getArguments(); ColorData context = aggregatorCall.getContext(); double weight = arguments.getDouble("WEIGHT"); double width = arguments.getDouble("WIDTH"); context.addData(weight, width); }}

×