15. package cascadingtutorial.wordcount;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
String inputPath = args[0];
String outputPath = args[1];
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
41. package cascadingtutorial.wordcount;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
String inputPath = args[0];
String outputPath = args[1];
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
42. package cascadingtutorial.wordcount;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
String inputPath = args[0];
String outputPath = args[1];
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
43. import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
44. import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
45. import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
TextLine()
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
46. import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
TextLine()
Tap sinkTap
SequenceFile()
= outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
47. import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
48. /**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
49. /**
* Wordcount example in Cascading
*/
public class Main
{ hdfs://master0:54310/user/nmurray/data.txt
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
50. /**
* Wordcount example in Cascading
*/
public class Main
{ hdfs://master0:54310/user/nmurray/data.txt new Hfs()
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
51. /**
* Wordcount example in Cascading
*/
public class Main
{ hdfs://master0:54310/user/nmurray/data.txt new Hfs()
data/sources/obama-inaugural-address.txt
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
52. /**
* Wordcount example in Cascading
*/
public class Main
{ hdfs://master0:54310/user/nmurray/data.txt new Hfs()
data/sources/obama-inaugural-address.txt
public static void main( String[] args )
new Lfs()
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
53. /**
* Wordcount example in Cascading
*/
public class Main
{ hdfs://master0:54310/user/nmurray/data.txt new Hfs()
data/sources/obama-inaugural-address.txt
public static void main( String[] args )
new Lfs()
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
S3fs()
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
54. /**
* Wordcount example in Cascading
*/
public class Main
{ hdfs://master0:54310/user/nmurray/data.txt new Hfs()
data/sources/obama-inaugural-address.txt
public static void main( String[] args )
new Lfs()
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
S3fs()
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
GlobHfs() new Fields("count", "word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(),
55. /**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
56. Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
57. Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Pipe Assembly
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
81. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
82. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
83. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
84. import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* Wordcount example in Cascading
*/
public class Main
{
public static void main( String[] args )
{
Scheme inputScheme = new TextLine(new Fields("offset", "line"));
Scheme outputScheme = new TextLine();
String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe = new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
85. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
86. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
87. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
88. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
Fields()
89. Each
offset line
0 the lazy brown fox
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
90. Each
offset line
0 the lazy brown fox
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
91. Each
X
offset
0
line
the lazy brown fox
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
92. Each
X offset
0
line
the lazy brown fox
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
word word word word
the lazy brown fox
106. Each
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
107. Each
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
108. String inputPath = args[0];
String outputPath = args[1];
Tap sourceTap = inputPath.matches( "^[^:]+://.*") ?
new Hfs(inputScheme, inputPath) :
new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
109. new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
110. new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
111. new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
122. Every
new Every(previousPipe,
argumentSelector,
operation,
outputSelector)
new Every(wcPipe, new Count(), new Fields("count", "word"));
123. Every
new Every(previousPipe,
argumentSelector,
operation,
outputSelector)
new Every(wcPipe, new Count(), new Fields("count", "word"));
new Every(wcPipe, Fields.ALL, new Count(), new Fields("count", "word"));
126. Predefined Field Sets
Fields.ALL all available fields
Fields.GROUP fields used for last grouping
Fields.VALUES fields not used for last grouping
fields of argument Tuple
Fields.ARGS
(for Operations)
replaces input with Operation result
Fields.RESULTS
(for Pipes)
127. Predefined Field Sets
Fields.ALL all available fields
Fields.GROUP fields used for last grouping
Fields.VALUES fields not used for last grouping
fields of argument Tuple
Fields.ARGS
(for Operations)
replaces input with Operation result
Fields.RESULTS
(for Pipes)
128. Predefined Field Sets
Fields.ALL all available fields
Fields.GROUP fields used for last grouping
Fields.VALUES fields not used for last grouping
fields of argument Tuple
Fields.ARGS
(for Operations)
replaces input with Operation result
Fields.RESULTS
(for Pipes)
129. Predefined Field Sets
Fields.ALL all available fields
Fields.GROUP fields used for last grouping
Fields.VALUES fields not used for last grouping
fields of argument Tuple
Fields.ARGS
(for Operations)
replaces input with Operation result
Fields.RESULTS
(for Pipes)
130. Predefined Field Sets
Fields.ALL all available fields
Fields.GROUP fields used for last grouping
Fields.VALUES fields not used for last grouping
fields of argument Tuple
Fields.ARGS
(for Operations)
replaces input with Operation result
Fields.RESULTS
(for Pipes)
131. Field Selection
new Each(previousPipe,
argumentSelector,
operation,
outputSelector)
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("timestamp"));
133. Field Selection
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
134. Field Selection
Original Tuple
visitor_ search_ time page_
id
id term stamp number
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
135. Field Selection
Original Tuple Argument Selector
visitor_ search_ time page_
id timestamp
id term stamp number
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
136. Field Selection
Original Tuple Argument Selector
visitor_ search_ time page_
id timestamp
id term stamp number
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
137. Field Selection
Original Tuple Argument Selector
visitor_ search_ time page_
id timestamp
id term stamp number
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
Input to DateParser will be:
timestamp
139. Field Selection
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
140. Field Selection
Original Tuple
visitor_ search_ time page_
id
id term stamp number
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
141. Field Selection
Original Tuple
id
visitor_
id
search_
term
time
stamp
page_
number ⊕
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
142. Field Selection
Original Tuple DateParser Output
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
143. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
144. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
145. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
146. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
Output of Each will be:
search_ visitor_
ts
term id
147. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
Output of Each will be:
search_ visitor_
ts
term id
148. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
Output of Each will be:
search_ visitor_
ts
term id
149. Field Selection
Original Tuple DateParser Output Output Selector
id
visitor_
id
search_
term
time
stamp
page_
number ⊕ ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
Output of Each will be:
search_ visitor_
ts
term id
150. Field Selection
Original Tuple DateParser Output Output Selector
X
id
visitor_
id
search_
term
XX ⊕
time
stamp
page_
number ts ts
search_
term
visitor_
id
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
new Fields("ts", "search_term", "visitor_id"));
Output of Each will be:
search_ visitor_
ts
term id
151. word
hello
word
hello
word
world
new Every(wcPipe, new Count(), new Fields("count", "word"));
152. word
hello
word
hello
word
world
new Every(wcPipe, new Count(), new Fields("count", "word"));
count word
2 hello
count word
1 world
153. new Lfs(inputScheme, inputPath);
Tap sinkTap = outputPath.matches("^[^:]+://.*") ?
new Hfs(outputScheme, outputPath) :
new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
154. new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
155. new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
156. new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
157. new Lfs(outputScheme, outputPath);
Pipe wcPipe =
new Each("wordcount",
new Fields("line"),
new RegexSplitGenerator(new Fields("word"), "s+"),
new Fields("word"));
wcPipe = new GroupBy(wcPipe, new Fields("word"));
wcPipe = new Every(wcPipe, new Count(), new Fields("count", "word"));
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
Flow parsedLogFlow = new FlowConnector(properties)
.connect(sourceTap, sinkTap, wcPipe);
parsedLogFlow.start();
parsedLogFlow.complete();
}
}
159. input:
mary had a little lamb
little lamb
little lamb
mary had a little lamb
whose fleece was white as snow
160. input:
mary had a little lamb
little lamb
little lamb
mary had a little lamb
whose fleece was white as snow
command:
hadoop jar ./target/cascading-tutorial-1.0.0.jar
data/sources/misc/mary.txt data/output
161. output:
2 a
1 as
1 fleece
2 had
4 lamb
4 little
2 mary
1 snow
1 was
1 white
1 whose
166. Cookbook
// remove all tuples where search_term is '-'
pipe = new Each(pipe,
new Fields("search_term"),
new RegexFilter("^(-)$", true));
167. Cookbook
// convert "timestamp" String field to "ts" long field
pipe = new Each(pipe,
new Fields("timestamp"),
new DateParser("yyyy-MM-dd HH:mm:ss"),
Fields.ALL);
168. Cookbook
// td - timestamp of day-level granularity
pipe = new Each(pipe,
new Fields("ts"),
new ExpressionFunction(
new Fields("td"),
"ts - (ts % (24 * 60 * 60 * 1000))",
long.class),
Fields.ALL);
169. Cookbook
// SELECT DISTINCT visitor_id
// GROUP BY visitor_id ORDER BY ts
pipe = new GroupBy(pipe,
new Fields("visitor_id"),
new Fields("ts"));
// take the First() tuple of every grouping
pipe = new Every(pipe,
Fields.ALL,
new First(),
Fields.RESULTS);
171. Desired Operation
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
172. Desired Operation
line
mary had a little lamb
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
173. Desired Operation
line
mary had a little lamb
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
ngram
mary had
174. Desired Operation
line
mary had a little lamb
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
ngram ngram
mary had had a
175. Desired Operation
line
mary had a little lamb
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
ngram ngram ngram
mary had had a a little
176. Desired Operation
line
mary had a little lamb
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
ngram ngram ngram ngram
mary had had a a little little lamb
191. Tuple 123 vis-abc tacos 2009-10-08
03:21:23 1
Fields id
visitor_ search_
id term
time
stamp
page_
number
list of strings representing
field names
192. Tuple 123 vis-abc tacos 2009-10-08
03:21:23 1
Fields id
visitor_ search_
id term
time
stamp
page_
number
193. TupleEntry
visitor_ search_ time page_
id
id term stamp number
123 vis-abc tacos 2009-10-08
03:21:23 1
194. public class NGramTokenizer extends BaseOperation implements Function
{
private int size;
public NGramTokenizer(int size)
{
super(new Fields("ngram"));
this.size = size;
}
public void operate(FlowProcess flowProcess, FunctionCall functionCall)
{
String token;
TupleEntry arguments = functionCall.getArguments();
com.attinteractive.util.NGramTokenizer t =
new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);
while((token = t.next()) != null) {
TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
functionCall.getOutputCollector().add(result);
}
}
}
195. public class NGramTokenizer extends BaseOperation implements Function
{
private int size;
public NGramTokenizer(int size)
{
super(new Fields("ngram"));
this.size = size;
}
public void operate(FlowProcess flowProcess, FunctionCall functionCall)
{
String token;
TupleEntry arguments = functionCall.getArguments();
com.attinteractive.util.NGramTokenizer t =
new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);
while((token = t.next()) != null) {
TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
functionCall.getOutputCollector().add(result);
}
}
}
196. public class NGramTokenizer extends BaseOperation implements Function
{
private int size;
public NGramTokenizer(int size)
{
super(new Fields("ngram"));
this.size = size;
}
public void operate(FlowProcess flowProcess, FunctionCall functionCall)
{
String token;
TupleEntry arguments = functionCall.getArguments();
com.attinteractive.util.NGramTokenizer t =
new com.attinteractive.util.NGramTokenizer(arguments.getString(0), this.size);
while((token = t.next()) != null) {
TupleEntry result = new TupleEntry(new Fields("ngram"), new Tuple(token));
functionCall.getOutputCollector().add(result);
}
}
}
197. Success
line
mary had a little lamb
Pipe pipe = new Each("ngram pipe",
new Fields("line"),
new NGramTokenizer(2),
new Fields("ngram"));
ngram ngram ngram ngram
mary had had a a little little lamb