1. HackReduce
M a p R e d u c e I n t r o
Hopper.com (Greg Lu)
2. Project
github.com/hackreduce/Hackathon
Wiki
github.com/hackreduce/Hackathon/wiki
Download the Github project for some sample datasets
3. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv
}
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23 InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57
}
NASDAQ,DYNT,2008-12-29,0.31,0.31,0.29,0.30,26900,0.30
NASDAQ,DMLP,2003-10-21,17.65,17.94,17.58,17.59,4800,9.73
NASDAQ,DORM,1997-02-07,7.88,7.88,7.63,7.75,7400,3.87 InputSplit 2
NASDAQ,DXPE,2004-10-25,5.19,5.24,5.00,5.00,7600,2.50
}
NASDAQ,DEST,2009-03-17,4.55,5.03,4.55,5.03,6800,5.03
NASDAQ,DBRN,1992-01-02,8.88,9.25,8.75,8.88,84800,2.22
NASDAQ,DXYN,1998-11-25,6.38,6.44,6.19,6.25,211100,6.25 InputSplit 3
NASDAQ,DEAR,1998-12-08,10.50,11.50,10.50,10.50,5800,6.45
...
org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)
public int run(String[] args) throws Exception {
Configuration conf = getConf();
if (args.length != 2) {
System.err.println("Usage: " + getClass().getName() + " <input> <output>");
System.exit(2);
}
// Creating the MapReduce job (configuration) object
Job job = new Job(conf);
job.setJarByClass(getClass());
job.setJobName(getClass().getName());
} Defines how the data is split
// The Nasdaq/NYSE data dumps comes in as a CSV file (text input), so we configure
// the job to use this format.
job.setInputFormatClass(TextInputFormat.class); and assigned to which mappers
[...]
4. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv
datasets/nasdaq/daily_prices
}
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23
InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57
org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)
public int run(String[] args) throws Exception {
[...]
// Tell the job which Mapper and Reducer to use (classes defined above)
job.setMapperClass(MarketCapitalizationMapper.class);
job.setReducerClass(MarketCapitalizationReducer.class);
} Point the job to the custom classes that
we created in order to process the data.
}
// This is what the Mapper will be outputting to the Reducer
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class); Define the types of the (key, value)
// This is what the Reducer will be outputting pairs that we’ll be outputting from the
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
mappers and the result of the job itself.
// Setting the input folder of the job
FileInputFormat.addInputPath(job, new Path(args[0]));
// Preparing the output folder by first deleting it if it exists
Path output = new Path(args[1]);
FileSystem.get(conf).delete(output, true);
FileOutputFormat.setOutputPath(job, output);
Now we’ll show the MarketCapitalizationMapper class
5. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv
datasets/nasdaq/daily_prices
}
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23
InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57
org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)
public static class MarketCapitalizationMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String inputString = value.toString();
String[] attributes = inputString.split(",");
if (attributes.length != 9)
throw new IllegalArgumentException("Input string given did not have 9 values in CSV format");
try {
String exchange = attributes[0];
String stockSymbol = attributes[1];
Date date = sdf.parse(attributes[2]);
double stockPriceOpen = Double.parseDouble(attributes[3]);
double stockPriceHigh = Double.parseDouble(attributes[4]);
double stockPriceLow = Double.parseDouble(attributes[5]);
double stockPriceClose = Double.parseDouble(attributes[6]);
int stockVolume = Integer.parseInt(attributes[7]);
double stockPriceAdjClose = Double.parseDouble(attributes[8]);
} catch (ParseException e) {
throw new IllegalArgumentException("Input string contained an unknown value that couldn't be parsed");
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Input string contained an unknown number value that couldn't be parsed");
}
double marketCap = stockPriceClose * stockVolume;
context.write(new Text(stockSymbol), new DoubleWritable(marketCap)); } This job doesn’t do a whole lot,
}
but this is where the processing
} is occurring.
9. We can dynamically increase your clusters if
you need the processing power, but it’s
typically bottlenecked by the code.
If your job takes longer than 10 minutes to
run, come see us.