The presentation of Apache Spark by Mylène Reiners during our first Eindhoven Java Meetup (see http://www.opencirclesolutions.nl/eindhoven-java-meetup/).
13. Hive example
// Import Spark SQL
import org.apache.spark.sql.hive.HiveContext;
// Or if you can't have the hive dependencies
import org.apache.spark.sql.SQLContext;
// Import the JavaSchemaRDD
import org.apache.spark.sql.SchemaRDD;
import org.apache.spark.sql.Row;
(...)
JavaSparkContext ctx = new JavaSparkContext(...);
SQLContext hiveCtx = new HiveContext(ctx);
14. Hive example (cont’d)
SchemaRDD input = hiveCtx.jsonFile(inputFile);
// Register the input schema RDD
input.registerTempTable("tweets");
// Select tweets based on the retweetCount
SchemaRDD topTweets = hiveCtx.sql(
"SELECT text, retweetCount FROM tweets ORDER BY
retweetCount LIMIT 10");
16. Example
// Create a StreamingContext with a 1-second batch size from a SparkConf
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
// Create a DStream from all the input on port 7777
JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777);
// Filter our DStream for lines with "error"
JavaDStream<String> errorLines = lines.filter(new Function<String, Boolean>() {
public Boolean call(String line) {
return line.contains("error");
}});
// Print out the lines with errors
errorLines.print();
17. Example
// Start our streaming context and wait for it
// to "finish"
jssc.start();
// Wait for the job to finish
jssc.awaitTermination();
19. Example
// Load the edges as a graph
val graph = GraphLoader.edgeListFile(sc,
"followers.txt")
// Run PageRank
val ranks = graph.pageRank(0.0001).vertices
// Join the ranks with the usernames
val users = sc.textFile("users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}