2. Demo
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
case class Data(InvoiceNo: String, StockCode: String, Description: String, Quantity: Long, InvoiceDate:
String, UnitPrice: Double, CustomerID: String, Country: String)
val schema = Encoders.product[Data].schema
val df=spark.read.option("header",true).schema(schema).csv("./data.csv")
val clean=df.na.drop(Seq("CustomerID")).dropDuplicates()
val data = clean.withColumn("total",when($"StockCode"!=="D",$"UnitPrice"*$"Quantity").otherwise(0))
.withColumn("Discount",when($"StockCode"==="D",$"UnitPrice"*$"Quantity").otherwise(0))
.withColumn("Postage",when($"StockCode"==="P",1).otherwise(0))
.withColumn("Invoice",regexp_replace($"InvoiceNo","^C",""))
.withColumn("Cancelled",when(substring($"InvoiceNo",0,1)==="C",1).otherwise(0))
val aggregated=data.groupBy($"Invoice",$"Country",$"CustomerID")
.agg(sum($"Discount").as("Discount"),sum($"total").as("Total"),max($"Cancelled").as("Cancelled"))
val customers =aggregated.groupBy($"CustomerID")
.agg(sum($"Total").as("Total"),sum($"Discount").as("Discount"),sum($"Cancelled").as("Cancelled"),count($"
Invoice").as("Invoices"))
import org.apache.spark.ml.feature.VectorAssembler
val assembler = new
VectorAssembler().setInputCols(Array("Total","Discount","Cancelled","Invoices")).setOutputCol("features")
val features=assembler.transform(customers)
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.evaluation.ClusteringEvaluator
val Array(test,train)= features.randomSplit(Array(0.3,0.7))
val kmeans=new KMeans().setK(12).setFeaturesCol("features").setPredictionCol("prediction")
val model = kmeans.fit(train)
model.clusterCenters.foreach(println)
val predictions=model.transform(test)
predictions.groupBy($"prediction").count().show()
4. 2004
MapReduce:
Simplified Data Processing on Large Clusters
Jeff Dean, Sanjay Ghemawat
Google, Inc.
https://research.google.com/archive/mapreduce-osdi04-slides/index.html
6. • Re-execute on fail
• Skip bad-records
• Redundent execution (copies of tasks)
• Data locality optimization
• Combiners (map-side reduce)
• Compression of data
19. Streaming challenges
Watermarks – describe event time
progress
Events earlier than watermark are
ignored
(too slow – delay, too fast – more
late events)
27. Spark
• Lots of things out of the box
• Batch (RDD, DataFrames, DataSets)
• Streaming
• Structured Streaming (unify batch and streaming)
• Graph
• (“Classic”) ML
• Runs on Hadoop, Mesos, Kubernetes
28. Lots of extensions
• Spark NLP - John Snow Labs
• Spark Deep Learning - Databricks, Intel (BidDL), DeepLearing4j, H2O
• Connectors to any DB that respects itself
• (Hades is WIP )
29. Multiple languages• Scala, Java, R, Python, .NET (just
released)
• Currently Scala is favorite
• Python taking center-stage
Editor's Notes
Higher abstraction
More like a database table than an array
Adds Optimizers