SlideShare a Scribd company logo
class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator])
extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) {
def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = {
val grouped = dataset.groupBy()
val max = grouped.max().first().getDouble(0)
val min = grouped.min().first().getDouble(0)
new MinMaxNormEstimatorModel(min = min, max = max, operationName = operationName, uid = uid)
}
}
class MinMaxNormEstimatorModel(val min: Double, val max: Double, operationName: String, uid: String)
extends UnaryModel[Real, Real](operationName = operationName, uid = uid) {
def transformFn: Real => Real = _.value.map(v => (v - min) / (max - min)).toReal
}
@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends FlatSpec with Matchers {
val conf = new SparkConf().setMaster("local[*]")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
"MinMaxNormEstimator" should ”fit a min/max model" in {
val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_)))
val model = new MinMaxNormEstimator().fitFn(data)
model match {
case m: MinMaxNormEstimatorModel =>
m.min shouldBe -1.0
m.max shouldBe 5.0
case m => fail(s"Unexpected model type: ${m.getClass}")
}
}
}
@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends FlatSpec with Matchers with BeforeAndAfterAll {
val conf = new SparkConf().setMaster("local[*]")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
override def afterAll(): Unit = {
super.afterAll()
spark.stop()
}
// …
}
trait TestSpark extends BeforeAndAfterAll { self: Suite =>
val kryoClasses: Array[Class[_]] = Array()
val conf: SparkConf = new SparkConf()
.setMaster("local[*]")
.registerKryoClasses(kryoClasses)
.set("spark.serializer", classOf[org.apache.spark.serializer.KryoSerializer].getName)
.set("spark.ui.enabled", false.toString) // disable Spark Application UI for faster tests
implicit lazy val spark = SparkSession.builder().config(conf).getOrCreate()
override def afterAll(): Unit = {
super.afterAll()
spark.stop()
}
}
abstract class SparkFlatSpec extends FlatSpec with Matchers with TestSpark
@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends SparkFlatSpec {
import spark.implicits._
val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_)))
"MinMaxNormEstimator" should "fit a min/max model" in {
new MinMaxNormEstimator().fitFn(data) match {
case model: MinMaxNormEstimatorModel =>
model.min shouldBe -1.0
model.max shouldBe 5.0
case model =>
fail(s"Unexpected model type: ${model.getClass}")
}
}
}
More here:
https://github.com/apache/spark/blob/master/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
https://jungiangenealogy.weebly.com/serpent.html
More here: https://github.com/saurfang/sbt-spark-submit
For Gradle: https://github.com/salesforce/TransmogrifAI/blob/master/gradle/spark.gradle
package org.apache.spark.ml
abstract class Transformer extends PipelineStage
{
def transform(dataset: Dataset[_]): DataFrame
}
type DataFrame = Dataset[Row]
trait Row extends Serializable {
def get(i: Int): Any
}
package org.apache.spark.util
case object ClosureUtils {
def checkSerializable(closure: AnyRef): Try[Unit] = scala.util.Try {
ClosureCleaner.clean(closure, checkSerializable = true, cleanTransitively = true)
}
}
package my.app
val x = new NonSerializable { def fun() = ??? }
ClosureUtils.checkSerializable(x.fun _) // make the check early at application start
udf { d: Double => x.fun(d) } // if we are here – it’s safe to use x
Read more:
https://docs.transmogrif.ai/en/stable/developer-guide/index.html#wrapping-a-non-serializable-external-
library
udf { d: Double => new NonSerializable().fun(d) }
object NonSerializable {
val instance = new NonSerializable
}
udf { d: Double => NonSerializable.instance.fun(d) }
val ds: Dataset[_] = ???
println("Dataset schema & plans:")
ds.printSchema()
ds.explain(extended = true)
https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html
https://databricks.com/blog/2016/05/23/apache-spark-as-a-compiler-joining-a-billion-rows-per-second-on-a-laptop.html
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/inter
nal/SQLConf.scala#L780
// Apply all the transformers one by one as org.apache.spark.ml.Pipeline does
transformers.zipWithIndex.foldLeft(data) { case (df, (transformer, i)) =>
val persist = i > 0 && i % persistEveryKStages == 0
val newDF = transformer.transform(df)
if (!persist) newDF
else {
// Converting to RDD and back here to break up Catalyst [SPARK-13346]
val persisted = newDF.rdd.persist()
lastPersisted.foreach(_.unpersist())
lastPersisted = Some(persisted)
spark.createDataFrame(persisted, newDF.schema)
}
}
https://github.com/salesforce/TransmogrifAI/blob/master/core/src/main/scala/com/salesforce/op/uti
ls/stages/FitStagesUtil.scala#L150
More here: https://spark.apache.org/docs/latest/tuning.html
https://github.com/salesforce/TransmogrifAI/blob/master/utils/src/main/scala/com/salesforce/op
/utils/spark/OpSparkListener.scala
https://github.com/LucaCanali/sparkMeasure
val myListener = new MySparkListener(
appName = spark.sparkContext.appName,
appId = spark.sparkContext.applicationId
) {
override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
super.onApplicationEnd(applicationEnd)
val m = super.metrics
log.info("Total run time: {}", m.appDurationPretty)
MyApp.onApplicationEnd(m)
}
}
spark.sparkContext.addSparkListener(myListener) // add the listener
def onApplicationEnd(metrics: AppMetrics): Unit = { /* consume metrics */ }
6B+
predictions
per day
Einstein
Platform
Compute
Orchestration
Data Store
Model Lifecycle
Management
Data Science
Experience
Configuration
Services
Infrastructure
Metrics
Health Monitoring
ETL/GDPR/
Data Processing
DL TransmogrifAI
Machine Learning
Lead Scoring Engagement ScoringCase Classification Prediction Builder
...
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

More Related Content

What's hot

From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)Night Sailer
 
はじめてのGroovy
はじめてのGroovyはじめてのGroovy
はじめてのGroovy
Tsuyoshi Yamamoto
 
MongoDB全機能解説2
MongoDB全機能解説2MongoDB全機能解説2
MongoDB全機能解説2Takahiro Inoue
 
Tips and Tricks of Developing .NET Application
Tips and Tricks of Developing .NET ApplicationTips and Tricks of Developing .NET Application
Tips and Tricks of Developing .NET Application
Joni
 
Grails UI Primer
Grails UI PrimerGrails UI Primer
Grails UI Primer
Matthew Taylor
 
Get started with YUI
Get started with YUIGet started with YUI
Get started with YUIAdam Lu
 
MTDDC 2010.2.5 Tokyo - Brand new API
MTDDC 2010.2.5 Tokyo - Brand new APIMTDDC 2010.2.5 Tokyo - Brand new API
MTDDC 2010.2.5 Tokyo - Brand new APISix Apart KK
 
JQuery Presentation
JQuery PresentationJQuery Presentation
JQuery Presentation
Sony Jain
 
Say It With Javascript
Say It With JavascriptSay It With Javascript
Say It With Javascript
Giovanni Scerra ☃
 
Groovy collection api
Groovy collection apiGroovy collection api
Groovy collection api
trygvea
 
What’s new in C# 6
What’s new in C# 6What’s new in C# 6
What’s new in C# 6
Fiyaz Hasan
 
The state of your own hypertext preprocessor
The state of your own hypertext preprocessorThe state of your own hypertext preprocessor
The state of your own hypertext preprocessor
Alessandro Nadalin
 
Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Stubる - Mockingjayを使ったHTTPクライアントのテスト -Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Kenji Tanaka
 

What's hot (16)

From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)
 
はじめてのGroovy
はじめてのGroovyはじめてのGroovy
はじめてのGroovy
 
MongoDB Oplog入門
MongoDB Oplog入門MongoDB Oplog入門
MongoDB Oplog入門
 
MongoDB全機能解説2
MongoDB全機能解説2MongoDB全機能解説2
MongoDB全機能解説2
 
Tips and Tricks of Developing .NET Application
Tips and Tricks of Developing .NET ApplicationTips and Tricks of Developing .NET Application
Tips and Tricks of Developing .NET Application
 
Grails UI Primer
Grails UI PrimerGrails UI Primer
Grails UI Primer
 
Get started with YUI
Get started with YUIGet started with YUI
Get started with YUI
 
MTDDC 2010.2.5 Tokyo - Brand new API
MTDDC 2010.2.5 Tokyo - Brand new APIMTDDC 2010.2.5 Tokyo - Brand new API
MTDDC 2010.2.5 Tokyo - Brand new API
 
JQuery Presentation
JQuery PresentationJQuery Presentation
JQuery Presentation
 
Backbone intro
Backbone introBackbone intro
Backbone intro
 
Say It With Javascript
Say It With JavascriptSay It With Javascript
Say It With Javascript
 
Groovy collection api
Groovy collection apiGroovy collection api
Groovy collection api
 
What’s new in C# 6
What’s new in C# 6What’s new in C# 6
What’s new in C# 6
 
The state of your own hypertext preprocessor
The state of your own hypertext preprocessorThe state of your own hypertext preprocessor
The state of your own hypertext preprocessor
 
Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Stubる - Mockingjayを使ったHTTPクライアントのテスト -Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Stubる - Mockingjayを使ったHTTPクライアントのテスト -
 
I Love Ruby
I Love RubyI Love Ruby
I Love Ruby
 

Similar to The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Tsuyoshi Yamamoto
 
Symfony2 Building on Alpha / Beta technology
Symfony2 Building on Alpha / Beta technologySymfony2 Building on Alpha / Beta technology
Symfony2 Building on Alpha / Beta technology
Daniel Knell
 
TypeScript Introduction
TypeScript IntroductionTypeScript Introduction
TypeScript Introduction
Dmitry Sheiko
 
用Tornado开发RESTful API运用
用Tornado开发RESTful API运用用Tornado开发RESTful API运用
用Tornado开发RESTful API运用Felinx Lee
 
Taming that client side mess with Backbone.js
Taming that client side mess with Backbone.jsTaming that client side mess with Backbone.js
Taming that client side mess with Backbone.jsJarod Ferguson
 
Python magicmethods
Python magicmethodsPython magicmethods
Python magicmethods
dreampuf
 
Ten useful JavaScript tips & best practices
Ten useful JavaScript tips & best practicesTen useful JavaScript tips & best practices
Ten useful JavaScript tips & best practices
Ankit Rastogi
 
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf MilanFrom Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
Fabio Collini
 
Scala coated JVM
Scala coated JVMScala coated JVM
Scala coated JVM
Stuart Roebuck
 
Swift for TensorFlow - CoreML Personalization
Swift for TensorFlow - CoreML PersonalizationSwift for TensorFlow - CoreML Personalization
Swift for TensorFlow - CoreML Personalization
Jacopo Mangiavacchi
 
Kitura Todolist tutorial
Kitura Todolist tutorialKitura Todolist tutorial
Kitura Todolist tutorial
Robert F. Dickerson
 
Pyimproved again
Pyimproved againPyimproved again
Pyimproved again
rik0
 
Aplicacoes dinamicas Rails com Backbone
Aplicacoes dinamicas Rails com BackboneAplicacoes dinamicas Rails com Backbone
Aplicacoes dinamicas Rails com Backbone
Rafael Felix da Silva
 
Unittests für Dummies
Unittests für DummiesUnittests für Dummies
Unittests für Dummies
Lars Jankowfsky
 
From android/ java to swift (2)
From android/ java to swift (2)From android/ java to swift (2)
From android/ java to swift (2)
allanh0526
 
Practical Google App Engine Applications In Py
Practical Google App Engine Applications In PyPractical Google App Engine Applications In Py
Practical Google App Engine Applications In Py
Eric ShangKuan
 
Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Leonardo Soto
 
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Puppet
 

Similar to The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge (20)

Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
 
Symfony2 Building on Alpha / Beta technology
Symfony2 Building on Alpha / Beta technologySymfony2 Building on Alpha / Beta technology
Symfony2 Building on Alpha / Beta technology
 
TypeScript Introduction
TypeScript IntroductionTypeScript Introduction
TypeScript Introduction
 
用Tornado开发RESTful API运用
用Tornado开发RESTful API运用用Tornado开发RESTful API运用
用Tornado开发RESTful API运用
 
Taming that client side mess with Backbone.js
Taming that client side mess with Backbone.jsTaming that client side mess with Backbone.js
Taming that client side mess with Backbone.js
 
Python magicmethods
Python magicmethodsPython magicmethods
Python magicmethods
 
Ten useful JavaScript tips & best practices
Ten useful JavaScript tips & best practicesTen useful JavaScript tips & best practices
Ten useful JavaScript tips & best practices
 
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf MilanFrom Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
 
Scala coated JVM
Scala coated JVMScala coated JVM
Scala coated JVM
 
Swift for TensorFlow - CoreML Personalization
Swift for TensorFlow - CoreML PersonalizationSwift for TensorFlow - CoreML Personalization
Swift for TensorFlow - CoreML Personalization
 
Kitura Todolist tutorial
Kitura Todolist tutorialKitura Todolist tutorial
Kitura Todolist tutorial
 
Unit testing with mock libs
Unit testing with mock libsUnit testing with mock libs
Unit testing with mock libs
 
Pyimproved again
Pyimproved againPyimproved again
Pyimproved again
 
Django quickstart
Django quickstartDjango quickstart
Django quickstart
 
Aplicacoes dinamicas Rails com Backbone
Aplicacoes dinamicas Rails com BackboneAplicacoes dinamicas Rails com Backbone
Aplicacoes dinamicas Rails com Backbone
 
Unittests für Dummies
Unittests für DummiesUnittests für Dummies
Unittests für Dummies
 
From android/ java to swift (2)
From android/ java to swift (2)From android/ java to swift (2)
From android/ java to swift (2)
 
Practical Google App Engine Applications In Py
Practical Google App Engine Applications In PyPractical Google App Engine Applications In Py
Practical Google App Engine Applications In Py
 
Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)Jython: Python para la plataforma Java (EL2009)
Jython: Python para la plataforma Java (EL2009)
 
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
 

More from Databricks

DW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptxDW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptx
Databricks
 
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1
Databricks
 
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2
Databricks
 
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2
Databricks
 
Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4
Databricks
 
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
Databricks
 
Democratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDemocratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized Platform
Databricks
 
Learn to Use Databricks for Data Science
Learn to Use Databricks for Data ScienceLearn to Use Databricks for Data Science
Learn to Use Databricks for Data Science
Databricks
 
Why APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML MonitoringWhy APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML Monitoring
Databricks
 
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch FixThe Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
Databricks
 
Stage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationStage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI Integration
Databricks
 
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchSimplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Databricks
 
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on KubernetesScaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on Kubernetes
Databricks
 
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark PipelinesScaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Databricks
 
Sawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature AggregationsSawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature Aggregations
Databricks
 
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen SinkRedis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Databricks
 
Re-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and SparkRe-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and Spark
Databricks
 
Raven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesRaven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction Queries
Databricks
 
Processing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkProcessing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache Spark
Databricks
 
Massive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta LakeMassive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta Lake
Databricks
 

More from Databricks (20)

DW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptxDW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptx
 
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1
 
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2
 
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2
 
Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4
 
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
 
Democratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDemocratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized Platform
 
Learn to Use Databricks for Data Science
Learn to Use Databricks for Data ScienceLearn to Use Databricks for Data Science
Learn to Use Databricks for Data Science
 
Why APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML MonitoringWhy APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML Monitoring
 
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch FixThe Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
 
Stage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationStage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI Integration
 
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchSimplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorch
 
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on KubernetesScaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on Kubernetes
 
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark PipelinesScaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
 
Sawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature AggregationsSawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature Aggregations
 
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen SinkRedis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
 
Re-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and SparkRe-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and Spark
 
Raven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesRaven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction Queries
 
Processing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkProcessing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache Spark
 
Massive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta LakeMassive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta Lake
 

Recently uploaded

SOCRadar Germany 2024 Threat Landscape Report
SOCRadar Germany 2024 Threat Landscape ReportSOCRadar Germany 2024 Threat Landscape Report
SOCRadar Germany 2024 Threat Landscape Report
SOCRadar
 
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdfCh03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
haila53
 
一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单
一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单
一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单
ewymefz
 
Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project PresentationPredicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Boston Institute of Analytics
 
社内勉強会資料_LLM Agents                              .
社内勉強会資料_LLM Agents                              .社内勉強会資料_LLM Agents                              .
社内勉強会資料_LLM Agents                              .
NABLAS株式会社
 
一比一原版(CBU毕业证)卡普顿大学毕业证如何办理
一比一原版(CBU毕业证)卡普顿大学毕业证如何办理一比一原版(CBU毕业证)卡普顿大学毕业证如何办理
一比一原版(CBU毕业证)卡普顿大学毕业证如何办理
ahzuo
 
一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单
ocavb
 
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdfCriminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP
 
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
u86oixdj
 
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
vcaxypu
 
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
pchutichetpong
 
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdfSample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Linda486226
 
一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单
ewymefz
 
一比一原版(CBU毕业证)卡普顿大学毕业证成绩单
一比一原版(CBU毕业证)卡普顿大学毕业证成绩单一比一原版(CBU毕业证)卡普顿大学毕业证成绩单
一比一原版(CBU毕业证)卡普顿大学毕业证成绩单
nscud
 
Malana- Gimlet Market Analysis (Portfolio 2)
Malana- Gimlet Market Analysis (Portfolio 2)Malana- Gimlet Market Analysis (Portfolio 2)
Malana- Gimlet Market Analysis (Portfolio 2)
TravisMalana
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Subhajit Sahu
 
一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理
一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理
一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理
ahzuo
 
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
axoqas
 
一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理
一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理
一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理
mbawufebxi
 
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
nscud
 

Recently uploaded (20)

SOCRadar Germany 2024 Threat Landscape Report
SOCRadar Germany 2024 Threat Landscape ReportSOCRadar Germany 2024 Threat Landscape Report
SOCRadar Germany 2024 Threat Landscape Report
 
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdfCh03-Managing the Object-Oriented Information Systems Project a.pdf
Ch03-Managing the Object-Oriented Information Systems Project a.pdf
 
一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单
一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单
一比一原版(UPenn毕业证)宾夕法尼亚大学毕业证成绩单
 
Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project PresentationPredicting Product Ad Campaign Performance: A Data Analysis Project Presentation
Predicting Product Ad Campaign Performance: A Data Analysis Project Presentation
 
社内勉強会資料_LLM Agents                              .
社内勉強会資料_LLM Agents                              .社内勉強会資料_LLM Agents                              .
社内勉強会資料_LLM Agents                              .
 
一比一原版(CBU毕业证)卡普顿大学毕业证如何办理
一比一原版(CBU毕业证)卡普顿大学毕业证如何办理一比一原版(CBU毕业证)卡普顿大学毕业证如何办理
一比一原版(CBU毕业证)卡普顿大学毕业证如何办理
 
一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单一比一原版(TWU毕业证)西三一大学毕业证成绩单
一比一原版(TWU毕业证)西三一大学毕业证成绩单
 
Criminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdfCriminal IP - Threat Hunting Webinar.pdf
Criminal IP - Threat Hunting Webinar.pdf
 
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
原版制作(Deakin毕业证书)迪肯大学毕业证学位证一模一样
 
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
一比一原版(ArtEZ毕业证)ArtEZ艺术学院毕业证成绩单
 
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
Data Centers - Striving Within A Narrow Range - Research Report - MCG - May 2...
 
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdfSample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
Sample_Global Non-invasive Prenatal Testing (NIPT) Market, 2019-2030.pdf
 
一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单一比一原版(BU毕业证)波士顿大学毕业证成绩单
一比一原版(BU毕业证)波士顿大学毕业证成绩单
 
一比一原版(CBU毕业证)卡普顿大学毕业证成绩单
一比一原版(CBU毕业证)卡普顿大学毕业证成绩单一比一原版(CBU毕业证)卡普顿大学毕业证成绩单
一比一原版(CBU毕业证)卡普顿大学毕业证成绩单
 
Malana- Gimlet Market Analysis (Portfolio 2)
Malana- Gimlet Market Analysis (Portfolio 2)Malana- Gimlet Market Analysis (Portfolio 2)
Malana- Gimlet Market Analysis (Portfolio 2)
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
 
一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理
一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理
一比一原版(UIUC毕业证)伊利诺伊大学|厄巴纳-香槟分校毕业证如何办理
 
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
哪里卖(usq毕业证书)南昆士兰大学毕业证研究生文凭证书托福证书原版一模一样
 
一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理
一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理
一比一原版(Bradford毕业证书)布拉德福德大学毕业证如何办理
 
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
一比一原版(CBU毕业证)不列颠海角大学毕业证成绩单
 

The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8. class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { val grouped = dataset.groupBy() val max = grouped.max().first().getDouble(0) val min = grouped.min().first().getDouble(0) new MinMaxNormEstimatorModel(min = min, max = max, operationName = operationName, uid = uid) } } class MinMaxNormEstimatorModel(val min: Double, val max: Double, operationName: String, uid: String) extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { def transformFn: Real => Real = _.value.map(v => (v - min) / (max - min)).toReal }
  • 9. @RunWith(classOf[JUnitRunner]) class MinMaxNormEstimatorTest extends FlatSpec with Matchers { val conf = new SparkConf().setMaster("local[*]") implicit val spark = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ "MinMaxNormEstimator" should ”fit a min/max model" in { val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_))) val model = new MinMaxNormEstimator().fitFn(data) model match { case m: MinMaxNormEstimatorModel => m.min shouldBe -1.0 m.max shouldBe 5.0 case m => fail(s"Unexpected model type: ${m.getClass}") } } }
  • 10. @RunWith(classOf[JUnitRunner]) class MinMaxNormEstimatorTest extends FlatSpec with Matchers with BeforeAndAfterAll { val conf = new SparkConf().setMaster("local[*]") implicit val spark = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ override def afterAll(): Unit = { super.afterAll() spark.stop() } // … }
  • 11. trait TestSpark extends BeforeAndAfterAll { self: Suite => val kryoClasses: Array[Class[_]] = Array() val conf: SparkConf = new SparkConf() .setMaster("local[*]") .registerKryoClasses(kryoClasses) .set("spark.serializer", classOf[org.apache.spark.serializer.KryoSerializer].getName) .set("spark.ui.enabled", false.toString) // disable Spark Application UI for faster tests implicit lazy val spark = SparkSession.builder().config(conf).getOrCreate() override def afterAll(): Unit = { super.afterAll() spark.stop() } } abstract class SparkFlatSpec extends FlatSpec with Matchers with TestSpark
  • 12. @RunWith(classOf[JUnitRunner]) class MinMaxNormEstimatorTest extends SparkFlatSpec { import spark.implicits._ val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_))) "MinMaxNormEstimator" should "fit a min/max model" in { new MinMaxNormEstimator().fitFn(data) match { case model: MinMaxNormEstimatorModel => model.min shouldBe -1.0 model.max shouldBe 5.0 case model => fail(s"Unexpected model type: ${model.getClass}") } } } More here: https://github.com/apache/spark/blob/master/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19. More here: https://github.com/saurfang/sbt-spark-submit For Gradle: https://github.com/salesforce/TransmogrifAI/blob/master/gradle/spark.gradle
  • 20. package org.apache.spark.ml abstract class Transformer extends PipelineStage { def transform(dataset: Dataset[_]): DataFrame } type DataFrame = Dataset[Row] trait Row extends Serializable { def get(i: Int): Any }
  • 21.
  • 22.
  • 23.
  • 24. package org.apache.spark.util case object ClosureUtils { def checkSerializable(closure: AnyRef): Try[Unit] = scala.util.Try { ClosureCleaner.clean(closure, checkSerializable = true, cleanTransitively = true) } } package my.app val x = new NonSerializable { def fun() = ??? } ClosureUtils.checkSerializable(x.fun _) // make the check early at application start udf { d: Double => x.fun(d) } // if we are here – it’s safe to use x
  • 25. Read more: https://docs.transmogrif.ai/en/stable/developer-guide/index.html#wrapping-a-non-serializable-external- library udf { d: Double => new NonSerializable().fun(d) } object NonSerializable { val instance = new NonSerializable } udf { d: Double => NonSerializable.instance.fun(d) }
  • 26. val ds: Dataset[_] = ??? println("Dataset schema & plans:") ds.printSchema() ds.explain(extended = true) https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html https://databricks.com/blog/2016/05/23/apache-spark-as-a-compiler-joining-a-billion-rows-per-second-on-a-laptop.html
  • 28. // Apply all the transformers one by one as org.apache.spark.ml.Pipeline does transformers.zipWithIndex.foldLeft(data) { case (df, (transformer, i)) => val persist = i > 0 && i % persistEveryKStages == 0 val newDF = transformer.transform(df) if (!persist) newDF else { // Converting to RDD and back here to break up Catalyst [SPARK-13346] val persisted = newDF.rdd.persist() lastPersisted.foreach(_.unpersist()) lastPersisted = Some(persisted) spark.createDataFrame(persisted, newDF.schema) } } https://github.com/salesforce/TransmogrifAI/blob/master/core/src/main/scala/com/salesforce/op/uti ls/stages/FitStagesUtil.scala#L150
  • 29.
  • 31.
  • 32.
  • 33.
  • 34. https://github.com/salesforce/TransmogrifAI/blob/master/utils/src/main/scala/com/salesforce/op /utils/spark/OpSparkListener.scala https://github.com/LucaCanali/sparkMeasure val myListener = new MySparkListener( appName = spark.sparkContext.appName, appId = spark.sparkContext.applicationId ) { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { super.onApplicationEnd(applicationEnd) val m = super.metrics log.info("Total run time: {}", m.appDurationPretty) MyApp.onApplicationEnd(m) } } spark.sparkContext.addSparkListener(myListener) // add the listener def onApplicationEnd(metrics: AppMetrics): Unit = { /* consume metrics */ }
  • 35.
  • 36. 6B+ predictions per day Einstein Platform Compute Orchestration Data Store Model Lifecycle Management Data Science Experience Configuration Services Infrastructure Metrics Health Monitoring ETL/GDPR/ Data Processing DL TransmogrifAI Machine Learning Lead Scoring Engagement ScoringCase Classification Prediction Builder ...