SlideShare a Scribd company logo
1 of 194
Download to read offline
Apache Spark for 

library developers
William Benton
willb@redhat.com
@willb
Erik Erlandson
eje@redhat.com
@manyangled
About Will
#SAISDD6
The Silex and Isarn libraries
Reusable open-source code that works 

with Spark, factored from internal apps.
We’ve tracked Spark releases since Spark 1.3.0.
See https://silex.radanalytics.io and 

http://isarnproject.org
#SAISDD6
Forecast
Basic considerations for reusable Spark code
Generic functions for parallel collections
Extending data frames with custom aggregates
Exposing JVM libraries to Python
Sharing your work with the world
Basic considerations
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
Today’s main themes
#SAISDD6
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
#SAISDD6
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
$ sbt +compile # or test, package, publish, etc.
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
#SAISDD6
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
$ sbt +compile # or test, package, publish, etc.
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
#SAISDD6
in your SBT build definition:
Bring-your-own Spark
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
in your SBT build definition:
Bring-your-own Spark
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
in your SBT build definition:
“Bring-your-own Spark”
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
in your SBT build definition:
“Bring-your-own Spark”
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
#SAISDD6
Taking care with resources
#SAISDD6
Taking care with resources
#SAISDD6
Taking care with resources
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
rdd.cache()
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
rdd.cache()
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
rdd.cache()
rdd.unpersist()
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
#SAISDD6
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
result
}
#SAISDD6
nextModel = modelFromState(newState)
current.unpersist
}
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
current.unpersist
sc.broadcast(nextModel)
#SAISDD6
nextModel = modelFromState(newState)
current.unpersist
}
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
current.unpersist
sc.broadcast(nextModel)
#SAISDD6
nextModel = modelFromState(newState)
current.unpersist
}
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
current.unpersist
sc.broadcast(nextModel)
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
class
pointer flags size locks element pointer element pointer
class
pointer flags size locks 1.0
class
pointer flags size locks 3.0 4.0
2.0
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
class
pointer flags size locks element pointer element pointer
class
pointer flags size locks 1.0
class
pointer flags size locks 3.0 4.0
2.0 32 bytes of data…
#SAISDD6
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
class
pointer flags size locks element pointer element pointer
class
pointer flags size locks 1.0
class
pointer flags size locks 3.0 4.0
2.0
…and 64 bytes
of overhead!
32 bytes of data…
Continuous integration for
Spark libraries and apps
#SAISDD6
local[*]
#SAISDD6
CPU Memory
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
local[2]
#SAISDD6
#SAISDD6
Writing generic code for
Spark’s parallel collections
#SAISDD6
The RDD is invariant
T <: U RDD[T] <: RDD[U]
#SAISDD6
The RDD is invariant
T <: U RDD[T] <: RDD[U]
dog animal
#SAISDD6
T <: U RDD[T] <: RDD[U]
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
#SAISDD6
T <: U RDD[T] <: RDD[U]
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
#SAISDD6
val xacts = spark.parallelize(Array(
Transaction(1, 1, 1.0),
Transaction(2, 2, 1.0)
))
badKeyByUserId(xacts)
<console>: error: type mismatch;
found : org.apache.spark.rdd.RDD[Transaction]
required: org.apache.spark.rdd.RDD[HasUserId]
Note: Transaction <: HasUserID, but class RDD is invariant in type T.
You may wish to define T as +T instead. (SLS 4.5)
badKeyByUserId(xacts)
#SAISDD6
val xacts = spark.parallelize(Array(
Transaction(1, 1, 1.0),
Transaction(2, 2, 1.0)
))
badKeyByUserId(xacts)
<console>: error: type mismatch;
found : org.apache.spark.rdd.RDD[Transaction]
required: org.apache.spark.rdd.RDD[HasUserId]
Note: Transaction <: HasUserID, but class RDD is invariant in type T.
You may wish to define T as +T instead. (SLS 4.5)
badKeyByUserId(xacts)
#SAISDD6
#SAISDD6
An example: natural join
A B C D E A EB X Y
#SAISDD6
An example: natural join
A B C D E A EB X Y
#SAISDD6
An example: natural join
A B C D E X Y
#SAISDD6
Ad-hoc natural join
df1.join(df2, df1("a") === df2("a") &&
df1("b") === df2("b") &&
df1("e") === df2("e"))
#SAISDD6
= {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
introspecting over column names
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing expressions
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing expressions
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing expressions
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
[left.a === right.a, left.b === right.b, …]
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
left.a === right.a && left.b === right.b && …
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
left.a === right.a && left.b === right.b && …
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing column lists
#SAISDD6
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
if(ccols.isEmpty)
left.limit(0).crossJoin(right.limit(0))
else
left
.join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
}
dynamically constructing column lists
#SAISDD6
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
#SAISDD6
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
#SAISDD6
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
}
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
}
}
import NaturalJoin.implicits._
df.natjoin(otherdf)
#SAISDD6
User-defined functions
{"a": 1, "b": "wilma", ..., "x": "club"}
{"a": 2, "b": "betty", ..., "x": "diamond"}
{"a": 3, "b": "fred", ..., "x": "heart"}
{"a": 4, "b": "barney", ..., "x": "spade"}
#SAISDD6
User-defined functions
{"a": 1, "b": "wilma", ..., "x": "club"}
{"a": 2, "b": "betty", ..., "x": "diamond"}
{"a": 3, "b": "fred", ..., "x": "heart"}
{"a": 4, "b": "barney", ..., "x": "spade"}
wilma club
betty diamond
fred heart
barney spade
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
#SAISDD6
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
try:
d = json.loads(js)
return [str(d.get(f)) for f in fields]
except:
return [None] * len(fields)
return udf(impl, resultType)
extract_bx = selectively_structure(["b", "x"])
structured_df = df.withColumn("result", extract_bx("json"))
#SAISDD6
Spark’s ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df) model.transform(df)
#SAISDD6
Working with ML pipelines
model.transform(df)
#SAISDD6
Working with ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
#SAISDD6
Spark’s ML pipelines
model.transform(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df) model.transform(df)
#SAISDD6
Spark’s ML pipelines
estimator.fit(df) model.transform(df)
inputCol
epochs
seed
outputCol
#SAISDD6
#SAISDD6
Forecast
Basic considerations for reusable Spark code
Generic functions for parallel collections
Extending data frames with custom aggregates
Exposing JVM libraries to Python
Sharing your work with the world
About Erik
User-defined aggregates:
the fundamentals
#SAISDD6
Three components
#SAISDD6
Three components
#SAISDD6
Three components
#SAISDD6
Three components
#SAISDD6
Three components
User-defined aggregates:
the implementation
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
#SAISDD6
Four main functions: initialize
initialize
#SAISDD6
Four main functions: initialize
initialize
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
Four main functions: evaluate
evaluate
#SAISDD6
Four main functions: evaluate
evaluate
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
}
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
#SAISDD6
Four main functions: update
update
#SAISDD6
Four main functions: update
update
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
Four main functions: merge
1
merge
2
#SAISDD6
Four main functions: merge
merge
1 + 2
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
#SAISDD6
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
num.toDouble(input.getAs[N](0)))
}
}
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
buf2.getAs[TDigestSQL](0).tdigest)
}
User-defined aggregates:
User-defined types
#SAISDD6
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
package org.apache.spark
#SAISDD6
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
#SAISDD6
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
#SAISDD6
Implementing custom types
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
"isarnproject.sketches.udt.tdigest.TDigestUDT"
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
Nil)
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
row
}
#SAISDD6
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
#SAISDD6
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
#SAISDD6
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters = clustX.zip(clustM)
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
}
Extending PySpark with
your Scala library
#SAISDD6
[ ]
#SAISDD6
[ ]
#SAISDD6
[ ]
#SAISDD6
[ ]
#SAISDD6
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
#SAISDD6
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
#SAISDD6
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
# use the gateway to access JVM objects and classes
thisThing = sparkJVM.com.path.to.this.thing
#SAISDD6
A Python-friendly wrapper
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
#SAISDD6
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
tdigestDoubleUDAF
#SAISDD6
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
}
Double
#SAISDD6
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
#SAISDD6
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
tdapply apply
#SAISDD6
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))tdapply(_to_seq(sc, [col], _to_java_column))
#SAISDD6
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
# ...
#SAISDD6
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
class TDigestUDT(UserDefinedType):
@classmethod
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
# ...
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
@classmethod
def module(cls):
return "isarnproject.sketches.udt.tdigest"
@classmethod
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
#SAISDD6
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (obj.delta, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
#SAISDD6
class TDigestUDT extends UserDefinedType[TDigestSQL] {
// ...
override def pyUDT: String =
“isarnproject.sketches.udt.tdigest.TDigestUDT"
}
class TDigestUDT extends UserDefinedType[TDigestSQL] {
// ...
override def pyUDT: String =
“isarnproject.sketches.udt.tdigest.TDigestUDT"
}
#SAISDD6
Python code in JAR files
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
"isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
"isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
"isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
"isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
"isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
"isarnproject/sketches/udt/tdigest.pyc"
)
#SAISDD6
Cross-building for Python
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
#SAISDD6
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
#SAISDD6
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
if (stat != 0) {
throw new IllegalStateException("python compile failed")
}
}
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
#SAISDD6
Using versioned JAR files
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
#SAISDD6
Using versioned JAR files
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
#SAISDD6
Using versioned JAR files
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
$ pyspark --packages 
'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
Show your work: 

publishing results
#SAISDD6
Developing with git-flow
$ brew install git-flow # macOS
$ dnf install git-flow # Fedora
$ yum install git-flow # CentOS
$ apt-get install git-flow # Debian and friends
(Search the internet for “git flow” to learn more!)
#SAISDD6
# Set up git-flow in this repository
$ git flow init
# Start work on my-awesome-feature; create
# and switch to a feature branch
$ git flow feature start my-awesome-feature
$ ...
# Finish work on my-awesome-feature; merge
# feature/my-awesome-feature to develop
$ git flow feature finish my-awesome-feature
#SAISDD6
# Start work on a release branch
$ git flow release start 0.1.0
# Hack and bump version numbers
$ ...
# Finish work on v0.1.0; merge
# release/0.1.0 to develop and master;
# tag v0.1.0
$ git flow release finish 0.1.0
#SAISDD6
Maven Central Bintray
not really easy to set up for library developers trivial
trivial easy to set up for library users mostly
yes, via sbt easy to publish yes, via sbt + plugins
yes easy to resolve artifacts mostly
Conclusions and takeaways
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
#SAISDD6
https://radanalytics.io
eje@redhat.com • @manyangled
willb@redhat.com • @willb
KEEP IN TOUCH

More Related Content

What's hot

Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLabCloudxLab
 
Spark Cassandra Connector Dataframes
Spark Cassandra Connector DataframesSpark Cassandra Connector Dataframes
Spark Cassandra Connector DataframesRussell Spitzer
 
Spark cassandra connector.API, Best Practices and Use-Cases
Spark cassandra connector.API, Best Practices and Use-CasesSpark cassandra connector.API, Best Practices and Use-Cases
Spark cassandra connector.API, Best Practices and Use-CasesDuyhai Doan
 
Spark and Cassandra 2 Fast 2 Furious
Spark and Cassandra 2 Fast 2 FuriousSpark and Cassandra 2 Fast 2 Furious
Spark and Cassandra 2 Fast 2 FuriousRussell Spitzer
 
Cassandra and Spark: Optimizing for Data Locality
Cassandra and Spark: Optimizing for Data LocalityCassandra and Spark: Optimizing for Data Locality
Cassandra and Spark: Optimizing for Data LocalityRussell Spitzer
 
Spark Cassandra Connector: Past, Present, and Future
Spark Cassandra Connector: Past, Present, and FutureSpark Cassandra Connector: Past, Present, and Future
Spark Cassandra Connector: Past, Present, and FutureRussell Spitzer
 
Riak at The NYC Cloud Computing Meetup Group
Riak at The NYC Cloud Computing Meetup GroupRiak at The NYC Cloud Computing Meetup Group
Riak at The NYC Cloud Computing Meetup Groupsiculars
 
Apache cassandra and spark. you got the the lighter, let's start the fire
Apache cassandra and spark. you got the the lighter, let's start the fireApache cassandra and spark. you got the the lighter, let's start the fire
Apache cassandra and spark. you got the the lighter, let's start the firePatrick McFadin
 
Escape From Hadoop: Spark One Liners for C* Ops
Escape From Hadoop: Spark One Liners for C* OpsEscape From Hadoop: Spark One Liners for C* Ops
Escape From Hadoop: Spark One Liners for C* OpsRussell Spitzer
 
Introduction to Spark Datasets - Functional and relational together at last
Introduction to Spark Datasets - Functional and relational together at lastIntroduction to Spark Datasets - Functional and relational together at last
Introduction to Spark Datasets - Functional and relational together at lastHolden Karau
 
Beyond shuffling - Scala Days Berlin 2016
Beyond shuffling - Scala Days Berlin 2016Beyond shuffling - Scala Days Berlin 2016
Beyond shuffling - Scala Days Berlin 2016Holden Karau
 
Spark Streaming with Cassandra
Spark Streaming with CassandraSpark Streaming with Cassandra
Spark Streaming with CassandraJacek Lewandowski
 
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)Spark Summit
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsCheng Min Chi
 
An Introduction to time series with Team Apache
An Introduction to time series with Team ApacheAn Introduction to time series with Team Apache
An Introduction to time series with Team ApachePatrick McFadin
 
Getting started in Apache Spark and Flink (with Scala) - Part II
Getting started in Apache Spark and Flink (with Scala) - Part IIGetting started in Apache Spark and Flink (with Scala) - Part II
Getting started in Apache Spark and Flink (with Scala) - Part IIAlexander Panchenko
 
Spark cassandra integration, theory and practice
Spark cassandra integration, theory and practiceSpark cassandra integration, theory and practice
Spark cassandra integration, theory and practiceDuyhai Doan
 

What's hot (20)

Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
 
Spark Cassandra Connector Dataframes
Spark Cassandra Connector DataframesSpark Cassandra Connector Dataframes
Spark Cassandra Connector Dataframes
 
Spark cassandra connector.API, Best Practices and Use-Cases
Spark cassandra connector.API, Best Practices and Use-CasesSpark cassandra connector.API, Best Practices and Use-Cases
Spark cassandra connector.API, Best Practices and Use-Cases
 
Spark and Cassandra 2 Fast 2 Furious
Spark and Cassandra 2 Fast 2 FuriousSpark and Cassandra 2 Fast 2 Furious
Spark and Cassandra 2 Fast 2 Furious
 
Cassandra and Spark: Optimizing for Data Locality
Cassandra and Spark: Optimizing for Data LocalityCassandra and Spark: Optimizing for Data Locality
Cassandra and Spark: Optimizing for Data Locality
 
Spark Cassandra Connector: Past, Present, and Future
Spark Cassandra Connector: Past, Present, and FutureSpark Cassandra Connector: Past, Present, and Future
Spark Cassandra Connector: Past, Present, and Future
 
Riak at The NYC Cloud Computing Meetup Group
Riak at The NYC Cloud Computing Meetup GroupRiak at The NYC Cloud Computing Meetup Group
Riak at The NYC Cloud Computing Meetup Group
 
Apache Spark RDD 101
Apache Spark RDD 101Apache Spark RDD 101
Apache Spark RDD 101
 
Apache cassandra and spark. you got the the lighter, let's start the fire
Apache cassandra and spark. you got the the lighter, let's start the fireApache cassandra and spark. you got the the lighter, let's start the fire
Apache cassandra and spark. you got the the lighter, let's start the fire
 
Django cryptography
Django cryptographyDjango cryptography
Django cryptography
 
Apache Spark with Scala
Apache Spark with ScalaApache Spark with Scala
Apache Spark with Scala
 
Escape From Hadoop: Spark One Liners for C* Ops
Escape From Hadoop: Spark One Liners for C* OpsEscape From Hadoop: Spark One Liners for C* Ops
Escape From Hadoop: Spark One Liners for C* Ops
 
Introduction to Spark Datasets - Functional and relational together at last
Introduction to Spark Datasets - Functional and relational together at lastIntroduction to Spark Datasets - Functional and relational together at last
Introduction to Spark Datasets - Functional and relational together at last
 
Beyond shuffling - Scala Days Berlin 2016
Beyond shuffling - Scala Days Berlin 2016Beyond shuffling - Scala Days Berlin 2016
Beyond shuffling - Scala Days Berlin 2016
 
Spark Streaming with Cassandra
Spark Streaming with CassandraSpark Streaming with Cassandra
Spark Streaming with Cassandra
 
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internals
 
An Introduction to time series with Team Apache
An Introduction to time series with Team ApacheAn Introduction to time series with Team Apache
An Introduction to time series with Team Apache
 
Getting started in Apache Spark and Flink (with Scala) - Part II
Getting started in Apache Spark and Flink (with Scala) - Part IIGetting started in Apache Spark and Flink (with Scala) - Part II
Getting started in Apache Spark and Flink (with Scala) - Part II
 
Spark cassandra integration, theory and practice
Spark cassandra integration, theory and practiceSpark cassandra integration, theory and practice
Spark cassandra integration, theory and practice
 

Similar to Apache Spark for Library Developers with Erik Erlandson and William Benton

Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik ErlandsonDatabricks
 
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探台灣資料科學年會
 
Beneath RDD in Apache Spark by Jacek Laskowski
Beneath RDD in Apache Spark by Jacek LaskowskiBeneath RDD in Apache Spark by Jacek Laskowski
Beneath RDD in Apache Spark by Jacek LaskowskiSpark Summit
 
Spark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Streaming Programming Techniques You Should Know with Gerard MaasSpark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Streaming Programming Techniques You Should Know with Gerard MaasSpark Summit
 
Spark zeppelin-cassandra at synchrotron
Spark zeppelin-cassandra at synchrotronSpark zeppelin-cassandra at synchrotron
Spark zeppelin-cassandra at synchrotronDuyhai Doan
 
A really really fast introduction to PySpark - lightning fast cluster computi...
A really really fast introduction to PySpark - lightning fast cluster computi...A really really fast introduction to PySpark - lightning fast cluster computi...
A really really fast introduction to PySpark - lightning fast cluster computi...Holden Karau
 
Spark Programming
Spark ProgrammingSpark Programming
Spark ProgrammingTaewook Eom
 
Introduction to Apache Spark
Introduction to Apache Spark Introduction to Apache Spark
Introduction to Apache Spark Hubert Fan Chiang
 
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules DamjiA Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules DamjiData Con LA
 
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...Databricks
 
A Tale of Two APIs: Using Spark Streaming In Production
A Tale of Two APIs: Using Spark Streaming In ProductionA Tale of Two APIs: Using Spark Streaming In Production
A Tale of Two APIs: Using Spark Streaming In ProductionLightbend
 
Intro to apache spark stand ford
Intro to apache spark stand fordIntro to apache spark stand ford
Intro to apache spark stand fordThu Hiền
 
Spark Summit EU talk by Ted Malaska
Spark Summit EU talk by Ted MalaskaSpark Summit EU talk by Ted Malaska
Spark Summit EU talk by Ted MalaskaSpark Summit
 
Scala Meetup Hamburg - Spark
Scala Meetup Hamburg - SparkScala Meetup Hamburg - Spark
Scala Meetup Hamburg - SparkIvan Morozov
 
Apache Spark RDDs
Apache Spark RDDsApache Spark RDDs
Apache Spark RDDsDean Chen
 
3 Dundee-Spark Overview for C* developers
3 Dundee-Spark Overview for C* developers3 Dundee-Spark Overview for C* developers
3 Dundee-Spark Overview for C* developersChristopher Batey
 

Similar to Apache Spark for Library Developers with Erik Erlandson and William Benton (20)

Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
 
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
 
Beneath RDD in Apache Spark by Jacek Laskowski
Beneath RDD in Apache Spark by Jacek LaskowskiBeneath RDD in Apache Spark by Jacek Laskowski
Beneath RDD in Apache Spark by Jacek Laskowski
 
Introduction to Apache Spark
Introduction to Apache SparkIntroduction to Apache Spark
Introduction to Apache Spark
 
Spark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Streaming Programming Techniques You Should Know with Gerard MaasSpark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Streaming Programming Techniques You Should Know with Gerard Maas
 
Spark zeppelin-cassandra at synchrotron
Spark zeppelin-cassandra at synchrotronSpark zeppelin-cassandra at synchrotron
Spark zeppelin-cassandra at synchrotron
 
Apache spark core
Apache spark coreApache spark core
Apache spark core
 
A really really fast introduction to PySpark - lightning fast cluster computi...
A really really fast introduction to PySpark - lightning fast cluster computi...A really really fast introduction to PySpark - lightning fast cluster computi...
A really really fast introduction to PySpark - lightning fast cluster computi...
 
Intro to Apache Spark
Intro to Apache SparkIntro to Apache Spark
Intro to Apache Spark
 
Intro to Apache Spark
Intro to Apache SparkIntro to Apache Spark
Intro to Apache Spark
 
Spark Programming
Spark ProgrammingSpark Programming
Spark Programming
 
Introduction to Apache Spark
Introduction to Apache Spark Introduction to Apache Spark
Introduction to Apache Spark
 
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules DamjiA Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
 
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
 
A Tale of Two APIs: Using Spark Streaming In Production
A Tale of Two APIs: Using Spark Streaming In ProductionA Tale of Two APIs: Using Spark Streaming In Production
A Tale of Two APIs: Using Spark Streaming In Production
 
Intro to apache spark stand ford
Intro to apache spark stand fordIntro to apache spark stand ford
Intro to apache spark stand ford
 
Spark Summit EU talk by Ted Malaska
Spark Summit EU talk by Ted MalaskaSpark Summit EU talk by Ted Malaska
Spark Summit EU talk by Ted Malaska
 
Scala Meetup Hamburg - Spark
Scala Meetup Hamburg - SparkScala Meetup Hamburg - Spark
Scala Meetup Hamburg - Spark
 
Apache Spark RDDs
Apache Spark RDDsApache Spark RDDs
Apache Spark RDDs
 
3 Dundee-Spark Overview for C* developers
3 Dundee-Spark Overview for C* developers3 Dundee-Spark Overview for C* developers
3 Dundee-Spark Overview for C* developers
 

More from Databricks

DW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptxDW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptxDatabricks
 
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1Databricks
 
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2Databricks
 
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2Databricks
 
Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4Databricks
 
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of HadoopDatabricks
 
Democratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDemocratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDatabricks
 
Learn to Use Databricks for Data Science
Learn to Use Databricks for Data ScienceLearn to Use Databricks for Data Science
Learn to Use Databricks for Data ScienceDatabricks
 
Why APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML MonitoringWhy APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML MonitoringDatabricks
 
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch FixThe Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch FixDatabricks
 
Stage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationStage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationDatabricks
 
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchSimplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchDatabricks
 
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on KubernetesScaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on KubernetesDatabricks
 
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark PipelinesScaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark PipelinesDatabricks
 
Sawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature AggregationsSawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature AggregationsDatabricks
 
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen SinkRedis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen SinkDatabricks
 
Re-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and SparkRe-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and SparkDatabricks
 
Raven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesRaven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesDatabricks
 
Processing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkProcessing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkDatabricks
 
Massive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta LakeMassive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta LakeDatabricks
 

More from Databricks (20)

DW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptxDW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptx
 
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1
 
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2
 
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2
 
Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4
 
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
 
Democratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDemocratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized Platform
 
Learn to Use Databricks for Data Science
Learn to Use Databricks for Data ScienceLearn to Use Databricks for Data Science
Learn to Use Databricks for Data Science
 
Why APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML MonitoringWhy APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML Monitoring
 
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch FixThe Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
 
Stage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationStage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI Integration
 
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchSimplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorch
 
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on KubernetesScaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on Kubernetes
 
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark PipelinesScaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
 
Sawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature AggregationsSawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature Aggregations
 
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen SinkRedis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
 
Re-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and SparkRe-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and Spark
 
Raven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesRaven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction Queries
 
Processing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkProcessing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache Spark
 
Massive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta LakeMassive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta Lake
 

Recently uploaded

{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...Pooja Nehwal
 
Ukraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICSUkraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICSAishani27
 
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /WhatsappsBeautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsappssapnasaifi408
 
Industrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdfIndustrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdfLars Albertsson
 
Call Girls In Mahipalpur O9654467111 Escorts Service
Call Girls In Mahipalpur O9654467111  Escorts ServiceCall Girls In Mahipalpur O9654467111  Escorts Service
Call Girls In Mahipalpur O9654467111 Escorts ServiceSapana Sha
 
Low Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service Bhilai
Low Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service BhilaiLow Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service Bhilai
Low Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service BhilaiSuhani Kapoor
 
Brighton SEO | April 2024 | Data Storytelling
Brighton SEO | April 2024 | Data StorytellingBrighton SEO | April 2024 | Data Storytelling
Brighton SEO | April 2024 | Data StorytellingNeil Barnes
 
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Serviceranjana rawat
 
Predicting Employee Churn: A Data-Driven Approach Project Presentation
Predicting Employee Churn: A Data-Driven Approach Project PresentationPredicting Employee Churn: A Data-Driven Approach Project Presentation
Predicting Employee Churn: A Data-Driven Approach Project PresentationBoston Institute of Analytics
 
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...Sapana Sha
 
定制英国白金汉大学毕业证(UCB毕业证书) 成绩单原版一比一
定制英国白金汉大学毕业证(UCB毕业证书)																			成绩单原版一比一定制英国白金汉大学毕业证(UCB毕业证书)																			成绩单原版一比一
定制英国白金汉大学毕业证(UCB毕业证书) 成绩单原版一比一ffjhghh
 
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdfMarket Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdfRachmat Ramadhan H
 
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptxEMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptxthyngster
 
VIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service Amravati
VIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service AmravatiVIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service Amravati
VIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service AmravatiSuhani Kapoor
 
Invezz.com - Grow your wealth with trading signals
Invezz.com - Grow your wealth with trading signalsInvezz.com - Grow your wealth with trading signals
Invezz.com - Grow your wealth with trading signalsInvezz1
 
Schema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfSchema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfLars Albertsson
 
Unveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data AnalystUnveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data AnalystSamantha Rae Coolbeth
 
100-Concepts-of-AI by Anupama Kate .pptx
100-Concepts-of-AI by Anupama Kate .pptx100-Concepts-of-AI by Anupama Kate .pptx
100-Concepts-of-AI by Anupama Kate .pptxAnupama Kate
 
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...Suhani Kapoor
 

Recently uploaded (20)

{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...{Pooja:  9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
{Pooja: 9892124323 } Call Girl in Mumbai | Jas Kaur Rate 4500 Free Hotel Del...
 
Ukraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICSUkraine War presentation: KNOW THE BASICS
Ukraine War presentation: KNOW THE BASICS
 
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /WhatsappsBeautiful Sapna Vip  Call Girls Hauz Khas 9711199012 Call /Whatsapps
Beautiful Sapna Vip Call Girls Hauz Khas 9711199012 Call /Whatsapps
 
Industrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdfIndustrialised data - the key to AI success.pdf
Industrialised data - the key to AI success.pdf
 
Call Girls In Mahipalpur O9654467111 Escorts Service
Call Girls In Mahipalpur O9654467111  Escorts ServiceCall Girls In Mahipalpur O9654467111  Escorts Service
Call Girls In Mahipalpur O9654467111 Escorts Service
 
Low Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service Bhilai
Low Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service BhilaiLow Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service Bhilai
Low Rate Call Girls Bhilai Anika 8250192130 Independent Escort Service Bhilai
 
Brighton SEO | April 2024 | Data Storytelling
Brighton SEO | April 2024 | Data StorytellingBrighton SEO | April 2024 | Data Storytelling
Brighton SEO | April 2024 | Data Storytelling
 
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
(PARI) Call Girls Wanowrie ( 7001035870 ) HI-Fi Pune Escorts Service
 
Predicting Employee Churn: A Data-Driven Approach Project Presentation
Predicting Employee Churn: A Data-Driven Approach Project PresentationPredicting Employee Churn: A Data-Driven Approach Project Presentation
Predicting Employee Churn: A Data-Driven Approach Project Presentation
 
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
Saket, (-DELHI )+91-9654467111-(=)CHEAP Call Girls in Escorts Service Saket C...
 
定制英国白金汉大学毕业证(UCB毕业证书) 成绩单原版一比一
定制英国白金汉大学毕业证(UCB毕业证书)																			成绩单原版一比一定制英国白金汉大学毕业证(UCB毕业证书)																			成绩单原版一比一
定制英国白金汉大学毕业证(UCB毕业证书) 成绩单原版一比一
 
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdfMarket Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
 
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptxEMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM  TRACKING WITH GOOGLE ANALYTICS.pptx
EMERCE - 2024 - AMSTERDAM - CROSS-PLATFORM TRACKING WITH GOOGLE ANALYTICS.pptx
 
VIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service Amravati
VIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service AmravatiVIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service Amravati
VIP Call Girls in Amravati Aarohi 8250192130 Independent Escort Service Amravati
 
E-Commerce Order PredictionShraddha Kamble.pptx
E-Commerce Order PredictionShraddha Kamble.pptxE-Commerce Order PredictionShraddha Kamble.pptx
E-Commerce Order PredictionShraddha Kamble.pptx
 
Invezz.com - Grow your wealth with trading signals
Invezz.com - Grow your wealth with trading signalsInvezz.com - Grow your wealth with trading signals
Invezz.com - Grow your wealth with trading signals
 
Schema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfSchema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdf
 
Unveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data AnalystUnveiling Insights: The Role of a Data Analyst
Unveiling Insights: The Role of a Data Analyst
 
100-Concepts-of-AI by Anupama Kate .pptx
100-Concepts-of-AI by Anupama Kate .pptx100-Concepts-of-AI by Anupama Kate .pptx
100-Concepts-of-AI by Anupama Kate .pptx
 
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
VIP High Class Call Girls Jamshedpur Anushka 8250192130 Independent Escort Se...
 

Apache Spark for Library Developers with Erik Erlandson and William Benton

  • 1. Apache Spark for 
 library developers William Benton willb@redhat.com @willb Erik Erlandson eje@redhat.com @manyangled
  • 3. #SAISDD6 The Silex and Isarn libraries Reusable open-source code that works 
 with Spark, factored from internal apps. We’ve tracked Spark releases since Spark 1.3.0. See https://silex.radanalytics.io and 
 http://isarnproject.org
  • 4.
  • 5. #SAISDD6 Forecast Basic considerations for reusable Spark code Generic functions for parallel collections Extending data frames with custom aggregates Exposing JVM libraries to Python Sharing your work with the world
  • 17. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 18. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile $ sbt +compile # or test, package, publish, etc. $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 19. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile $ sbt +compile # or test, package, publish, etc. $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 20. #SAISDD6 in your SBT build definition: Bring-your-own Spark libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 21. #SAISDD6 in your SBT build definition: Bring-your-own Spark libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 22. #SAISDD6 in your SBT build definition: “Bring-your-own Spark” libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 23. #SAISDD6 in your SBT build definition: “Bring-your-own Spark” libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 27. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache()
  • 28. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache()
  • 29. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache() rdd.unpersist()
  • 30. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result }
  • 31. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result }
  • 32. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 33. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 34. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 35. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
  • 36. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0
  • 37. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0 32 bytes of data…
  • 38. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0 …and 64 bytes of overhead! 32 bytes of data…
  • 39. Continuous integration for Spark libraries and apps
  • 48. Writing generic code for Spark’s parallel collections
  • 49. #SAISDD6 The RDD is invariant T <: U RDD[T] <: RDD[U]
  • 50. #SAISDD6 The RDD is invariant T <: U RDD[T] <: RDD[U] dog animal
  • 51. #SAISDD6 T <: U RDD[T] <: RDD[U] trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
  • 52. #SAISDD6 T <: U RDD[T] <: RDD[U] trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = r.map(x => (x.userid, x))
  • 53. #SAISDD6 val xacts = spark.parallelize(Array( Transaction(1, 1, 1.0), Transaction(2, 2, 1.0) )) badKeyByUserId(xacts) <console>: error: type mismatch; found : org.apache.spark.rdd.RDD[Transaction] required: org.apache.spark.rdd.RDD[HasUserId] Note: Transaction <: HasUserID, but class RDD is invariant in type T. You may wish to define T as +T instead. (SLS 4.5) badKeyByUserId(xacts)
  • 54. #SAISDD6 val xacts = spark.parallelize(Array( Transaction(1, 1, 1.0), Transaction(2, 2, 1.0) )) badKeyByUserId(xacts) <console>: error: type mismatch; found : org.apache.spark.rdd.RDD[Transaction] required: org.apache.spark.rdd.RDD[HasUserId] Note: Transaction <: HasUserID, but class RDD is invariant in type T. You may wish to define T as +T instead. (SLS 4.5) badKeyByUserId(xacts)
  • 56. #SAISDD6 An example: natural join A B C D E A EB X Y
  • 57. #SAISDD6 An example: natural join A B C D E A EB X Y
  • 58. #SAISDD6 An example: natural join A B C D E X Y
  • 59. #SAISDD6 Ad-hoc natural join df1.join(df2, df1("a") === df2("a") && df1("b") === df2("b") && df1("e") === df2("e"))
  • 60. #SAISDD6 = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame
  • 61. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 62. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } introspecting over column names
  • 63. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 64. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 65. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 66. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 67. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } [left.a === right.a, left.b === right.b, …]
  • 68. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } left.a === right.a && left.b === right.b && …
  • 69. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } left.a === right.a && left.b === right.b && …
  • 70. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 71. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing column lists
  • 72. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, ccols.map {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing column lists
  • 73. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf) case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 74. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf) case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 75. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 76. #SAISDD6 User-defined functions {"a": 1, "b": "wilma", ..., "x": "club"} {"a": 2, "b": "betty", ..., "x": "diamond"} {"a": 3, "b": "fred", ..., "x": "heart"} {"a": 4, "b": "barney", ..., "x": "spade"}
  • 77. #SAISDD6 User-defined functions {"a": 1, "b": "wilma", ..., "x": "club"} {"a": 2, "b": "betty", ..., "x": "diamond"} {"a": 3, "b": "fred", ..., "x": "heart"} {"a": 4, "b": "barney", ..., "x": "spade"} wilma club betty diamond fred heart barney spade
  • 78. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 79. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 80. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 81. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 82. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 83. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) extract_bx = selectively_structure(["b", "x"]) structured_df = df.withColumn("result", extract_bx("json"))
  • 88. #SAISDD6 Working with ML pipelines model.transform(df)
  • 89. #SAISDD6 Working with ML pipelines model.transform(df)
  • 93. #SAISDD6 Spark’s ML pipelines estimator.fit(df) model.transform(df) inputCol epochs seed outputCol
  • 95. #SAISDD6 Forecast Basic considerations for reusable Spark code Generic functions for parallel collections Extending data frames with custom aggregates Exposing JVM libraries to Python Sharing your work with the world
  • 104. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 105. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 106. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 107. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 108. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 109. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 110. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 111. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 112. #SAISDD6 Four main functions: initialize initialize
  • 113. #SAISDD6 Four main functions: initialize initialize
  • 114. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 115. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 116. #SAISDD6 Four main functions: evaluate evaluate
  • 117. #SAISDD6 Four main functions: evaluate evaluate
  • 118. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 119. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 122. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 123. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 124. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 125. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 126. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 127. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 128. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 129. #SAISDD6 Four main functions: merge 1 merge 2
  • 130. #SAISDD6 Four main functions: merge merge 1 + 2
  • 131. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 132. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 134. #SAISDD6 User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... package org.apache.spark
  • 135. #SAISDD6 package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // ....
  • 136. #SAISDD6 package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // ....
  • 137. #SAISDD6 Implementing custom types class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 138. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 139. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 140. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 141. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 142. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 143. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 144. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 145. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 146. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 147. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = clustX.zip(clustM) .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 148. Extending PySpark with your Scala library
  • 153. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing
  • 154. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm
  • 155. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing # use the gateway to access JVM objects and classes thisThing = sparkJVM.com.path.to.this.thing
  • 156. #SAISDD6 A Python-friendly wrapper package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) }
  • 157. #SAISDD6 package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } tdigestDoubleUDAF
  • 158. #SAISDD6 package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } Double
  • 159. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column))) from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
  • 160. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column))) tdapply apply
  • 161. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = sc._jvm.org.isarnproject.sketches.udaf.pythonBindings tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column)))tdapply(_to_seq(sc, [col], _to_java_column))
  • 162. #SAISDD6 class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) # ...
  • 163. #SAISDD6 class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) # ...
  • 164. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 165. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 166. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 167. #SAISDD6 class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4]) class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
  • 168. #SAISDD6 class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4]) class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (obj.delta, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
  • 169. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { // ... override def pyUDT: String = “isarnproject.sketches.udt.tdigest.TDigestUDT" } class TDigestUDT extends UserDefinedType[TDigestSQL] { // ... override def pyUDT: String = “isarnproject.sketches.udt.tdigest.TDigestUDT" }
  • 170. #SAISDD6 Python code in JAR files mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 171. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 172. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 173. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 174. #SAISDD6 Cross-building for Python lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython)
  • 175. #SAISDD6 lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython)
  • 176. #SAISDD6 lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value s.log.info("compiling python...") if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
  • 177. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 178. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 179. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 180. Show your work: 
 publishing results
  • 181. #SAISDD6 Developing with git-flow $ brew install git-flow # macOS $ dnf install git-flow # Fedora $ yum install git-flow # CentOS $ apt-get install git-flow # Debian and friends (Search the internet for “git flow” to learn more!)
  • 182. #SAISDD6 # Set up git-flow in this repository $ git flow init # Start work on my-awesome-feature; create # and switch to a feature branch $ git flow feature start my-awesome-feature $ ... # Finish work on my-awesome-feature; merge # feature/my-awesome-feature to develop $ git flow feature finish my-awesome-feature
  • 183. #SAISDD6 # Start work on a release branch $ git flow release start 0.1.0 # Hack and bump version numbers $ ... # Finish work on v0.1.0; merge # release/0.1.0 to develop and master; # tag v0.1.0 $ git flow release finish 0.1.0
  • 184.
  • 185.
  • 186.
  • 187.
  • 188. #SAISDD6 Maven Central Bintray not really easy to set up for library developers trivial trivial easy to set up for library users mostly yes, via sbt easy to publish yes, via sbt + plugins yes easy to resolve artifacts mostly