SlideShare a Scribd company logo
1 of 63
Download to read offline
TUPLESTUPLES
ALLALL
THE WAYTHE WAY
DOWNDOWN
HIHI
ABOUT THIS TALKABOUT THIS TALK
Scala
Hadoop
Cascading
Scalding
SCALASCALA
IMPLICITIMPLICIT
CONVERSIONSCONVERSIONS
scala> "blah and blah"
res4: java.lang.String = blah and blah
scala> "blah and blah".urlencode()
error: value urlencode is not a member of java.lang.String
class WrappedString(val x:String) {
def urlencode() = URLEncoder.encode(x)
}
implicit def stringToWrappedString(x:String): WrappedString =
new WrappedString(x)
scala> "blah and blah".urlencode()
res2: java.lang.String = blah+and+blah!
scala> stringToWrappedString(“blah and blah”).urlencode()
res2: java.lang.String = blah+and+blah!
IMPLICIT PARAMETERSIMPLICIT PARAMETERS
def emailSignature(name:String)(implicit organization:String) =
"Sincerely Yours,n" + name + "n" + organization
scala> implicit val organization = "Etsy, Inc."
organization: java.lang.String = Etsy, Inc.
scala> emailSignature("Giovanni Fernandez-Kincade")
res3: java.lang.String =
Sincerely Yours,
Giovanni Fernandez-Kincade
Etsy, Inc.
WORD COUNTWORD COUNT
FIRST CHARACTERFIRST CHARACTER
COUNTCOUNT
val words = List("one", "two", "three")
words
.map {
word =>
Map(word.substring(0,1) -> 1)
}
.reduce {
(a, b) =>
a ++ b.map {
mapTuple =>
val (key, count) = mapTuple
(key, a.getOrElse(key, 0) + count)
}
}
val words = List("one", "two", "three")
words
.map {
word =>
Map(word.substring(0,1) -> 1)
}
List(Map(o -> 1), Map(t -> 1), Map(t -> 1))
List(Map(o -> 1), Map(t -> 1), Map(t -> 1))
.reduce {
(a, b) =>
a ++ b.map {
mapTuple =>
val (key, count) = mapTuple
(key, a.getOrElse(key, 0) + count)
}
}
Map(o -> 1, t -> 2)
HADOOPHADOOP
~> echo "onentwonthree" > ~/foo.txt
~> hadoop fs -put ~/foo.txt
SORT OFSORT OF
void map(K1 key, V1 value, OutputCollector<K2,V2> output)
void reduce(K2 key, Iterator<V2> values,
OutputCollector<K3,V3> output)
CASCADINGCASCADING
cascading.tuple.Tuple
val tuple = new Tuple(
1.asInstanceOf[Object],
"gigi".asInstanceOf[Object],
"miami".asInstanceOf[Object]
)
val fields = new Fields(
"user_id",
"user_name",
"location"
)
val one = new TupleEntry(
fields,
tuple
)
> one.get("user_id")
res4: java.lang.Comparable[_] = 1
> one.get("user_name")
res5: java.lang.Comparable[_] = gigi
FIRST CHARACTERFIRST CHARACTER
COUNT 2.0COUNT 2.0
def cascadingTuple[T](fieldName:String, entry:T): TupleEntry =
new TupleEntry(
new Fields(fieldName),
new Tuple(entry.asInstanceOf[Object])
)
val words = List(
cascadingTuple("word", "one"),
cascadingTuple("word", "two"),
cascadingTuple("word", "three")
)
words
.map {
tuple =>
cascadingTuple(
"map",
Map(tuple.getString("word").substring(0,1) -> 1)
)
}
.reduce {
(tupleA, tupleB) =>
val (a,b) = (
tupleA.getObject("map").asInstanceOf[Map[String,Int]],
tupleB.getObject("map").asInstanceOf[Map[String,Int]],
)
val result = a ++ b.map {
mapTuple =>
val (key, count) = mapTuple
(key, a.getOrElse(key, 0) + count)
}
cascadingTuple("map", result)
}
> run
fields: ['map'] tuple: ['Map(o -> 1, t -> 2)']
FIRST CHARACTERFIRST CHARACTER
COUNT 1.2COUNT 1.2
List("one", "two", "three")
.groupBy(_.substring(0,1))
res1: Map(o -> List(one), t -> List(two, three))
List("one", "two", "three")
.groupBy(_.substring(0,1))
.map {
(tuple) =>
val (key, value) = tuple
(key, value.size)
}
res1: Map(o -> 1, t -> 2)
SCALDINGSCALDING
import com.twitter.scalding._
class ExampleJob(args: Args)
extends Job(args) {
TextLine("data/words.txt")
.map('line -> 'first_character) {
(line:String) =>
line.substring(0,1)
}
.groupBy('first_character) {
(g: GroupBuilder) =>
g.size('size)
}
.write(Tsv("data/output/characters.tsv"))
}
class ExampleJob(args: Args)
extends Job(args) {
TextLine("data/words.txt")
fields: ['line'] tuple: ['one']
fields: ['line'] tuple: ['two']
fields: ['line'] tuple: ['three']
.map('line -> 'first_character) {
(line:String) =>
line.substring(0,1)
}
fields: ['line', 'first_character'] tuple: ['one', 'o']
fields: ['line', 'first_character'] tuple: ['two', 't']
fields: ['line', 'first_character'] tuple: ['three', 't']
.groupBy('first_character) {
(g: GroupBuilder) =>
g.size('size)
}
fields: ['first_character', 'size'] tuple: ['o', 1]
fields: ['first_character', 'size'] tuple: ['t', 2]
.groupBy('first_character) {
(g: GroupBuilder) =>
g.size('size)
}
fields: ['first_character', ‘size’] tuple: ['o', ‘1’]
fields: ['first_character', ‘size’] tuple: ['t', ‘2’]
.write(Tsv("data/output/characters.tsv"))
o 1
t 2
BEHIND THE SCENESBEHIND THE SCENES
TextLine("data/words.txt")
.map('line -> 'first_character) {
HUH?HUH?
class Job(val args: Args)
extends FieldConversions
with java.io.Serializable {
implicit def pipeToRichPipe(pipe: Pipe): RichPipe =
new RichPipe(pipe)
implicit def sourceToRichPipe(src: Source): RichPipe =
new RichPipe(src.read)
def map[A, T](fs: (Fields, Fields))
(fn: A => T)
(implicit conv: TupleConverter[A],
setter: TupleSetter[T]): Pipe
.map('line -> 'first_character) {
def map[A, T](fs: (Fields, Fields))
implicit def symbolToFields(x: Symbol) = {
if (x == '*) {
Fields.ALL
} else {
new Fields(x.name)
}
}
.map('line -> 'first_character) {
(line:String) =>
line.substring(0,1)
}
def map[A, T](fs: (Fields, Fields))
(fn: A => T)
implicit def tuple1Converter[A](implicit gA: TupleGetter[A]): TupleCo
implicit object StringGetter extends TupleGetter[String] {
override def get(tup: CTuple, i: Int) = tup.getString(i)
}
.map('line -> 'first_character) {
(line:String) =>
line.substring(0,1)
}
def map[A, T](fs: (Fields, Fields))
(fn: A => T)
(implicit conv: TupleConverter[A],
setter: TupleSetter[T]): Pipe
implicit def singleSetter[A]: TupleSetter[A] = new TupleSetter[A] {
override def apply(arg: A) = {
val tup = CTuple.size(1)
tup.set(0, arg)
tup
}
WHAT HAPPENS WHENWHAT HAPPENS WHEN
YOU RUN IT?YOU RUN IT?
> runMain com.twitter.scalding.Tool
com.giokincade.scalding.ExampleJob --local
hadoop jar foo.jar com.giokincade.scalding.ExampleJob --hdfs
COMPILE TIMECOMPILE TIME
COMPOSITION TIMECOMPOSITION TIME
JOB RUN TIMEJOB RUN TIME
TYPED PIPETYPED PIPE
TypedPipe.from(
TextLine("data/words.txt")
)
.map(line => line.substring(0,1))
.groupBy(character => character)
.size
.write(
TypedTsv[(String, Long)]("data/output/typed-characters.tsv")
)
RESOURCESRESOURCES
github.com/giokincade/scalding-talk-examples
github.com/twitter/scalding
MEME
gio@etsy.com
@giokincade
FINFIN

More Related Content

What's hot

JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
PROIDEA
 

What's hot (18)

学生向けScalaハンズオンテキスト part2
学生向けScalaハンズオンテキスト part2学生向けScalaハンズオンテキスト part2
学生向けScalaハンズオンテキスト part2
 
学生向けScalaハンズオンテキスト
学生向けScalaハンズオンテキスト学生向けScalaハンズオンテキスト
学生向けScalaハンズオンテキスト
 
Meet scala
Meet scalaMeet scala
Meet scala
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
The Ring programming language version 1.5.1 book - Part 34 of 180
The Ring programming language version 1.5.1 book - Part 34 of 180The Ring programming language version 1.5.1 book - Part 34 of 180
The Ring programming language version 1.5.1 book - Part 34 of 180
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
JDD2015: Functional programing and Event Sourcing - a pair made in heaven - e...
 
Patterns in Terraform 12+13: Data, Transformations and Resources
Patterns in Terraform 12+13: Data, Transformations and ResourcesPatterns in Terraform 12+13: Data, Transformations and Resources
Patterns in Terraform 12+13: Data, Transformations and Resources
 
Purely Functional Data Structures in Scala
Purely Functional Data Structures in ScalaPurely Functional Data Structures in Scala
Purely Functional Data Structures in Scala
 
The Ring programming language version 1.10 book - Part 30 of 212
The Ring programming language version 1.10 book - Part 30 of 212The Ring programming language version 1.10 book - Part 30 of 212
The Ring programming language version 1.10 book - Part 30 of 212
 
Functional programming with_scala
Functional programming with_scalaFunctional programming with_scala
Functional programming with_scala
 
Switching from java to groovy
Switching from java to groovySwitching from java to groovy
Switching from java to groovy
 
PHP and MySQL Tips and tricks, DC 2007
PHP and MySQL Tips and tricks, DC 2007PHP and MySQL Tips and tricks, DC 2007
PHP and MySQL Tips and tricks, DC 2007
 
Python programming : List and tuples
Python programming : List and tuplesPython programming : List and tuples
Python programming : List and tuples
 
The Ring programming language version 1.9 book - Part 44 of 210
The Ring programming language version 1.9 book - Part 44 of 210The Ring programming language version 1.9 book - Part 44 of 210
The Ring programming language version 1.9 book - Part 44 of 210
 
Dictionary in python
Dictionary in pythonDictionary in python
Dictionary in python
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 

Similar to Tuples All the Way Down

(How) can we benefit from adopting scala?
(How) can we benefit from adopting scala?(How) can we benefit from adopting scala?
(How) can we benefit from adopting scala?
Tomasz Wrobel
 
Scala presentation by Aleksandar Prokopec
Scala presentation by Aleksandar ProkopecScala presentation by Aleksandar Prokopec
Scala presentation by Aleksandar Prokopec
Loïc Descotte
 
Pragmatic Real-World Scala
Pragmatic Real-World ScalaPragmatic Real-World Scala
Pragmatic Real-World Scala
parag978978
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin
 

Similar to Tuples All the Way Down (20)

A bit about Scala
A bit about ScalaA bit about Scala
A bit about Scala
 
(How) can we benefit from adopting scala?
(How) can we benefit from adopting scala?(How) can we benefit from adopting scala?
(How) can we benefit from adopting scala?
 
Spark_Documentation_Template1
Spark_Documentation_Template1Spark_Documentation_Template1
Spark_Documentation_Template1
 
Spark workshop
Spark workshopSpark workshop
Spark workshop
 
Scala presentation by Aleksandar Prokopec
Scala presentation by Aleksandar ProkopecScala presentation by Aleksandar Prokopec
Scala presentation by Aleksandar Prokopec
 
Scala
ScalaScala
Scala
 
Scala in Places API
Scala in Places APIScala in Places API
Scala in Places API
 
Real Time Big Data Management
Real Time Big Data ManagementReal Time Big Data Management
Real Time Big Data Management
 
Modern technologies in data science
Modern technologies in data science Modern technologies in data science
Modern technologies in data science
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and Monoids
 
Pragmatic Real-World Scala
Pragmatic Real-World ScalaPragmatic Real-World Scala
Pragmatic Real-World Scala
 
Scala training workshop 02
Scala training workshop 02Scala training workshop 02
Scala training workshop 02
 
SparkSQLの構文解析
SparkSQLの構文解析SparkSQLの構文解析
SparkSQLの構文解析
 
Scalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of codeScalding - Hadoop Word Count in LESS than 70 lines of code
Scalding - Hadoop Word Count in LESS than 70 lines of code
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with spark
 
Introduction to Scala
Introduction to ScalaIntroduction to Scala
Introduction to Scala
 
Hello scala
Hello scalaHello scala
Hello scala
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 

Recently uploaded

Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire business
panagenda
 

Recently uploaded (20)

Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with Milvus
 
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
 
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ..."I see eyes in my soup": How Delivery Hero implemented the safety system for ...
"I see eyes in my soup": How Delivery Hero implemented the safety system for ...
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptx
 
Navigating Identity and Access Management in the Modern Enterprise
Navigating Identity and Access Management in the Modern EnterpriseNavigating Identity and Access Management in the Modern Enterprise
Navigating Identity and Access Management in the Modern Enterprise
 
How to Check CNIC Information Online with Pakdata cf
How to Check CNIC Information Online with Pakdata cfHow to Check CNIC Information Online with Pakdata cf
How to Check CNIC Information Online with Pakdata cf
 
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
 
Design and Development of a Provenance Capture Platform for Data Science
Design and Development of a Provenance Capture Platform for Data ScienceDesign and Development of a Provenance Capture Platform for Data Science
Design and Development of a Provenance Capture Platform for Data Science
 
Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..
 
Choreo: Empowering the Future of Enterprise Software Engineering
Choreo: Empowering the Future of Enterprise Software EngineeringChoreo: Empowering the Future of Enterprise Software Engineering
Choreo: Empowering the Future of Enterprise Software Engineering
 
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
 
Simplifying Mobile A11y Presentation.pptx
Simplifying Mobile A11y Presentation.pptxSimplifying Mobile A11y Presentation.pptx
Simplifying Mobile A11y Presentation.pptx
 
AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)
AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)
AI+A11Y 11MAY2024 HYDERBAD GAAD 2024 - HelloA11Y (11 May 2024)
 
Quantum Leap in Next-Generation Computing
Quantum Leap in Next-Generation ComputingQuantum Leap in Next-Generation Computing
Quantum Leap in Next-Generation Computing
 
DBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor Presentation
 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf
 
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodPolkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
 
Less Is More: Utilizing Ballerina to Architect a Cloud Data Platform
Less Is More: Utilizing Ballerina to Architect a Cloud Data PlatformLess Is More: Utilizing Ballerina to Architect a Cloud Data Platform
Less Is More: Utilizing Ballerina to Architect a Cloud Data Platform
 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire business
 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
 

Tuples All the Way Down