Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.
TUPLESTUPLES
ALLALL
THE WAYTHE WAY
DOWNDOWN
HIHI
ABOUT THIS TALKABOUT THIS TALK
Scala
Hadoop
Cascading
Scalding
SCALASCALA
IMPLICITIMPLICIT
CONVERSIONSCONVERSIONS
scala> "blah and blah"
res4: java.lang.String = blah and blah
scala> "blah and blah".urlencode()
error: value urlencode is...
class WrappedString(val x:String) {
def urlencode() = URLEncoder.encode(x)
}
implicit def stringToWrappedString(x:String): WrappedString =
new WrappedString(x)
scala> "blah and blah".urlencode()
res2: java.lang.String = blah+and+blah!
scala> stringToWrappedString(“blah and blah”).u...
IMPLICIT PARAMETERSIMPLICIT PARAMETERS
def emailSignature(name:String)(implicit organization:String) =
"Sincerely Yours,n" + name + "n" + organization
scala> imp...
WORD COUNTWORD COUNT
FIRST CHARACTERFIRST CHARACTER
COUNTCOUNT
val words = List("one", "two", "three")
words
.map {
word =>
Map(word.substring(0,1) -> 1)
}
.reduce {
(a, b) =>
a ++ b.ma...
val words = List("one", "two", "three")
words
.map {
word =>
Map(word.substring(0,1) -> 1)
}
List(Map(o -> 1), Map(t -> 1)...
List(Map(o -> 1), Map(t -> 1), Map(t -> 1))
.reduce {
(a, b) =>
a ++ b.map {
mapTuple =>
val (key, count) = mapTuple
(key,...
HADOOPHADOOP
~> echo "onentwonthree" > ~/foo.txt
~> hadoop fs -put ~/foo.txt
SORT OFSORT OF
void map(K1 key, V1 value, OutputCollector<K2,V2> output)
void reduce(K2 key, Iterator<V2> values,
OutputCo...
CASCADINGCASCADING
cascading.tuple.Tuple
val tuple = new Tuple(
1.asInstanceOf[Object],
"gigi".asInstanceOf[Object],
"miami".asInstanceOf[Object]
)
val fields = ne...
> one.get("user_id")
res4: java.lang.Comparable[_] = 1
> one.get("user_name")
res5: java.lang.Comparable[_] = gigi
FIRST CHARACTERFIRST CHARACTER
COUNT 2.0COUNT 2.0
def cascadingTuple[T](fieldName:String, entry:T): TupleEntry =
new TupleEntry(
new Fields(fieldName),
new Tuple(entry.asIn...
val words = List(
cascadingTuple("word", "one"),
cascadingTuple("word", "two"),
cascadingTuple("word", "three")
)
words
.map {
tuple =>
cascadingTuple(
"map",
Map(tuple.getString("word").substring(0,1) -> 1)
)
}
.reduce {
(tupleA, tupleB) =>
val (a,b) = (
tupleA.getObject("map").asInstanceOf[Map[String,Int]],
tupleB.getObject("map")...
> run
fields: ['map'] tuple: ['Map(o -> 1, t -> 2)']
FIRST CHARACTERFIRST CHARACTER
COUNT 1.2COUNT 1.2
List("one", "two", "three")
.groupBy(_.substring(0,1))
res1: Map(o -> List(one), t -> List(two, three))
List("one", "two", "three")
.groupBy(_.substring(0,1))
.map {
(tuple) =>
val (key, value) = tuple
(key, value.size)
}
res1...
SCALDINGSCALDING
import com.twitter.scalding._
class ExampleJob(args: Args)
extends Job(args) {
TextLine("data/words.txt")
.map('line -> 'f...
class ExampleJob(args: Args)
extends Job(args) {
TextLine("data/words.txt")
fields: ['line'] tuple: ['one']
fields: ['line'] tuple: ['two']
fields: ['line'] tuple: ['three...
.map('line -> 'first_character) {
(line:String) =>
line.substring(0,1)
}
fields: ['line', 'first_character'] tuple: ['one'...
.groupBy('first_character) {
(g: GroupBuilder) =>
g.size('size)
}
fields: ['first_character', 'size'] tuple: ['o', 1]
fiel...
.groupBy('first_character) {
(g: GroupBuilder) =>
g.size('size)
}
fields: ['first_character', ‘size’] tuple: ['o', ‘1’]
fi...
BEHIND THE SCENESBEHIND THE SCENES
TextLine("data/words.txt")
.map('line -> 'first_character) {
HUH?HUH?
class Job(val args: Args)
extends FieldConversions
with java.io.Serializable {
implicit def pipeToRichPipe(pipe: Pipe): Ri...
def map[A, T](fs: (Fields, Fields))
(fn: A => T)
(implicit conv: TupleConverter[A],
setter: TupleSetter[T]): Pipe
.map('line -> 'first_character) {
def map[A, T](fs: (Fields, Fields))
implicit def symbolToFields(x: Symbol) = {
if (x == ...
.map('line -> 'first_character) {
(line:String) =>
line.substring(0,1)
}
def map[A, T](fs: (Fields, Fields))
(fn: A => T)
implicit def tuple1Converter[A](implicit gA: TupleGetter[A]): TupleCo
implicit object StringGetter extends TupleGetter[Str...
WHAT HAPPENS WHENWHAT HAPPENS WHEN
YOU RUN IT?YOU RUN IT?
> runMain com.twitter.scalding.Tool
com.giokincade.scalding.Exam...
COMPILE TIMECOMPILE TIME
COMPOSITION TIMECOMPOSITION TIME
JOB RUN TIMEJOB RUN TIME
TYPED PIPETYPED PIPE
TypedPipe.from(
TextLine("data/words.txt")
)
.map(line => line.substring(0,1))
.groupBy(character => character)
.size
.wri...
RESOURCESRESOURCES
github.com/giokincade/scalding-talk-examples
github.com/twitter/scalding
MEME
gio@etsy.com
@giokincade
FINFIN
Tuples All the Way Down
Tuples All the Way Down
Tuples All the Way Down
Tuples All the Way Down
Tuples All the Way Down
Tuples All the Way Down
Tuples All the Way Down
Tuples All the Way Down
Upcoming SlideShare
Loading in …5
×

Tuples All the Way Down

29 views

Published on

An introductory course in Scalding, which I gave at a Scala meetup.

Published in: Technology
  • Be the first to comment

  • Be the first to like this

Tuples All the Way Down

  1. 1. TUPLESTUPLES ALLALL THE WAYTHE WAY DOWNDOWN
  2. 2. HIHI
  3. 3. ABOUT THIS TALKABOUT THIS TALK Scala Hadoop Cascading Scalding
  4. 4. SCALASCALA
  5. 5. IMPLICITIMPLICIT CONVERSIONSCONVERSIONS
  6. 6. scala> "blah and blah" res4: java.lang.String = blah and blah scala> "blah and blah".urlencode() error: value urlencode is not a member of java.lang.String
  7. 7. class WrappedString(val x:String) { def urlencode() = URLEncoder.encode(x) }
  8. 8. implicit def stringToWrappedString(x:String): WrappedString = new WrappedString(x)
  9. 9. scala> "blah and blah".urlencode() res2: java.lang.String = blah+and+blah! scala> stringToWrappedString(“blah and blah”).urlencode() res2: java.lang.String = blah+and+blah!
  10. 10. IMPLICIT PARAMETERSIMPLICIT PARAMETERS
  11. 11. def emailSignature(name:String)(implicit organization:String) = "Sincerely Yours,n" + name + "n" + organization scala> implicit val organization = "Etsy, Inc." organization: java.lang.String = Etsy, Inc. scala> emailSignature("Giovanni Fernandez-Kincade") res3: java.lang.String = Sincerely Yours, Giovanni Fernandez-Kincade Etsy, Inc.
  12. 12. WORD COUNTWORD COUNT FIRST CHARACTERFIRST CHARACTER COUNTCOUNT
  13. 13. val words = List("one", "two", "three") words .map { word => Map(word.substring(0,1) -> 1) } .reduce { (a, b) => a ++ b.map { mapTuple => val (key, count) = mapTuple (key, a.getOrElse(key, 0) + count) } }
  14. 14. val words = List("one", "two", "three") words .map { word => Map(word.substring(0,1) -> 1) } List(Map(o -> 1), Map(t -> 1), Map(t -> 1))
  15. 15. List(Map(o -> 1), Map(t -> 1), Map(t -> 1)) .reduce { (a, b) => a ++ b.map { mapTuple => val (key, count) = mapTuple (key, a.getOrElse(key, 0) + count) } } Map(o -> 1, t -> 2)
  16. 16. HADOOPHADOOP
  17. 17. ~> echo "onentwonthree" > ~/foo.txt ~> hadoop fs -put ~/foo.txt
  18. 18. SORT OFSORT OF void map(K1 key, V1 value, OutputCollector<K2,V2> output) void reduce(K2 key, Iterator<V2> values, OutputCollector<K3,V3> output)
  19. 19. CASCADINGCASCADING
  20. 20. cascading.tuple.Tuple
  21. 21. val tuple = new Tuple( 1.asInstanceOf[Object], "gigi".asInstanceOf[Object], "miami".asInstanceOf[Object] ) val fields = new Fields( "user_id", "user_name", "location" ) val one = new TupleEntry( fields, tuple )
  22. 22. > one.get("user_id") res4: java.lang.Comparable[_] = 1 > one.get("user_name") res5: java.lang.Comparable[_] = gigi
  23. 23. FIRST CHARACTERFIRST CHARACTER COUNT 2.0COUNT 2.0
  24. 24. def cascadingTuple[T](fieldName:String, entry:T): TupleEntry = new TupleEntry( new Fields(fieldName), new Tuple(entry.asInstanceOf[Object]) )
  25. 25. val words = List( cascadingTuple("word", "one"), cascadingTuple("word", "two"), cascadingTuple("word", "three") )
  26. 26. words .map { tuple => cascadingTuple( "map", Map(tuple.getString("word").substring(0,1) -> 1) ) }
  27. 27. .reduce { (tupleA, tupleB) => val (a,b) = ( tupleA.getObject("map").asInstanceOf[Map[String,Int]], tupleB.getObject("map").asInstanceOf[Map[String,Int]], ) val result = a ++ b.map { mapTuple => val (key, count) = mapTuple (key, a.getOrElse(key, 0) + count) } cascadingTuple("map", result) }
  28. 28. > run fields: ['map'] tuple: ['Map(o -> 1, t -> 2)']
  29. 29. FIRST CHARACTERFIRST CHARACTER COUNT 1.2COUNT 1.2
  30. 30. List("one", "two", "three") .groupBy(_.substring(0,1)) res1: Map(o -> List(one), t -> List(two, three))
  31. 31. List("one", "two", "three") .groupBy(_.substring(0,1)) .map { (tuple) => val (key, value) = tuple (key, value.size) } res1: Map(o -> 1, t -> 2)
  32. 32. SCALDINGSCALDING
  33. 33. import com.twitter.scalding._ class ExampleJob(args: Args) extends Job(args) { TextLine("data/words.txt") .map('line -> 'first_character) { (line:String) => line.substring(0,1) } .groupBy('first_character) { (g: GroupBuilder) => g.size('size) } .write(Tsv("data/output/characters.tsv")) }
  34. 34. class ExampleJob(args: Args) extends Job(args) {
  35. 35. TextLine("data/words.txt") fields: ['line'] tuple: ['one'] fields: ['line'] tuple: ['two'] fields: ['line'] tuple: ['three']
  36. 36. .map('line -> 'first_character) { (line:String) => line.substring(0,1) } fields: ['line', 'first_character'] tuple: ['one', 'o'] fields: ['line', 'first_character'] tuple: ['two', 't'] fields: ['line', 'first_character'] tuple: ['three', 't']
  37. 37. .groupBy('first_character) { (g: GroupBuilder) => g.size('size) } fields: ['first_character', 'size'] tuple: ['o', 1] fields: ['first_character', 'size'] tuple: ['t', 2]
  38. 38. .groupBy('first_character) { (g: GroupBuilder) => g.size('size) } fields: ['first_character', ‘size’] tuple: ['o', ‘1’] fields: ['first_character', ‘size’] tuple: ['t', ‘2’] .write(Tsv("data/output/characters.tsv")) o 1 t 2
  39. 39. BEHIND THE SCENESBEHIND THE SCENES
  40. 40. TextLine("data/words.txt") .map('line -> 'first_character) {
  41. 41. HUH?HUH?
  42. 42. class Job(val args: Args) extends FieldConversions with java.io.Serializable { implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read)
  43. 43. def map[A, T](fs: (Fields, Fields)) (fn: A => T) (implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe
  44. 44. .map('line -> 'first_character) { def map[A, T](fs: (Fields, Fields)) implicit def symbolToFields(x: Symbol) = { if (x == '*) { Fields.ALL } else { new Fields(x.name) } }
  45. 45. .map('line -> 'first_character) { (line:String) => line.substring(0,1) } def map[A, T](fs: (Fields, Fields)) (fn: A => T)
  46. 46. implicit def tuple1Converter[A](implicit gA: TupleGetter[A]): TupleCo implicit object StringGetter extends TupleGetter[String] { override def get(tup: CTuple, i: Int) = tup.getString(i) } .map('line -> 'first_character) { (line:String) => line.substring(0,1) } def map[A, T](fs: (Fields, Fields)) (fn: A => T) (implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe implicit def singleSetter[A]: TupleSetter[A] = new TupleSetter[A] { override def apply(arg: A) = { val tup = CTuple.size(1) tup.set(0, arg) tup }
  47. 47. WHAT HAPPENS WHENWHAT HAPPENS WHEN YOU RUN IT?YOU RUN IT? > runMain com.twitter.scalding.Tool com.giokincade.scalding.ExampleJob --local hadoop jar foo.jar com.giokincade.scalding.ExampleJob --hdfs
  48. 48. COMPILE TIMECOMPILE TIME
  49. 49. COMPOSITION TIMECOMPOSITION TIME
  50. 50. JOB RUN TIMEJOB RUN TIME
  51. 51. TYPED PIPETYPED PIPE
  52. 52. TypedPipe.from( TextLine("data/words.txt") ) .map(line => line.substring(0,1)) .groupBy(character => character) .size .write( TypedTsv[(String, Long)]("data/output/typed-characters.tsv") )
  53. 53. RESOURCESRESOURCES github.com/giokincade/scalding-talk-examples github.com/twitter/scalding
  54. 54. MEME gio@etsy.com @giokincade
  55. 55. FINFIN

×