Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Tuples All the Way Down

24 views

Published on

An introductory course in Scalding, which I gave at a Scala meetup.

Published in: Technology
  • Be the first to comment

  • Be the first to like this

Tuples All the Way Down

  1. 1. TUPLESTUPLES ALLALL THE WAYTHE WAY DOWNDOWN
  2. 2. HIHI
  3. 3. ABOUT THIS TALKABOUT THIS TALK Scala Hadoop Cascading Scalding
  4. 4. SCALASCALA
  5. 5. IMPLICITIMPLICIT CONVERSIONSCONVERSIONS
  6. 6. scala> "blah and blah" res4: java.lang.String = blah and blah scala> "blah and blah".urlencode() error: value urlencode is not a member of java.lang.String
  7. 7. class WrappedString(val x:String) { def urlencode() = URLEncoder.encode(x) }
  8. 8. implicit def stringToWrappedString(x:String): WrappedString = new WrappedString(x)
  9. 9. scala> "blah and blah".urlencode() res2: java.lang.String = blah+and+blah! scala> stringToWrappedString(“blah and blah”).urlencode() res2: java.lang.String = blah+and+blah!
  10. 10. IMPLICIT PARAMETERSIMPLICIT PARAMETERS
  11. 11. def emailSignature(name:String)(implicit organization:String) = "Sincerely Yours,n" + name + "n" + organization scala> implicit val organization = "Etsy, Inc." organization: java.lang.String = Etsy, Inc. scala> emailSignature("Giovanni Fernandez-Kincade") res3: java.lang.String = Sincerely Yours, Giovanni Fernandez-Kincade Etsy, Inc.
  12. 12. WORD COUNTWORD COUNT FIRST CHARACTERFIRST CHARACTER COUNTCOUNT
  13. 13. val words = List("one", "two", "three") words .map { word => Map(word.substring(0,1) -> 1) } .reduce { (a, b) => a ++ b.map { mapTuple => val (key, count) = mapTuple (key, a.getOrElse(key, 0) + count) } }
  14. 14. val words = List("one", "two", "three") words .map { word => Map(word.substring(0,1) -> 1) } List(Map(o -> 1), Map(t -> 1), Map(t -> 1))
  15. 15. List(Map(o -> 1), Map(t -> 1), Map(t -> 1)) .reduce { (a, b) => a ++ b.map { mapTuple => val (key, count) = mapTuple (key, a.getOrElse(key, 0) + count) } } Map(o -> 1, t -> 2)
  16. 16. HADOOPHADOOP
  17. 17. ~> echo "onentwonthree" > ~/foo.txt ~> hadoop fs -put ~/foo.txt
  18. 18. SORT OFSORT OF void map(K1 key, V1 value, OutputCollector<K2,V2> output) void reduce(K2 key, Iterator<V2> values, OutputCollector<K3,V3> output)
  19. 19. CASCADINGCASCADING
  20. 20. cascading.tuple.Tuple
  21. 21. val tuple = new Tuple( 1.asInstanceOf[Object], "gigi".asInstanceOf[Object], "miami".asInstanceOf[Object] ) val fields = new Fields( "user_id", "user_name", "location" ) val one = new TupleEntry( fields, tuple )
  22. 22. > one.get("user_id") res4: java.lang.Comparable[_] = 1 > one.get("user_name") res5: java.lang.Comparable[_] = gigi
  23. 23. FIRST CHARACTERFIRST CHARACTER COUNT 2.0COUNT 2.0
  24. 24. def cascadingTuple[T](fieldName:String, entry:T): TupleEntry = new TupleEntry( new Fields(fieldName), new Tuple(entry.asInstanceOf[Object]) )
  25. 25. val words = List( cascadingTuple("word", "one"), cascadingTuple("word", "two"), cascadingTuple("word", "three") )
  26. 26. words .map { tuple => cascadingTuple( "map", Map(tuple.getString("word").substring(0,1) -> 1) ) }
  27. 27. .reduce { (tupleA, tupleB) => val (a,b) = ( tupleA.getObject("map").asInstanceOf[Map[String,Int]], tupleB.getObject("map").asInstanceOf[Map[String,Int]], ) val result = a ++ b.map { mapTuple => val (key, count) = mapTuple (key, a.getOrElse(key, 0) + count) } cascadingTuple("map", result) }
  28. 28. > run fields: ['map'] tuple: ['Map(o -> 1, t -> 2)']
  29. 29. FIRST CHARACTERFIRST CHARACTER COUNT 1.2COUNT 1.2
  30. 30. List("one", "two", "three") .groupBy(_.substring(0,1)) res1: Map(o -> List(one), t -> List(two, three))
  31. 31. List("one", "two", "three") .groupBy(_.substring(0,1)) .map { (tuple) => val (key, value) = tuple (key, value.size) } res1: Map(o -> 1, t -> 2)
  32. 32. SCALDINGSCALDING
  33. 33. import com.twitter.scalding._ class ExampleJob(args: Args) extends Job(args) { TextLine("data/words.txt") .map('line -> 'first_character) { (line:String) => line.substring(0,1) } .groupBy('first_character) { (g: GroupBuilder) => g.size('size) } .write(Tsv("data/output/characters.tsv")) }
  34. 34. class ExampleJob(args: Args) extends Job(args) {
  35. 35. TextLine("data/words.txt") fields: ['line'] tuple: ['one'] fields: ['line'] tuple: ['two'] fields: ['line'] tuple: ['three']
  36. 36. .map('line -> 'first_character) { (line:String) => line.substring(0,1) } fields: ['line', 'first_character'] tuple: ['one', 'o'] fields: ['line', 'first_character'] tuple: ['two', 't'] fields: ['line', 'first_character'] tuple: ['three', 't']
  37. 37. .groupBy('first_character) { (g: GroupBuilder) => g.size('size) } fields: ['first_character', 'size'] tuple: ['o', 1] fields: ['first_character', 'size'] tuple: ['t', 2]
  38. 38. .groupBy('first_character) { (g: GroupBuilder) => g.size('size) } fields: ['first_character', ‘size’] tuple: ['o', ‘1’] fields: ['first_character', ‘size’] tuple: ['t', ‘2’] .write(Tsv("data/output/characters.tsv")) o 1 t 2
  39. 39. BEHIND THE SCENESBEHIND THE SCENES
  40. 40. TextLine("data/words.txt") .map('line -> 'first_character) {
  41. 41. HUH?HUH?
  42. 42. class Job(val args: Args) extends FieldConversions with java.io.Serializable { implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read)
  43. 43. def map[A, T](fs: (Fields, Fields)) (fn: A => T) (implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe
  44. 44. .map('line -> 'first_character) { def map[A, T](fs: (Fields, Fields)) implicit def symbolToFields(x: Symbol) = { if (x == '*) { Fields.ALL } else { new Fields(x.name) } }
  45. 45. .map('line -> 'first_character) { (line:String) => line.substring(0,1) } def map[A, T](fs: (Fields, Fields)) (fn: A => T)
  46. 46. implicit def tuple1Converter[A](implicit gA: TupleGetter[A]): TupleCo implicit object StringGetter extends TupleGetter[String] { override def get(tup: CTuple, i: Int) = tup.getString(i) } .map('line -> 'first_character) { (line:String) => line.substring(0,1) } def map[A, T](fs: (Fields, Fields)) (fn: A => T) (implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe implicit def singleSetter[A]: TupleSetter[A] = new TupleSetter[A] { override def apply(arg: A) = { val tup = CTuple.size(1) tup.set(0, arg) tup }
  47. 47. WHAT HAPPENS WHENWHAT HAPPENS WHEN YOU RUN IT?YOU RUN IT? > runMain com.twitter.scalding.Tool com.giokincade.scalding.ExampleJob --local hadoop jar foo.jar com.giokincade.scalding.ExampleJob --hdfs
  48. 48. COMPILE TIMECOMPILE TIME
  49. 49. COMPOSITION TIMECOMPOSITION TIME
  50. 50. JOB RUN TIMEJOB RUN TIME
  51. 51. TYPED PIPETYPED PIPE
  52. 52. TypedPipe.from( TextLine("data/words.txt") ) .map(line => line.substring(0,1)) .groupBy(character => character) .size .write( TypedTsv[(String, Long)]("data/output/typed-characters.tsv") )
  53. 53. RESOURCESRESOURCES github.com/giokincade/scalding-talk-examples github.com/twitter/scalding
  54. 54. MEME gio@etsy.com @giokincade
  55. 55. FINFIN

×