7




Aggregate Data Analysis
Data                Data                Data



mapper              mapper              mapper

   mapper              mapper              mapper

           mapper              mapper              mapper
#$%&'()*'                          -'.            #$%   0
           1"23                45)667'
      &'()*'           0      1"23        0    1"        3
                   "
"
Welcome to My HomePage.
      Thank you.
 Where is your house? ....




                                                                  "
                  " !+/"-'.                         "
"
mapper

Big Data   mapper

           mapper
map: (k1, v1) ! [(k2, v2)] // []

//word count
class Mapper
   method Map(docid a, doc d)
      for all term t ∈ doc d do
         Emit(term t, count 1)
> require 'msgpack'
> msg = [1,2,3].to_msgpack 
                      #=>"x93x01x02x03"
> MessagePack.unpack(msg)  #=> [1,2,3]
// word count
class Combiner
   method Combine(string t, counts [c1, c2, . . .])
      sum ← 0
      for all count c ∈ counts [c1, c2, . . .] do
         sum ← sum + c
      Emit(string t, count sum)
reduce: (k2, [v2]) ! [(k3, v3)]

//word count
class Reducer
   method Reduce(term t, counts [c1, c2, . . .])
      sum ← 0
      for all count c ∈ counts [c1,c2,...] do
         sum ← sum + c
      Emit(term t, count sum)
30   CHAPTER 2. MAPREDUCE BASICS

                          !       "       #       $       %         &




                 '())*+
                   ))              '())*+
                                     ))                 '())*+
                                                          ))                 '())*+
                                                                               ))


                ( -   , .         / 0     / 1         ( 2     / .           , 3     / 4

                 /5',67*+          /5',67*+             /5',67*+             /5',67*+


                ( -   , .               / 8           ( 2     / .           , 3     / 4

                )
                )(+969657*+       )
                                  )(+969657*+         )
                                                      )(+969657*+           )
                                                                            )(+969657*+

                         :;<==>*?(7@?:5+9A (BB+*B(9*?C(><*D?,E?F*ED

                              (   - 2            ,    . 3               /   . 8 4



                        +*@</*+               +*@</*+            +*@</*+


                            G 2                 H 3                 I 8
Hadoop Tutorial Series, Issue #2: Getting Started With (Customized) Partitioning
Hadoop Tutorial Series, Issue #2:
Getting Started With (Customized) Partitioning
Hadoop Tutorial Series, Issue #2: Getting Started With (Customized) Partitioning
package com.philippeadjiman.hadooptraining; 
package com.philippeadjiman.hadooptraining;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
 
public class MyPartitioner implements Partitioner<IntWritable,Text> {
" @Override
" public int getPartition(IntWritable key, Text value, int numPartitions) {
" " /* Pretty ugly hard coded partitioning function. Don't do that in practice,
it is just for the sake of understanding. */
" " int nbOccurences = key.get();
 
" " if( nbOccurences < 3 )
" " " return 0;
" " else
" " " return 1;
" }
 
" @Override
" public void configure(JobConf arg0) {
 
" }
}                      Hadoop Tutorial Series, Issue #2: Getting Started With (Customized) Partitioning
x + y = y + x
x    y = y     x

(x + y) + z = x + (y + z)
(x    y)     z = x   (y   z)
class Mapper {
   buffer
   init() {
       buffer = HashMap.new
   }
   map(id, data) {
       elements = process(data)
       for each element {
            ....
            check_and_put(buffer, k2, v2)
       }
   } //                           Designing algorithms for Map Reduce
check_and_put(buffer, k2, v2) {
        if buffer.full {
            for each k2 in buffer.keys {
                emit(k2, buffer[k2])
            }
        } else {
            buffer.incrby(k2, v2) // H[k2]+=v2
        }
    }
    close() {
        for each k2 in buffer.keys {
            emit(k2, buffer[k2])
        }
    }
}                                  Designing algorithms for Map Reduce
!           !        !         !"#                                 !             !             !                       2,%




                                                             6"#
                                                                                "#            "#            "#                      !"#$%&'
             "           "        "         $%&'(%)*'+




                                                                                                                                                    5,%
                                                                                '(            '(            '(                      ()*+*,-./0$1/

  !         ! !#!                           !+(,+       !         !             !
                                                                                $             $ 2,%$                                :*8+"*6$+/
                             !#!"#



                                              6"#
                                                                                                    !"#$%&'
  "         "   "                $%&'(%)*'+ -(.*#/0 "#           "#             "#




                                                                                                                      5,%
                     #       #




                                                             (+2*3+
                                                    '(            '(            '(                  ()*+*,-./0$1/




                                                                                                                                                    ,33"/3,+*#)9+"//
                                                                                                    !"           !"                 2/"3/
                     $       $ !+(,+        1+2*3+      $         $             $                   :*8+"*6$+/
      !#       !#                                                                                   "%           "%                 !"#$%&'
                             % -(.*#/0      4.5&*6+(
      #          #%                                                                                     &        &                  4#56*)/
                                              (+2*3+




                                                                                                                      ,33"/3,+*#)9+"//
                                                                           !"            !"         2/"3/
                             1+2*3+
       $ Figure 1: Distributed execution plan for MapReduce
              $
           when reduce cannot be decomposed to perform partial             "%            "%         !"#$%&'
       %   aggregation.
                 %            4.5&*6+(                                           !"            !"                                   2/"3/
                                                                            &            &          4#56*)/




                                                                                                                                                    "/0$1/
Figure 1: Distributed execution plan for function, and merge and
              With this user-defined MapReduce                                    "%                "%                               !"#$%&'
when reduce cannot beoperators provided by partial
           grouping decomposed to perform the system, it is pos-
aggregation.
            sible to execute a simple distributed computation as
                                                         !"           !"             )             )2/"3/                           7*),-./0$1/
            shown in Figure 1. The computation has exactly




                                                                                                                      "/0$1/
                                                          "%
   With this user-definedthe first phase merge anda Map function
            two phases: function, and executes                         "%            *             *!"#$%&'                         4#)8$5/"
grouping operatorsinputs to by the system, and pos-
            on the provided extract keys it is records, then per-
                                                           )
sible to execute a simple distributed computation as based on the
            forms a partitioning of these outputs
                                                                       )                            7*),-./0$1/
                                                                            Figure 2: Distributed execution plan for MapReduce
shown in Figureof the records. The second phase collects and
            keys 1. The computation has exactly                             when reduce supports partial aggregation. The imple-
two phases: the first phase executes a Map function         *           *                    4#)8$5/"
                                                                            mentation of GroupBy in the first stage may be different to
Def. 1
  x: data items, x1 ⊕ x2: concatenation of x1, x2.
             H        decomposable         2    I   C

                 :

1) ∀x1, x2 : H(x1 ⊕ x2) = C(I(x1 ⊕ x2)) = C(I(x1) ⊕ I(x2))
2) ∀x1, x2 : I(x1 ⊕ x2) = I(x2 ⊕ x1)
3) ∀x1, x2 : C(x1 ⊕ x2) = C(x2 ⊕ x1)

Def. 2
         H           associative-decomposable       Def.1

1-3                      C

4) ∀x1, x2, x3 : C(C(x1 ⊕ x2) ⊕ x3) = C(x1 ⊕ C(x2 ⊕ x3))
( i.e. C is associative )
class Combiner {
   share_space
   init(share_space_info) {
       share_space = conn(share_space_info)
   }
   combine(key, elements) {
       sum = 0
       for each element {
              ...
              sum += v
       } //
share_space.incrby(key, sum)
        emit(key, share_space_info)
    } // end combine()
}
class Reducer {
    reduce(key, list_of_share_space_info) {
        for each share_space_info {
            share_space = conn(share_space_info)
            sum = 0
            elements = share_space.hget(key)
            for each elemnt {
            ...
        }
    }
}
partition(key) {
   range = (KEY_MAX - KEY_MIN) / NUM_OF_REDUCERS
   reducer_no = (key - KEY_MIN) / range
   return reducer_no
}                                    Designing algorithms for Map Reduce
(t1, m1, r80521), (t1, m2, r14209), (t1, m3, r76042),
(t2, m1, r21823), (t2, m2, r66508), (t2, m3, r98347),...




 map: m1 ! (t1, r80521) //

 // t1,t2,t3,...
 (m1) ! [(t1, r80521), (t3, r146925), (t2, r21823)]
 (m2) ! [(t2, r66508), (t1, r14209), (t3, r14720)]
map: (m1, t1) ! r80521




(m1, t1) ! [(r80521)] // t1,t2,t3,...
(m1, t2) ! [(r21823)]
(m1, t3) ! [(r146925)]
class Mapper {
          buffer
          map(id, number) {
             buffer.append(number)
             if (buffer.is_full) {
                   max = compute_max(buffer)
                   emit(1, max)
             }
      }
}                                        Designing algorithms for Map Reduce
class Reducer {
    reduce(key, list_of_local_max) {
        global_max = 0
        for local_max in list_of_local_max {
            if local_max > global_max {
                global_max = local_max
            }
        }
        emit(1, global_max)
    }
}                                  Designing algorithms for Map Reduce
class Combiner {
    combine(key, list_of_local_max) {
       local_max = maximum(list_of_local_max)
       emit(1, local_max)
    } // Max()

}                               Designing algorithms for Map Reduce
class Mapper {
    map(id, data) {
        key, value = process(data)
        if rand() < 0.1 {   //rand() ∈ [0.0, 1.0)
            emit(key, value)
        }
    }
}
Map Reduce and Stream Processing
# Call at each hit record
 map(k1, hitRecord) {
     site = hitRecord.site
     #     key(=site)    slice

     slice = lookupSlice(site)
     if (slice.time - now > 60.minutes) {
         # Notify reducer whole slice of site is sent
         advance(site, slice)
         slice = lookupSlice(site)
     }
     emitIntermediate(site, slice, 1)
 }                                      Map Reduce and Stream Processing
combine(site, slice, countList) {
    hitCount = 0
    for count in countList {
        hitCount += count
    }
    # Send the message to the downstream node
    emitIntermediate(site, slice, hitCount)
}                                     Map Reduce and Stream Processing
#       mapper   slice

reduce(site, slice, countList) {
    hitCount = 0
    for count in countList {
        hitCount += count
    }
    sv = SliceValue.new
    sv.hitCount = hitCount
    return sv
}                                  Map Reduce and Stream Processing
# Window
init(slice) {
    rangeValue = RangeValue.new
    rangeValue.hitCount = 0
    return rangeValue
}
# Reduce
merge(rangeValue, slice, sliceValue) {
    rangeValue.hitCount += sliceValue.hitCount
}
#     slice   slicing window
unmerge(rangeValue, slice, sliceValue) {
    rangeValue.hitCount -= sliceValue.hitCount
}                                 Map Reduce and Stream Processing
5&4.)1*,!,);3-00+*0-1*,!&/*+!*-58!.-$*9!-$%!@+&22,!).A!18*!          -!:2*=#;2*!'-<!1&!4&$#1&+!,1+*-4#$0!%-1-6!!
.-$*3-00+*0-1*,! 1&! 5&4.)1*! '#$%&'3-00+*0-1*,6! >)+! *=3           R)++*$1!.+&.&,-2,!:&+!*/-2)-1#$0!,2#%#$03'#$%&'!-00+*0-1*!
.*+#4*$1-2! ,1)%<! ,8&',! 18-1! ),#$0! .-$*,! 8-,! ,#0$#:#5-$1!      ()*+#*,!;)::*+!*-58!#$.)1!1).2*!)$1#2!#1!#,!$&!2&$0*+!$**%*%!
.*+:&+4-$5*!;*$*:#1,6!!                                              INP6! D#$5*! *-58! #$.)1! 1).2*! ;*2&$0,! 1&! 4)21#.2*! '#$%&',9!
                                                                     ,)58!-..+&-58*,!;)::*+!-!1).2*!)$1#2!#1!#,!.+&5*,,*%!:&+!18*!
'(# )*+,-./0+1-*2                                                    -00+*0-1*! &/*+! 18*! 2-,1! '#$%&'! 1&! '8#58! #1! ;*2&$0,6! -58!
B-$<! -..2#5-1#&$,! $**%! 1&! .+&5*,,! ,1+*-4,9! :&+! *=-4.2*9!      #$.)1! 1).2*! #,! -55*,,*%! 4)21#.2*! 1#4*,9! &$5*! :&+! *-58! '#$3
:#$-$5#-2! %-1-! -$-2<,#,9! $*1'&+C! 1+-::#5! 4&$#1&+#$09! -$%!      %&'!18-1!#1!.-+1#5#.-1*,!#$6!!!
1*2*5&44)$#5-1#&$! 4&$#1&+#$06! D*/*+-2! %-1-;-,*! +*,*-+58!
0+&).,! -+*! ;)#2%#$0! --1-! .1+*-4! /-$-0*4*$1! .<,1*4,!            "*! ,**! 1'&! .+&;2*4,! '#18! ,)58! -..+&-58*,6! W#+,1! 18*!
EFDBDG!,&!18-1!-..2#5-1#&$,!5-$!#,,)*!()*+#*,!1&!0*1!1#4*2<!         ;)::*+!,#H*!+*()#+*%!#,!)$;&)$%*%T!Q1!-$<!1#4*!#$,1-$19!-22!
#$:&+4-1#&$! :+&4! ,1+*-4,6! B-$-0#$0! -$%! .+&5*,,#$0!              1).2*,! 5&$1-#$*%! #$! 18*! 5)++*$1! '#$%&'! -+*! #$! 18*! ;)::*+9!
,1+*-4,!0#/*,!+#,*!1&!58-22*$0*,!18-1!8-/*!;**$!*=1*$,#/*2<!         -$%!,&!18*!,#H*!&:!18*!+*()#+*%!;)::*+,!#,!%*1*+4#$*%!;<!18*!
%#,5),,*%!-$%!+*5&0$#H*%!IJ9!K9!L9!M9!NOP6!!                         '#$%&'!+-$0*!-$%!18*!%-1-!-++#/-2!+-1*6!D*5&$%9!.+&5*,,#$0!
                                                                     *-58!#$.)1!1).2*!4)21#.2*!1#4*,!2*-%,!1&!-!8#08!5&4.)1-1#&$!
Q$!#4.&+1-$1!52-,,!&:!()*+#*,!&/*+!%-1-!,1+*-4,!#,!,2#%#$03          5&,16!W&+!*=-4.2*!#$!X)*+<!N9!*-58!#$.)1!1).2*!#,!.+&5*,,*%!
'#$%&'!-00+*0-1*!()*+#*,6!R&$,#%*+!-$!&$2#$*!-)51#&$!,<,3            :&)+!1#4*,6!Q,!18*!+-1#&!&:!YQZ[!&/*+!D]7F!#$5+*-,*,9!
1*4!#$!'8#58!;#%,!&$!-)51#&$!#1*4,!-+*!,1+*-4*%!#$1&!-!5*$3          ,&!%&*,!18*!$)4;*+!&:!1#4*,!*-58!1).2*!#,!.+&5*,,*%6!R&$3
1+-2!-)51#&$!.+&5*,,#$0!,<,1*46!S8*!,58*4-!&:!*-58!;#%!#,T!          ,#%*+#$0!18*!2-+0*!/&2)4*!-$%!:-,1!-++#/-2!+-1*!&:!,1+*-4#$0!
U#1*43#%9! ;#%3.+#5*9! 1#4*,1-4.V6! W&+! *-,*! &:! .+*,*$1-1#&$9!    %-1-9!+*%)5#$0!18*!-4&)$1!&:!+*()#+*%!;)::*+!,.-5*!E#%*-22<!
'*!-,,)4*!18-1!;#%,!-++#/*!#$!&+%*+!&$!18*#+!1#4*,1-4.!-13           1&!-!5&$,1-$1!;&)$%G!-$%!5&4.)1-1#&$!1#4*!#,!-$!#4.&+1-$1!
1+#;)1*6! E"*! -+*! -51#/*2<! #$/*,1#0-1#$0! .+&5*,,#$0! %#,&+3
%*+*%!%-1-!,1+*-4,G!X)*+<!N!,8&',!-$!*=-4.2*!&:!-!,2#%#$03
'#$%&'!-00+*0-1*!()*+<6!
3/4,52'T!@W#$%!18*!4-=#4)4!;#%!.+#5*!:&+!18*!.-,1!K!4#$3
)1*,!-$%!).%-1*!18*!+*,)21!*/*+<!N!4#$)1*6A!
!"#"$%&'()*+,-./0,123&
4567&+,-89:;%%5&<,'28<('/&
&&&&&&&&&&5;=>"&?&',@A<28&
&&&&&&&&&&!#BC"&D&',@A<2E&
7$! 18*! ()*+<! -;&/*9! '*! #$1+&%)5*! -! '#$%&'! ,.*5#:#5-1#&$!
'#18!18+**!.-+-4*1*+,T!YQZ[!,.*5#:#*,!18*!'#$%&'!,#H*9!
D]7F! ,.*5#:#*,! 8&'! 18*! '#$%&'! 4&/*,9! -$%! "QSSY!
,.*5#:#*,! 18*! '#$%&'#$0! -11+#;)1*! &$! '8#58! 18-1! 18*!
YQZ[! -$%! D]7F! .-+-4*1*+,! -+*! %*:#$*%6! S8*! '#$%&'!
,.*5#:#5-1#&$! &:! X)*+<! N! ;+*-C,! 18*! ;#%! ,1+*-4! #$1&! &/*+3
2-..#$0!K34#$)1*!,);3,1+*-4,!18-1!,1-+1!*/*+<!4#$)1*9!'#18!
+*,.*51! 1&! 18*! 1#4*,1-4.! -11+#;)1*6! S8*,*! &/*+2-..#$0! ,);3
,1+*-4,!-+*!5-22*%!!"#$#%&0(#%$)(!6!X)*+<!N!5-25)2-1*,!18*!                      617/,42'8291*.-:;2&-<=-;4.2->26-/,2?@*4;2
                                                              No Pane, No Gain: Efficient Evaluation of Sliding-Window
                                                                          Aggregates over Data Streams
K-Means Clustering in Map Reduce
Figure 2: MapReduce Classifier Training and Evaluation Procedure




                                A Comparison of Approaches for Large-Scale Data Mining
Google Pregel Graph Processing
Google Pregel Graph Processing
Map Reduce 〜入門編:仕組みの理解とアルゴリズムデザイン〜

Map Reduce 〜入門編:仕組みの理解とアルゴリズムデザイン〜

  • 12.
  • 14.
    Data Data Data mapper mapper mapper mapper mapper mapper mapper mapper mapper
  • 19.
    #$%&'()*' -'. #$% 0 1"23 45)667' &'()*' 0 1"23 0 1" 3 " " Welcome to My HomePage. Thank you. Where is your house? .... " " !+/"-'. " "
  • 21.
    mapper Big Data mapper mapper
  • 22.
    map: (k1, v1)! [(k2, v2)] // [] //word count class Mapper method Map(docid a, doc d) for all term t ∈ doc d do Emit(term t, count 1)
  • 23.
    > require 'msgpack' >msg = [1,2,3].to_msgpack  #=>"x93x01x02x03" > MessagePack.unpack(msg)  #=> [1,2,3]
  • 24.
    // word count classCombiner method Combine(string t, counts [c1, c2, . . .]) sum ← 0 for all count c ∈ counts [c1, c2, . . .] do sum ← sum + c Emit(string t, count sum)
  • 27.
    reduce: (k2, [v2])! [(k3, v3)] //word count class Reducer method Reduce(term t, counts [c1, c2, . . .]) sum ← 0 for all count c ∈ counts [c1,c2,...] do sum ← sum + c Emit(term t, count sum)
  • 28.
    30 CHAPTER 2. MAPREDUCE BASICS ! " # $ % & '())*+ )) '())*+ )) '())*+ )) '())*+ )) ( - , . / 0 / 1 ( 2 / . , 3 / 4 /5',67*+ /5',67*+ /5',67*+ /5',67*+ ( - , . / 8 ( 2 / . , 3 / 4 ) )(+969657*+ ) )(+969657*+ ) )(+969657*+ ) )(+969657*+ :;<==>*?(7@?:5+9A (BB+*B(9*?C(><*D?,E?F*ED ( - 2 , . 3 / . 8 4 +*@</*+ +*@</*+ +*@</*+ G 2 H 3 I 8
  • 33.
    Hadoop Tutorial Series,Issue #2: Getting Started With (Customized) Partitioning
  • 34.
    Hadoop Tutorial Series,Issue #2: Getting Started With (Customized) Partitioning
  • 35.
    Hadoop Tutorial Series,Issue #2: Getting Started With (Customized) Partitioning
  • 36.
    package com.philippeadjiman.hadooptraining;  package com.philippeadjiman.hadooptraining; importorg.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Partitioner;   public class MyPartitioner implements Partitioner<IntWritable,Text> { " @Override " public int getPartition(IntWritable key, Text value, int numPartitions) { " " /* Pretty ugly hard coded partitioning function. Don't do that in practice, it is just for the sake of understanding. */ " " int nbOccurences = key.get();   " " if( nbOccurences < 3 ) " " " return 0; " " else " " " return 1; " }   " @Override " public void configure(JobConf arg0) {   " } } Hadoop Tutorial Series, Issue #2: Getting Started With (Customized) Partitioning
  • 37.
    x + y= y + x x y = y x (x + y) + z = x + (y + z) (x y) z = x (y z)
  • 42.
    class Mapper { buffer init() { buffer = HashMap.new } map(id, data) { elements = process(data) for each element { .... check_and_put(buffer, k2, v2) } } // Designing algorithms for Map Reduce
  • 43.
    check_and_put(buffer, k2, v2){ if buffer.full { for each k2 in buffer.keys { emit(k2, buffer[k2]) } } else { buffer.incrby(k2, v2) // H[k2]+=v2 } } close() { for each k2 in buffer.keys { emit(k2, buffer[k2]) } } } Designing algorithms for Map Reduce
  • 44.
    ! ! ! !"# ! ! ! 2,% 6"# "# "# "# !"#$%&' " " " $%&'(%)*'+ 5,% '( '( '( ()*+*,-./0$1/ ! ! !#! !+(,+ ! ! ! $ $ 2,%$ :*8+"*6$+/ !#!"# 6"# !"#$%&' " " " $%&'(%)*'+ -(.*#/0 "# "# "# 5,% # # (+2*3+ '( '( '( ()*+*,-./0$1/ ,33"/3,+*#)9+"// !" !" 2/"3/ $ $ !+(,+ 1+2*3+ $ $ $ :*8+"*6$+/ !# !# "% "% !"#$%&' % -(.*#/0 4.5&*6+( # #% & & 4#56*)/ (+2*3+ ,33"/3,+*#)9+"// !" !" 2/"3/ 1+2*3+ $ Figure 1: Distributed execution plan for MapReduce $ when reduce cannot be decomposed to perform partial "% "% !"#$%&' % aggregation. % 4.5&*6+( !" !" 2/"3/ & & 4#56*)/ "/0$1/ Figure 1: Distributed execution plan for function, and merge and With this user-defined MapReduce "% "% !"#$%&' when reduce cannot beoperators provided by partial grouping decomposed to perform the system, it is pos- aggregation. sible to execute a simple distributed computation as !" !" ) )2/"3/ 7*),-./0$1/ shown in Figure 1. The computation has exactly "/0$1/ "% With this user-definedthe first phase merge anda Map function two phases: function, and executes "% * *!"#$%&' 4#)8$5/" grouping operatorsinputs to by the system, and pos- on the provided extract keys it is records, then per- ) sible to execute a simple distributed computation as based on the forms a partitioning of these outputs ) 7*),-./0$1/ Figure 2: Distributed execution plan for MapReduce shown in Figureof the records. The second phase collects and keys 1. The computation has exactly when reduce supports partial aggregation. The imple- two phases: the first phase executes a Map function * * 4#)8$5/" mentation of GroupBy in the first stage may be different to
  • 46.
    Def. 1 x: data items, x1 ⊕ x2: concatenation of x1, x2. H decomposable 2 I C : 1) ∀x1, x2 : H(x1 ⊕ x2) = C(I(x1 ⊕ x2)) = C(I(x1) ⊕ I(x2)) 2) ∀x1, x2 : I(x1 ⊕ x2) = I(x2 ⊕ x1) 3) ∀x1, x2 : C(x1 ⊕ x2) = C(x2 ⊕ x1) Def. 2 H associative-decomposable Def.1 1-3 C 4) ∀x1, x2, x3 : C(C(x1 ⊕ x2) ⊕ x3) = C(x1 ⊕ C(x2 ⊕ x3)) ( i.e. C is associative )
  • 49.
    class Combiner { share_space init(share_space_info) { share_space = conn(share_space_info) } combine(key, elements) { sum = 0 for each element { ... sum += v } //
  • 50.
    share_space.incrby(key, sum) emit(key, share_space_info) } // end combine() } class Reducer { reduce(key, list_of_share_space_info) { for each share_space_info { share_space = conn(share_space_info) sum = 0 elements = share_space.hget(key) for each elemnt { ... } } }
  • 51.
    partition(key) { range = (KEY_MAX - KEY_MIN) / NUM_OF_REDUCERS reducer_no = (key - KEY_MIN) / range return reducer_no } Designing algorithms for Map Reduce
  • 52.
    (t1, m1, r80521),(t1, m2, r14209), (t1, m3, r76042), (t2, m1, r21823), (t2, m2, r66508), (t2, m3, r98347),... map: m1 ! (t1, r80521) // // t1,t2,t3,... (m1) ! [(t1, r80521), (t3, r146925), (t2, r21823)] (m2) ! [(t2, r66508), (t1, r14209), (t3, r14720)]
  • 53.
    map: (m1, t1)! r80521 (m1, t1) ! [(r80521)] // t1,t2,t3,... (m1, t2) ! [(r21823)] (m1, t3) ! [(r146925)]
  • 54.
    class Mapper { buffer map(id, number) { buffer.append(number) if (buffer.is_full) { max = compute_max(buffer) emit(1, max) } } } Designing algorithms for Map Reduce
  • 55.
    class Reducer { reduce(key, list_of_local_max) { global_max = 0 for local_max in list_of_local_max { if local_max > global_max { global_max = local_max } } emit(1, global_max) } } Designing algorithms for Map Reduce
  • 56.
    class Combiner { combine(key, list_of_local_max) { local_max = maximum(list_of_local_max) emit(1, local_max) } // Max() } Designing algorithms for Map Reduce
  • 57.
    class Mapper { map(id, data) { key, value = process(data) if rand() < 0.1 { //rand() ∈ [0.0, 1.0) emit(key, value) } } }
  • 63.
    Map Reduce andStream Processing
  • 66.
    # Call ateach hit record map(k1, hitRecord) { site = hitRecord.site # key(=site) slice slice = lookupSlice(site) if (slice.time - now > 60.minutes) { # Notify reducer whole slice of site is sent advance(site, slice) slice = lookupSlice(site) } emitIntermediate(site, slice, 1) } Map Reduce and Stream Processing
  • 67.
    combine(site, slice, countList){ hitCount = 0 for count in countList { hitCount += count } # Send the message to the downstream node emitIntermediate(site, slice, hitCount) } Map Reduce and Stream Processing
  • 68.
    # mapper slice reduce(site, slice, countList) { hitCount = 0 for count in countList { hitCount += count } sv = SliceValue.new sv.hitCount = hitCount return sv } Map Reduce and Stream Processing
  • 69.
    # Window init(slice) { rangeValue = RangeValue.new rangeValue.hitCount = 0 return rangeValue } # Reduce merge(rangeValue, slice, sliceValue) { rangeValue.hitCount += sliceValue.hitCount } # slice slicing window unmerge(rangeValue, slice, sliceValue) { rangeValue.hitCount -= sliceValue.hitCount } Map Reduce and Stream Processing
  • 70.
    5&4.)1*,!,);3-00+*0-1*,!&/*+!*-58!.-$*9!-$%!@+&22,!).A!18*! -!:2*=#;2*!'-<!1&!4&$#1&+!,1+*-4#$0!%-1-6!! .-$*3-00+*0-1*,! 1&! 5&4.)1*! '#$%&'3-00+*0-1*,6! >)+! *=3 R)++*$1!.+&.&,-2,!:&+!*/-2)-1#$0!,2#%#$03'#$%&'!-00+*0-1*! .*+#4*$1-2! ,1)%<! ,8&',! 18-1! ),#$0! .-$*,! 8-,! ,#0$#:#5-$1! ()*+#*,!;)::*+!*-58!#$.)1!1).2*!)$1#2!#1!#,!$&!2&$0*+!$**%*%! .*+:&+4-$5*!;*$*:#1,6!! INP6! D#$5*! *-58! #$.)1! 1).2*! ;*2&$0,! 1&! 4)21#.2*! '#$%&',9! ,)58!-..+&-58*,!;)::*+!-!1).2*!)$1#2!#1!#,!.+&5*,,*%!:&+!18*! '(# )*+,-./0+1-*2 -00+*0-1*! &/*+! 18*! 2-,1! '#$%&'! 1&! '8#58! #1! ;*2&$0,6! -58! B-$<! -..2#5-1#&$,! $**%! 1&! .+&5*,,! ,1+*-4,9! :&+! *=-4.2*9! #$.)1! 1).2*! #,! -55*,,*%! 4)21#.2*! 1#4*,9! &$5*! :&+! *-58! '#$3 :#$-$5#-2! %-1-! -$-2<,#,9! $*1'&+C! 1+-::#5! 4&$#1&+#$09! -$%! %&'!18-1!#1!.-+1#5#.-1*,!#$6!!! 1*2*5&44)$#5-1#&$! 4&$#1&+#$06! D*/*+-2! %-1-;-,*! +*,*-+58! 0+&).,! -+*! ;)#2%#$0! --1-! .1+*-4! /-$-0*4*$1! .<,1*4,! "*! ,**! 1'&! .+&;2*4,! '#18! ,)58! -..+&-58*,6! W#+,1! 18*! EFDBDG!,&!18-1!-..2#5-1#&$,!5-$!#,,)*!()*+#*,!1&!0*1!1#4*2<! ;)::*+!,#H*!+*()#+*%!#,!)$;&)$%*%T!Q1!-$<!1#4*!#$,1-$19!-22! #$:&+4-1#&$! :+&4! ,1+*-4,6! B-$-0#$0! -$%! .+&5*,,#$0! 1).2*,! 5&$1-#$*%! #$! 18*! 5)++*$1! '#$%&'! -+*! #$! 18*! ;)::*+9! ,1+*-4,!0#/*,!+#,*!1&!58-22*$0*,!18-1!8-/*!;**$!*=1*$,#/*2<! -$%!,&!18*!,#H*!&:!18*!+*()#+*%!;)::*+,!#,!%*1*+4#$*%!;<!18*! %#,5),,*%!-$%!+*5&0$#H*%!IJ9!K9!L9!M9!NOP6!! '#$%&'!+-$0*!-$%!18*!%-1-!-++#/-2!+-1*6!D*5&$%9!.+&5*,,#$0! *-58!#$.)1!1).2*!4)21#.2*!1#4*,!2*-%,!1&!-!8#08!5&4.)1-1#&$! Q$!#4.&+1-$1!52-,,!&:!()*+#*,!&/*+!%-1-!,1+*-4,!#,!,2#%#$03 5&,16!W&+!*=-4.2*!#$!X)*+<!N9!*-58!#$.)1!1).2*!#,!.+&5*,,*%! '#$%&'!-00+*0-1*!()*+#*,6!R&$,#%*+!-$!&$2#$*!-)51#&$!,<,3 :&)+!1#4*,6!Q,!18*!+-1#&!&:!YQZ[!&/*+!D]7F!#$5+*-,*,9! 1*4!#$!'8#58!;#%,!&$!-)51#&$!#1*4,!-+*!,1+*-4*%!#$1&!-!5*$3 ,&!%&*,!18*!$)4;*+!&:!1#4*,!*-58!1).2*!#,!.+&5*,,*%6!R&$3 1+-2!-)51#&$!.+&5*,,#$0!,<,1*46!S8*!,58*4-!&:!*-58!;#%!#,T! ,#%*+#$0!18*!2-+0*!/&2)4*!-$%!:-,1!-++#/-2!+-1*!&:!,1+*-4#$0! U#1*43#%9! ;#%3.+#5*9! 1#4*,1-4.V6! W&+! *-,*! &:! .+*,*$1-1#&$9! %-1-9!+*%)5#$0!18*!-4&)$1!&:!+*()#+*%!;)::*+!,.-5*!E#%*-22<! '*!-,,)4*!18-1!;#%,!-++#/*!#$!&+%*+!&$!18*#+!1#4*,1-4.!-13 1&!-!5&$,1-$1!;&)$%G!-$%!5&4.)1-1#&$!1#4*!#,!-$!#4.&+1-$1! 1+#;)1*6! E"*! -+*! -51#/*2<! #$/*,1#0-1#$0! .+&5*,,#$0! %#,&+3 %*+*%!%-1-!,1+*-4,G!X)*+<!N!,8&',!-$!*=-4.2*!&:!-!,2#%#$03 '#$%&'!-00+*0-1*!()*+<6! 3/4,52'T!@W#$%!18*!4-=#4)4!;#%!.+#5*!:&+!18*!.-,1!K!4#$3 )1*,!-$%!).%-1*!18*!+*,)21!*/*+<!N!4#$)1*6A! !"#"$%&'()*+,-./0,123& 4567&+,-89:;%%5&<,'28<('/& &&&&&&&&&&5;=>"&?&',@A<28& &&&&&&&&&&!#BC"&D&',@A<2E& 7$! 18*! ()*+<! -;&/*9! '*! #$1+&%)5*! -! '#$%&'! ,.*5#:#5-1#&$! '#18!18+**!.-+-4*1*+,T!YQZ[!,.*5#:#*,!18*!'#$%&'!,#H*9! D]7F! ,.*5#:#*,! 8&'! 18*! '#$%&'! 4&/*,9! -$%! "QSSY! ,.*5#:#*,! 18*! '#$%&'#$0! -11+#;)1*! &$! '8#58! 18-1! 18*! YQZ[! -$%! D]7F! .-+-4*1*+,! -+*! %*:#$*%6! S8*! '#$%&'! ,.*5#:#5-1#&$! &:! X)*+<! N! ;+*-C,! 18*! ;#%! ,1+*-4! #$1&! &/*+3 2-..#$0!K34#$)1*!,);3,1+*-4,!18-1!,1-+1!*/*+<!4#$)1*9!'#18! +*,.*51! 1&! 18*! 1#4*,1-4.! -11+#;)1*6! S8*,*! &/*+2-..#$0! ,);3 ,1+*-4,!-+*!5-22*%!!"#$#%&0(#%$)(!6!X)*+<!N!5-25)2-1*,!18*! 617/,42'8291*.-:;2&-<=-;4.2->26-/,2?@*4;2 No Pane, No Gain: Efficient Evaluation of Sliding-Window Aggregates over Data Streams
  • 74.
  • 75.
    Figure 2: MapReduceClassifier Training and Evaluation Procedure A Comparison of Approaches for Large-Scale Data Mining
  • 76.
  • 77.