• Save
Compiled Python UDFs for Impala

Like this? Share it with your network

Share
  • Full Name Full Name Comment goes here.
    Are you sure you want to
    Your message goes here
    Be the first to comment
No Downloads

Views

Total Views
1,875
On Slideshare
1,685
From Embeds
190
Number of Embeds
7

Actions

Shares
Downloads
0
Comments
0
Likes
4

Embeds 190

http://bigdata.braccialli.net 133
https://twitter.com 36
http://www.slideee.com 13
https://www.linkedin.com 3
http://feedly.com 2
http://www.slidesearchengine.com 2
http://5424981385442961652_41e25bf6e2019e86d8bd05b27d82a356c026b356.blogspot.com.br 1

Report content

Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
    No notes for slide

Transcript

  • 1. 1 Compiled  Python  UDFs  for  Impala   Uri  Laserson   20  May  2014  
  • 2. Impala  User-­‐defined  FuncAons  (UDFs)   •  Tuple  =>  Scalar  value   •  Substring   •  sin,  cos,  pow,  …   •  Machine-­‐learning  models   •  Supports  Hive  UDFs  (Java)   •  RelaAvely  unpleasurable   •  Slower   •  Impala  (naAve)  UDFs   •  C++  interface  designed  for  efficiency   •  Similar  to  Postgres  UDFs   •  Runs  any  LLVM-­‐compiled  code   2
  • 3. LLVM  compiler  infrastructure   3
  • 4. LLVM:  C++  example   4 bool StringEq(FunctionContext* context,! const StringVal& arg1,! const StringVal& arg2) {! if (arg1.is_null != arg2.is_null)! return false;! if (arg1.is_null)! return true;! if (arg1.len != arg2.len)! return false;! return (arg1.ptr == arg2.ptr) || ! memcmp(arg1.ptr, arg2.ptr, arg1.len) == 0;! }!
  • 5. LLVM:  IR  output   5 ; ModuleID = '<stdin>'! target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64- f80:128:128-n8:16:32:64-S128"! target triple = "x86_64-apple-macosx10.7.0"! ! %"class.impala_udf::FunctionContext" = type { %"class.impala::FunctionContextImpl"* }! %"class.impala::FunctionContextImpl" = type opaque! %"struct.impala_udf::StringVal" = type { %"struct.impala_udf::AnyVal", i32, i8* }! %"struct.impala_udf::AnyVal" = type { i8 }! ! ; Function Attrs: nounwind readonly ssp uwtable! define zeroext i1 @_Z8StringEqPN10impala_udf15FunctionContextERKNS_9StringValES4_(%"class.impala_udf::FunctionContext"* nocapture %context, %"struct.impala_udf::StringVal"* nocapture %arg1, %"struct.impala_udf::StringVal"* nocapture %arg2) #0 {! entry:! %is_null = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 0, i32 0! %0 = load i8* %is_null, align 1, !tbaa !0, !range !3! %is_null1 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 0, i32 0! %1 = load i8* %is_null1, align 1, !tbaa !0, !range !3! %cmp = icmp eq i8 %0, %1! br i1 %cmp, label %if.end, label %return! ! if.end: ; preds = %entry! %tobool = icmp eq i8 %0, 0! br i1 %tobool, label %if.end7, label %return! ! if.end7: ; preds = %if.end! %len = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 1! %2 = load i32* %len, align 4, !tbaa !4! %len8 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 1! %3 = load i32* %len8, align 4, !tbaa !4! %cmp9 = icmp eq i32 %2, %3! br i1 %cmp9, label %if.end11, label %return! ! if.end11: ; preds = %if.end7! %ptr = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 2! %4 = load i8** %ptr, align 8, !tbaa !5! %ptr12 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 2! %5 = load i8** %ptr12, align 8, !tbaa !5! %cmp13 = icmp eq i8* %4, %5! br i1 %cmp13, label %return, label %lor.rhs! ! lor.rhs: ; preds = %if.end11! %conv17 = sext i32 %2 to i64! %call = tail call i32 @memcmp(i8* %4, i8* %5, i64 %conv17)! %cmp18 = icmp eq i32 %call, 0! br label %return! ! return: ; preds = %lor.rhs, %if.end11, %if.end7, %if.end, %entry! %retval.0 = phi i1 [ false, %entry ], [ true, %if.end ], [ false, %if.end7 ], [ true, %if.end11 ], [ %cmp18, %lor.rhs ]!
  • 6. LLVM:  IR  output   6 ; ModuleID = '<stdin>'! target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64- f80:128:128-n8:16:32:64-S128"! target triple = "x86_64-apple-macosx10.7.0"! ! %"class.impala_udf::FunctionContext" = type { %"class.impala::FunctionContextImpl"* }! %"class.impala::FunctionContextImpl" = type opaque! %"struct.impala_udf::StringVal" = type { %"struct.impala_udf::AnyVal", i32, i8* }! %"struct.impala_udf::AnyVal" = type { i8 }! ! ; Function Attrs: nounwind readonly ssp uwtable! define zeroext i1 @_Z8StringEqPN10impala_udf15FunctionContextERKNS_9StringValES4_(%"class.impala_udf::FunctionContext"* nocapture %context, %"struct.impala_udf::StringVal"* nocapture %arg1, %"struct.impala_udf::StringVal"* nocapture %arg2) #0 {! entry:! %is_null = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 0, i32 0! %0 = load i8* %is_null, align 1, !tbaa !0, !range !3! %is_null1 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 0, i32 0! %1 = load i8* %is_null1, align 1, !tbaa !0, !range !3! %cmp = icmp eq i8 %0, %1! br i1 %cmp, label %if.end, label %return! ! if.end: ; preds = %entry! %tobool = icmp eq i8 %0, 0! br i1 %tobool, label %if.end7, label %return! ! if.end7: ; preds = %if.end! %len = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 1! %2 = load i32* %len, align 4, !tbaa !4! %len8 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 1! %3 = load i32* %len8, align 4, !tbaa !4! %cmp9 = icmp eq i32 %2, %3! br i1 %cmp9, label %if.end11, label %return! ! if.end11: ; preds = %if.end7! %ptr = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 2! %4 = load i8** %ptr, align 8, !tbaa !5! %ptr12 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 2! %5 = load i8** %ptr12, align 8, !tbaa !5! %cmp13 = icmp eq i8* %4, %5! br i1 %cmp13, label %return, label %lor.rhs! ! lor.rhs: ; preds = %if.end11! %conv17 = sext i32 %2 to i64! %call = tail call i32 @memcmp(i8* %4, i8* %5, i64 %conv17)! %cmp18 = icmp eq i32 %call, 0! br label %return! ! return: ; preds = %lor.rhs, %if.end11, %if.end7, %if.end, %entry! %retval.0 = phi i1 [ false, %entry ], [ true, %if.end ], [ false, %if.end7 ], [ true, %if.end11 ], [ %cmp18, %lor.rhs ]!
  • 7. Data  type  compaAbility   7 struct AnyVal {! bool is_null;! };! ! struct StringVal : public AnyVal {! int len;! uint8_t* ptr;! };! %AnyVal = type { i8 }! %StringVal = type { %AnyVal, i32, i8* }! ! ; or! ! %StringVal = type { { i8 }, i32, i8* }! C++  LLVM  IR  
  • 8. Register  and  execute  the  funcAon   8 CREATE FUNCTION StringEq(STRING, STRING)! RETURNS BOOLEAN! LOCATION '/path/to/bitcode.ll’! SYMBOL=’StringEq’;! SELECT StringEq(a, b) FROM mytable;!
  • 9. Numba  compiler   9 NumbaPython
  • 10. Impyla:  Python  Library  for  Impala   •  pip  install  impyla   •  DB  API  v2.0  (PEP  249)  compaAble   •  Prototype  sklearn  API  for  Impala  ML   •  Numba  integraAon  (described  here)   •  See  blog  post:  h]p://blog.cloudera.com/blog/ 2014/04/a-­‐new-­‐python-­‐client-­‐for-­‐impala/   10
  • 11. LLVM:  Python  example   11 @udf(IntVal(FunctionContext, StringVal)) def hour_from_weird_date_format(context, date):! return int(split(date, '-')[1])! ! ! ship_udf(cursor, hour_from_weird_data_format,! '/path/to/store/udf.ll', 'my.impala.host')! ! ! cur.execute('SELECT hour_from_weird_data_format(date) ’ +! ‘AS hour FROM mytable LIMIT 100’)!
  • 12. Model  Scoring:  BigML  on  Census  Data   12 MLaaS  
  • 13. Model  Scoring:  BigML  on  Census  Data   13
  • 14. Example:  100  Node  Decision  Tree   14 def predict_income(impala_function_context, age, workclass, final_weight, education, education_num, marital_status, occupation, relationship,! race, sex, hours_per_week, native_country, income):! if (marital_status is None):! return '<=50K'! if (marital_status == 'Married-civ-spouse'):! if (education_num is None):! return '<=50K'! if (education_num > 12):! if (hours_per_week is None):! return '>50K'! if (hours_per_week > 31):! if (age is None):! return '>50K'! if (age > 28):! if (education_num > 13):! if (age > 58):! return '>50K'! if (age <= 58):! return '>50K'! if (education_num <= 13):! if (occupation is None):! return '>50K'! if (occupation == 'Exec-managerial'):! return '>50K'! if (occupation != 'Exec-managerial'):! return '>50K'! if (age <= 28):! if (age > 24):! if (occupation is None):! return '<=50K'! if (occupation == 'Tech-support'):! return '>50K'! if (occupation != 'Tech-support'):! return '<=50K'! if (age <= 24):! if (final_weight is None):! return '<=50K'! if (final_weight > 492053):! return '>50K'! if (final_weight <= 492053):! return '<=50K'! if (hours_per_week <= 31):! if (sex is None):! return '<=50K'! if (sex == 'Male'):! if (age is None):! return '<=50K'! if (age > 29):! if (age > 62):!
  • 15. Batch  Scoring  with  PySpark   15 # parse the text data! observations = sc.textFile('/path/to/census_data').map(parse_obs)! ! # perform batch scoring! predictions = observations.map(lambda tup: predict_income(*tup))! ! # trigger computation! distinct = predictions.distinct().collect()! !
  • 16. Batch  Scoring  with  Impala   16 # compile the scoring function! predict_income = udf(signature)(predict_income)! ! ship_udf(cursor, predict_income, ...)! ! # perform batch scoring! cursor.execute(‘SELECT DISTINCT predict_income(age, ... ) ‘ +! ‘FROM census_text’)! distinct = cursor.fetchall()! !
  • 17. ExecuAon  Time   17 execution_time =! ! per_job_overhead +! ! N * ( per_record_exec + memcmp_exec )!
  • 18. PySpark  vs.  Impala  Performance   18 Tree  size   (nodes)   Spark   execu?on   ?me  (s)   Impala   execu?on   ?me  (s)   Fold   differenc e   Impala   compila? on  ?me   (s)   Bytecode   size   (bytes)   Percent   memcmp   nodes   0   160   9   17x   0   4   100   175   22   8x   1   2254   22%   500   178   27   7x   4   9803   35%   1000   184   32   6x   16   23495   34%   1500   188   35   5x   18   28301   34%   2000   196   37   5x   31   42442   33%  
  • 19. ExecuAon  Time   19 execution_time =! ! per_job_overhead +! ! N * ( per_record_exec + memcmp_exec )! Spark:  24  threads  /  node   [   ]   Impala:  1  thread  /  node  
  • 20. PySpark  vs.  Impala  Performance   20 Tree  size   (nodes)   Spark   execu?on   ?me  (s)   Impala   execu?on   ?me  (s)   Fold   differenc e   Impala   compila? on  ?me   (s)   Bytecode   size   (bytes)   Percent   memcmp   nodes   0   160   9   17x   0   4   100   175   22   8x   1   2254   22%   500   178   27   7x   4   9803   35%   1000   184   32   6x   16   23495   34%   1500   188   35   5x   18   28301   34%   2000   196   37   5x   31   42442   33%  
  • 21. Current  Status   •  Support  for  all  Impala  UDF  data  types  (e.g.,  IntVal,   StringVal,  etc.)   •  Support  for  casts  to/from  primiAve  types:   •  Any  operaAons  on  primiAves  should  work  on  Impala  types   •  Support  for  NULL  types  as  Python  None! •  Proof-­‐of-­‐principle  support  for  Python  string  module   •  len! •  split! •  ConcatenaAon   •  Call  out  to  any  extern C  funcAons   •  Proposed  direcAons   •  Array  handling   •  Numpy  support   •  What  else?   21
  • 22. UDFs  with  Impala  +  Numba   •  Simplicity  of  Python  interface/syntax   •  Performance  of  compiled  language  like  C++   •  Developed  at:  h]ps://github.com/cloudera/impyla   •  Please  try  it  and  tell  us  what  features  would  be  useful   •  Please  contribute!   22 pip install impyla!
  • 23. 23