SlideShare a Scribd company logo
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Nested	
  Types	
  in	
  Impala
Alex	
  Behm	
  //	
  Cloudera,	
  Inc.	
  
Marcel	
  Kornacker	
  //	
  Cloudera,	
  Inc.	
  
Skye	
  Wanderman-­‐Milne	
  //	
  Cloudera,	
  Inc.
Impala	
  Meetup	
  03/24/2015
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Design	
  Goals
• Goals	
  
• Support	
  for	
  nested	
  data	
  types:	
  struct,	
  map,	
  array	
  
• Full	
  expressiveness	
  of	
  SQL	
  with	
  nested	
  structures	
  
• v1	
  Prioritization	
  
• Focus	
  on	
  SELECT	
  queries	
  (INSERT	
  in	
  later	
  releases)	
  
• Focus	
  on	
  native	
  Parquet	
  and	
  Avro	
  formats	
  (XML,	
  JSON,	
  etc	
  in	
  later	
  releases)	
  
• Focus	
  on	
  built-­‐in	
  language	
  expressiveness	
  (UDTF	
  extensibility	
  in	
  later	
  releases)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  Schema
CREATE TABLE Customers {
id BIGINT,
address STRUCT<
city: STRING,
zip: INT
>
orders ARRAY<STRUCT<
txn_time: TIMESTAMP,
cc: BIGINT,
items: ARRAY<STRUCT<
item_no: STRING,
price: DECIMAL(9,2)
>>
>>
preferred_cc BIGINT
}
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  Extensions
• Path	
  expressions	
  extend	
  column	
  references	
  to	
  scalars	
  (nested	
  structs)	
  
• Can	
  appear	
  anywhere	
  a	
  conventional	
  column	
  reference	
  is	
  used	
  
• Collections	
  (maps	
  and	
  arrays)	
  are	
  exposed	
  like	
  sub-­‐tables	
  
• Use	
  FROM	
  clause	
  to	
  specify	
  which	
  collections	
  to	
  read	
  like	
  conventional	
  tables	
  
• Can	
  use	
  JOIN	
  conditions	
  to	
  express	
  join	
  relationship	
  (default	
  is	
  INNER	
  JOIN)
Find	
  the	
  ids	
  and	
  city	
  of	
  customers	
  who	
  live	
  in	
  the	
  zip	
  code	
  94305:	
  
SELECT	
  id,	
  address.city	
  FROM	
  customers	
  WHERE	
  address.zip	
  =	
  94305
Find	
  all	
  orders	
  that	
  were	
  paid	
  for	
  with	
  a	
  customer’s	
  preferred	
  credit	
  card:	
  
SELECT	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  WHERE	
  o.cc	
  =	
  c.preferred_cc	
  	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Referencing	
  Arrays	
  &	
  Maps
• Basic	
  idea:	
  Flatten	
  nested	
  collections	
  referenced	
  in	
  the	
  FROM	
  clause	
  
• Can	
  be	
  thought	
  of	
  as	
  an	
  implicit	
  join	
  on	
  the	
  parent/child	
  relationship
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o
c.id o.txn_id
100 203
100 305
100 507
… …
101 10056
101 10
… …
id	
  of	
  a	
  customer	
  repeated	
  
for	
  every	
  order
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Referencing	
  Arrays	
  &	
  Maps
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customer	
  c	
  INNER	
  JOIN	
  c.orders	
  o
Returns	
  customer/order	
  data	
  for	
  customers	
  that	
  have	
  at	
  least	
  one	
  order
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c	
  LEFT	
  OUTER	
  JOIN	
  c.orders	
  o
Also	
  returns	
  customers	
  with	
  no	
  orders	
  (with	
  order	
  fields	
  NULL)
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c	
  LEFT	
  ANTI	
  JOIN	
  c.orders	
  o
Find	
  all	
  customers	
  that	
  have	
  no	
  orders
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Motivation	
  for	
  Advanced	
  Querying	
  Capabilities
• Count	
  the	
  number	
  of	
  orders	
  per	
  customer	
  
• Count	
  the	
  number	
  of	
  items	
  per	
  order	
  
• Impractical	
  à Requires	
  unique	
  id	
  at	
  every	
  nesting	
  level	
  
• Information	
  is	
  already	
  expressed	
  in	
  nesting	
  relationship!	
  
• What	
  about	
  even	
  more	
  interesting	
  queries?	
  
• Get	
  the	
  number	
  of	
  orders	
  and	
  the	
  average	
  item	
  price	
  per	
  customer?	
  
• “Group	
  by”	
  multiple	
  nesting	
  levels
SELECT	
  COUNT(*)	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  GROUP	
  BY	
  c.id
SELECT	
  COUNT(*)	
  FROM	
  customers.orders	
  o,	
  o.items	
  GROUP	
  BY	
  ???
Must	
  be	
  unique
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Advanced	
  Querying:	
  Correlated	
  Table	
  References
• Count	
  the	
  number	
  of	
  orders	
  per	
  customer	
  
• Count	
  the	
  number	
  of	
  items	
  per	
  order	
  
• Get	
  the	
  number	
  of	
  orders	
  and	
  the	
  average	
  item	
  price	
  per	
  customer
SELECT	
  cnt	
  FROM	
  customers	
  c,	
  (SELECT	
  COUNT(*)	
  cnt	
  FROM	
  c.orders)	
  v
SELECT	
  cnt	
  FROM	
  customers.orders	
  o,	
  (SELECT	
  COUNT(*)	
  cnt	
  FROM	
  o.items)	
  v
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1,	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Correlated	
  reference	
  to	
  “c”	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Advanced	
  Querying:	
  Correlated	
  Table	
  References
• Full	
  expressibility	
  
• Arbitrary	
  SQL	
  allowed	
  in	
  inline	
  views	
  and	
  subqueries	
  with	
  correlated	
  table	
  refs	
  
• Correlated	
  subqueries	
  transformed	
  into	
  joins	
  if	
  possible	
  
• Exploits	
  nesting	
  relationship	
  
• No	
  need	
  for	
  stored	
  unique	
  ids	
  at	
  various	
  levels	
  
• Similar	
  to	
  standard	
  SQL	
  ‘LATERAL’,	
  ‘CROSS	
  APPLY’,	
  ‘OUTER	
  CROSS	
  APPLY’	
  
• Goes	
  beyond	
  standard	
  in	
  some	
  aspects	
  (semi/anti	
  variants)	
  
• Not	
  as	
  general	
  as	
  SQL	
  standard	
  (limited	
  correlations)
SELECT	
  id	
  FROM	
  customers	
  c	
  WHERE	
  EXISTS	
  
	
  	
  (SELECT	
  1	
  FROM	
  c.orders	
  o,	
  o.items	
  i	
  where	
  o.cc	
  =	
  c.preferred_cc	
  and	
  i.price	
  >	
  100)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Execution	
  Model
Design	
  sweet	
  spot:	
  small/medium	
  sized	
  collections	
  (max	
  few	
  hundreds	
  of	
  MB)	
  
• Built-­‐in	
  limitation	
  on	
  the	
  size	
  of	
  nested	
  collections	
  (TBD)	
  
• Huge	
  collections	
  not	
  expected	
  to	
  be	
  performant	
  and	
  rare	
  
Execution	
  Overview	
  
• Scans	
  materialize	
  minimal	
  nested	
  structure	
  in	
  memory	
  
• Three	
  new	
  exec	
  nodes	
  
• Subplan:	
  Executes	
  its	
  subplan	
  tree	
  for	
  each	
  input	
  row	
  and	
  returns	
  the	
  rows	
  
produced	
  by	
  the	
  subplan	
  
• Nested	
  Row	
  Src:	
  Returns	
  the	
  current	
  input	
  row	
  of	
  its	
  parent	
  Subplan	
  node	
  
• Unnest:	
  Scans	
  an	
  array	
  slot	
  of	
  the	
  current	
  input	
  row	
  of	
  its	
  parent	
  Subplan	
  node	
  or	
  
of	
  an	
  output	
  row	
  from	
  its	
  child	
  plan	
  node,	
  returning	
  one	
  row	
  per	
  array	
  element.	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  A
SELECT	
  count(*)	
  FROM	
  customer.orders.items
Scan	
  
orders.customer.items
Aggregate	
  
count(*)
Count	
  the	
  total	
  number	
  of	
  items
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  B
SELECT	
  c.id,	
  cnt	
  
FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders)	
  v
Scan	
  c
Subplan
materialize	
  
c.orders
Unnest	
  
c.orders
Aggregate	
  
count(*)
Subplan	
  Pseudo-­‐Code	
  
result	
  =	
  {}	
  
for	
  each	
  c	
  in	
  input	
  
	
  	
  set	
  c	
  in	
  dependent	
  nodes	
  
	
  	
  result	
  +=	
  rhsPlan.exec()	
  
return	
  result
Nested	
  
Row	
  Src
Cross	
  Join
set	
  c	
  in	
  dependent	
  nodes
Count	
  the	
  number	
  of	
  orders	
  per	
  customer
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  C
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Scan	
  c
Subplan
Unnest	
  
c.orders
Aggregate	
  
count(*)
Unnest	
  
c.orders.item
Aggregate	
  
avg(price)
Cross	
  Join
Return	
  the	
  number	
  of	
  orders	
  and	
  
the	
  average	
  item	
  price	
  per	
  customer
Nested	
  Row	
  
Src
Cross	
  Join
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  D
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders	
  o	
  
	
  	
  	
  WHERE	
  (SELECT	
  sum(price)	
  FROM	
  o.items)	
  >	
  100))	
  v
Scan	
  c
Subplan
Unnest	
  
c.orders
Aggregate	
  
count(*)
Unnest	
  
o.items
Aggregate	
  
sum(price)	
  
sum(price)>100
Cross	
  Join
For	
  each	
  customer,	
  return	
  the	
  number	
  of	
  orders	
  
whose	
  total	
  item	
  price	
  exceeds	
  >	
  100
Subplan
Nested	
  Row	
  
Src:	
  c
Nested	
  Row	
  
Src:	
  o
Cross	
  Join
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Future	
  Work
• Syntax	
  extensions	
  
• Performance	
  improvements	
  
• Parquet:	
  Scan	
  columns	
  directly	
  in	
  “unnest”,	
  avoid	
  collection	
  materialization	
  
• Codegen	
  subplans	
  
• INSERT	
  queries,	
  e.g.,	
  convert	
  flat	
  data	
  into	
  nested	
  data	
  
• UDTF	
  support	
  
• More	
  formats:	
  JSON,	
  XML,	
  etc.
SELECT	
   c.id,	
  
	
  	
   	
   count(c.orders),	
  
	
   	
   avg(c.orders.items.price)	
  
FROM	
  customer	
  c
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Rewrite
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Thank	
  you
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Appendix
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Comparison	
  of	
  Impala	
  and	
  HiveQL
• Impala’s	
  syntax	
  provides	
  a	
  superset	
  of	
  Hive’s	
  functionality	
  
• HiveQL	
  has	
  similar	
  path	
  expressions	
  but	
  with	
  restrictions	
  
• Must	
  use	
  LATERAL	
  VIEW	
  in	
  FROM	
  clause;	
  more	
  verbose	
  syntax	
  
• LATERAL	
  VIEWs	
  themselves	
  have	
  many	
  restrictions,	
  no	
  arbitrary	
  SQL	
  
• Requires	
  complex	
  joins	
  or	
  unique	
  ids	
  at	
  various	
  nesting	
  levels	
  for	
  expressing	
  even	
  simple	
  queries	
  (e.g.,	
  
find	
  number	
  of	
  orders	
  per	
  customer)	
  
• Does	
  not	
  provide	
  similar	
  inline	
  view	
  and	
  subquery	
  capabilities
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
  …	
  FROM	
  customer	
  c,	
  c.orders	
  o,	
  o.items	
  i
SELECT	
  …	
  FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  c2…)	
  
LATERAL	
  VIEW	
  explode(c.orders.items)	
  as	
  (c1,	
  c2…)
SELECT	
  …	
  FROM	
  customer	
  c	
  
LEFT	
  JOIN	
  c.orders	
  LEFT	
  JOIN	
  c.orders.items
SELECT	
  …	
  FROM	
  customer	
  c	
  
OUTER	
  LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  …)	
  
OUTER	
  LATERAL	
  VIEW	
  explode(c.orders.items)	
  as	
  (c1,	
  …)
SELECT	
  c.id	
  FROM	
  customer	
  c,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  oid	
  FROM	
  c.orders)	
  v	
  
ON	
  c.preferred_cc	
  =	
  v.cc
SELECT	
  c.id	
  FROM	
  customer	
  c	
  
WHERE	
  NOT	
  EXISTS	
  (SELECT	
  oid	
  FROM	
  c.orders	
  WHERE	
  
c.preferred_cc	
  =	
  orders.cc)
No	
  convenient/performant	
  equivalent	
  in	
  Hive.
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
  …	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  MY_UDTF(c.orders)	
  as	
  (c1,	
  c2…)
Impala	
  will	
  not	
  support	
  Hive’s	
  builtin	
  or	
  user-­‐defined	
  
table	
  generating	
  functions	
  for	
  now.
SELECT	
  …	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  json_tuple(c.json_str,	
  …)	
  as	
  (c1,	
  c2…)
SELECT	
  count(c.orders)	
  FROM	
  customer	
  c
SELECT	
  cnt	
  FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1
SELECT	
  count(1)	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  c2…)	
  
GROUP	
  BY	
  c.orders	
  
(in	
  absence	
  of	
  a	
  unique	
  key	
  in	
  ‘customer’)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
   c.id,	
  
	
  	
   	
   count(c.orders),	
  
	
   	
   avg(c.orders.items.price)	
  
FROM	
  customer	
  c
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
JOIN	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
No	
  convenient/performant	
  equivalent	
  in	
  Hive.
• Impala	
  is	
  more	
  expressive,	
  but	
  less	
  extensible	
  until	
  UDTFs	
  are	
  supported	
  
• More	
  join	
  types:	
  inner/outer/semi/anti	
  
• Full	
  SQL	
  block	
  inside	
  correlated	
  inline	
  view	
  
• Hive	
  more	
  extensible	
  (UDTFs),	
  has	
  builtin	
  UDTFs	
  
• Hive	
  Lateral	
  View	
  very	
  rigid,	
  no	
  arbitrary	
  SQL	
  inside
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
Josh	
  Will’s	
  Blog	
  post	
  on	
  analyzing	
  misspelled	
  queries	
  
http://blog.cloudera.com/blog/2014/08/how-­‐to-­‐count-­‐events-­‐like-­‐a-­‐data-­‐scientist/	
  
• Goal:	
  Rudimentary	
  spell	
  checker	
  based	
  on	
  counting	
  query/click	
  events	
  
• Problem:	
  Cross	
  referencing	
  items	
  in	
  multiple	
  nested	
  collections	
  
• Representative	
  of	
  many	
  machine	
  learning	
  tasks	
  (Josh	
  tells	
  me)	
  
• Goal	
  was	
  not	
  naturally	
  achievable	
  with	
  Hive	
  	
  
• Josh	
  implemented	
  a	
  custom	
  Hive	
  extension	
  “WITHIN”	
  
• How	
  can	
  Impala	
  serve	
  this	
  use	
  case?
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
account_id:	
  bigint	
  
search_events:	
  array<struct<	
  
	
  	
  event_id:	
  bigint	
  
	
  	
  query:	
  string	
  
	
  	
  tstamp_sec:	
  bigint	
  
	
  	
  ...	
  
>>	
  
install_events:	
  array<struct<	
  
	
  	
  event_id:	
  bigint	
  
	
  	
  search_event_id:	
  bigint	
  
	
  	
  app_id:	
  bigint	
  
	
  	
  ...	
  
>>
SELECT	
  a.qw,	
  a.qr,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  
LATERAL	
  VIEW	
  WITHIN(	
  
	
  	
  "SELECT	
  bad.query	
  qw,	
  good.query	
  qr	
  
	
  	
  FROM	
  t1	
  as	
  bad,	
  t1	
  as	
  good	
  
	
  	
  WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  t2)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  t2)",	
  
search_events,	
  install_events)	
  a	
  
GROUP	
  BY	
  a.qw,	
  a.qr;
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s,	
  
	
  	
  s.search_events	
  bad,	
  
	
  	
  s.search_events	
  good,	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events),	
  
GROUP	
  BY	
  bad.query,	
  good.query;
Josh’s	
  HiveQL	
  extension
Impala	
  SQL
Schema
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s,	
  
	
  	
  s.search_events	
  bad,	
  
	
  	
  s.search_events	
  good,	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events),	
  
GROUP	
  BY	
  bad.query,	
  good.query;
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  bad,	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  good,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v1	
  
ON	
  (bad.event_id	
  =	
  v1.install_events)	
  
LEFT	
  SEMI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v2	
  
ON	
  (good.event_id	
  =	
  v2.search_event_id)	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
GROUP	
  BY	
  bad.query,	
  good.query;
Rewrite
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
Scan	
  s
Subplan
Cross	
  Join	
  
bad.ts	
  <	
  good.ts	
  AND	
  
good.ts	
  –	
  bad.ts	
  <	
  30
Unnest	
  
s.search_events	
  good
Left	
  Anti	
  Join	
  
bad.event_id	
  =	
  
v1.search_event_id
Unnest	
  
s.install_events	
  v1
Left	
  Semi	
  Join	
  
good.event_id	
  =	
  
v2.search_event_id
Unnest	
  
s.install_events	
  v2
Aggregate	
  
count(*)	
  group	
  by	
  
bad.query,	
  good.query
Unnest	
  
s.search_events	
  good
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  bad,	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  good,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v1	
  
ON	
  (bad.event_id	
  =	
  v1.install_events)	
  
LEFT	
  SEMI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v2	
  
ON	
  (good.event_id	
  =	
  v2.search_event_id)	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
GROUP	
  BY	
  bad.query,	
  good.query;

More Related Content

What's hot

PostgreSQL
PostgreSQL PostgreSQL
PostgreSQL
Amazon Web Services
 
SQL
SQLSQL
Unions and joins in mysql
Unions and joins in mysqlUnions and joins in mysql
Sql queries
Sql queriesSql queries
Sql queries
Preeti Lakhani
 
Plsql task
Plsql taskPlsql task
Plsql task
Nawaz Sk
 
Moq Presentation
Moq PresentationMoq Presentation
Moq Presentation
LynxStar
 
Ejercicio sql tienda informatica
Ejercicio sql tienda informaticaEjercicio sql tienda informatica
Ejercicio sql tienda informatica
Ashley Stronghold Witwicky
 
Typescript in 30mins
Typescript in 30mins Typescript in 30mins
Typescript in 30mins
Udaya Kumar
 
Why Scala?
Why Scala?Why Scala?
Why Scala?
Alex Payne
 
Introducing type script
Introducing type scriptIntroducing type script
Introducing type script
Remo Jansen
 
Oracle - Program with PL/SQL - Lession 01
Oracle - Program with PL/SQL - Lession 01Oracle - Program with PL/SQL - Lession 01
Oracle - Program with PL/SQL - Lession 01
Thuan Nguyen
 
Re-Engineering PostgreSQL as a Time-Series Database
Re-Engineering PostgreSQL as a Time-Series DatabaseRe-Engineering PostgreSQL as a Time-Series Database
Re-Engineering PostgreSQL as a Time-Series Database
All Things Open
 
Complex queries in sql
Complex queries in sqlComplex queries in sql
Complex queries in sql
Charan Reddy
 
Introduction to triggers
Introduction to triggersIntroduction to triggers
Introduction to triggers
Syed Awais Mazhar Bukhari
 
MySQL Optimizer Cost Model
MySQL Optimizer Cost ModelMySQL Optimizer Cost Model
MySQL Optimizer Cost Model
Olav Sandstå
 
A philosophy of software design
A philosophy of software designA philosophy of software design
A philosophy of software design
Diego Pacheco
 
SQL window functions for MySQL
SQL window functions for MySQLSQL window functions for MySQL
SQL window functions for MySQL
Dag H. Wanvik
 

What's hot (20)

PostgreSQL
PostgreSQL PostgreSQL
PostgreSQL
 
SQL
SQLSQL
SQL
 
Unions and joins in mysql
Unions and joins in mysqlUnions and joins in mysql
Unions and joins in mysql
 
Sql queries
Sql queriesSql queries
Sql queries
 
Trigger
TriggerTrigger
Trigger
 
Plsql task
Plsql taskPlsql task
Plsql task
 
Moq Presentation
Moq PresentationMoq Presentation
Moq Presentation
 
Ejercicio sql tienda informatica
Ejercicio sql tienda informaticaEjercicio sql tienda informatica
Ejercicio sql tienda informatica
 
Typescript in 30mins
Typescript in 30mins Typescript in 30mins
Typescript in 30mins
 
Sql Queries
Sql QueriesSql Queries
Sql Queries
 
Why Scala?
Why Scala?Why Scala?
Why Scala?
 
SQL BASIC QUERIES SOLUTION ~hmftj
SQL BASIC QUERIES SOLUTION ~hmftjSQL BASIC QUERIES SOLUTION ~hmftj
SQL BASIC QUERIES SOLUTION ~hmftj
 
Introducing type script
Introducing type scriptIntroducing type script
Introducing type script
 
Oracle - Program with PL/SQL - Lession 01
Oracle - Program with PL/SQL - Lession 01Oracle - Program with PL/SQL - Lession 01
Oracle - Program with PL/SQL - Lession 01
 
Re-Engineering PostgreSQL as a Time-Series Database
Re-Engineering PostgreSQL as a Time-Series DatabaseRe-Engineering PostgreSQL as a Time-Series Database
Re-Engineering PostgreSQL as a Time-Series Database
 
Complex queries in sql
Complex queries in sqlComplex queries in sql
Complex queries in sql
 
Introduction to triggers
Introduction to triggersIntroduction to triggers
Introduction to triggers
 
MySQL Optimizer Cost Model
MySQL Optimizer Cost ModelMySQL Optimizer Cost Model
MySQL Optimizer Cost Model
 
A philosophy of software design
A philosophy of software designA philosophy of software design
A philosophy of software design
 
SQL window functions for MySQL
SQL window functions for MySQLSQL window functions for MySQL
SQL window functions for MySQL
 

Similar to Nested Types in Impala

Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Dataconomy Media
 
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Cloudera, Inc.
 
U-SQL - Azure Data Lake Analytics for Developers
U-SQL - Azure Data Lake Analytics for DevelopersU-SQL - Azure Data Lake Analytics for Developers
U-SQL - Azure Data Lake Analytics for Developers
Michael Rys
 
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Cloudera, Inc.
 
Cassandra Basics, Counters and Time Series Modeling
Cassandra Basics, Counters and Time Series ModelingCassandra Basics, Counters and Time Series Modeling
Cassandra Basics, Counters and Time Series Modeling
Vassilis Bekiaris
 
ADL/U-SQL Introduction (SQLBits 2016)
ADL/U-SQL Introduction (SQLBits 2016)ADL/U-SQL Introduction (SQLBits 2016)
ADL/U-SQL Introduction (SQLBits 2016)
Michael Rys
 
AWS Summit Seoul 2015 - AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
AWS Summit Seoul 2015 -  AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...AWS Summit Seoul 2015 -  AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
AWS Summit Seoul 2015 - AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
Amazon Web Services Korea
 
Old code doesn't stink
Old code doesn't stinkOld code doesn't stink
Old code doesn't stink
Martin Gutenbrunner
 
Beg sql
Beg sqlBeg sql
3 CityNetConf - sql+c#=u-sql
3 CityNetConf - sql+c#=u-sql3 CityNetConf - sql+c#=u-sql
3 CityNetConf - sql+c#=u-sql
Łukasz Grala
 
Sql Server - Apresentação
Sql Server - ApresentaçãoSql Server - Apresentação
Sql Server - Apresentação
Professora Maria Lúcia Etec
 
Dbms lab Manual
Dbms lab ManualDbms lab Manual
Dbms lab Manual
Vivek Kumar Sinha
 
Dublin Ireland Spark Meetup October 15, 2015
Dublin Ireland Spark Meetup October 15, 2015Dublin Ireland Spark Meetup October 15, 2015
Dublin Ireland Spark Meetup October 15, 2015
eddiebaggott
 
Sql server T-sql basics ppt-3
Sql server T-sql basics  ppt-3Sql server T-sql basics  ppt-3
Sql server T-sql basics ppt-3
Vibrant Technologies & Computers
 
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQLBuilding a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
ScyllaDB
 
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEOTricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Altinity Ltd
 
Event streaming webinar feb 2020
Event streaming webinar feb 2020Event streaming webinar feb 2020
Event streaming webinar feb 2020
Maheedhar Gunturu
 
Anything SQL: Lightning Talks
Anything SQL: Lightning TalksAnything SQL: Lightning Talks
Anything SQL: Lightning Talks
SQL Server Sri Lanka User Group
 
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project ADN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
Dataconomy Media
 

Similar to Nested Types in Impala (20)

Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
 
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
 
U-SQL - Azure Data Lake Analytics for Developers
U-SQL - Azure Data Lake Analytics for DevelopersU-SQL - Azure Data Lake Analytics for Developers
U-SQL - Azure Data Lake Analytics for Developers
 
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
 
Cassandra Basics, Counters and Time Series Modeling
Cassandra Basics, Counters and Time Series ModelingCassandra Basics, Counters and Time Series Modeling
Cassandra Basics, Counters and Time Series Modeling
 
ADL/U-SQL Introduction (SQLBits 2016)
ADL/U-SQL Introduction (SQLBits 2016)ADL/U-SQL Introduction (SQLBits 2016)
ADL/U-SQL Introduction (SQLBits 2016)
 
AWS Summit Seoul 2015 - AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
AWS Summit Seoul 2015 -  AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...AWS Summit Seoul 2015 -  AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
AWS Summit Seoul 2015 - AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
 
Old code doesn't stink
Old code doesn't stinkOld code doesn't stink
Old code doesn't stink
 
Beg sql
Beg sqlBeg sql
Beg sql
 
Beg sql
Beg sqlBeg sql
Beg sql
 
3 CityNetConf - sql+c#=u-sql
3 CityNetConf - sql+c#=u-sql3 CityNetConf - sql+c#=u-sql
3 CityNetConf - sql+c#=u-sql
 
Sql Server - Apresentação
Sql Server - ApresentaçãoSql Server - Apresentação
Sql Server - Apresentação
 
Dbms lab Manual
Dbms lab ManualDbms lab Manual
Dbms lab Manual
 
Dublin Ireland Spark Meetup October 15, 2015
Dublin Ireland Spark Meetup October 15, 2015Dublin Ireland Spark Meetup October 15, 2015
Dublin Ireland Spark Meetup October 15, 2015
 
Sql server T-sql basics ppt-3
Sql server T-sql basics  ppt-3Sql server T-sql basics  ppt-3
Sql server T-sql basics ppt-3
 
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQLBuilding a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
 
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEOTricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
 
Event streaming webinar feb 2020
Event streaming webinar feb 2020Event streaming webinar feb 2020
Event streaming webinar feb 2020
 
Anything SQL: Lightning Talks
Anything SQL: Lightning TalksAnything SQL: Lightning Talks
Anything SQL: Lightning Talks
 
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project ADN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
 

More from Cloudera, Inc.

Partner Briefing_January 25 (FINAL).pptx
Partner Briefing_January 25 (FINAL).pptxPartner Briefing_January 25 (FINAL).pptx
Partner Briefing_January 25 (FINAL).pptx
Cloudera, Inc.
 
Cloudera Data Impact Awards 2021 - Finalists
Cloudera Data Impact Awards 2021 - Finalists Cloudera Data Impact Awards 2021 - Finalists
Cloudera Data Impact Awards 2021 - Finalists
Cloudera, Inc.
 
2020 Cloudera Data Impact Awards Finalists
2020 Cloudera Data Impact Awards Finalists2020 Cloudera Data Impact Awards Finalists
2020 Cloudera Data Impact Awards Finalists
Cloudera, Inc.
 
Edc event vienna presentation 1 oct 2019
Edc event vienna presentation 1 oct 2019Edc event vienna presentation 1 oct 2019
Edc event vienna presentation 1 oct 2019
Cloudera, Inc.
 
Machine Learning with Limited Labeled Data 4/3/19
Machine Learning with Limited Labeled Data 4/3/19Machine Learning with Limited Labeled Data 4/3/19
Machine Learning with Limited Labeled Data 4/3/19
Cloudera, Inc.
 
Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Data Driven With the Cloudera Modern Data Warehouse 3.19.19Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Cloudera, Inc.
 
Introducing Cloudera DataFlow (CDF) 2.13.19
Introducing Cloudera DataFlow (CDF) 2.13.19Introducing Cloudera DataFlow (CDF) 2.13.19
Introducing Cloudera DataFlow (CDF) 2.13.19
Cloudera, Inc.
 
Introducing Cloudera Data Science Workbench for HDP 2.12.19
Introducing Cloudera Data Science Workbench for HDP 2.12.19Introducing Cloudera Data Science Workbench for HDP 2.12.19
Introducing Cloudera Data Science Workbench for HDP 2.12.19
Cloudera, Inc.
 
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Cloudera, Inc.
 
Leveraging the cloud for analytics and machine learning 1.29.19
Leveraging the cloud for analytics and machine learning 1.29.19Leveraging the cloud for analytics and machine learning 1.29.19
Leveraging the cloud for analytics and machine learning 1.29.19
Cloudera, Inc.
 
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Cloudera, Inc.
 
Leveraging the Cloud for Big Data Analytics 12.11.18
Leveraging the Cloud for Big Data Analytics 12.11.18Leveraging the Cloud for Big Data Analytics 12.11.18
Leveraging the Cloud for Big Data Analytics 12.11.18
Cloudera, Inc.
 
Modern Data Warehouse Fundamentals Part 3
Modern Data Warehouse Fundamentals Part 3Modern Data Warehouse Fundamentals Part 3
Modern Data Warehouse Fundamentals Part 3
Cloudera, Inc.
 
Modern Data Warehouse Fundamentals Part 2
Modern Data Warehouse Fundamentals Part 2Modern Data Warehouse Fundamentals Part 2
Modern Data Warehouse Fundamentals Part 2
Cloudera, Inc.
 
Modern Data Warehouse Fundamentals Part 1
Modern Data Warehouse Fundamentals Part 1Modern Data Warehouse Fundamentals Part 1
Modern Data Warehouse Fundamentals Part 1
Cloudera, Inc.
 
Extending Cloudera SDX beyond the Platform
Extending Cloudera SDX beyond the PlatformExtending Cloudera SDX beyond the Platform
Extending Cloudera SDX beyond the Platform
Cloudera, Inc.
 
Federated Learning: ML with Privacy on the Edge 11.15.18
Federated Learning: ML with Privacy on the Edge 11.15.18Federated Learning: ML with Privacy on the Edge 11.15.18
Federated Learning: ML with Privacy on the Edge 11.15.18
Cloudera, Inc.
 
Analyst Webinar: Doing a 180 on Customer 360
Analyst Webinar: Doing a 180 on Customer 360Analyst Webinar: Doing a 180 on Customer 360
Analyst Webinar: Doing a 180 on Customer 360
Cloudera, Inc.
 
Build a modern platform for anti-money laundering 9.19.18
Build a modern platform for anti-money laundering 9.19.18Build a modern platform for anti-money laundering 9.19.18
Build a modern platform for anti-money laundering 9.19.18
Cloudera, Inc.
 
Introducing the data science sandbox as a service 8.30.18
Introducing the data science sandbox as a service 8.30.18Introducing the data science sandbox as a service 8.30.18
Introducing the data science sandbox as a service 8.30.18
Cloudera, Inc.
 

More from Cloudera, Inc. (20)

Partner Briefing_January 25 (FINAL).pptx
Partner Briefing_January 25 (FINAL).pptxPartner Briefing_January 25 (FINAL).pptx
Partner Briefing_January 25 (FINAL).pptx
 
Cloudera Data Impact Awards 2021 - Finalists
Cloudera Data Impact Awards 2021 - Finalists Cloudera Data Impact Awards 2021 - Finalists
Cloudera Data Impact Awards 2021 - Finalists
 
2020 Cloudera Data Impact Awards Finalists
2020 Cloudera Data Impact Awards Finalists2020 Cloudera Data Impact Awards Finalists
2020 Cloudera Data Impact Awards Finalists
 
Edc event vienna presentation 1 oct 2019
Edc event vienna presentation 1 oct 2019Edc event vienna presentation 1 oct 2019
Edc event vienna presentation 1 oct 2019
 
Machine Learning with Limited Labeled Data 4/3/19
Machine Learning with Limited Labeled Data 4/3/19Machine Learning with Limited Labeled Data 4/3/19
Machine Learning with Limited Labeled Data 4/3/19
 
Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Data Driven With the Cloudera Modern Data Warehouse 3.19.19Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Data Driven With the Cloudera Modern Data Warehouse 3.19.19
 
Introducing Cloudera DataFlow (CDF) 2.13.19
Introducing Cloudera DataFlow (CDF) 2.13.19Introducing Cloudera DataFlow (CDF) 2.13.19
Introducing Cloudera DataFlow (CDF) 2.13.19
 
Introducing Cloudera Data Science Workbench for HDP 2.12.19
Introducing Cloudera Data Science Workbench for HDP 2.12.19Introducing Cloudera Data Science Workbench for HDP 2.12.19
Introducing Cloudera Data Science Workbench for HDP 2.12.19
 
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
 
Leveraging the cloud for analytics and machine learning 1.29.19
Leveraging the cloud for analytics and machine learning 1.29.19Leveraging the cloud for analytics and machine learning 1.29.19
Leveraging the cloud for analytics and machine learning 1.29.19
 
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
 
Leveraging the Cloud for Big Data Analytics 12.11.18
Leveraging the Cloud for Big Data Analytics 12.11.18Leveraging the Cloud for Big Data Analytics 12.11.18
Leveraging the Cloud for Big Data Analytics 12.11.18
 
Modern Data Warehouse Fundamentals Part 3
Modern Data Warehouse Fundamentals Part 3Modern Data Warehouse Fundamentals Part 3
Modern Data Warehouse Fundamentals Part 3
 
Modern Data Warehouse Fundamentals Part 2
Modern Data Warehouse Fundamentals Part 2Modern Data Warehouse Fundamentals Part 2
Modern Data Warehouse Fundamentals Part 2
 
Modern Data Warehouse Fundamentals Part 1
Modern Data Warehouse Fundamentals Part 1Modern Data Warehouse Fundamentals Part 1
Modern Data Warehouse Fundamentals Part 1
 
Extending Cloudera SDX beyond the Platform
Extending Cloudera SDX beyond the PlatformExtending Cloudera SDX beyond the Platform
Extending Cloudera SDX beyond the Platform
 
Federated Learning: ML with Privacy on the Edge 11.15.18
Federated Learning: ML with Privacy on the Edge 11.15.18Federated Learning: ML with Privacy on the Edge 11.15.18
Federated Learning: ML with Privacy on the Edge 11.15.18
 
Analyst Webinar: Doing a 180 on Customer 360
Analyst Webinar: Doing a 180 on Customer 360Analyst Webinar: Doing a 180 on Customer 360
Analyst Webinar: Doing a 180 on Customer 360
 
Build a modern platform for anti-money laundering 9.19.18
Build a modern platform for anti-money laundering 9.19.18Build a modern platform for anti-money laundering 9.19.18
Build a modern platform for anti-money laundering 9.19.18
 
Introducing the data science sandbox as a service 8.30.18
Introducing the data science sandbox as a service 8.30.18Introducing the data science sandbox as a service 8.30.18
Introducing the data science sandbox as a service 8.30.18
 

Recently uploaded

E-commerce Application Development Company.pdf
E-commerce Application Development Company.pdfE-commerce Application Development Company.pdf
E-commerce Application Development Company.pdf
Hornet Dynamics
 
Empowering Growth with Best Software Development Company in Noida - Deuglo
Empowering Growth with Best Software  Development Company in Noida - DeugloEmpowering Growth with Best Software  Development Company in Noida - Deuglo
Empowering Growth with Best Software Development Company in Noida - Deuglo
Deuglo Infosystem Pvt Ltd
 
Pro Unity Game Development with C-sharp Book
Pro Unity Game Development with C-sharp BookPro Unity Game Development with C-sharp Book
Pro Unity Game Development with C-sharp Book
abdulrafaychaudhry
 
Utilocate provides Smarter, Better, Faster, Safer Locate Ticket Management
Utilocate provides Smarter, Better, Faster, Safer Locate Ticket ManagementUtilocate provides Smarter, Better, Faster, Safer Locate Ticket Management
Utilocate provides Smarter, Better, Faster, Safer Locate Ticket Management
Utilocate
 
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI AppAI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
Google
 
Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604
Fermin Galan
 
Nidhi Software Price. Fact , Costs, Tips
Nidhi Software Price. Fact , Costs, TipsNidhi Software Price. Fact , Costs, Tips
Nidhi Software Price. Fact , Costs, Tips
vrstrong314
 
Introduction to Pygame (Lecture 7 Python Game Development)
Introduction to Pygame (Lecture 7 Python Game Development)Introduction to Pygame (Lecture 7 Python Game Development)
Introduction to Pygame (Lecture 7 Python Game Development)
abdulrafaychaudhry
 
OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024
OpenMetadata
 
openEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain SecurityopenEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain Security
Shane Coughlan
 
Lecture 1 Introduction to games development
Lecture 1 Introduction to games developmentLecture 1 Introduction to games development
Lecture 1 Introduction to games development
abdulrafaychaudhry
 
Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...
Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...
Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...
Globus
 
Graspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code AnalysisGraspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code Analysis
Aftab Hussain
 
Atelier - Innover avec l’IA Générative et les graphes de connaissances
Atelier - Innover avec l’IA Générative et les graphes de connaissancesAtelier - Innover avec l’IA Générative et les graphes de connaissances
Atelier - Innover avec l’IA Générative et les graphes de connaissances
Neo4j
 
AI Genie Review: World’s First Open AI WordPress Website Creator
AI Genie Review: World’s First Open AI WordPress Website CreatorAI Genie Review: World’s First Open AI WordPress Website Creator
AI Genie Review: World’s First Open AI WordPress Website Creator
Google
 
In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...
In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...
In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...
Juraj Vysvader
 
Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024
Globus
 
Need for Speed: Removing speed bumps from your Symfony projects ⚡️
Need for Speed: Removing speed bumps from your Symfony projects ⚡️Need for Speed: Removing speed bumps from your Symfony projects ⚡️
Need for Speed: Removing speed bumps from your Symfony projects ⚡️
Łukasz Chruściel
 
Essentials of Automations: The Art of Triggers and Actions in FME
Essentials of Automations: The Art of Triggers and Actions in FMEEssentials of Automations: The Art of Triggers and Actions in FME
Essentials of Automations: The Art of Triggers and Actions in FME
Safe Software
 
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of CodeA Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
Aftab Hussain
 

Recently uploaded (20)

E-commerce Application Development Company.pdf
E-commerce Application Development Company.pdfE-commerce Application Development Company.pdf
E-commerce Application Development Company.pdf
 
Empowering Growth with Best Software Development Company in Noida - Deuglo
Empowering Growth with Best Software  Development Company in Noida - DeugloEmpowering Growth with Best Software  Development Company in Noida - Deuglo
Empowering Growth with Best Software Development Company in Noida - Deuglo
 
Pro Unity Game Development with C-sharp Book
Pro Unity Game Development with C-sharp BookPro Unity Game Development with C-sharp Book
Pro Unity Game Development with C-sharp Book
 
Utilocate provides Smarter, Better, Faster, Safer Locate Ticket Management
Utilocate provides Smarter, Better, Faster, Safer Locate Ticket ManagementUtilocate provides Smarter, Better, Faster, Safer Locate Ticket Management
Utilocate provides Smarter, Better, Faster, Safer Locate Ticket Management
 
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI AppAI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
 
Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604
 
Nidhi Software Price. Fact , Costs, Tips
Nidhi Software Price. Fact , Costs, TipsNidhi Software Price. Fact , Costs, Tips
Nidhi Software Price. Fact , Costs, Tips
 
Introduction to Pygame (Lecture 7 Python Game Development)
Introduction to Pygame (Lecture 7 Python Game Development)Introduction to Pygame (Lecture 7 Python Game Development)
Introduction to Pygame (Lecture 7 Python Game Development)
 
OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024
 
openEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain SecurityopenEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain Security
 
Lecture 1 Introduction to games development
Lecture 1 Introduction to games developmentLecture 1 Introduction to games development
Lecture 1 Introduction to games development
 
Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...
Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...
Innovating Inference - Remote Triggering of Large Language Models on HPC Clus...
 
Graspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code AnalysisGraspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code Analysis
 
Atelier - Innover avec l’IA Générative et les graphes de connaissances
Atelier - Innover avec l’IA Générative et les graphes de connaissancesAtelier - Innover avec l’IA Générative et les graphes de connaissances
Atelier - Innover avec l’IA Générative et les graphes de connaissances
 
AI Genie Review: World’s First Open AI WordPress Website Creator
AI Genie Review: World’s First Open AI WordPress Website CreatorAI Genie Review: World’s First Open AI WordPress Website Creator
AI Genie Review: World’s First Open AI WordPress Website Creator
 
In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...
In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...
In 2015, I used to write extensions for Joomla, WordPress, phpBB3, etc and I ...
 
Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024
 
Need for Speed: Removing speed bumps from your Symfony projects ⚡️
Need for Speed: Removing speed bumps from your Symfony projects ⚡️Need for Speed: Removing speed bumps from your Symfony projects ⚡️
Need for Speed: Removing speed bumps from your Symfony projects ⚡️
 
Essentials of Automations: The Art of Triggers and Actions in FME
Essentials of Automations: The Art of Triggers and Actions in FMEEssentials of Automations: The Art of Triggers and Actions in FME
Essentials of Automations: The Art of Triggers and Actions in FME
 
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of CodeA Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
 

Nested Types in Impala

  • 1. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Nested  Types  in  Impala Alex  Behm  //  Cloudera,  Inc.   Marcel  Kornacker  //  Cloudera,  Inc.   Skye  Wanderman-­‐Milne  //  Cloudera,  Inc. Impala  Meetup  03/24/2015
  • 2. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Design  Goals • Goals   • Support  for  nested  data  types:  struct,  map,  array   • Full  expressiveness  of  SQL  with  nested  structures   • v1  Prioritization   • Focus  on  SELECT  queries  (INSERT  in  later  releases)   • Focus  on  native  Parquet  and  Avro  formats  (XML,  JSON,  etc  in  later  releases)   • Focus  on  built-­‐in  language  expressiveness  (UDTF  extensibility  in  later  releases)
  • 3. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  Schema CREATE TABLE Customers { id BIGINT, address STRUCT< city: STRING, zip: INT > orders ARRAY<STRUCT< txn_time: TIMESTAMP, cc: BIGINT, items: ARRAY<STRUCT< item_no: STRING, price: DECIMAL(9,2) >> >> preferred_cc BIGINT }
  • 4. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  Extensions • Path  expressions  extend  column  references  to  scalars  (nested  structs)   • Can  appear  anywhere  a  conventional  column  reference  is  used   • Collections  (maps  and  arrays)  are  exposed  like  sub-­‐tables   • Use  FROM  clause  to  specify  which  collections  to  read  like  conventional  tables   • Can  use  JOIN  conditions  to  express  join  relationship  (default  is  INNER  JOIN) Find  the  ids  and  city  of  customers  who  live  in  the  zip  code  94305:   SELECT  id,  address.city  FROM  customers  WHERE  address.zip  =  94305 Find  all  orders  that  were  paid  for  with  a  customer’s  preferred  credit  card:   SELECT  o.txn_id  FROM  customers  c,  c.orders  o  WHERE  o.cc  =  c.preferred_cc    
  • 5. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Referencing  Arrays  &  Maps • Basic  idea:  Flatten  nested  collections  referenced  in  the  FROM  clause   • Can  be  thought  of  as  an  implicit  join  on  the  parent/child  relationship SELECT  c.id,  o.txn_id  FROM  customers  c,  c.orders  o c.id o.txn_id 100 203 100 305 100 507 … … 101 10056 101 10 … … id  of  a  customer  repeated   for  every  order
  • 6. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Referencing  Arrays  &  Maps SELECT  c.id,  o.txn_id  FROM  customers  c,  c.orders  o   SELECT  c.id,  o.txn_id  FROM  customer  c  INNER  JOIN  c.orders  o Returns  customer/order  data  for  customers  that  have  at  least  one  order SELECT  c.id,  o.txn_id  FROM  customers  c  LEFT  OUTER  JOIN  c.orders  o Also  returns  customers  with  no  orders  (with  order  fields  NULL) SELECT  c.id,  o.txn_id  FROM  customers  c  LEFT  ANTI  JOIN  c.orders  o Find  all  customers  that  have  no  orders
  • 7. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Motivation  for  Advanced  Querying  Capabilities • Count  the  number  of  orders  per  customer   • Count  the  number  of  items  per  order   • Impractical  à Requires  unique  id  at  every  nesting  level   • Information  is  already  expressed  in  nesting  relationship!   • What  about  even  more  interesting  queries?   • Get  the  number  of  orders  and  the  average  item  price  per  customer?   • “Group  by”  multiple  nesting  levels SELECT  COUNT(*)  FROM  customers  c,  c.orders  o  GROUP  BY  c.id SELECT  COUNT(*)  FROM  customers.orders  o,  o.items  GROUP  BY  ??? Must  be  unique
  • 8. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Advanced  Querying:  Correlated  Table  References • Count  the  number  of  orders  per  customer   • Count  the  number  of  items  per  order   • Get  the  number  of  orders  and  the  average  item  price  per  customer SELECT  cnt  FROM  customers  c,  (SELECT  COUNT(*)  cnt  FROM  c.orders)  v SELECT  cnt  FROM  customers.orders  o,  (SELECT  COUNT(*)  cnt  FROM  o.items)  v SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(1)  cnt  FROM  c.orders)  v1,      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Correlated  reference  to  “c”  
  • 9. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Advanced  Querying:  Correlated  Table  References • Full  expressibility   • Arbitrary  SQL  allowed  in  inline  views  and  subqueries  with  correlated  table  refs   • Correlated  subqueries  transformed  into  joins  if  possible   • Exploits  nesting  relationship   • No  need  for  stored  unique  ids  at  various  levels   • Similar  to  standard  SQL  ‘LATERAL’,  ‘CROSS  APPLY’,  ‘OUTER  CROSS  APPLY’   • Goes  beyond  standard  in  some  aspects  (semi/anti  variants)   • Not  as  general  as  SQL  standard  (limited  correlations) SELECT  id  FROM  customers  c  WHERE  EXISTS      (SELECT  1  FROM  c.orders  o,  o.items  i  where  o.cc  =  c.preferred_cc  and  i.price  >  100)
  • 10. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Execution  Model Design  sweet  spot:  small/medium  sized  collections  (max  few  hundreds  of  MB)   • Built-­‐in  limitation  on  the  size  of  nested  collections  (TBD)   • Huge  collections  not  expected  to  be  performant  and  rare   Execution  Overview   • Scans  materialize  minimal  nested  structure  in  memory   • Three  new  exec  nodes   • Subplan:  Executes  its  subplan  tree  for  each  input  row  and  returns  the  rows   produced  by  the  subplan   • Nested  Row  Src:  Returns  the  current  input  row  of  its  parent  Subplan  node   • Unnest:  Scans  an  array  slot  of  the  current  input  row  of  its  parent  Subplan  node  or   of  an  output  row  from  its  child  plan  node,  returning  one  row  per  array  element.  
  • 11. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  A SELECT  count(*)  FROM  customer.orders.items Scan   orders.customer.items Aggregate   count(*) Count  the  total  number  of  items Exchange   (to  client)
  • 12. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  B SELECT  c.id,  cnt   FROM  customer  c,   JOIN  (SELECT  count(*)  cnt  FROM  c.orders)  v Scan  c Subplan materialize   c.orders Unnest   c.orders Aggregate   count(*) Subplan  Pseudo-­‐Code   result  =  {}   for  each  c  in  input      set  c  in  dependent  nodes      result  +=  rhsPlan.exec()   return  result Nested   Row  Src Cross  Join set  c  in  dependent  nodes Count  the  number  of  orders  per  customer Exchange   (to  client)
  • 13. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  C SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(*)  cnt  FROM  c.orders)  v1      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Scan  c Subplan Unnest   c.orders Aggregate   count(*) Unnest   c.orders.item Aggregate   avg(price) Cross  Join Return  the  number  of  orders  and   the  average  item  price  per  customer Nested  Row   Src Cross  Join Exchange   (to  client)
  • 14. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  D SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(*)  cnt  FROM  c.orders  o        WHERE  (SELECT  sum(price)  FROM  o.items)  >  100))  v Scan  c Subplan Unnest   c.orders Aggregate   count(*) Unnest   o.items Aggregate   sum(price)   sum(price)>100 Cross  Join For  each  customer,  return  the  number  of  orders   whose  total  item  price  exceeds  >  100 Subplan Nested  Row   Src:  c Nested  Row   Src:  o Cross  Join Exchange   (to  client)
  • 15. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Future  Work • Syntax  extensions   • Performance  improvements   • Parquet:  Scan  columns  directly  in  “unnest”,  avoid  collection  materialization   • Codegen  subplans   • INSERT  queries,  e.g.,  convert  flat  data  into  nested  data   • UDTF  support   • More  formats:  JSON,  XML,  etc. SELECT   c.id,         count(c.orders),       avg(c.orders.items.price)   FROM  customer  c SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(1)  cnt  FROM  c.orders)  v1      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Rewrite
  • 16. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Thank  you
  • 17. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Appendix
  • 18. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Comparison  of  Impala  and  HiveQL • Impala’s  syntax  provides  a  superset  of  Hive’s  functionality   • HiveQL  has  similar  path  expressions  but  with  restrictions   • Must  use  LATERAL  VIEW  in  FROM  clause;  more  verbose  syntax   • LATERAL  VIEWs  themselves  have  many  restrictions,  no  arbitrary  SQL   • Requires  complex  joins  or  unique  ids  at  various  nesting  levels  for  expressing  even  simple  queries  (e.g.,   find  number  of  orders  per  customer)   • Does  not  provide  similar  inline  view  and  subquery  capabilities
  • 19. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT  …  FROM  customer  c,  c.orders  o,  o.items  i SELECT  …  FROM  customer  c   LATERAL  VIEW  explode(c.orders)  as  (c1,  c2…)   LATERAL  VIEW  explode(c.orders.items)  as  (c1,  c2…) SELECT  …  FROM  customer  c   LEFT  JOIN  c.orders  LEFT  JOIN  c.orders.items SELECT  …  FROM  customer  c   OUTER  LATERAL  VIEW  explode(c.orders)  as  (c1,  …)   OUTER  LATERAL  VIEW  explode(c.orders.items)  as  (c1,  …) SELECT  c.id  FROM  customer  c,   LEFT  ANTI  JOIN  (SELECT  oid  FROM  c.orders)  v   ON  c.preferred_cc  =  v.cc SELECT  c.id  FROM  customer  c   WHERE  NOT  EXISTS  (SELECT  oid  FROM  c.orders  WHERE   c.preferred_cc  =  orders.cc) No  convenient/performant  equivalent  in  Hive.
  • 20. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT  …   FROM  customer  c   LATERAL  VIEW  MY_UDTF(c.orders)  as  (c1,  c2…) Impala  will  not  support  Hive’s  builtin  or  user-­‐defined   table  generating  functions  for  now. SELECT  …   FROM  customer  c   LATERAL  VIEW  json_tuple(c.json_str,  …)  as  (c1,  c2…) SELECT  count(c.orders)  FROM  customer  c SELECT  cnt  FROM  customer  c,   JOIN  (SELECT  count(1)  cnt  FROM  c.orders)  v1 SELECT  count(1)   FROM  customer  c   LATERAL  VIEW  explode(c.orders)  as  (c1,  c2…)   GROUP  BY  c.orders   (in  absence  of  a  unique  key  in  ‘customer’)
  • 21. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT   c.id,         count(c.orders),       avg(c.orders.items.price)   FROM  customer  c SELECT  c.id,  cnt,  avgp   FROM  customer  c,   JOIN  (SELECT  count(1)  cnt  FROM  c.orders)  v1   JOIN  (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 No  convenient/performant  equivalent  in  Hive. • Impala  is  more  expressive,  but  less  extensible  until  UDTFs  are  supported   • More  join  types:  inner/outer/semi/anti   • Full  SQL  block  inside  correlated  inline  view   • Hive  more  extensible  (UDTFs),  has  builtin  UDTFs   • Hive  Lateral  View  very  rigid,  no  arbitrary  SQL  inside
  • 22. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action Josh  Will’s  Blog  post  on  analyzing  misspelled  queries   http://blog.cloudera.com/blog/2014/08/how-­‐to-­‐count-­‐events-­‐like-­‐a-­‐data-­‐scientist/   • Goal:  Rudimentary  spell  checker  based  on  counting  query/click  events   • Problem:  Cross  referencing  items  in  multiple  nested  collections   • Representative  of  many  machine  learning  tasks  (Josh  tells  me)   • Goal  was  not  naturally  achievable  with  Hive     • Josh  implemented  a  custom  Hive  extension  “WITHIN”   • How  can  Impala  serve  this  use  case?
  • 23. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action account_id:  bigint   search_events:  array<struct<      event_id:  bigint      query:  string      tstamp_sec:  bigint      ...   >>   install_events:  array<struct<      event_id:  bigint      search_event_id:  bigint      app_id:  bigint      ...   >> SELECT  a.qw,  a.qr,  count(*)  as  cnt   FROM  sessions   LATERAL  VIEW  WITHIN(      "SELECT  bad.query  qw,  good.query  qr      FROM  t1  as  bad,  t1  as  good      WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  t2)      AND  good.event_id  IN  (select  search_event_id  FROM  t2)",   search_events,  install_events)  a   GROUP  BY  a.qw,  a.qr; SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s,      s.search_events  bad,      s.search_events  good,   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  s.install_events)      AND  good.event_id  IN  (select  search_event_id  FROM  s.install_events),   GROUP  BY  bad.query,  good.query; Josh’s  HiveQL  extension Impala  SQL Schema
  • 24. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s,      s.search_events  bad,      s.search_events  good,   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  s.install_events)      AND  good.event_id  IN  (select  search_event_id  FROM  s.install_events),   GROUP  BY  bad.query,  good.query; SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s   JOIN  (SELECT  *  FROM  s.search_events)  bad,   JOIN  (SELECT  *  FROM  s.search_events)  good,   LEFT  ANTI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v1   ON  (bad.event_id  =  v1.install_events)   LEFT  SEMI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v2   ON  (good.event_id  =  v2.search_event_id)   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30   GROUP  BY  bad.query,  good.query; Rewrite
  • 25. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action Scan  s Subplan Cross  Join   bad.ts  <  good.ts  AND   good.ts  –  bad.ts  <  30 Unnest   s.search_events  good Left  Anti  Join   bad.event_id  =   v1.search_event_id Unnest   s.install_events  v1 Left  Semi  Join   good.event_id  =   v2.search_event_id Unnest   s.install_events  v2 Aggregate   count(*)  group  by   bad.query,  good.query Unnest   s.search_events  good SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s   JOIN  (SELECT  *  FROM  s.search_events)  bad,   JOIN  (SELECT  *  FROM  s.search_events)  good,   LEFT  ANTI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v1   ON  (bad.event_id  =  v1.install_events)   LEFT  SEMI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v2   ON  (good.event_id  =  v2.search_event_id)   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30   GROUP  BY  bad.query,  good.query;