SlideShare a Scribd company logo
1 of 23
Download to read offline
ES-Hadoop: Bridging the world of Hadoop and
Elasticsearch
Bala Venkatrao (bala@elastic.co)
June 2015
h"ps://github.com/elas3c/elas3csearch-­‐hadoop	
  
	
  
www.elastic.co
Elasticsearch for Apache Hadoop™
3	
  
www.elastic.co
Certified to work
4	
  
Partition-to-Partition architecture
Node1
2P
1R
Node2
1P
3R
Node3
2R
3P
Dynamic runtime matching
Node1	
  
	
  
	
  
	
  
	
  
	
  
2P	
  
1R	
  
Node2	
  
	
  
	
  
	
  
	
  
	
  
1P	
  
3R	
  
Node3	
  
	
  
	
  
	
  
	
  
	
  
2R	
  
3P	
  
Failure handling
Node1	
  
	
  
	
  
	
  
	
  
	
  
2P	
  
1R	
  
Node2	
  
	
  
	
  
	
  
	
  
	
  
1P	
  
3R	
  
Node3	
  
	
  
	
  
	
  
	
  
	
  
2R	
  
3P	
  
Co-location
Node1	
  
	
  
	
  
	
  
	
  
	
  
2P	
  
1R	
  
Node2	
  
	
  
	
  
	
  
	
  
	
  
1P	
  
3R	
  
Node3	
  
	
  
	
  
	
  
	
  
	
  
2R	
  
3P	
  
www.elastic.co
Native integration - Map / Reduce
JobConf	
  conf	
  =	
  new	
  JobConf();	
  	
  
conf.setInputFormat(EsInputFormat.class);	
  	
  
conf.set("es.resource",	
  "radio/artists");	
  	
  
conf.set("es.query",	
  "?q=me*");	
  	
  
	
  
JobClient.runJob(conf);	
  
JobConf	
  conf	
  =	
  new	
  JobConf();	
  	
  
conf.setOutputFormat(EsOutputFormat.class);	
  	
  
conf.set("es.resource",	
  "radio/artists");	
  
	
  
JobClient.runJob(conf);	
  
9	
  
www.elastic.co
Native integration - Cascading
Tap	
  in	
  =	
  new	
  EsTap("radio/artists","?q=me*");	
  
Tap	
  out	
  =	
  new	
  StdOut(new	
  TextLine());	
  
new	
  LocalFlowConnector().	
  
	
  	
  	
  	
  	
  	
  connect(in,	
  out,	
  new	
  Pipe(“pipe")).complete();	
  
	
  
JobClient.runJob(conf);	
  
Tap	
  in	
  =	
  Lfs(new	
  TextDelimited(	
  
	
  	
  	
  new	
  Fields("id",	
  "name",	
  "url",	
  "picture")),	
  "artists.dat");	
  
Tap	
  out	
  =	
  new	
  EsTap("radio/artists",	
  	
  
	
  	
  	
  new	
  Fields("name",	
  "url",	
  "picture"));	
  
new	
  HadoopFlowConnector().	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  connect(in,	
  out,	
  new	
  Pipe(“pipe")).complete();	
  
10	
  
www.elastic.co
Native integration - Apache Pig
A	
  =	
  LOAD	
  'radio/artists'	
  USING	
  
	
  	
  	
  	
  org.elasticsearch.hadoop.pig.EsStorage('es.query=?q=me*');	
  
DUMP	
  A;	
  
A	
  =	
  LOAD	
  'src/artists.dat'	
  USING	
  PigStorage()	
  AS	
  	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  (id:long,	
  name,	
  url:chararray,	
  picture:	
  chararray);	
  
B	
  =	
  FOREACH	
  A	
  GENERATE	
  name,	
  TOTUPLE(url,	
  picture)	
  AS	
  links;	
  
	
  
STORE	
  B	
  INTO	
  'radio/artists'	
  USING	
  	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  org.elasticsearch.hadoop.pig.EsStorage();	
  
11	
  
www.elastic.co
Native integration - Apache Hive
CREATE	
  EXTERNAL	
  TABLE	
  artists	
  (	
  
	
  	
  id	
  BIGINT,name	
  STRING,	
  links	
  STRUCT<url:STRING,	
  picture:STRING>)	
  
STORED	
  BY	
  'org.elasticsearch.hadoop.hive.EsStorageHandler'	
  
TBLPROPERTIES('es.resource'='radio/artists','es.query'='?q=me*');	
  
	
  
SELECT	
  FROM	
  artists;	
  
CREATE	
  EXTERNAL	
  TABLE	
  artists	
  (	
  
	
  	
  id	
  BIGINT,name	
  STRING,	
  links	
  STRUCT<url:STRING,	
  picture:STRING>)	
  
STORED	
  BY	
  'org.elasticsearch.hadoop.hive.EsStorageHandler'	
  
TBLPROPERTIES('es.resource'='radio/artists');	
  
	
  
INSERT	
  OVERWRITE	
  TABLE	
  artists	
  SELECT	
  	
  
	
  s.name,	
  named_struct('url',	
  s.url,	
  'picture',	
  s.pic)	
  FROM	
  source	
  
s;	
  
	
   12	
  
www.elastic.co
Native integration - Apache Spark
import	
  org.elasticsearch.spark._	
  
	
  
val	
  sc	
  =	
  new	
  SparkContext(new	
  SparkConf())	
  
val	
  rdd	
  =	
  sc.esRDD("radio/artists",	
  "?me*")	
  
import	
  org.elasticsearch.spark._	
  	
  	
  	
  	
  	
  	
  	
  	
  
	
  
case	
  class	
  Artist(name:	
  String,	
  albums:	
  Int)	
  
	
  
val	
  u2	
  =	
  Artist("U2",	
  12)	
  
val	
  bh	
  =	
  Map("name"-­‐>"Buckethead","albums"	
  -­‐>	
  95,	
  "age"	
  -­‐>	
  45)	
  
	
  
sc.makeRDD(Seq(u2,	
  h2)).saveToEs("radio/artists")	
  
13	
  
www.elastic.co
Native integration - Spark SQL
val	
  sql	
  =	
  new	
  SQLContext...	
  
val	
  df	
  =	
  sql.load("radio/artists",	
  "org.elasticsearch.spark.sql")	
  
df.filter(df("age")	
  >	
  40)	
  
val	
  sql	
  =	
  new	
  SQLContext...	
  
val	
  table	
  =	
  sql.sql("CREATE	
  TEMPORARY	
  TABLE	
  artists	
  "	
  +	
  
	
   	
   	
   	
  	
  	
  	
  "USING	
  org.elasticsearch.spark.sql	
  "	
  +	
  
	
   	
   	
   	
  	
  	
  	
  "OPTIONS(resource=`radio/artists`)	
  ")	
  
	
  
val	
  names	
  =	
  sql.sql("SELECT	
  name	
  FROM	
  artists")	
  
14	
  
www.elastic.co
Native integration - Apache Storm
TopologyBuilder	
  builder	
  =	
  new	
  TopologyBuilder();	
  
builder.setBolt("esBolt",	
  new	
  EsBolt("twitter/tweets"));	
  
TopologyBuilder	
  builder	
  =	
  new	
  TopologyBuilder();	
  
builder.setSpout("esSpout",new	
  EsSpout("twitter/tweets","?q=nfl*",
5);	
  
Builder.setBolt("bolt“,	
  new	
  
PrinterBolt()).shuffleGrouping("esSpout");	
  
15	
  
www.elastic.co
Resource Management
www.elastic.co
YARN support – In Beta
•  Run Elasticsearch on YARN
•  But YARN doesn’t support long-lived services (yet):
•  No provisioning
•  No ip/network guarantees
•  Data/node affinity
•  Next YARN releases plan to address this
•  Tracking projects like Apache Slider
17	
  
www.elastic.co
Storage	
  
www.elastic.co
HDFS integration
•  Snapshot/Restore
•  Use HDFS as a shared storage
•  Backup and recover data
•  Works great with snapshot immutable data
•  HDFS as a File-System – not recommended / tread carefully
•  Incomplete FS semantics (last-delete-on-close, fsync)
•  NFSv3 (metadata issues)
•  See Elasticsearch issue #9072
	
  	
   19	
  
www.elastic.co
20	
  
•  Support for Spark, Spark SQL, Storm
•  Includes support for Spark (core and SQL) 1.2, 1.3 and 1.4
•  Support for all Spark SQL filters and relationship traits
•  Certification with Hadoop distributions
•  Currently certified with CDH5.x, HDP2.x, MapR 4.x and Databricks Spark
•  Security enhancements
•  Basic HTTP authentication allowing Hadoop jobs running against a restricted Elasticsearch cluster to identify themselves
accordingly
•  SSL/TLS support for cryptographic connections between Elasticsearch and Hadoop cluster, enabling data-sensitive
environments to transparently encrypt the data at transport level and thus prevent snooping and preserve data
confidentiality.
•  Support for Shield-enabled Elasticsearch clusters
•  Several enhancements and performance improvements, including
•  Client node routing
•  Return raw JSON and metadata while reading documents from ES
•  Inclusion / Exclusion of fields to be written to ES
What’s New in ES-Hadoop 2.1
www.elastic.co
•  Support for ES aggregations
•  Marvel integration
•  Integration with Machine Learning libraries e.g Mllib
•  Others? (Suggestions)
Roadmap
21	
  
www.elastic.co
22	
  
Documentation –
https://www.elastic.co/guide/en/elasticsearch/hadoop/index.html
Project home page/ Source repository -
https://github.com/elastic/elasticsearch-hadoop
Issue tracker - https://github.com/elastic/elasticsearch-hadoop/issues
Mailing list / forum - https://discuss.elastic.co/c/elasticsearch-and-hadoop
More Questions?
www.elastic.co
Thank you!

More Related Content

What's hot

Solr vs. Elasticsearch - Case by Case
Solr vs. Elasticsearch - Case by CaseSolr vs. Elasticsearch - Case by Case
Solr vs. Elasticsearch - Case by CaseAlexandre Rafalovitch
 
Elasticsearch for Data Analytics
Elasticsearch for Data AnalyticsElasticsearch for Data Analytics
Elasticsearch for Data AnalyticsFelipe
 
Introduction to Elasticsearch
Introduction to ElasticsearchIntroduction to Elasticsearch
Introduction to ElasticsearchRuslan Zavacky
 
Solr and Elasticsearch, a performance study
Solr and Elasticsearch, a performance studySolr and Elasticsearch, a performance study
Solr and Elasticsearch, a performance studyCharlie Hull
 
Introduction to elasticsearch
Introduction to elasticsearchIntroduction to elasticsearch
Introduction to elasticsearchpmanvi
 
Elasticsearch and Spark
Elasticsearch and SparkElasticsearch and Spark
Elasticsearch and SparkAudible, Inc.
 
ElasticSearch in action
ElasticSearch in actionElasticSearch in action
ElasticSearch in actionCodemotion
 
You know, for search. Querying 24 Billion Documents in 900ms
You know, for search. Querying 24 Billion Documents in 900msYou know, for search. Querying 24 Billion Documents in 900ms
You know, for search. Querying 24 Billion Documents in 900msJodok Batlogg
 
Side by Side with Elasticsearch and Solr
Side by Side with Elasticsearch and SolrSide by Side with Elasticsearch and Solr
Side by Side with Elasticsearch and SolrSematext Group, Inc.
 
Building a CRM on top of ElasticSearch
Building a CRM on top of ElasticSearchBuilding a CRM on top of ElasticSearch
Building a CRM on top of ElasticSearchMark Greene
 
2014 spark with elastic search
2014   spark with elastic search2014   spark with elastic search
2014 spark with elastic searchHenry Saputra
 
Elasticsearch first-steps
Elasticsearch first-stepsElasticsearch first-steps
Elasticsearch first-stepsMatteo Moci
 
아파트 정보를 이용한 ELK stack 활용 - 오근문
아파트 정보를 이용한 ELK stack 활용 - 오근문아파트 정보를 이용한 ELK stack 활용 - 오근문
아파트 정보를 이용한 ELK stack 활용 - 오근문NAVER D2
 
Cool bonsai cool - an introduction to ElasticSearch
Cool bonsai cool - an introduction to ElasticSearchCool bonsai cool - an introduction to ElasticSearch
Cool bonsai cool - an introduction to ElasticSearchclintongormley
 
An Introduction to Elastic Search.
An Introduction to Elastic Search.An Introduction to Elastic Search.
An Introduction to Elastic Search.Jurriaan Persyn
 
Elasticsearch - Devoxx France 2012 - English version
Elasticsearch - Devoxx France 2012 - English versionElasticsearch - Devoxx France 2012 - English version
Elasticsearch - Devoxx France 2012 - English versionDavid Pilato
 
Introduction to Elasticsearch with basics of Lucene
Introduction to Elasticsearch with basics of LuceneIntroduction to Elasticsearch with basics of Lucene
Introduction to Elasticsearch with basics of LuceneRahul Jain
 
Use Cases for Elastic Search Percolator
Use Cases for Elastic Search PercolatorUse Cases for Elastic Search Percolator
Use Cases for Elastic Search PercolatorMaxim Shelest
 
What I learnt: Elastic search & Kibana : introduction, installtion & configur...
What I learnt: Elastic search & Kibana : introduction, installtion & configur...What I learnt: Elastic search & Kibana : introduction, installtion & configur...
What I learnt: Elastic search & Kibana : introduction, installtion & configur...Rahul K Chauhan
 

What's hot (20)

Solr vs. Elasticsearch - Case by Case
Solr vs. Elasticsearch - Case by CaseSolr vs. Elasticsearch - Case by Case
Solr vs. Elasticsearch - Case by Case
 
Elasticsearch for Data Analytics
Elasticsearch for Data AnalyticsElasticsearch for Data Analytics
Elasticsearch for Data Analytics
 
Introduction to Elasticsearch
Introduction to ElasticsearchIntroduction to Elasticsearch
Introduction to Elasticsearch
 
Solr and Elasticsearch, a performance study
Solr and Elasticsearch, a performance studySolr and Elasticsearch, a performance study
Solr and Elasticsearch, a performance study
 
Introduction to elasticsearch
Introduction to elasticsearchIntroduction to elasticsearch
Introduction to elasticsearch
 
Elasticsearch and Spark
Elasticsearch and SparkElasticsearch and Spark
Elasticsearch and Spark
 
ElasticSearch in action
ElasticSearch in actionElasticSearch in action
ElasticSearch in action
 
You know, for search. Querying 24 Billion Documents in 900ms
You know, for search. Querying 24 Billion Documents in 900msYou know, for search. Querying 24 Billion Documents in 900ms
You know, for search. Querying 24 Billion Documents in 900ms
 
Side by Side with Elasticsearch and Solr
Side by Side with Elasticsearch and SolrSide by Side with Elasticsearch and Solr
Side by Side with Elasticsearch and Solr
 
Building a CRM on top of ElasticSearch
Building a CRM on top of ElasticSearchBuilding a CRM on top of ElasticSearch
Building a CRM on top of ElasticSearch
 
2014 spark with elastic search
2014   spark with elastic search2014   spark with elastic search
2014 spark with elastic search
 
Elasticsearch first-steps
Elasticsearch first-stepsElasticsearch first-steps
Elasticsearch first-steps
 
아파트 정보를 이용한 ELK stack 활용 - 오근문
아파트 정보를 이용한 ELK stack 활용 - 오근문아파트 정보를 이용한 ELK stack 활용 - 오근문
아파트 정보를 이용한 ELK stack 활용 - 오근문
 
Cool bonsai cool - an introduction to ElasticSearch
Cool bonsai cool - an introduction to ElasticSearchCool bonsai cool - an introduction to ElasticSearch
Cool bonsai cool - an introduction to ElasticSearch
 
An Introduction to Elastic Search.
An Introduction to Elastic Search.An Introduction to Elastic Search.
An Introduction to Elastic Search.
 
Elasticsearch - Devoxx France 2012 - English version
Elasticsearch - Devoxx France 2012 - English versionElasticsearch - Devoxx France 2012 - English version
Elasticsearch - Devoxx France 2012 - English version
 
Introduction to Elasticsearch with basics of Lucene
Introduction to Elasticsearch with basics of LuceneIntroduction to Elasticsearch with basics of Lucene
Introduction to Elasticsearch with basics of Lucene
 
Use Cases for Elastic Search Percolator
Use Cases for Elastic Search PercolatorUse Cases for Elastic Search Percolator
Use Cases for Elastic Search Percolator
 
What I learnt: Elastic search & Kibana : introduction, installtion & configur...
What I learnt: Elastic search & Kibana : introduction, installtion & configur...What I learnt: Elastic search & Kibana : introduction, installtion & configur...
What I learnt: Elastic search & Kibana : introduction, installtion & configur...
 
Elasticsearch
ElasticsearchElasticsearch
Elasticsearch
 

Similar to ElasticES-Hadoop: Bridging the world of Hadoop and Elasticsearch

Hands On Spring Data
Hands On Spring DataHands On Spring Data
Hands On Spring DataEric Bottard
 
Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...
Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...
Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...Oleksiy Panchenko
 
Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...
Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...
Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...StreamNative
 
Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...
Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...
Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...Shirshanka Das
 
Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...
Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...
Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...Databricks
 
Just one-shade-of-openstack
Just one-shade-of-openstackJust one-shade-of-openstack
Just one-shade-of-openstackRoberto Polli
 
Managing Your Content with Elasticsearch
Managing Your Content with ElasticsearchManaging Your Content with Elasticsearch
Managing Your Content with ElasticsearchSamantha Quiñones
 
Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...
Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...
Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...Helena Edelson
 
Using ElasticSearch as a fast, flexible, and scalable solution to search occu...
Using ElasticSearch as a fast, flexible, and scalable solution to search occu...Using ElasticSearch as a fast, flexible, and scalable solution to search occu...
Using ElasticSearch as a fast, flexible, and scalable solution to search occu...kristgen
 
Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...
Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...
Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...Paul Brebner
 
Neural Search Comes to Apache Solr
Neural Search Comes to Apache SolrNeural Search Comes to Apache Solr
Neural Search Comes to Apache SolrSease
 
How to develop Big Data Pipelines for Hadoop, by Costin Leau
How to develop Big Data Pipelines for Hadoop, by Costin LeauHow to develop Big Data Pipelines for Hadoop, by Costin Leau
How to develop Big Data Pipelines for Hadoop, by Costin LeauCodemotion
 
Structured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim Dowling
Structured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim DowlingStructured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim Dowling
Structured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim DowlingDatabricks
 
Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...
Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...
Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...Sease
 
Environment for training models
Environment for training modelsEnvironment for training models
Environment for training modelsFlyElephant
 
Null Bachaav - May 07 Attack Monitoring workshop.
Null Bachaav - May 07 Attack Monitoring workshop.Null Bachaav - May 07 Attack Monitoring workshop.
Null Bachaav - May 07 Attack Monitoring workshop.Prajal Kulkarni
 
Attack on graph
Attack on graphAttack on graph
Attack on graphScott Miao
 
Dense Retrieval with Apache Solr Neural Search.pdf
Dense Retrieval with Apache Solr Neural Search.pdfDense Retrieval with Apache Solr Neural Search.pdf
Dense Retrieval with Apache Solr Neural Search.pdfSease
 

Similar to ElasticES-Hadoop: Bridging the world of Hadoop and Elasticsearch (20)

Hands On Spring Data
Hands On Spring DataHands On Spring Data
Hands On Spring Data
 
Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...
Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...
Elasticsearch, Logstash, Kibana. Cool search, analytics, data mining and more...
 
Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...
Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...
Unlocking the Power of Lakehouse Architectures with Apache Pulsar and Apache ...
 
Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...
Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...
Apache Gobblin: Bridging Batch and Streaming Data Integration. Big Data Meetu...
 
Play framework
Play frameworkPlay framework
Play framework
 
Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...
Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...
Very Large Data Files, Object Stores, and Deep Learning—Lessons Learned While...
 
Just one-shade-of-openstack
Just one-shade-of-openstackJust one-shade-of-openstack
Just one-shade-of-openstack
 
Managing Your Content with Elasticsearch
Managing Your Content with ElasticsearchManaging Your Content with Elasticsearch
Managing Your Content with Elasticsearch
 
Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...
Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...
Lambda Architecture with Spark, Spark Streaming, Kafka, Cassandra, Akka and S...
 
Using ElasticSearch as a fast, flexible, and scalable solution to search occu...
Using ElasticSearch as a fast, flexible, and scalable solution to search occu...Using ElasticSearch as a fast, flexible, and scalable solution to search occu...
Using ElasticSearch as a fast, flexible, and scalable solution to search occu...
 
Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...
Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...
Change Data Capture (CDC) With Kafka Connect® and the Debezium PostgreSQL Sou...
 
Neural Search Comes to Apache Solr
Neural Search Comes to Apache SolrNeural Search Comes to Apache Solr
Neural Search Comes to Apache Solr
 
How to develop Big Data Pipelines for Hadoop, by Costin Leau
How to develop Big Data Pipelines for Hadoop, by Costin LeauHow to develop Big Data Pipelines for Hadoop, by Costin Leau
How to develop Big Data Pipelines for Hadoop, by Costin Leau
 
Structured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim Dowling
Structured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim DowlingStructured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim Dowling
Structured-Streaming-as-a-Service with Kafka, YARN, and Tooling with Jim Dowling
 
Elasticsearch Introduction
Elasticsearch IntroductionElasticsearch Introduction
Elasticsearch Introduction
 
Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...
Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...
Neural Search Comes to Apache Solr_ Approximate Nearest Neighbor, BERT and Mo...
 
Environment for training models
Environment for training modelsEnvironment for training models
Environment for training models
 
Null Bachaav - May 07 Attack Monitoring workshop.
Null Bachaav - May 07 Attack Monitoring workshop.Null Bachaav - May 07 Attack Monitoring workshop.
Null Bachaav - May 07 Attack Monitoring workshop.
 
Attack on graph
Attack on graphAttack on graph
Attack on graph
 
Dense Retrieval with Apache Solr Neural Search.pdf
Dense Retrieval with Apache Solr Neural Search.pdfDense Retrieval with Apache Solr Neural Search.pdf
Dense Retrieval with Apache Solr Neural Search.pdf
 

More from MapR Technologies

Converging your data landscape
Converging your data landscapeConverging your data landscape
Converging your data landscapeMapR Technologies
 
ML Workshop 2: Machine Learning Model Comparison & Evaluation
ML Workshop 2: Machine Learning Model Comparison & EvaluationML Workshop 2: Machine Learning Model Comparison & Evaluation
ML Workshop 2: Machine Learning Model Comparison & EvaluationMapR Technologies
 
Self-Service Data Science for Leveraging ML & AI on All of Your Data
Self-Service Data Science for Leveraging ML & AI on All of Your DataSelf-Service Data Science for Leveraging ML & AI on All of Your Data
Self-Service Data Science for Leveraging ML & AI on All of Your DataMapR Technologies
 
Enabling Real-Time Business with Change Data Capture
Enabling Real-Time Business with Change Data CaptureEnabling Real-Time Business with Change Data Capture
Enabling Real-Time Business with Change Data CaptureMapR Technologies
 
Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...
Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...
Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...MapR Technologies
 
ML Workshop 1: A New Architecture for Machine Learning Logistics
ML Workshop 1: A New Architecture for Machine Learning LogisticsML Workshop 1: A New Architecture for Machine Learning Logistics
ML Workshop 1: A New Architecture for Machine Learning LogisticsMapR Technologies
 
Machine Learning Success: The Key to Easier Model Management
Machine Learning Success: The Key to Easier Model ManagementMachine Learning Success: The Key to Easier Model Management
Machine Learning Success: The Key to Easier Model ManagementMapR Technologies
 
Data Warehouse Modernization: Accelerating Time-To-Action
Data Warehouse Modernization: Accelerating Time-To-Action Data Warehouse Modernization: Accelerating Time-To-Action
Data Warehouse Modernization: Accelerating Time-To-Action MapR Technologies
 
Live Tutorial – Streaming Real-Time Events Using Apache APIs
Live Tutorial – Streaming Real-Time Events Using Apache APIsLive Tutorial – Streaming Real-Time Events Using Apache APIs
Live Tutorial – Streaming Real-Time Events Using Apache APIsMapR Technologies
 
Bringing Structure, Scalability, and Services to Cloud-Scale Storage
Bringing Structure, Scalability, and Services to Cloud-Scale StorageBringing Structure, Scalability, and Services to Cloud-Scale Storage
Bringing Structure, Scalability, and Services to Cloud-Scale StorageMapR Technologies
 
Live Machine Learning Tutorial: Churn Prediction
Live Machine Learning Tutorial: Churn PredictionLive Machine Learning Tutorial: Churn Prediction
Live Machine Learning Tutorial: Churn PredictionMapR Technologies
 
An Introduction to the MapR Converged Data Platform
An Introduction to the MapR Converged Data PlatformAn Introduction to the MapR Converged Data Platform
An Introduction to the MapR Converged Data PlatformMapR Technologies
 
How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...
How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...
How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...MapR Technologies
 
Best Practices for Data Convergence in Healthcare
Best Practices for Data Convergence in HealthcareBest Practices for Data Convergence in Healthcare
Best Practices for Data Convergence in HealthcareMapR Technologies
 
Geo-Distributed Big Data and Analytics
Geo-Distributed Big Data and AnalyticsGeo-Distributed Big Data and Analytics
Geo-Distributed Big Data and AnalyticsMapR Technologies
 
MapR Product Update - Spring 2017
MapR Product Update - Spring 2017MapR Product Update - Spring 2017
MapR Product Update - Spring 2017MapR Technologies
 
3 Benefits of Multi-Temperature Data Management for Data Analytics
3 Benefits of Multi-Temperature Data Management for Data Analytics3 Benefits of Multi-Temperature Data Management for Data Analytics
3 Benefits of Multi-Temperature Data Management for Data AnalyticsMapR Technologies
 
Cisco & MapR bring 3 Superpowers to SAP HANA Deployments
Cisco & MapR bring 3 Superpowers to SAP HANA DeploymentsCisco & MapR bring 3 Superpowers to SAP HANA Deployments
Cisco & MapR bring 3 Superpowers to SAP HANA DeploymentsMapR Technologies
 
MapR and Cisco Make IT Better
MapR and Cisco Make IT BetterMapR and Cisco Make IT Better
MapR and Cisco Make IT BetterMapR Technologies
 
Evolving from RDBMS to NoSQL + SQL
Evolving from RDBMS to NoSQL + SQLEvolving from RDBMS to NoSQL + SQL
Evolving from RDBMS to NoSQL + SQLMapR Technologies
 

More from MapR Technologies (20)

Converging your data landscape
Converging your data landscapeConverging your data landscape
Converging your data landscape
 
ML Workshop 2: Machine Learning Model Comparison & Evaluation
ML Workshop 2: Machine Learning Model Comparison & EvaluationML Workshop 2: Machine Learning Model Comparison & Evaluation
ML Workshop 2: Machine Learning Model Comparison & Evaluation
 
Self-Service Data Science for Leveraging ML & AI on All of Your Data
Self-Service Data Science for Leveraging ML & AI on All of Your DataSelf-Service Data Science for Leveraging ML & AI on All of Your Data
Self-Service Data Science for Leveraging ML & AI on All of Your Data
 
Enabling Real-Time Business with Change Data Capture
Enabling Real-Time Business with Change Data CaptureEnabling Real-Time Business with Change Data Capture
Enabling Real-Time Business with Change Data Capture
 
Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...
Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...
Machine Learning for Chickens, Autonomous Driving and a 3-year-old Who Won’t ...
 
ML Workshop 1: A New Architecture for Machine Learning Logistics
ML Workshop 1: A New Architecture for Machine Learning LogisticsML Workshop 1: A New Architecture for Machine Learning Logistics
ML Workshop 1: A New Architecture for Machine Learning Logistics
 
Machine Learning Success: The Key to Easier Model Management
Machine Learning Success: The Key to Easier Model ManagementMachine Learning Success: The Key to Easier Model Management
Machine Learning Success: The Key to Easier Model Management
 
Data Warehouse Modernization: Accelerating Time-To-Action
Data Warehouse Modernization: Accelerating Time-To-Action Data Warehouse Modernization: Accelerating Time-To-Action
Data Warehouse Modernization: Accelerating Time-To-Action
 
Live Tutorial – Streaming Real-Time Events Using Apache APIs
Live Tutorial – Streaming Real-Time Events Using Apache APIsLive Tutorial – Streaming Real-Time Events Using Apache APIs
Live Tutorial – Streaming Real-Time Events Using Apache APIs
 
Bringing Structure, Scalability, and Services to Cloud-Scale Storage
Bringing Structure, Scalability, and Services to Cloud-Scale StorageBringing Structure, Scalability, and Services to Cloud-Scale Storage
Bringing Structure, Scalability, and Services to Cloud-Scale Storage
 
Live Machine Learning Tutorial: Churn Prediction
Live Machine Learning Tutorial: Churn PredictionLive Machine Learning Tutorial: Churn Prediction
Live Machine Learning Tutorial: Churn Prediction
 
An Introduction to the MapR Converged Data Platform
An Introduction to the MapR Converged Data PlatformAn Introduction to the MapR Converged Data Platform
An Introduction to the MapR Converged Data Platform
 
How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...
How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...
How to Leverage the Cloud for Business Solutions | Strata Data Conference Lon...
 
Best Practices for Data Convergence in Healthcare
Best Practices for Data Convergence in HealthcareBest Practices for Data Convergence in Healthcare
Best Practices for Data Convergence in Healthcare
 
Geo-Distributed Big Data and Analytics
Geo-Distributed Big Data and AnalyticsGeo-Distributed Big Data and Analytics
Geo-Distributed Big Data and Analytics
 
MapR Product Update - Spring 2017
MapR Product Update - Spring 2017MapR Product Update - Spring 2017
MapR Product Update - Spring 2017
 
3 Benefits of Multi-Temperature Data Management for Data Analytics
3 Benefits of Multi-Temperature Data Management for Data Analytics3 Benefits of Multi-Temperature Data Management for Data Analytics
3 Benefits of Multi-Temperature Data Management for Data Analytics
 
Cisco & MapR bring 3 Superpowers to SAP HANA Deployments
Cisco & MapR bring 3 Superpowers to SAP HANA DeploymentsCisco & MapR bring 3 Superpowers to SAP HANA Deployments
Cisco & MapR bring 3 Superpowers to SAP HANA Deployments
 
MapR and Cisco Make IT Better
MapR and Cisco Make IT BetterMapR and Cisco Make IT Better
MapR and Cisco Make IT Better
 
Evolving from RDBMS to NoSQL + SQL
Evolving from RDBMS to NoSQL + SQLEvolving from RDBMS to NoSQL + SQL
Evolving from RDBMS to NoSQL + SQL
 

ElasticES-Hadoop: Bridging the world of Hadoop and Elasticsearch

  • 1. ES-Hadoop: Bridging the world of Hadoop and Elasticsearch Bala Venkatrao (bala@elastic.co) June 2015
  • 6. Dynamic runtime matching Node1             2P   1R   Node2             1P   3R   Node3             2R   3P  
  • 7. Failure handling Node1             2P   1R   Node2             1P   3R   Node3             2R   3P  
  • 8. Co-location Node1             2P   1R   Node2             1P   3R   Node3             2R   3P  
  • 9. www.elastic.co Native integration - Map / Reduce JobConf  conf  =  new  JobConf();     conf.setInputFormat(EsInputFormat.class);     conf.set("es.resource",  "radio/artists");     conf.set("es.query",  "?q=me*");       JobClient.runJob(conf);   JobConf  conf  =  new  JobConf();     conf.setOutputFormat(EsOutputFormat.class);     conf.set("es.resource",  "radio/artists");     JobClient.runJob(conf);   9  
  • 10. www.elastic.co Native integration - Cascading Tap  in  =  new  EsTap("radio/artists","?q=me*");   Tap  out  =  new  StdOut(new  TextLine());   new  LocalFlowConnector().              connect(in,  out,  new  Pipe(“pipe")).complete();     JobClient.runJob(conf);   Tap  in  =  Lfs(new  TextDelimited(        new  Fields("id",  "name",  "url",  "picture")),  "artists.dat");   Tap  out  =  new  EsTap("radio/artists",          new  Fields("name",  "url",  "picture"));   new  HadoopFlowConnector().                      connect(in,  out,  new  Pipe(“pipe")).complete();   10  
  • 11. www.elastic.co Native integration - Apache Pig A  =  LOAD  'radio/artists'  USING          org.elasticsearch.hadoop.pig.EsStorage('es.query=?q=me*');   DUMP  A;   A  =  LOAD  'src/artists.dat'  USING  PigStorage()  AS                      (id:long,  name,  url:chararray,  picture:  chararray);   B  =  FOREACH  A  GENERATE  name,  TOTUPLE(url,  picture)  AS  links;     STORE  B  INTO  'radio/artists'  USING                              org.elasticsearch.hadoop.pig.EsStorage();   11  
  • 12. www.elastic.co Native integration - Apache Hive CREATE  EXTERNAL  TABLE  artists  (      id  BIGINT,name  STRING,  links  STRUCT<url:STRING,  picture:STRING>)   STORED  BY  'org.elasticsearch.hadoop.hive.EsStorageHandler'   TBLPROPERTIES('es.resource'='radio/artists','es.query'='?q=me*');     SELECT  FROM  artists;   CREATE  EXTERNAL  TABLE  artists  (      id  BIGINT,name  STRING,  links  STRUCT<url:STRING,  picture:STRING>)   STORED  BY  'org.elasticsearch.hadoop.hive.EsStorageHandler'   TBLPROPERTIES('es.resource'='radio/artists');     INSERT  OVERWRITE  TABLE  artists  SELECT      s.name,  named_struct('url',  s.url,  'picture',  s.pic)  FROM  source   s;     12  
  • 13. www.elastic.co Native integration - Apache Spark import  org.elasticsearch.spark._     val  sc  =  new  SparkContext(new  SparkConf())   val  rdd  =  sc.esRDD("radio/artists",  "?me*")   import  org.elasticsearch.spark._                     case  class  Artist(name:  String,  albums:  Int)     val  u2  =  Artist("U2",  12)   val  bh  =  Map("name"-­‐>"Buckethead","albums"  -­‐>  95,  "age"  -­‐>  45)     sc.makeRDD(Seq(u2,  h2)).saveToEs("radio/artists")   13  
  • 14. www.elastic.co Native integration - Spark SQL val  sql  =  new  SQLContext...   val  df  =  sql.load("radio/artists",  "org.elasticsearch.spark.sql")   df.filter(df("age")  >  40)   val  sql  =  new  SQLContext...   val  table  =  sql.sql("CREATE  TEMPORARY  TABLE  artists  "  +                "USING  org.elasticsearch.spark.sql  "  +                "OPTIONS(resource=`radio/artists`)  ")     val  names  =  sql.sql("SELECT  name  FROM  artists")   14  
  • 15. www.elastic.co Native integration - Apache Storm TopologyBuilder  builder  =  new  TopologyBuilder();   builder.setBolt("esBolt",  new  EsBolt("twitter/tweets"));   TopologyBuilder  builder  =  new  TopologyBuilder();   builder.setSpout("esSpout",new  EsSpout("twitter/tweets","?q=nfl*", 5);   Builder.setBolt("bolt“,  new   PrinterBolt()).shuffleGrouping("esSpout");   15  
  • 17. www.elastic.co YARN support – In Beta •  Run Elasticsearch on YARN •  But YARN doesn’t support long-lived services (yet): •  No provisioning •  No ip/network guarantees •  Data/node affinity •  Next YARN releases plan to address this •  Tracking projects like Apache Slider 17  
  • 19. www.elastic.co HDFS integration •  Snapshot/Restore •  Use HDFS as a shared storage •  Backup and recover data •  Works great with snapshot immutable data •  HDFS as a File-System – not recommended / tread carefully •  Incomplete FS semantics (last-delete-on-close, fsync) •  NFSv3 (metadata issues) •  See Elasticsearch issue #9072     19  
  • 20. www.elastic.co 20   •  Support for Spark, Spark SQL, Storm •  Includes support for Spark (core and SQL) 1.2, 1.3 and 1.4 •  Support for all Spark SQL filters and relationship traits •  Certification with Hadoop distributions •  Currently certified with CDH5.x, HDP2.x, MapR 4.x and Databricks Spark •  Security enhancements •  Basic HTTP authentication allowing Hadoop jobs running against a restricted Elasticsearch cluster to identify themselves accordingly •  SSL/TLS support for cryptographic connections between Elasticsearch and Hadoop cluster, enabling data-sensitive environments to transparently encrypt the data at transport level and thus prevent snooping and preserve data confidentiality. •  Support for Shield-enabled Elasticsearch clusters •  Several enhancements and performance improvements, including •  Client node routing •  Return raw JSON and metadata while reading documents from ES •  Inclusion / Exclusion of fields to be written to ES What’s New in ES-Hadoop 2.1
  • 21. www.elastic.co •  Support for ES aggregations •  Marvel integration •  Integration with Machine Learning libraries e.g Mllib •  Others? (Suggestions) Roadmap 21  
  • 22. www.elastic.co 22   Documentation – https://www.elastic.co/guide/en/elasticsearch/hadoop/index.html Project home page/ Source repository - https://github.com/elastic/elasticsearch-hadoop Issue tracker - https://github.com/elastic/elasticsearch-hadoop/issues Mailing list / forum - https://discuss.elastic.co/c/elasticsearch-and-hadoop More Questions?