SlideShare a Scribd company logo
1 of 39
An analysis and comparison from a developer’s perspective
   Report Buyer product catalogue:

    • Text fields: title, subtitle, summary, toc
    • Product code and ISBN
    • Supplier, category, type and availability
    • Publication date and price
Enterprise class search engine
Scalable and based on Apache Lucene
REST-ful API or PECL extension
Fast, transactional full-text indexing
Faceted and geospatial search
Rich document indexing
Comes with simple web interface
Built-in caching of queries and responses
Numerous plug-ins
   Available as system packages
   Uses Tomcat or Jetty
   Requires a restart on configuration change
   Packages install as a service
   Specify database location
   Memory settings
   Query caching options
   Request handler setup
   Search components and plug-ins
   Spell checker configuration
<!-- Report Buyer fields -->
<field name="item_guid" type="string" indexed="true" stored="true" required="true" />
<field name="name" type="text" indexed="true" stored="true" required="true" boost="75"
     omitNorms="false" />
<field name="subtitle" type="text" indexed="true" stored="true" required="false" boost="25"
     omitNorms="false" />
<field name="summary" type="text" indexed="true" stored="false" boost="1" omitNorms="false" />
<field name="toc" type="text" indexed="true" stored="false" boost="1" omitNorms="false" />
<field name="isbn" type="string" indexed="true" stored="false" boost="200" omitNorms="false" />
<field name="product_code" type="string" indexed="true" stored="true" boost="200" omitNorms="false" />
<field name="publish_date" type="tdate" indexed="true" stored="true" />
<field name="price" type="tfloat" indexed="true" stored="true" />
<field name="availability" type="boolean" indexed="true" stored="true" />
<field name="link" type="string" indexed="false" stored="true" />
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>

<copyField source="name" dest="text"/>
<copyField source="subtitle" dest="text"/>
<copyField source="summary" dest="text"/>
<copyField source="toc" dest="text"/>

<uniqueKey>item_guid</uniqueKey>
<defaultSearchField>text</defaultSearchField>
   Data Import Handler
   REST-ful API
   PHP PECL Extension
   Third-party libraries, like Solarium
<?php
$solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080);
$solr       = new SolrClient($solr_options);
$doc        = new SolrInputDocument();
while ($row = mysql_fetch_array($result, MYSQL_ASSOC))
{
    $doc = new SolrInputDocument();
    $row['publish_date'] = strftime('%Y-%m-%dT00:00:01Z', strtotime($row['publish_date']));
    foreach ($row as $key => $value) {
            $doc->addField($key, $value);
    }
    $updateResponse = $solr->addDocument($doc);
    $response = $updateResponse->getResponse();
    if ($response->responseHeader->status != 0) {
            print "Error importing into Solr: ";
print_r($response);
    }
}

$solr->commit();
?>
POST to http://localhost:8080/solr/update?commit=true

<add>
   <doc>
          <field name="item_guid">a34bbff9e17ada79658c72fde90c7369</field>
          <field name="name">Research Report on China's Corn Industry</field>
          <field name="price">1265</field>
          etc
    </doc>
</add>
$solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080);
$solr = new SolrClient($solr_options);
$query = new SolrQuery();
$query->setQuery("research in china");
$query->setFacet(true);
$query->addFacetField('availability');

$query->addField('item_guid')->addField('name')->addField('publish_date')->addField('subtitle')->
   addField('product_code')->addField('availability')->addField('price');

$query->addSortField('publish_date', SolrQuery::ORDER_DESC);

$query_response = $solr->query($query);
$response = $query_response->getResponse();

print "Found ".$response->response->numFound." results, for {$query_string} in ".$response-
     >responseHeader->QTime." ms:nn";
foreach ($response->response->docs as $position=>$doc_data) {
     $download = ($doc_data['availability'] == '1') ? 'Yes' : 'No';
     print "{$position} - Date:{$pub_date} - {$doc_data['product_code']} - D/L:{$download} £".sprintf("%5d",
     $doc_data['price'])." - {$doc_data['name']}n";
}
print "Facets for instant ".$response->facet_counts->facet_fields->availability->false;
http://localhost:8080/solr/select/?q=research%20%in%20china&indent=on&hl=true&hl.fl=item_guid,name,
    publish_date,subtitle,product_code,availability,price&facet=true&facet.field=availability&wt=json

{
 "responseHeader":{
  "status":0, "QTime":20,
  "params":{
      "facet":"true",      "indent":"on",               "q":"research u0000 china",
      "hl.fl":"item_guid,name,publish_date,subtitle,product_code,availability,price",
      "facet.field":"availability", "wt":"json", "hl":"true"}},
 "response":{"numFound":197481,"start":0,"docs":[
      {
       "item_guid":"e68cf64921a02e926137d78d2c52da35",
       "name":"Market Research Report on China Civil Aero Industry",
       "product_code":"SFC00076",
       "price":190.0, "availability":false,
       "type":10,
      "link":
      "/industry_manufacturing/plant_heavy_equipment/market_research_report_china_civil_aero_industry.
      html",
       "publish_date":"2008-07-22T00:00:01Z"
      }
}
   More features than other products
   Responsive, busy mailing list
   Large team of developers
   Good PHP libraries for integration
   Several books available
   Fairly heavy footprint
   Also built on Apache Lucene
   JSON-based
   Distributed, scalable server model
   Easy to configure, or configuration free
   Faceting and highlight support
   Auto type detection
   Multiple indexes
   CouchDB integration
   Download and unpack zip file
   Run elasticsearch/bin/elasticsearch
   No schema is required - almost
   No configuration is required - almost
GET http://localhost:9200/ HTTP/1.0
{
     "ok" : true,
     "name" : "Test",
     "version" : {
       "number" : "0.18.7",
       "snapshot_build" : false
     },
     "tagline" : "You Know, for Search",
     "cover" : "DON'T PANIC",
     "quote" : {
       "book" : "The Hitchhiker's Guide to the Galaxy",
       "chapter" : "Chapter 27",
       "text1" : ""Forty-two," said Deep Thought, with infinite majesty and calm.",
       "text2" : ""The Answer to the Great Question, of Life, the Universe and Everything""
     }
   }
curl -XPUT http://localhost:9200/reports/ -d '
{
     "index:" {
           "analysis": {
                         "analyzer": {
                                       "my_analyzer": {
                                                  "tokenizer": "standard",
                                                  "filter": ["standard", "lowercase", "my_stemmer"]
                                       }
                         },
                         "filter": {
                                       "my_stemmer": {
                                                  "type": "stemmer",
                                                  "name": "english"
                                       }
                         }
           }
     }
}'
<?php
require_once("ElasticSearch.php");
$es = new ElasticSearch;
$es->index = 'reports';
$type = 'report';
$mappings = array($type => array('properties' => array(
           '_id' => array('type' => 'string', 'path' => 'item_guid'),
           'item_guid' => array('type' => 'string', 'store' => 'yes', 'index' => 'not_analyzed'),
           'name' => array('type' => 'string', 'store' => 'no', 'boost' => 75),
           'subtitle' => array('type' => 'string', 'store' => 'yes', 'boost' => 25),
           'summary' => array('type' => 'string', 'store' => 'yes', 'boost' => 10),
           'toc' => array('type' => 'string', 'store' => 'no'),
           'product_code' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'),
           'isbn' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'),
)));

$json = json_encode($mappings);

$es->map($type, $json);
?>
<?php
require_once("ElasticSearch.php");
$es = new ElasticSearch;
$es->index = 'reports';
$type = 'report';

$sql = "SELECT `item_guid`, `name`, `subtitle`, `summary`, `toc`, `supplier`,
           `product_code`, `isbn`, `category`, `price`, `availibility` as `availability`,
           `type`, `link`, `publish_date`
           FROM `rb_search`";

$result = read_query($sql);

while ($row = mysql_fetch_array($result, MYSQL_ASSOC))
{
    $es->add($type, $row['item_guid'], json_encode($row));
}
?>
GET http://localhost:9200/reports/report/_count/

{"count":260349,"_shards":{"total":1,"successful":1,"failed":0}}
<?php
require_once("ElasticSearch.php");
$es = new ElasticSearch;

$es->index = 'reports';
$type = 'report';

$query = array(
   'fields' => array('item_guid', 'name', 'subtitle'),
   'query' => array(
                          'term' => array('name' => 'research'),
                          ),
   'facets' => array(
           'availability' => array(
                          'terms' => array('field' => 'availability')
           )
   )
);

$result = $es->query($type, json_encode($query));
?>
   Nicholas Ruflin's elastica
   Raymond Julin's elasticsearch
   Niranjan Uma Shankar's elasticsearch-php
   Very fast indexing
   Auto-scaling architecture
   Elegant REST approach
   Flexible zero configuration model
   Poor documentation
   No feature list, conceptual model or
    introduction
   All data is stored, meaning large indices
   Indexes MySQL, MSSQL, XML or ODBC
   Querying through Sphinx PHP API
   Searching through SQL queries or API
   Scalable to index 6TB of data in 16bn
    documents and 2000 queries/sec
   Used by Craigslist, Boardreader
   Runs as a storage engine in MySQL
   Install from system packages or source
   Source tarball is needed to get PHP
    SphinxAPI
   No other software needed
   Runs as a service in Ubuntu
   Plain index - fast search, slow update
   Real-time index - fast update, less efficient
   Distributed - combination of both methods
index rb_test
{
     # index type
     type = rt
     path = /mnt/data_indexed/sphinx/rb_test
     # define the fields we're indexing
     rt_field = name
     rt_field = subtitle
     rt_field = summary
     rt_field = toc

    #define the fields we want to get back out
    rt_attr_string = item_guid
    rt_attr_string = supplier
    rt_attr_string = product_code
    rt_attr_string = isbn
    rt_attr_string = category
    rt_attr_uint = price
    rt_attr_string = link
    rt_attr_timestamp = publish_date

    # morphology preprocessors to apply
    morphology                          = stem_en
    html_strip                          =1
    html_index_attrs    = img=alt,title; a=title;
    html_remove_elements                = style, script
}
<?php
require_once("mysql.inc.php");
$sql = "SELECT conv(mid(md5(`item_guid`), 1, 16), 16, 10) AS `id`, `item_guid`, `name`,
            `subtitle`, `summary`, `toc`, `supplier`, `product_code`, `isbn`, `category`,
            `price`, `availibility` as `availability`, `type`, `link`, UNIX_TIMESTAMP(`publish_date`) AS
     `publish_date` FROM `rb_search`";
$result = read_query($sql);
$sphinx = mysql_connect("127.0.0.1:9306", "", "", true);
while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) {
     foreach ($row as $key=>$value) {
            $row[$key] = mysql_escape_string($value);
     }
     $sql = "REPLACE INTO `rb_search` (`id`, `title`, `subtitle`,`availability`, `type`, `price`, `publish_date`,
     `item_guid`, `supplier`, `product_code`, `isbn`, `category`, `link`, `summary`, `toc`)
VALUES
            ('{$row['id']}', '{$row['name']}', '{$row['subtitle']}', '{$row['availability']}',
     '{$row['type']}','{$row['price']}', '{$row['publish_date']}', '{$row['item_guid']}', '{$row['supplier']}',
     '{$row['product_code']}', '{$row['isbn']}', '{$row['category']}', '{$row['link']}','{$row['summary']}',
     '{$row['toc']}')";
     mysql_query($sql, $sphinx);
}
?>
mysql --host=127.0.0.1 --port=9306

Welcome to the MySQL monitor. Commands end with ; or g.
Your MySQL connection id is 1
Server version: 2.0.3-id64-release (r3043)

mysql> select item_guid, title, subtitle, price from rb_search where match('china pharmaceutical') and price
     > 100 and price < 300 limit 2G
************************** 1. row ***************************
    id: 5228810066049016302
  weight: 6671
  price: 220
item_guid: cc74cb075aa37696198e87850f033398
  title: North China Pharmaceutical Group Corp-Therapeutic Competitors Report
 subtitle:
*************************** 2. row ***************************
    id: 3548867347418583847
  weight: 6662
  price: 190
item_guid: 6ce04df0fb277aa3ff596c2ca00c81a9
  title: China Pharmaceutical Industry Report
 subtitle: 2006-2007
2 rows in set (0.01 sec)
   Fastest indexing of all engines
   Really simple interface via SQL
   Document IDs must be unsigned integers
   No faceting support
   Good support in forums
   Deployed as a C++ library
   Bindings provided to connect to PHP
   Available in most package repositories
   Binding need to be compiled separately
   Query Parser, similar to other engines
   Stemming and faceted search
   Server replication
   Install from system packages
   Compile PHP bindings from source
   No other software needed
   Runs on demand
   No configuration required
   Define-and-go schema
   Documents
   Terms
   Values
   Document data
<?php
$xapian_db = new XapianWritableDatabase($xapian, Xapian::DB_CREATE_OR_OVERWRITE);
$xapian_term_generator = new XapianTermGenerator();
$xapian_term_generator->set_stemmer(new XapianStem("english"));

while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) {
    $doc = new XapianDocument();
           $xapian_term_generator->set_document($doc);
           foreach ($xapian_term_weights as $field => $weight) {
           $xapian_term_generator->index_text($row[$field], $weight);
           }
    $xapian_term_generator->index_text($row['name'], 75, 'S:');
           $doc->add_boolean_term('CODE:' . $row['product_code']);
    $doc->add_value($xapian_value_slots['price'], Xapian::sortable_serialise($row['price']));
    $doc->add_value($xapian_value_slots['publish_date'], strftime("%Y%m%d",
    strtotime($row['publish_date'])));

     // add in additional values that we're going to use for facets
             $doc->add_value($xapian_value_slots['availability'], $row['availability']);
            $doc->set_data(serialize($doc_data));
            $docid = 'Q'.$row['item_guid'];
            $xapian_db->replace_document($docid, $doc);
}
?>
<?php
$xapian_db = new XapianDatabase($xapian);
$query_parser            = new XapianQueryParser();
$query_parser->set_stemmer(new XapianStem("english"));
$query_parser->set_default_op(XapianQuery::OP_AND);

$dvrProcessor = new XapianDateValueRangeProcessor($xapian_value_slots['publish_date'], 'date:');
$query_parser->add_valuerangeprocessor($dvrProcessor);

$query_parser->add_prefix("code", "CODE:");
$query_parser->add_prefix("category", "CATEGORY:");
$query_parser->add_prefix("title", "S:");
$query = $query_parser->parse_query('“Medical devices” NEAR china NOT russian price:10..150 category:medical');

$enquire = new XapianEnquire($xapian_db);
$enquire->set_query($query);
$matches = $enquire->get_mset($offset, $pagesize);
while (!($start->equals($end))) {
     $doc = $start->get_document();
     $price                = Xapian::sortable_unserialise($doc->get_value($xapian_value_slots['price']));
     $start->next();
}?>
   Only one option available from Xapian
   Requires additional compilation due to
    licensing
   Not very well documented API
   Reasonably fast indexing
   Very flexible implementation
   Faceting and range searching
   Good Quick Start guide
   Responsive mailing list
   Third-party paid support
   Every project has different needs
   Not one search product fits all
   Fastest to index was Sphinx
   Most feature-rich: Solr
   The next steps are up to you

More Related Content

What's hot

Jquery presentation
Jquery presentationJquery presentation
Jquery presentation
guest5d87aa6
 
Php code for online quiz
Php code for online quizPhp code for online quiz
Php code for online quiz
hnyb1002
 

What's hot (18)

Jquery presentation
Jquery presentationJquery presentation
Jquery presentation
 
Php 101: PDO
Php 101: PDOPhp 101: PDO
Php 101: PDO
 
Gta v savegame
Gta v savegameGta v savegame
Gta v savegame
 
Your code sucks, let's fix it
Your code sucks, let's fix itYour code sucks, let's fix it
Your code sucks, let's fix it
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object Calisthenics
 
Your code sucks, let's fix it - PHP Master Series 2012
Your code sucks, let's fix it - PHP Master Series 2012Your code sucks, let's fix it - PHP Master Series 2012
Your code sucks, let's fix it - PHP Master Series 2012
 
Drupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary EditionDrupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary Edition
 
Lithium: The Framework for People Who Hate Frameworks, Tokyo Edition
Lithium: The Framework for People Who Hate Frameworks, Tokyo EditionLithium: The Framework for People Who Hate Frameworks, Tokyo Edition
Lithium: The Framework for People Who Hate Frameworks, Tokyo Edition
 
Your code sucks, let's fix it (CakeFest2012)
Your code sucks, let's fix it (CakeFest2012)Your code sucks, let's fix it (CakeFest2012)
Your code sucks, let's fix it (CakeFest2012)
 
Doctrine 2
Doctrine 2Doctrine 2
Doctrine 2
 
PHP tips and tricks
PHP tips and tricks PHP tips and tricks
PHP tips and tricks
 
Php code for online quiz
Php code for online quizPhp code for online quiz
Php code for online quiz
 
Drupal7 dbtng
Drupal7  dbtngDrupal7  dbtng
Drupal7 dbtng
 
Drupal II: The SQL
Drupal II: The SQLDrupal II: The SQL
Drupal II: The SQL
 
PHP Data Objects
PHP Data ObjectsPHP Data Objects
PHP Data Objects
 
Erlang for data ops
Erlang for data opsErlang for data ops
Erlang for data ops
 
Drupal 8 database api
Drupal 8 database apiDrupal 8 database api
Drupal 8 database api
 
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
 

Viewers also liked

Oxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassinOxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide
 

Viewers also liked (11)

Search search search
Search search searchSearch search search
Search search search
 
Poitou charentes JUG - Elasticsearch
Poitou charentes JUG - ElasticsearchPoitou charentes JUG - Elasticsearch
Poitou charentes JUG - Elasticsearch
 
Introducing ElasticSearch - Ashish
Introducing ElasticSearch - AshishIntroducing ElasticSearch - Ashish
Introducing ElasticSearch - Ashish
 
The original vision of Nutch, 14 years later: Building an open source search ...
The original vision of Nutch, 14 years later: Building an open source search ...The original vision of Nutch, 14 years later: Building an open source search ...
The original vision of Nutch, 14 years later: Building an open source search ...
 
Elasticsearch
ElasticsearchElasticsearch
Elasticsearch
 
Comparing open source search engines
Comparing open source search enginesComparing open source search engines
Comparing open source search engines
 
Elastic search
Elastic searchElastic search
Elastic search
 
Elasticsearch Arcihtecture & What's New in Version 5
Elasticsearch Arcihtecture & What's New in Version 5Elasticsearch Arcihtecture & What's New in Version 5
Elasticsearch Arcihtecture & What's New in Version 5
 
Oxalide Workshop #3 - Elasticearch, an overview
Oxalide Workshop #3 - Elasticearch, an overviewOxalide Workshop #3 - Elasticearch, an overview
Oxalide Workshop #3 - Elasticearch, an overview
 
Oxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassinOxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassin
 
(Elastic)search in big data
(Elastic)search in big data(Elastic)search in big data
(Elastic)search in big data
 

Similar to Open Source Search: An Analysis

Propel sfugmd
Propel sfugmdPropel sfugmd
Propel sfugmd
iKlaus
 
第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource
Kaz Watanabe
 
From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)
Night Sailer
 

Similar to Open Source Search: An Analysis (20)

Propel sfugmd
Propel sfugmdPropel sfugmd
Propel sfugmd
 
The Zen of Lithium
The Zen of LithiumThe Zen of Lithium
The Zen of Lithium
 
第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource
 
The State of Lithium
The State of LithiumThe State of Lithium
The State of Lithium
 
Broadleaf Presents Thymeleaf
Broadleaf Presents ThymeleafBroadleaf Presents Thymeleaf
Broadleaf Presents Thymeleaf
 
From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)
 
Unit testing with zend framework tek11
Unit testing with zend framework tek11Unit testing with zend framework tek11
Unit testing with zend framework tek11
 
PostgreSQL's Secret NoSQL Superpowers
PostgreSQL's Secret NoSQL SuperpowersPostgreSQL's Secret NoSQL Superpowers
PostgreSQL's Secret NoSQL Superpowers
 
Hidden treasures of Ruby
Hidden treasures of RubyHidden treasures of Ruby
Hidden treasures of Ruby
 
Unit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxUnit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBenelux
 
Bag Of Tricks From Iusethis
Bag Of Tricks From IusethisBag Of Tricks From Iusethis
Bag Of Tricks From Iusethis
 
WordCamp Portland 2018: PHP for WordPress
WordCamp Portland 2018: PHP for WordPressWordCamp Portland 2018: PHP for WordPress
WordCamp Portland 2018: PHP for WordPress
 
Database api
Database apiDatabase api
Database api
 
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHPPHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
 
Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)
 
Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010
 
laravel tricks in 50minutes
laravel tricks in 50minuteslaravel tricks in 50minutes
laravel tricks in 50minutes
 
50 Laravel Tricks in 50 Minutes
50 Laravel Tricks in 50 Minutes50 Laravel Tricks in 50 Minutes
50 Laravel Tricks in 50 Minutes
 
Let's write secure Drupal code! - DrupalCamp Oslo, 2018
Let's write secure Drupal code! - DrupalCamp Oslo, 2018Let's write secure Drupal code! - DrupalCamp Oslo, 2018
Let's write secure Drupal code! - DrupalCamp Oslo, 2018
 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the Good
 

Recently uploaded

Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024
Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024
Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024
Victor Rentea
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Safe Software
 

Recently uploaded (20)

DBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor Presentation
 
Vector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxVector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptx
 
Stronger Together: Developing an Organizational Strategy for Accessible Desig...
Stronger Together: Developing an Organizational Strategy for Accessible Desig...Stronger Together: Developing an Organizational Strategy for Accessible Desig...
Stronger Together: Developing an Organizational Strategy for Accessible Desig...
 
Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with Milvus
 
API Governance and Monetization - The evolution of API governance
API Governance and Monetization -  The evolution of API governanceAPI Governance and Monetization -  The evolution of API governance
API Governance and Monetization - The evolution of API governance
 
AI in Action: Real World Use Cases by Anitaraj
AI in Action: Real World Use Cases by AnitarajAI in Action: Real World Use Cases by Anitaraj
AI in Action: Real World Use Cases by Anitaraj
 
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdfRising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
 
Modernizing Legacy Systems Using Ballerina
Modernizing Legacy Systems Using BallerinaModernizing Legacy Systems Using Ballerina
Modernizing Legacy Systems Using Ballerina
 
Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024
Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024
Modular Monolith - a Practical Alternative to Microservices @ Devoxx UK 2024
 
Decarbonising Commercial Real Estate: The Role of Operational Performance
Decarbonising Commercial Real Estate: The Role of Operational PerformanceDecarbonising Commercial Real Estate: The Role of Operational Performance
Decarbonising Commercial Real Estate: The Role of Operational Performance
 
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodPolkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf
 
Simplifying Mobile A11y Presentation.pptx
Simplifying Mobile A11y Presentation.pptxSimplifying Mobile A11y Presentation.pptx
Simplifying Mobile A11y Presentation.pptx
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptx
 
MINDCTI Revenue Release Quarter One 2024
MINDCTI Revenue Release Quarter One 2024MINDCTI Revenue Release Quarter One 2024
MINDCTI Revenue Release Quarter One 2024
 
Platformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityPlatformless Horizons for Digital Adaptability
Platformless Horizons for Digital Adaptability
 
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
 
How to Check CNIC Information Online with Pakdata cf
How to Check CNIC Information Online with Pakdata cfHow to Check CNIC Information Online with Pakdata cf
How to Check CNIC Information Online with Pakdata cf
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 

Open Source Search: An Analysis

  • 1. An analysis and comparison from a developer’s perspective
  • 2.
  • 3. Report Buyer product catalogue: • Text fields: title, subtitle, summary, toc • Product code and ISBN • Supplier, category, type and availability • Publication date and price
  • 4. Enterprise class search engine Scalable and based on Apache Lucene REST-ful API or PECL extension Fast, transactional full-text indexing Faceted and geospatial search Rich document indexing Comes with simple web interface Built-in caching of queries and responses Numerous plug-ins
  • 5. Available as system packages  Uses Tomcat or Jetty  Requires a restart on configuration change  Packages install as a service
  • 6. Specify database location  Memory settings  Query caching options  Request handler setup  Search components and plug-ins  Spell checker configuration
  • 7. <!-- Report Buyer fields --> <field name="item_guid" type="string" indexed="true" stored="true" required="true" /> <field name="name" type="text" indexed="true" stored="true" required="true" boost="75" omitNorms="false" /> <field name="subtitle" type="text" indexed="true" stored="true" required="false" boost="25" omitNorms="false" /> <field name="summary" type="text" indexed="true" stored="false" boost="1" omitNorms="false" /> <field name="toc" type="text" indexed="true" stored="false" boost="1" omitNorms="false" /> <field name="isbn" type="string" indexed="true" stored="false" boost="200" omitNorms="false" /> <field name="product_code" type="string" indexed="true" stored="true" boost="200" omitNorms="false" /> <field name="publish_date" type="tdate" indexed="true" stored="true" /> <field name="price" type="tfloat" indexed="true" stored="true" /> <field name="availability" type="boolean" indexed="true" stored="true" /> <field name="link" type="string" indexed="false" stored="true" /> <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> <copyField source="name" dest="text"/> <copyField source="subtitle" dest="text"/> <copyField source="summary" dest="text"/> <copyField source="toc" dest="text"/> <uniqueKey>item_guid</uniqueKey> <defaultSearchField>text</defaultSearchField>
  • 8. Data Import Handler  REST-ful API  PHP PECL Extension  Third-party libraries, like Solarium
  • 9. <?php $solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080); $solr = new SolrClient($solr_options); $doc = new SolrInputDocument(); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { $doc = new SolrInputDocument(); $row['publish_date'] = strftime('%Y-%m-%dT00:00:01Z', strtotime($row['publish_date'])); foreach ($row as $key => $value) { $doc->addField($key, $value); } $updateResponse = $solr->addDocument($doc); $response = $updateResponse->getResponse(); if ($response->responseHeader->status != 0) { print "Error importing into Solr: "; print_r($response); } } $solr->commit(); ?>
  • 10. POST to http://localhost:8080/solr/update?commit=true <add> <doc> <field name="item_guid">a34bbff9e17ada79658c72fde90c7369</field> <field name="name">Research Report on China's Corn Industry</field> <field name="price">1265</field> etc </doc> </add>
  • 11. $solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080); $solr = new SolrClient($solr_options); $query = new SolrQuery(); $query->setQuery("research in china"); $query->setFacet(true); $query->addFacetField('availability'); $query->addField('item_guid')->addField('name')->addField('publish_date')->addField('subtitle')-> addField('product_code')->addField('availability')->addField('price'); $query->addSortField('publish_date', SolrQuery::ORDER_DESC); $query_response = $solr->query($query); $response = $query_response->getResponse(); print "Found ".$response->response->numFound." results, for {$query_string} in ".$response- >responseHeader->QTime." ms:nn"; foreach ($response->response->docs as $position=>$doc_data) { $download = ($doc_data['availability'] == '1') ? 'Yes' : 'No'; print "{$position} - Date:{$pub_date} - {$doc_data['product_code']} - D/L:{$download} £".sprintf("%5d", $doc_data['price'])." - {$doc_data['name']}n"; } print "Facets for instant ".$response->facet_counts->facet_fields->availability->false;
  • 12. http://localhost:8080/solr/select/?q=research%20%in%20china&indent=on&hl=true&hl.fl=item_guid,name, publish_date,subtitle,product_code,availability,price&facet=true&facet.field=availability&wt=json { "responseHeader":{ "status":0, "QTime":20, "params":{ "facet":"true", "indent":"on", "q":"research u0000 china", "hl.fl":"item_guid,name,publish_date,subtitle,product_code,availability,price", "facet.field":"availability", "wt":"json", "hl":"true"}}, "response":{"numFound":197481,"start":0,"docs":[ { "item_guid":"e68cf64921a02e926137d78d2c52da35", "name":"Market Research Report on China Civil Aero Industry", "product_code":"SFC00076", "price":190.0, "availability":false, "type":10, "link": "/industry_manufacturing/plant_heavy_equipment/market_research_report_china_civil_aero_industry. html", "publish_date":"2008-07-22T00:00:01Z" } }
  • 13. More features than other products  Responsive, busy mailing list  Large team of developers  Good PHP libraries for integration  Several books available  Fairly heavy footprint
  • 14. Also built on Apache Lucene  JSON-based  Distributed, scalable server model  Easy to configure, or configuration free  Faceting and highlight support  Auto type detection  Multiple indexes  CouchDB integration
  • 15. Download and unpack zip file  Run elasticsearch/bin/elasticsearch
  • 16. No schema is required - almost  No configuration is required - almost
  • 17. GET http://localhost:9200/ HTTP/1.0 { "ok" : true, "name" : "Test", "version" : { "number" : "0.18.7", "snapshot_build" : false }, "tagline" : "You Know, for Search", "cover" : "DON'T PANIC", "quote" : { "book" : "The Hitchhiker's Guide to the Galaxy", "chapter" : "Chapter 27", "text1" : ""Forty-two," said Deep Thought, with infinite majesty and calm.", "text2" : ""The Answer to the Great Question, of Life, the Universe and Everything"" } }
  • 18. curl -XPUT http://localhost:9200/reports/ -d ' { "index:" { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "filter": ["standard", "lowercase", "my_stemmer"] } }, "filter": { "my_stemmer": { "type": "stemmer", "name": "english" } } } } }'
  • 19. <?php require_once("ElasticSearch.php"); $es = new ElasticSearch; $es->index = 'reports'; $type = 'report'; $mappings = array($type => array('properties' => array( '_id' => array('type' => 'string', 'path' => 'item_guid'), 'item_guid' => array('type' => 'string', 'store' => 'yes', 'index' => 'not_analyzed'), 'name' => array('type' => 'string', 'store' => 'no', 'boost' => 75), 'subtitle' => array('type' => 'string', 'store' => 'yes', 'boost' => 25), 'summary' => array('type' => 'string', 'store' => 'yes', 'boost' => 10), 'toc' => array('type' => 'string', 'store' => 'no'), 'product_code' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'), 'isbn' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'), ))); $json = json_encode($mappings); $es->map($type, $json); ?>
  • 20. <?php require_once("ElasticSearch.php"); $es = new ElasticSearch; $es->index = 'reports'; $type = 'report'; $sql = "SELECT `item_guid`, `name`, `subtitle`, `summary`, `toc`, `supplier`, `product_code`, `isbn`, `category`, `price`, `availibility` as `availability`, `type`, `link`, `publish_date` FROM `rb_search`"; $result = read_query($sql); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { $es->add($type, $row['item_guid'], json_encode($row)); } ?>
  • 22. <?php require_once("ElasticSearch.php"); $es = new ElasticSearch; $es->index = 'reports'; $type = 'report'; $query = array( 'fields' => array('item_guid', 'name', 'subtitle'), 'query' => array( 'term' => array('name' => 'research'), ), 'facets' => array( 'availability' => array( 'terms' => array('field' => 'availability') ) ) ); $result = $es->query($type, json_encode($query)); ?>
  • 23. Nicholas Ruflin's elastica  Raymond Julin's elasticsearch  Niranjan Uma Shankar's elasticsearch-php
  • 24. Very fast indexing  Auto-scaling architecture  Elegant REST approach  Flexible zero configuration model  Poor documentation  No feature list, conceptual model or introduction  All data is stored, meaning large indices
  • 25. Indexes MySQL, MSSQL, XML or ODBC  Querying through Sphinx PHP API  Searching through SQL queries or API  Scalable to index 6TB of data in 16bn documents and 2000 queries/sec  Used by Craigslist, Boardreader  Runs as a storage engine in MySQL
  • 26. Install from system packages or source  Source tarball is needed to get PHP SphinxAPI  No other software needed  Runs as a service in Ubuntu
  • 27. Plain index - fast search, slow update  Real-time index - fast update, less efficient  Distributed - combination of both methods
  • 28. index rb_test { # index type type = rt path = /mnt/data_indexed/sphinx/rb_test # define the fields we're indexing rt_field = name rt_field = subtitle rt_field = summary rt_field = toc #define the fields we want to get back out rt_attr_string = item_guid rt_attr_string = supplier rt_attr_string = product_code rt_attr_string = isbn rt_attr_string = category rt_attr_uint = price rt_attr_string = link rt_attr_timestamp = publish_date # morphology preprocessors to apply morphology = stem_en html_strip =1 html_index_attrs = img=alt,title; a=title; html_remove_elements = style, script }
  • 29. <?php require_once("mysql.inc.php"); $sql = "SELECT conv(mid(md5(`item_guid`), 1, 16), 16, 10) AS `id`, `item_guid`, `name`, `subtitle`, `summary`, `toc`, `supplier`, `product_code`, `isbn`, `category`, `price`, `availibility` as `availability`, `type`, `link`, UNIX_TIMESTAMP(`publish_date`) AS `publish_date` FROM `rb_search`"; $result = read_query($sql); $sphinx = mysql_connect("127.0.0.1:9306", "", "", true); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { foreach ($row as $key=>$value) { $row[$key] = mysql_escape_string($value); } $sql = "REPLACE INTO `rb_search` (`id`, `title`, `subtitle`,`availability`, `type`, `price`, `publish_date`, `item_guid`, `supplier`, `product_code`, `isbn`, `category`, `link`, `summary`, `toc`) VALUES ('{$row['id']}', '{$row['name']}', '{$row['subtitle']}', '{$row['availability']}', '{$row['type']}','{$row['price']}', '{$row['publish_date']}', '{$row['item_guid']}', '{$row['supplier']}', '{$row['product_code']}', '{$row['isbn']}', '{$row['category']}', '{$row['link']}','{$row['summary']}', '{$row['toc']}')"; mysql_query($sql, $sphinx); } ?>
  • 30. mysql --host=127.0.0.1 --port=9306 Welcome to the MySQL monitor. Commands end with ; or g. Your MySQL connection id is 1 Server version: 2.0.3-id64-release (r3043) mysql> select item_guid, title, subtitle, price from rb_search where match('china pharmaceutical') and price > 100 and price < 300 limit 2G ************************** 1. row *************************** id: 5228810066049016302 weight: 6671 price: 220 item_guid: cc74cb075aa37696198e87850f033398 title: North China Pharmaceutical Group Corp-Therapeutic Competitors Report subtitle: *************************** 2. row *************************** id: 3548867347418583847 weight: 6662 price: 190 item_guid: 6ce04df0fb277aa3ff596c2ca00c81a9 title: China Pharmaceutical Industry Report subtitle: 2006-2007 2 rows in set (0.01 sec)
  • 31. Fastest indexing of all engines  Really simple interface via SQL  Document IDs must be unsigned integers  No faceting support  Good support in forums
  • 32. Deployed as a C++ library  Bindings provided to connect to PHP  Available in most package repositories  Binding need to be compiled separately  Query Parser, similar to other engines  Stemming and faceted search  Server replication
  • 33. Install from system packages  Compile PHP bindings from source  No other software needed  Runs on demand
  • 34. No configuration required  Define-and-go schema  Documents  Terms  Values  Document data
  • 35. <?php $xapian_db = new XapianWritableDatabase($xapian, Xapian::DB_CREATE_OR_OVERWRITE); $xapian_term_generator = new XapianTermGenerator(); $xapian_term_generator->set_stemmer(new XapianStem("english")); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { $doc = new XapianDocument(); $xapian_term_generator->set_document($doc); foreach ($xapian_term_weights as $field => $weight) { $xapian_term_generator->index_text($row[$field], $weight); } $xapian_term_generator->index_text($row['name'], 75, 'S:'); $doc->add_boolean_term('CODE:' . $row['product_code']); $doc->add_value($xapian_value_slots['price'], Xapian::sortable_serialise($row['price'])); $doc->add_value($xapian_value_slots['publish_date'], strftime("%Y%m%d", strtotime($row['publish_date']))); // add in additional values that we're going to use for facets $doc->add_value($xapian_value_slots['availability'], $row['availability']); $doc->set_data(serialize($doc_data)); $docid = 'Q'.$row['item_guid']; $xapian_db->replace_document($docid, $doc); } ?>
  • 36. <?php $xapian_db = new XapianDatabase($xapian); $query_parser = new XapianQueryParser(); $query_parser->set_stemmer(new XapianStem("english")); $query_parser->set_default_op(XapianQuery::OP_AND); $dvrProcessor = new XapianDateValueRangeProcessor($xapian_value_slots['publish_date'], 'date:'); $query_parser->add_valuerangeprocessor($dvrProcessor); $query_parser->add_prefix("code", "CODE:"); $query_parser->add_prefix("category", "CATEGORY:"); $query_parser->add_prefix("title", "S:"); $query = $query_parser->parse_query('“Medical devices” NEAR china NOT russian price:10..150 category:medical'); $enquire = new XapianEnquire($xapian_db); $enquire->set_query($query); $matches = $enquire->get_mset($offset, $pagesize); while (!($start->equals($end))) { $doc = $start->get_document(); $price = Xapian::sortable_unserialise($doc->get_value($xapian_value_slots['price'])); $start->next(); }?>
  • 37. Only one option available from Xapian  Requires additional compilation due to licensing  Not very well documented API
  • 38. Reasonably fast indexing  Very flexible implementation  Faceting and range searching  Good Quick Start guide  Responsive mailing list  Third-party paid support
  • 39. Every project has different needs  Not one search product fits all  Fastest to index was Sphinx  Most feature-rich: Solr  The next steps are up to you