Lucene

3,586 views

Published on

Brief introduction to Lucene, including a Java for Perl guys overview.

Published in: Technology, Education

Lucene

  1. 1. Open source indexing and search engine
  2. 2. Web scale
  3. 3. Lucene Inverted index
  4. 4. Lucene Inverted index Results
  5. 5. Lucene Inverted index Servlet container J2EE application server
  6. 6. WARNING Java approaching!
  7. 7. Java is strongly object orientated
  8. 8. my @gene_names = (); push(@gene_names, $gene); print @gene_names; Perl Java Array gene_names = new Array(); gene_names.add(gene); System.out.println(gene_names.toString)
  9. 9. my $gene = Gene->new(‘ENS12345’); $gene->set_name(‘BRCA2’); Perl Java Gene gene = new Gene(‘ENS12345’); gene.set_name(‘BRCA2’);
  10. 10. Java is strongly typed
  11. 11. my $number = “100”; $number = $number + 400 print $number; Perl Java Integer number = new Integer(100); number = number + 400; System.out.println(number + 400);
  12. 12. Java is good at error handling
  13. 13. eval ($gene->transform); warn $@ if $@; Perl Java try { gene->transform } catch (IOException e) { e.printStackTrace; }
  14. 14. Java is surprisingly easy to learn
  15. 15. Conditionals and loops Variables have scope Extras from CPAN Performance is important Perl Java Conditionals and loops Variables have scope Extras available as JAR files Performance is important
  16. 16. Recipe 1: Indexing a collection of documents
  17. 17. org.ensembl.lucene.Writer
  18. 18. public static void main(String[] args) {         HashMap<String, String> arguments = new HashMap<String, String>();         String key = null;         for (String s: args) {            if (key == null) {                key = s;            } else {                arguments.put(key, s);                key = null;            }         }         Writer writer = new Writer();         writer.setIndexLocation(arguments.get(quot;-indexquot;));         writer.setInputLocation(arguments.get(quot;-inputquot;));         if (arguments.get(quot;-mergefactorquot;) != null) { writer.setMergeFactor(Integer.valueOf(arguments.get(quot;-mergefactorquot;)));     }         if (arguments.get(quot;-maxmergedocsquot;) != null) { writer.setMaxMergeDocs(Integer.valueOf(arguments.get(quot;-maxmergedocsquot;)));     }         try {             writer.index();         } catch (IOException e) {             e.printStackTrace();         }         System.out.println(quot;Indexing completequot;);   }
  19. 19. public static void main(String[] args) {         HashMap<String, String> arguments = new HashMap<String, String>();         String key = null;         for (String s: args) {            if (key == null) {                key = s;            } else {                arguments.put(key, s);                key = null;            }         }         Writer writer = new Writer();         writer.setIndexLocation(arguments.get(quot;-indexquot;));         writer.setInputLocation(arguments.get(quot;-inputquot;));         if (arguments.get(quot;-mergefactorquot;) != null) { writer.setMergeFactor(Integer.valueOf(arguments.get(quot;-mergefactorquot;)));     }         if (arguments.get(quot;-maxmergedocsquot;) != null) { writer.setMaxMergeDocs(Integer.valueOf(arguments.get(quot;-maxmergedocsquot;)));     }         try {             writer.index();         } catch (IOException e) {             e.printStackTrace();         }         System.out.println(quot;Indexing completequot;);   }
  20. 20. public static void main(String[] args) {         HashMap<String, String> arguments = new HashMap<String, String>();         String key = null;         for (String s: args) {            if (key == null) {                key = s;            } else {                arguments.put(key, s);                key = null;            }         }         Writer writer = new Writer();         writer.setIndexLocation(arguments.get(quot;-indexquot;));         writer.setInputLocation(arguments.get(quot;-inputquot;));         if (arguments.get(quot;-mergefactorquot;) != null) { writer.setMergeFactor(Integer.valueOf(arguments.get(quot;-mergefactorquot;)));     }         if (arguments.get(quot;-maxmergedocsquot;) != null) { writer.setMaxMergeDocs(Integer.valueOf(arguments.get(quot;-maxmergedocsquot;)));     }         try {             writer.index();         } catch (IOException e) {             e.printStackTrace();         }         System.out.println(quot;Indexing completequot;);   }
  21. 21. Max-merge-docs how many documents are added to a segment
  22. 22. Merge-factor how often Lucene merges index segments when adding documents
  23. 23. public static void main(String[] args) {         HashMap<String, String> arguments = new HashMap<String, String>();         String key = null;         for (String s: args) {            if (key == null) {                key = s;            } else {                arguments.put(key, s);                key = null;            }         }         Writer writer = new Writer();         writer.setIndexLocation(arguments.get(quot;-indexquot;));         writer.setInputLocation(arguments.get(quot;-inputquot;));         if (arguments.get(quot;-mergefactorquot;) != null) { writer.setMergeFactor(Integer.valueOf(arguments.get(quot;-mergefactorquot;)));     }         if (arguments.get(quot;-maxmergedocsquot;) != null) { writer.setMaxMergeDocs(Integer.valueOf(arguments.get(quot;-maxmergedocsquot;)));     }         try {             writer.index();         } catch (IOException e) {             e.printStackTrace();         }         System.out.println(quot;Indexing completequot;);   }
  24. 24. public void index() throws IOException {         File index = new File(getIndexLocation());         File location = new File(getInputLocation());         IndexWriter writer = new IndexWriter(index, new StandardAnalyzer(), true);         writer.setMergeFactor(getMergeFactor());         writer.setMaxMergeDocs(getMaxMergeDocs());         indexDocuments(writer, location);         writer.optimize();         writer.close();   }     private static void indexDocuments(IndexWriter writer, Filelocation) throws IOException {         if (location.canRead()) {           if (location.isDirectory()) {             String[] files = location.list();             if (files != null) {               for (int i = 0; i < files.length; i++) {                 indexDocuments(writer, new File(location, files[i]));            }           }           } else {             System.out.println(quot;Indexing  quot; + location);             try {                 GeneFileDocument.index(writer, location);           }             catch (FileNotFoundException e) {               System.out.println(quot;Caught exception: quot; + e);           }          }         }     }
  25. 25. public void index() throws IOException {         File index = new File(getIndexLocation());         File location = new File(getInputLocation());         IndexWriter writer = new IndexWriter(index, new StandardAnalyzer(), true);         writer.setMergeFactor(getMergeFactor());         writer.setMaxMergeDocs(getMaxMergeDocs());         indexDocuments(writer, location);         writer.optimize();         writer.close();   }     private static void indexDocuments(IndexWriter writer, Filelocation) throws IOException {         if (location.canRead()) {           if (location.isDirectory()) {             String[] files = location.list();             if (files != null) {               for (int i = 0; i < files.length; i++) {                 indexDocuments(writer, new File(location, files[i]));            }           }           } else {             System.out.println(quot;Indexing  quot; + location);             try {                 GeneFileDocument.index(writer, location);           }             catch (FileNotFoundException e) {               System.out.println(quot;Caught exception: quot; + e);           }          }         }     }
  26. 26. public void index() throws IOException {         File index = new File(getIndexLocation());         File location = new File(getInputLocation());         IndexWriter writer = new IndexWriter(index, new StandardAnalyzer(), true);         writer.setMergeFactor(getMergeFactor());         writer.setMaxMergeDocs(getMaxMergeDocs());         indexDocuments(writer, location);         writer.optimize();         writer.close();   }     private static void indexDocuments(IndexWriter writer, Filelocation) throws IOException {         if (location.canRead()) {           if (location.isDirectory()) {             String[] files = location.list();             if (files != null) {               for (int i = 0; i < files.length; i++) {                 indexDocuments(writer, new File(location, files[i]));            }           }           } else {             System.out.println(quot;Indexing  quot; + location);             try {                 GeneFileDocument.index(writer, location);           }             catch (FileNotFoundException e) {               System.out.println(quot;Caught exception: quot; + e);           }          }         }     }
  27. 27. public void index() throws IOException {         File index = new File(getIndexLocation());         File location = new File(getInputLocation());         IndexWriter writer = new IndexWriter(index, new StandardAnalyzer(), true);         writer.setMergeFactor(getMergeFactor());         writer.setMaxMergeDocs(getMaxMergeDocs());         indexDocuments(writer, location);         writer.optimize();         writer.close();   }     private static void indexDocuments(IndexWriter writer, Filelocation) throws IOException {         if (location.canRead()) {           if (location.isDirectory()) {             String[] files = location.list();             if (files != null) {               for (int i = 0; i < files.length; i++) {                 indexDocuments(writer, new File(location, files[i]));            }           }           } else {             System.out.println(quot;Indexing  quot; + location);             try {                 GeneFileDocument.index(writer, location);           }             catch (FileNotFoundException e) {               System.out.println(quot;Caught exception: quot; + e);           }          }         }     }
  28. 28. org.ensembl.lucene. GeneFileDocument
  29. 29.     public static void index(IndexWriter writer, File f) throws IOException {         String fields[] = {quot;subtypequot;, quot;idquot;, quot;urlquot;, quot;keywordsquot;, quot;descriptionquot;};         FileReader input = new FileReader(f);         BufferedReader bufRead = new BufferedReader(input);         String line;         line = bufRead.readLine();         while (line != null){              Document doc = new Document();              int count = 0;              String terms[] = line.split(quot;tquot;);              while (count < terms.length) {                  String field = fields[count];                  String item = terms[count];                  doc.add(new Field(field, item, Field.Store.YES, Field.Index.TOKENIZED));                  count++;              }              writer.addDocument(doc);              line = bufRead.readLine();          }   }
  30. 30.     public static void index(IndexWriter writer, File f) throws IOException {         String fields[] = {quot;subtypequot;, quot;idquot;, quot;urlquot;, quot;keywordsquot;, quot;descriptionquot;};         FileReader input = new FileReader(f);         BufferedReader bufRead = new BufferedReader(input);         String line;         line = bufRead.readLine();         while (line != null){              Document doc = new Document();              int count = 0;              String terms[] = line.split(quot;tquot;);              while (count < terms.length) {                  String field = fields[count];                  String item = terms[count];                  doc.add(new Field(field, item, Field.Store.YES, Field.Index.TOKENIZED));                  count++;              }              writer.addDocument(doc);              line = bufRead.readLine();          }   }
  31. 31.     public static void index(IndexWriter writer, File f) throws IOException {         String fields[] = {quot;subtypequot;, quot;idquot;, quot;urlquot;, quot;keywordsquot;, quot;descriptionquot;};         FileReader input = new FileReader(f);         BufferedReader bufRead = new BufferedReader(input);         String line;         line = bufRead.readLine();         while (line != null){              Document doc = new Document();              int count = 0;              String terms[] = line.split(quot;tquot;);              while (count < terms.length) {                  String field = fields[count];                  String item = terms[count];                  doc.add(new Field(field, item, Field.Store.YES, Field.Index.TOKENIZED));                  count++;              }              writer.addDocument(doc);              line = bufRead.readLine();          }   }
  32. 32. Quite a lot of memory ~1.5Gb
  33. 33. Creates index
  34. 34. Merge indices to form master search index
  35. 35. Recipe 2: Finding documents containing a search term
  36. 36. Easy
  37. 37. org.ensembl.lucene.Search
  38. 38. public static void main(String args[]) {         Timer timer = new Timer();         String index = quot;indexquot;;         try {             timer.start();             Searcher searcher = new IndexSearcher(index);             timer.stop();             System.out.println(quot;Loaded quot; + searcher.maxDoc() + quot; documents in quot; + timer.elapsed() + quot;msquot;);            search(searcher, quot;subtypequot;, quot;Vega_havana processed_pseudogene Genequot;);             search(searcher, quot;idquot;, quot;OTTHUMG00000000423quot;);             searcher.close();         } catch (Exception e) {             e.printStackTrace();     }   }
  39. 39. public static void main(String args[]) {         Timer timer = new Timer();         String index = quot;indexquot;;         try {             timer.start();             Searcher searcher = new IndexSearcher(index);             timer.stop();             System.out.println(quot;Loaded quot; + searcher.maxDoc() + quot; documents in quot; + timer.elapsed() + quot;msquot;);            search(searcher, quot;subtypequot;, quot;Vega_havana processed_pseudogene Genequot;);             search(searcher, quot;idquot;, quot;OTTHUMG00000000423quot;);             searcher.close();         } catch (Exception e) {             e.printStackTrace();     }   }
  40. 40. public static void main(String args[]) {         Timer timer = new Timer();         String index = quot;indexquot;;         try {             timer.start();             Searcher searcher = new IndexSearcher(index);             timer.stop();             System.out.println(quot;Loaded quot; + searcher.maxDoc() + quot; documents in quot; + timer.elapsed() + quot;msquot;);            search(searcher, quot;subtypequot;, quot;Vega_havana processed_pseudogene Genequot;);             search(searcher, quot;idquot;, quot;OTTHUMG00000000423quot;);             searcher.close();         } catch (Exception e) {             e.printStackTrace();     }   }
  41. 41. public static void main(String args[]) {         Timer timer = new Timer();         String index = quot;indexquot;;         try {             timer.start();             Searcher searcher = new IndexSearcher(index);             timer.stop();             System.out.println(quot;Loaded quot; + searcher.maxDoc() + quot; documents in quot; + timer.elapsed() + quot;msquot;);            search(searcher, quot;subtypequot;, quot;Vega_havana processed_pseudogene Genequot;);             search(searcher, quot;idquot;, quot;OTTHUMG00000000423quot;);             searcher.close();         } catch (Exception e) {             e.printStackTrace();     }   }
  42. 42.     private static void search(Searcher searcher, String field, String queryString) throws ParseException, IOException {         Timer timer = new Timer();         timer.start();         System.out.println(quot;Search (quot; + field + quot;): quot; + queryString);         QueryParser parser = new QueryParser(field, new StandardAnalyzer());         Query query = parser.parse(queryString);         Hits hits = searcher.search(query);         Integer count = 1;         Iterator<Hit> hiterator = hits.iterator();         while (hiterator.hasNext()) {             Hit hit = hiterator.next();             Document document = hit.getDocument();             System.out.println(count + quot;: ID: quot; + document.get(quot;idquot;));             System.out.println(count + quot;: Subtype: quot; + document.get(quot;subtypequot;));             count++;         }         int hitCount = hits.length();         timer.stop();         System.out.println(quot;Hits: quot; + hitCount);         System.out.println(quot;Completed in quot; + timer.elapsed() + quot;msquot;);
  43. 43.     private static void search(Searcher searcher, String field, String queryString) throws ParseException, IOException {         Timer timer = new Timer();         timer.start();         System.out.println(quot;Search (quot; + field + quot;): quot; + queryString);         QueryParser parser = new QueryParser(field, new StandardAnalyzer());         Query query = parser.parse(queryString);         Hits hits = searcher.search(query);         Integer count = 1;         Iterator<Hit> hiterator = hits.iterator();         while (hiterator.hasNext()) {             Hit hit = hiterator.next();             Document document = hit.getDocument();             System.out.println(count + quot;: ID: quot; + document.get(quot;idquot;));             System.out.println(count + quot;: Subtype: quot; + document.get(quot;subtypequot;));             count++;         }         int hitCount = hits.length();         timer.stop();         System.out.println(quot;Hits: quot; + hitCount);         System.out.println(quot;Completed in quot; + timer.elapsed() + quot;msquot;);
  44. 44.     private static void search(Searcher searcher, String field, String queryString) throws ParseException, IOException {         Timer timer = new Timer();         timer.start();         System.out.println(quot;Search (quot; + field + quot;): quot; + queryString);         QueryParser parser = new QueryParser(field, new StandardAnalyzer());         Query query = parser.parse(queryString);         Hits hits = searcher.search(query);         Integer count = 1;         Iterator<Hit> hiterator = hits.iterator();         while (hiterator.hasNext()) {             Hit hit = hiterator.next();             Document document = hit.getDocument();             System.out.println(count + quot;: ID: quot; + document.get(quot;idquot;));             System.out.println(count + quot;: Subtype: quot; + document.get(quot;subtypequot;));             count++;         }         int hitCount = hits.length();         timer.stop();         System.out.println(quot;Hits: quot; + hitCount);         System.out.println(quot;Completed in quot; + timer.elapsed() + quot;msquot;);
  45. 45.     private static void search(Searcher searcher, String field, String queryString) throws ParseException, IOException {         Timer timer = new Timer();         timer.start();         System.out.println(quot;Search (quot; + field + quot;): quot; + queryString);         QueryParser parser = new QueryParser(field, new StandardAnalyzer());         Query query = parser.parse(queryString);         Hits hits = searcher.search(query);         Integer count = 1;         Iterator<Hit> hiterator = hits.iterator();         while (hiterator.hasNext()) {             Hit hit = hiterator.next();             Document document = hit.getDocument();             System.out.println(count + quot;: ID: quot; + document.get(quot;idquot;));             System.out.println(count + quot;: Subtype: quot; + document.get(quot;subtypequot;));             count++;         }         int hitCount = hits.length();         timer.stop();         System.out.println(quot;Hits: quot; + hitCount);         System.out.println(quot;Completed in quot; + timer.elapsed() + quot;msquot;);
  46. 46.     private static void search(Searcher searcher, String field, String queryString) throws ParseException, IOException {         Timer timer = new Timer();         timer.start();         System.out.println(quot;Search (quot; + field + quot;): quot; + queryString);         QueryParser parser = new QueryParser(field, new StandardAnalyzer());         Query query = parser.parse(queryString);         Hits hits = searcher.search(query);         Integer count = 1;         Iterator<Hit> hiterator = hits.iterator();         while (hiterator.hasNext()) {             Hit hit = hiterator.next();             Document document = hit.getDocument();             System.out.println(count + quot;: ID: quot; + document.get(quot;idquot;));             System.out.println(count + quot;: Subtype: quot; + document.get(quot;subtypequot;));             count++;         }         int hitCount = hits.length();         timer.stop();         System.out.println(quot;Hits: quot; + hitCount);         System.out.println(quot;Completed in quot; + timer.elapsed() + quot;msquot;);
  47. 47.     private static void search(Searcher searcher, String field, String queryString) throws ParseException, IOException {         Timer timer = new Timer();         timer.start();         System.out.println(quot;Search (quot; + field + quot;): quot; + queryString);         QueryParser parser = new QueryParser(field, new StandardAnalyzer());         Query query = parser.parse(queryString);         Hits hits = searcher.search(query);         Integer count = 1;         Iterator<Hit> hiterator = hits.iterator();         while (hiterator.hasNext()) {             Hit hit = hiterator.next();             Document document = hit.getDocument();             System.out.println(count + quot;: ID: quot; + document.get(quot;idquot;));             System.out.println(count + quot;: Subtype: quot; + document.get(quot;subtypequot;));             count++;         }         int hitCount = hits.length();         timer.stop();         System.out.println(quot;Hits: quot; + hitCount);         System.out.println(quot;Completed in quot; + timer.elapsed() + quot;msquot;);
  48. 48. Recipe 3: Querying a remote document index
  49. 49. Wrap everything into a single file
  50. 50. Copy that file to an application server
  51. 51. Restart the application server
  52. 52. Voilà!
  53. 53. (almost never that easy)
  54. 54. You will need...
  55. 55. Bonus recipe! Automate tasks with Ant
  56. 56. XML based configuration
  57. 57. Automated compiles
  58. 58. Automated test runner
  59. 59. Automated deployment
  60. 60. Platform independent
  61. 61. Flexible (but complex)
  62. 62. ant deploy
  63. 63. clean code clean index compile build index build jar build war deploy
  64. 64. Could this work for Ensembl?
  65. 65. lucene.apache.org
  66. 66. Java IDEs rock: get stuck in
  67. 67. Thank you

×