Lucene2 4 Demo

1,837 views

Published on

0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total views
1,837
On SlideShare
0
From Embeds
0
Number of Embeds
4
Actions
Shares
0
Downloads
12
Comments
0
Likes
0
Embeds 0
No embeds

No notes for slide

Lucene2 4 Demo

  1. 1. lucene2.4 gong on 我把 lucene2.4 发行包的一个例子改的更简单些,仅供参考,其中在 eclipse 中运行遇到中文乱码问题,这些在代码中会有体现。 完成这个例子后,我顺便试了一下 qieqie 的庖丁解牛中文分词器,很好用, 而且分发包中有个简单的说明文档。 Java 代码 1. package tutorial; 2. 3. import java.io.File; 4. import java.io.FileInputStream; 5. import java.io.FileNotFoundException; 6. import java.io.IOException; 7. import java.io.InputStreamReader; 8. 9. import org.apache.lucene.analysis.standard.StandardAnalyzer; 10.import org.apache.lucene.document.Document; 11.import org.apache.lucene.document.Field; 12.import org.apache.lucene.index.IndexWriter; 13. 14.public class IndexFiles { 15. 16. public static void main(String[] args) { 17. long start = System.currentTimeMillis(); 18. try { 19. IndexWriter writer = new IndexWriter("index", 20. new StandardAnalyzer(), true, 21. IndexWriter.MaxFieldLength.LIMITED); 22. indexDocs(writer, new File("data")); 23. writer.optimize(); 24. writer.close(); 25. System.out.println("用时:" + (System.currentTimeMi llis() - start) 26. + " 毫秒"); 27. } catch (IOException e) { 28. e.printStackTrace();
  2. 2. 29. } 30. } 31. 32. static void indexDocs(IndexWriter writer, File file) throws IOException { 33. if (file.canRead()) { 34. if (file.isDirectory()) { 35. String[] files = file.list(); 36. if (files != null) { 37. for (int i = 0; i < files.length; i++) { 38. indexDocs(writer, new File(file, files[ i])); 39. } 40. } 41. } else { 42. System.out.println("添加 " + file); 43. try { 44. //针对参数文件建立索引文档 45. Document doc = new Document(); 46. //Field.Index.NOT_ANALYZED 文件名称 建立索 引,但不分词 47. doc.add(new Field("filename", file.getCanon icalPath(), 48. Field.Store.YES, Field.Index.NOT_AN ALYZED)); 49. doc.add(new Field("contents", 50. new InputStreamReader(new FileInput Stream(file.getCanonicalPath()), "utf-8"))); 51. //在 writer 中加入此文档 52. writer.addDocument(doc); 53. //抛弃了原来 demo 中的如下做法,因为这样不能 设定字符编码,当出现因为编码为题查不到 54. //中文字符时,便束手无策。 55. //writer.addDocument(FileDocument.Document( file)); 56. } catch (FileNotFoundException fnfe) { 57. ; 58. } 59. } 60. } 61. } 62. 63.}
  3. 3. Java 代码 1. package tutorial; 2. 3. import java.io.BufferedReader; 4. import java.io.IOException; 5. import java.io.InputStreamReader; 6. 7. import org.apache.lucene.analysis.Analyzer; 8. import org.apache.lucene.analysis.standard.StandardAnalyzer; 9. import org.apache.lucene.document.Document; 10.import org.apache.lucene.index.IndexReader; 11.import org.apache.lucene.queryParser.QueryParser; 12.import org.apache.lucene.search.IndexSearcher; 13.import org.apache.lucene.search.Query; 14.import org.apache.lucene.search.ScoreDoc; 15.import org.apache.lucene.search.Searcher; 16.import org.apache.lucene.search.TopDocCollector; 17. 18.public class SearchFiles { 19. 20. public static void main(String[] args) throws Exception { 21. 22. IndexReader reader = IndexReader.open("index"); 23. 24. Searcher searcher = new IndexSearcher(reader); 25. Analyzer analyzer = new StandardAnalyzer(); 26. 27. BufferedReader in = new BufferedReader(new InputStreamR eader(System.in)); 28. 29. String field = "contents"; 30. QueryParser parser = new QueryParser(field, analyzer); 31. 32. String queries = null; 33. Query query = null; 34. while (true) { 35. if (queries == null) { 36. System.out.print("输入查询词(quit or exit 退出): ");
  4. 4. 37. } 38. 39. String line = in.readLine(); 40. if (line == null || line.length() == -1) { 41. continue; 42. } else if (line.equals("quit") || line.equals("exit ")) { 43. break; 44. } 45. line = line.trim(); 46. 47. query = parser.parse(line); 48. System.out.println("查询: " + query.toString(field) ); 49. 50. long start = System.currentTimeMillis(); 51. doPagingSearch(searcher, query, 5); 52. System.out.println("用时:" + (System.currentTimeMi llis() - start) 53. + " 毫秒"); 54. } 55. 56. reader.close(); 57. } 58. 59. public static void doPagingSearch(Searcher searcher, Query query, 60. int hitsPerPage) throws IOException { 61. 62. TopDocCollector collector = new TopDocCollector(5 * hit sPerPage); 63. searcher.search(query, collector); 64. ScoreDoc[] hits = collector.topDocs().scoreDocs; 65. 66. int numTotalHits = collector.getTotalHits(); 67. System.out.println("符合查询词的文件数:" + numTotalHit s); 68. 69. int start = 0; 70. int end = Math.min(numTotalHits, hitsPerPage); 71. 72. for (int i = start; i < end; i++) { 73. Document doc = searcher.doc(hits[i].doc); 74. String path = doc.get("filename");
  5. 5. 75. if (path != null) { 76. System.out.println((i + 1) + ". " + path); 77. } else { 78. System.out.println((i + 1) + ". " + "沒有这个目 录指定的文件"); 79. } 80. } 81. } 82. 83.} 也许我的进度有点缓慢,因为这两天有点事,且总是心神不宁的。今天去火车站 买票了,给我的感觉是跟我平常买票没什么区别,我很容易就买到票了,而且 还有座位。也许是因为家近的原因吧。或者是我买的日期不是一个高峰点。 2009-01-12 下面是从庖丁文档里粘出来的,并为了适应版本 2.4 做了部分修改的代码: Java 代码 1. package tutorial; 2. 3. import net.paoding.analysis.analyzer.PaodingAnalyzer; 4. 5. import org.apache.lucene.analysis.Analyzer; 6. import org.apache.lucene.analysis.TokenStream; 7. import org.apache.lucene.document.Document; 8. import org.apache.lucene.document.Field; 9. import org.apache.lucene.index.IndexReader; 10.import org.apache.lucene.index.IndexWriter; 11.import org.apache.lucene.index.TermPositionVector; 12.import org.apache.lucene.queryParser.QueryParser; 13.import org.apache.lucene.search.IndexSearcher; 14.import org.apache.lucene.search.Query; 15.import org.apache.lucene.search.ScoreDoc; 16.import org.apache.lucene.search.Searcher; 17.import org.apache.lucene.search.TopDocCollector; 18.import org.apache.lucene.search.highlight.Formatter; 19.import org.apache.lucene.search.highlight.Highlighter;
  6. 6. 20.import org.apache.lucene.search.highlight.QueryScorer; 21. import org.apache.lucene.search.highlight.TokenGroup; 22.import org.apache.lucene.search.highlight.TokenSources; 23. 24.public class PaodingExample { 25. 26. public static void main(String[] args) throws Exception { 27. String IDNEX_PATH = "index"; 28. 29. // 获取 Paoding 中文分词器 30. Analyzer analyzer = new PaodingAnalyzer(); 31. 32. // 建立索引 33. IndexWriter writer = new IndexWriter(IDNEX_PATH, analyz er, true, IndexWriter.MaxFieldLength.LIMITED); 34. Document doc = new Document(); 35. Field field = new Field("content", "你好,世界!", Field .Store.YES, 36. Field.Index.ANALYZED, Field.TermVector.WITH_POS ITIONS_OFFSETS); 37. doc.add(field); 38. writer.addDocument(doc); 39. writer.close(); 40. 41. System.out.println("Indexed success!"); 42. 43. // 检索 44. IndexReader reader = IndexReader.open(IDNEX_PATH); 45. QueryParser parser = new QueryParser("content", analyze r); 46. Query query = parser.parse("你好"); 47. Searcher searcher = new IndexSearcher(reader); 48. TopDocCollector collector = new TopDocCollector(5); 49. searcher.search(query, collector); 50. ScoreDoc[] hits = collector.topDocs().scoreDocs; 51. 52. if (collector.getTotalHits() == 0) { 53. System.out.println("hits.length=0"); 54. System.exit(0); 55. } 56. 57. Document doc2 = searcher.doc(hits[0].doc); 58. // 高亮处理
  7. 7. 59. String text = doc2.get("content"); 60. TermPositionVector tpv = (TermPositionVector) reader.ge tTermFreqVector( 61. 0, "content"); 62. TokenStream ts = TokenSources.getTokenStream(tpv); 63. Formatter formatter = new Formatter() { 64. public String highlightTerm(String srcText, TokenGr oup g) { 65. if (g.getTotalScore() <= 0) { 66. return srcText; 67. } 68. return "<b>" + srcText + "</b>"; 69. } 70. }; 71. 72. Highlighter highlighter = new Highlighter(formatter, ne w QueryScorer( 73. query)); 74. String result = highlighter.getBestFragments(ts, text, 5, "…"); 75. System.out.println("result:nt" + result); 76. reader.close(); 77. 78. } 79. 80.} • 02:44 • 浏览 (931) • 评论 (5) • 分类: 检索/P2P/分布式 • 收藏 • 相关推荐 评论
  8. 8. 5 楼 每天看看 2009-04-29 引用 //Field.Index.NOT_ANALYZED 文件名称 建立索引,但不分词 doc.add(new Field("filename", file.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED)); 这样建立索引,是不是我在搜索时可以 String field = "filename"; QueryParser parser = new QueryParser(field, analyzer); query = parser.parse(filePath); 这样搜索,结果应该有一条记录。 4 楼 zhyt710 2009-02-07 引用 当然不是,你完全可以逐个遍历 int numTotalHits = collector.getTotalHits(); 个符合条件的结果。但是对于大数据量的系统,一 般出现的结果数量非常的巨大,所以采取分页显示的方式。 就是说把打印结果放 在一个 while 循环里,每次循环都让 start 加一页数量的大小,只要 start 的 值小于 numTotalHits。 从而实现分页显示。 我在这里是为了尽可能的简化程序, 使读者更容易入门 xuganggogo 写道 Deprecated. Hits will be removed in Lucene 3.0.Instead e. g. TopDocCollector and TopDocs can be used:&nbsp;&nbsp; TopDocCollector collector = new TopDocCollector(hitsPerPage);&nbsp;&nbsp; searcher.search(query, collector);&nbsp;&nbsp; ScoreDoc[] hits = collector.topDocs().scoreDocs;&nbsp;&nbsp; for (int i = 0; i &lt; hits.length; i++) {&nbsp;&nbsp;&nbsp;&nbsp; int docId = hits[i].doc;&nbsp;&nbsp;&nbsp;&nbsp; Document d = searcher.doc(docId);&nbsp;&nbsp;&nbsp;&nbsp; // do something with current hit 这里代替 hits 的方法,只能对 hitsPerPage 个结果进行操作。 难道 hitsPerPage 是写死的吗?如果要遍历所有结果集,该如何做? 3 楼 xuganggogo 2009-02-05 引用 Deprecated. Hits will be removed in Lucene 3.0. Instead e. g. TopDocCollector and TopDocs can be used: TopDocCollector collector = new TopDocCollector(hitsPerPage); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId);
  9. 9. // do something with current hit 这里代替 hits 的方法,只能对 hitsPerPage 个结果进行操作。 难道 hitsPerPage 是写死的吗? 如果要遍历所有结果集,该如何做? 2 楼 zhyt710 2009-02-04 引用 并为了适应版 xuganggogo 写道 这个能用作索引 HTML 吗? 当然可以,只要是文本文件都可以。 对于不是纯文本文件的如 pdf,excel 等,只 要进行相应的解析成文本后,便可建立索引 1 楼 xuganggogo 2009-02-04 引用 这个能用作索引 HTML 吗?

×