LuceneIndexer.java /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */package cn.huawei.com.CompressedSeacher.u
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package cn.huawei.com.CompressedSeacher.util.lucene; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.LinkOption; import java.nio.file.OpenOption; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * ** * * @author l00358914 */ public class LuceneIndexer { public static AtomicInteger index = new AtomicInteger(); private static final String indexPath = "D:\\Lucene-600-index-data_directory"; private static final String docsPath = "D:\\decompileFolder"; private static final LuceneIndexer instance = new LuceneIndexer(); private static final StringBuffer sb = new StringBuffer(); public static volatile boolean finished = false; private LuceneIndexer() { } public static LuceneIndexer getInstance() { return instance; } public static void main(String[] args) { LuceneIndexer.getInstance().prepareIndexDocs(docsPath, false, indexPath); } public static void prepareIndexDocs(String docsPath, boolean create, String indexPath) { if (docsPath == null || docsPath.length() <= 0) { return; } Path docDir = Paths.get(docsPath, new String[0]); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0]));//Lucene在文件系统中存储索引的最基本的抽象实现类是BaseDirectory,在文件系统上存储索引文件 //通常情况下,如果索引文件存储在文件系统之上,我们无需自己选择使用FSDirectory的某个实现子类 //只要使用FSDirectory中的open(Path path)方法即可 Analyzer analyzer = new StandardAnalyzer();//标准分析器 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("准备更新Lucene索引数据,请稍后................."); indexDocs(writer, docDir); writer.commit(); writer.close(); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } /** * ** * * @param writer * @param file * @throws IOException */ private static void indexDocs(final IndexWriter writer, Path file) throws IOException { if (Files.isDirectory(file, new LinkOption[0])) {//如果是目录,那么需要迭代 FindFileVisitor findJavaVisitor = new FindFileVisitor(writer); Files.walkFileTree(file, findJavaVisitor); } else {//如果是单一文件,直接索引 indexDoc(writer, file); } finished = true; System.out.println("Total file indexed:" + index.intValue()); } /** * ** * * @param writer * @param file * @throws IOException */ private static void indexDoc(IndexWriter writer, Path file) throws IOException { if (file.toString().endsWith(".class")) { return; } index.getAndIncrement(); InputStream stream = Files.newInputStream(file, new OpenOption[0]);//读取原始文件数据 Throwable localThrowable3 = null; try { Document doc = new Document();//针对每一个文档 //注意,对于需要分词的内容我们使用TextField,对于像id这样不需要分词的内容我们使用StringField doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); StringField sf = new StringField("path", file.toString(), Store.YES); String[] indexes = file.toString().split("\\\\");//转义斜杠\ StringField jar = new StringField("title", indexes[2].replaceAll(".src", ".jar"), Store.YES);//只索引但不分词,所有的字符串会作为一个整体进行索引,例如通常用于country或id等 doc.add(sf); doc.add(jar);//每一个filed相当于一个文档的一个属性,索引的时候可以按照不同的filed进行搜索 //--------------------------------------------------------- if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {//adding new docs System.out.println("Adding...." + file.toString()); writer.addDocument(doc);//addDoc()方法把文档(译者注:这里的文档是Lucene中的Document类的实例)添加到索引中 } else {//update docs // System.out.println("Updating...."+file.toString()); writer.updateDocument(new Term("path", file.toString()), doc); } } catch (Throwable localThrowable1) { localThrowable3 = localThrowable1; try { throw localThrowable1; } catch (Throwable ex) { Logger.getLogger(LuceneIndexer.class.getName()).log(Level.SEVERE, null, ex); } } finally { if (stream != null) { if (localThrowable3 != null) { try { stream.close(); } catch (Throwable localThrowable2) { localThrowable3.addSuppressed(localThrowable2); } } else { stream.close(); } } } } private static class FindFileVisitor extends SimpleFileVisitor{ IndexWriter writer; public FindFileVisitor(IndexWriter writer) { this.writer = writer; } @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { try { indexDoc(writer, file); } catch (IOException ex) { Logger.getLogger(LuceneIndexer.class.getName()).log(Level.SEVERE, null, ex); } return FileVisitResult.CONTINUE; } } }