LuceneIndexer.java /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */package cn.huawei.com.CompressedSeacher.u
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package cn.huawei.com.CompressedSeacher.util.lucene;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* **
*
* @author l00358914
*/
public class LuceneIndexer {
public static AtomicInteger index = new AtomicInteger();
private static final String indexPath = "D:\\Lucene-600-index-data_directory";
private static final String docsPath = "D:\\decompileFolder";
private static final LuceneIndexer instance = new LuceneIndexer();
private static final StringBuffer sb = new StringBuffer();
public static volatile boolean finished = false;
private LuceneIndexer() {
}
public static LuceneIndexer getInstance() {
return instance;
}
public static void main(String[] args) {
LuceneIndexer.getInstance().prepareIndexDocs(docsPath, false, indexPath);
}
public static void prepareIndexDocs(String docsPath, boolean create, String indexPath) {
if (docsPath == null || docsPath.length() <= 0) {
return;
}
Path docDir = Paths.get(docsPath, new String[0]);
if (!Files.isReadable(docDir)) {
System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path");
System.exit(1);
}
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0]));//Lucene在文件系统中存储索引的最基本的抽象实现类是BaseDirectory,在文件系统上存储索引文件
//通常情况下,如果索引文件存储在文件系统之上,我们无需自己选择使用FSDirectory的某个实现子类
//只要使用FSDirectory中的open(Path path)方法即可
Analyzer analyzer = new StandardAnalyzer();//标准分析器
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
if (create) {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
} else {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
System.out.println("准备更新Lucene索引数据,请稍后.................");
indexDocs(writer, docDir);
writer.commit();
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
}
/**
* **
*
* @param writer
* @param file
* @throws IOException
*/
private static void indexDocs(final IndexWriter writer, Path file)
throws IOException {
if (Files.isDirectory(file, new LinkOption[0])) {//如果是目录,那么需要迭代
FindFileVisitor findJavaVisitor = new FindFileVisitor(writer);
Files.walkFileTree(file, findJavaVisitor);
} else {//如果是单一文件,直接索引
indexDoc(writer, file);
}
finished = true;
System.out.println("Total file indexed:" + index.intValue());
}
/**
* **
*
* @param writer
* @param file
* @throws IOException
*/
private static void indexDoc(IndexWriter writer, Path file)
throws IOException {
if (file.toString().endsWith(".class")) {
return;
}
index.getAndIncrement();
InputStream stream = Files.newInputStream(file, new OpenOption[0]);//读取原始文件数据
Throwable localThrowable3 = null;
try {
Document doc = new Document();//针对每一个文档
//注意,对于需要分词的内容我们使用TextField,对于像id这样不需要分词的内容我们使用StringField
doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
StringField sf = new StringField("path", file.toString(), Store.YES);
String[] indexes = file.toString().split("\\\\");//转义斜杠\
StringField jar = new StringField("title", indexes[2].replaceAll(".src", ".jar"), Store.YES);//只索引但不分词,所有的字符串会作为一个整体进行索引,例如通常用于country或id等
doc.add(sf);
doc.add(jar);//每一个filed相当于一个文档的一个属性,索引的时候可以按照不同的filed进行搜索
//---------------------------------------------------------
if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {//adding new docs
System.out.println("Adding...." + file.toString());
writer.addDocument(doc);//addDoc()方法把文档(译者注:这里的文档是Lucene中的Document类的实例)添加到索引中
} else {//update docs
// System.out.println("Updating...."+file.toString());
writer.updateDocument(new Term("path", file.toString()), doc);
}
} catch (Throwable localThrowable1) {
localThrowable3 = localThrowable1;
try {
throw localThrowable1;
} catch (Throwable ex) {
Logger.getLogger(LuceneIndexer.class.getName()).log(Level.SEVERE, null, ex);
}
} finally {
if (stream != null) {
if (localThrowable3 != null) {
try {
stream.close();
} catch (Throwable localThrowable2) {
localThrowable3.addSuppressed(localThrowable2);
}
} else {
stream.close();
}
}
}
}
private static class FindFileVisitor extends SimpleFileVisitor
{
IndexWriter writer;
public FindFileVisitor(IndexWriter writer) {
this.writer = writer;
}
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
try {
indexDoc(writer, file);
} catch (IOException ex) {
Logger.getLogger(LuceneIndexer.class.getName()).log(Level.SEVERE, null, ex);
}
return FileVisitResult.CONTINUE;
}
}
}
