当前位置 : 主页 > 编程语言 > java >

CS-LuceneIndexer

来源:互联网 收集:自由互联 发布时间:2021-06-28
LuceneIndexer.java /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */package cn.huawei.com.CompressedSeacher.u
LuceneIndexer.java
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cn.huawei.com.CompressedSeacher.util.lucene;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 * **
 *
 * @author l00358914
 */
public class LuceneIndexer {
    public static AtomicInteger index = new AtomicInteger();
    private static final String indexPath = "D:\\Lucene-600-index-data_directory";
    private static final String docsPath = "D:\\decompileFolder";
    private static final LuceneIndexer instance = new LuceneIndexer();
    private static final StringBuffer sb = new StringBuffer();
    public static volatile boolean finished = false;
    private LuceneIndexer() {
    }
    public static LuceneIndexer getInstance() {
        return instance;
    }
    public static void main(String[] args) {
        LuceneIndexer.getInstance().prepareIndexDocs(docsPath, false, indexPath);
    }
    public static void prepareIndexDocs(String docsPath, boolean create, String indexPath) {
        if (docsPath == null || docsPath.length() <= 0) {
            return;
        }
        Path docDir = Paths.get(docsPath, new String[0]);
        if (!Files.isReadable(docDir)) {
            System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path");
            System.exit(1);
        }
        try {
            System.out.println("Indexing to directory '" + indexPath + "'...");
            Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0]));//Lucene在文件系统中存储索引的最基本的抽象实现类是BaseDirectory,在文件系统上存储索引文件
            //通常情况下,如果索引文件存储在文件系统之上,我们无需自己选择使用FSDirectory的某个实现子类
            //只要使用FSDirectory中的open(Path path)方法即可
            Analyzer analyzer = new StandardAnalyzer();//标准分析器
            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
            if (create) {
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
            } else {
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            }
            IndexWriter writer = new IndexWriter(dir, iwc);
            System.out.println("准备更新Lucene索引数据,请稍后.................");
            indexDocs(writer, docDir);
            writer.commit();
            writer.close();
        } catch (IOException e) {
            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
        }
    }
    /**
     * **
     *
     * @param writer
     * @param file
     * @throws IOException
     */
    private static void indexDocs(final IndexWriter writer, Path file)
            throws IOException {
        if (Files.isDirectory(file, new LinkOption[0])) {//如果是目录,那么需要迭代
            FindFileVisitor findJavaVisitor = new FindFileVisitor(writer);
            Files.walkFileTree(file, findJavaVisitor);
        } else {//如果是单一文件,直接索引
            indexDoc(writer, file);
        }
        finished = true;
        System.out.println("Total file indexed:" + index.intValue());
    }
    /**
     * **
     *
     * @param writer
     * @param file
     * @throws IOException
     */
    private static void indexDoc(IndexWriter writer, Path file)
            throws IOException {
        if (file.toString().endsWith(".class")) {
            return;
        }
        index.getAndIncrement();
        InputStream stream = Files.newInputStream(file, new OpenOption[0]);//读取原始文件数据
        Throwable localThrowable3 = null;
        try {
            Document doc = new Document();//针对每一个文档
            //注意,对于需要分词的内容我们使用TextField,对于像id这样不需要分词的内容我们使用StringField
            doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
            StringField sf = new StringField("path", file.toString(), Store.YES);
            String[] indexes = file.toString().split("\\\\");//转义斜杠\
            StringField jar = new StringField("title", indexes[2].replaceAll(".src", ".jar"), Store.YES);//只索引但不分词,所有的字符串会作为一个整体进行索引,例如通常用于country或id等
            doc.add(sf);
            doc.add(jar);//每一个filed相当于一个文档的一个属性,索引的时候可以按照不同的filed进行搜索
            //---------------------------------------------------------
            if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {//adding new docs
                System.out.println("Adding...." + file.toString());
                writer.addDocument(doc);//addDoc()方法把文档(译者注:这里的文档是Lucene中的Document类的实例)添加到索引中
            } else {//update docs
//                System.out.println("Updating...."+file.toString());
                writer.updateDocument(new Term("path", file.toString()), doc);
            }
        } catch (Throwable localThrowable1) {
            localThrowable3 = localThrowable1;
            try {
                throw localThrowable1;
            } catch (Throwable ex) {
                Logger.getLogger(LuceneIndexer.class.getName()).log(Level.SEVERE, null, ex);
            }
        } finally {
            if (stream != null) {
                if (localThrowable3 != null) {
                    try {
                        stream.close();
                    } catch (Throwable localThrowable2) {
                        localThrowable3.addSuppressed(localThrowable2);
                    }
                } else {
                    stream.close();
                }
            }
        }
    }

    private static class FindFileVisitor extends SimpleFileVisitor
 
   {
        IndexWriter writer;
        public FindFileVisitor(IndexWriter writer) {
            this.writer = writer;
        }
        @Override
        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
            try {
                indexDoc(writer, file);
            } catch (IOException ex) {
                Logger.getLogger(LuceneIndexer.class.getName()).log(Level.SEVERE, null, ex);
            }
            return FileVisitResult.CONTINUE;
        }
    }
}
 
上一篇:java web 二维码生成
下一篇:Md5util
网友评论