首页 > 其他 > 详细

lucene对文件做简单的索引

时间:2014-06-15 17:49:52      阅读:628      评论:0      收藏:0      [点我收藏+]

标签:lucene建立文件索引

package com.mylucene;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.LucenePackage;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class MyLuceneTest {
    
    /**
     * 根据内容,构建索引
     * @param analyzer
     * @param directory
     * @param items
     * @return
     */
    private boolean buildIndexer(Analyzer analyzer, Directory directory, List<Item> items) {
        IndexWriter iwriter = null;
        try {
            // 配置索引
            iwriter = new IndexWriter(directory, new IndexWriterConfig(
                    Version.LUCENE_47, analyzer));
            // 删除所有document
            iwriter.deleteAll();
            // 将文档信息存入索引
            Document doc[] = new Document[items.size()];
            for (int i = 0; i < items.size(); i++) {
                doc[i] = new Document();           
                Item item = items.get(i);
                java.lang.reflect.Field[] fields = item.getClass().getDeclaredFields();
                for (java.lang.reflect.Field field : fields) {
                    String fieldName = field.getName();
                   // System.out.println(fieldName);
                    String getMethodName = "get"+toFirstLetterUpperCase(fieldName);
                    Object obj = item.getClass().getMethod(getMethodName).invoke(item);
                    //System.out.println((String)obj);
                    doc[i].add(new Field(fieldName, (String)obj, TextField.TYPE_STORED));
                   // Field field1 = new Field("", new FileReader(new File("")));
                   // doc[1].add(field1);
                }
                
                iwriter.addDocument(doc[i]);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        } finally {
            try {
                iwriter.close();
            } catch (IOException e) {
            }
        }
        return true;
    }
    
    /**
     * 根据keyword搜索索引
     * @param analyzer
     * @param directory
     * @param keyword
     * @return
     */
    public List<Item> searchIndexer(Analyzer analyzer, Directory directory, String keyword) {
        DirectoryReader ireader = null;
        List<Item> result = new ArrayList<Item>();
        try {
            // 设定搜索目录
            ireader = DirectoryReader.open(directory);
            IndexSearcher isearcher = new IndexSearcher(ireader);

            // 对多field进行搜索
            java.lang.reflect.Field[] fields = Item.class.getDeclaredFields();
            int length = fields.length;
            String[] multiFields = new String[length];
            for (int i = 0; i < length; i++) {
                multiFields[i] = fields[i].getName();
            }
            MultiFieldQueryParser parser = new MultiFieldQueryParser(
                    Version.LUCENE_47, multiFields, analyzer);

            // 设定具体的搜索词
            Query query = parser.parse(keyword);
            ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;

            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = isearcher.doc(hits[i].doc);
                Item item = new Item();
                for (String field : multiFields) {
                    String setMethodName = "set"+toFirstLetterUpperCase(field);
                    item.getClass().getMethod(setMethodName, String.class).invoke(item, hitDoc.get(field));
                }
                result.add(item);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        } finally {
            try {
                ireader.close();
                directory.close();
            } catch (IOException e) {
            }
        }
        return result;
    }
    
    /**
     * 首字母转大写
     * @param str
     * @return
     */
    public static String toFirstLetterUpperCase(String str) {  
        if(str == null || str.length() < 2){  
            return str;  
        }  
        return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());  
     }  
    
    public static void main(String[] args) throws Exception {
    	System.out.println(LucenePackage.get());
        MyLuceneTest demo = new MyLuceneTest();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
       // Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);
        
        List<Item> items = new ArrayList<Item>();
        /*items.add(new Item("1", "中国", "This is the text to be greatly indexed."));
        items.add(new Item("2", "second", "This is great"));
        items.add(new Item("3", "third", "I love apple and pear. "));
        items.add(new Item("4", "four", "我是中国人"));
        items.add(new Item("5", "five", "中华人民共和国"));
        
        */File dataFile = new File("C:/mylucene");
        File[] dataFiles = dataFile.listFiles();
        for(int i = 0; i < dataFiles.length; i++){
        	Reader txtReader = new FileReader(dataFiles[i]);
        	char []buff = new char[10000];
        	txtReader.read(buff);
        	String str = String.valueOf(buff);
        	System.out.println(buff);
        	items.add(new Item(dataFiles[i].getCanonicalPath(),dataFiles[i].getName(),str));
        	//System.out.println(dataFiles[i].getCanonicalPath());
        	//System.out.println(dataFiles[i].getName());
        	//System.out.println(buff);
        	//System.out.println(txtReader.toString());
        }
        
        // 索引存到内存中的目录
        //Directory directory = new RAMDirectory();
        // 索引存储到硬盘
        File file = new File("c:/lucene");
        Directory directory = FSDirectory.open(file);
        demo.buildIndexer(analyzer, directory, items);
        List<Item> result = demo.searchIndexer(analyzer, directory, "中国");
        
        for (Item item : result) {
            System.out.println(item.toString());
        }
    }
}

package com.mylucene;
public class Item {
   
    private String id;
    private String title;
    private String content;
   
    public Item() {
    }
   
    public Item(String id, String title, String content) {
        this.id = id;
        this.title = title;
        this.content = content;
    }
   
    public String getId() {
        return id;
    }
    public void setId(String id) {
        this.id = id;
    }
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getContent() {
        return content;
    }
    public void setContent(String content) {
        this.content = content;
    }
   
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("[id=").append(id).append(",title=").append(title)
            .append(",content=").append(content).append("]");
        return sb.toString();
    }
}

这里是将文件的的三个属性进行了一下抽象,并且运用另一个类去表示,在以前版本中是运用Reader进行读取文件,并且在文件进行添加索引的时候直接对Reader读取的对象进行添加,不需要将其所有进行读出都进行封装。这里就是文件非常大的时候内存将会存不下,导致内存不足或者数组越界的可能。这里应该还可以像以前版本一样可以直接对文件建立索引的,我相信是我没有找到好的解决办法,所以应该多研究一下4.8的api。



lucene对文件做简单的索引,布布扣,bubuko.com

lucene对文件做简单的索引

标签:lucene建立文件索引

原文:http://blog.csdn.net/winnerspring/article/details/30262383

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 bubuko.com 版权所有 鲁ICP备09046678号-4
打开技术之扣,分享程序人生!
             

鲁公网安备 37021202000002号