首页 > 数据库技术 > 详细

利用whoosh对mongoDB的中文文档建立全文检索

时间:2017-01-24 09:51:23      阅读:727      评论:0      收藏:0      [点我收藏+]

1、建立索引

技术分享
#coding=utf-8
from __future__ import unicode_literals
__author__ = zh
import sys,os
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
import pymongo
import json
from pymongo.collection import Collection
from pymongo import database

class CreatIndex:
    def __init__(self):
        self.mongoClient = pymongo.MongoClient(192.168.229.128,27017)
        self.websdb = pymongo.database.Database(self.mongoClient,webdb)
        self.pagesCollection = Collection(self.websdb,pages)
    def BuiltIndex(self):
        analyzer = ChineseAnalyzer()
        # 索引模版
        schema = Schema(
            U_id=ID(stored=True),
            # md5=ID(stored=True),
            title=TEXT(stored=True,analyzer=analyzer),
            location=TEXT(stored=True),
            publish_time=DATETIME(stored=True,sortable=True),
            content=TEXT(stored=False,analyzer=analyzer)
        )
        from whoosh.filedb.filestore import FileStorage
        storage = FileStorage("../whoosh_index")
        if not os.path.exists("../whoosh_index"):
            os.mkdir("../whoosh_index")
            ix = storage.create_index(schema)
            print 建立索引文件!
        else:
            ix=storage.open_index()

        # if not os.path.exists("whoosh_index"):
        #     os.mkdir("whoosh_index")
        #     ix = create_in("whoosh_index", schema) # for create new index
        # #ix = open_dir("tmp") # for read only
        writer = ix.writer()
        try:
            num=0
            while(True):
                # break
                try:
                    row=self.pagesCollection.find_one({indexed:{$exists:False}})
                    if row!=None:
                        publish_time=None
                        if row.has_key(publish_time):
                            publish_time=row[publish_time]
                            if str(publish_time)==‘‘ or str(publish_time)==0:
                                publish_time=None
                        location=‘‘
                        if row.has_key(location):
                            location=json.JSONEncoder().encode(row[location])

                        writer.add_document(
                        U_id=‘‘.join(str(row[_id])),
                        # md5=row[‘md5‘],
                        title=row[name],
                        location=‘‘.join(location),
                        publish_time=publish_time,
                        content=row[information]
                        )
                        self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}})
                        num+=1
                        print row["_id"],"已建立索引!"
                    else:
                        writer.commit()
                        print "全部处理完毕"
                        # time.sleep(3600)
                        # self.BuiltIndex()
                        break
                except:
                    print row["_id"],"异常"
                    break
        except:
            writer.commit()
            print "异常"
        # print ‘已处理‘,num,‘共计‘, self.pagesCollection.find({‘indexed‘:{‘$exists‘:True}}).count()
            print 已处理,num,共计, self.pagesCollection.find().count()

creatindext = CreatIndex()
creatindext.BuiltIndex()
View Code

注:注意编码

2、检索

技术分享
from __future__ import unicode_literals
#coding=utf-8
__author__ = zh
# from whoosh.qparser import QueryParser
from whoosh import qparser,sorting
# from jieba.analyse import ChineseAnalyzer
from whoosh.index import open_dir
from whoosh.query import *
# import pymongo
import datetime
# from pymongo.collection import Collection
# from pymongo import database

class FullText:
    def __init__(self,index_home=whoosh_index):
        self.index_home = index_home
        self.ix = open_dir(self.index_home)
        self.searcher = self.ix.searcher()

    # 全文检索,目前主要利用关键字
    def Query(self,parameter):
        # analyzer = ChineseAnalyzer()
        # ix = open_dir(self.index_home) # for read only

        # searcher = ix.searcher()
        # print ix.schema[‘content‘]
        # 按照字段查询,可联合查询,MultifieldParser
        list=parameter[keys]
        if len(list)==1:
            parser = qparser.QueryParser(list[0], schema=self.ix.schema)
        if len(list)>1:
            parser = qparser.MultifieldParser(list, schema=self.ix.schema)
        # else:
        #     return None
        # print ix.schema
        keywords = parameter[keywords]
        # print keywords
        q = parser.parse(keywords)

        # mf = sorting.MultiFacet()
        scores = sorting.ScoreFacet()
        date = sorting.FieldFacet("publish_time", reverse=True)

        # 是否分页返回OR全部返回,默认全部返回
        _limit=None
        if parameter.has_key(page) and parameter.has_key(pagesize):
            page=parameter[page]
            pagesize=parameter[pagesize]
            if page > 0 and pagesize !=0:
                _limit=page*pagesize

        # 是否按照location字段过滤,默认不过滤
        allow_q=None
        if parameter.has_key(includeFields) and parameter[includeFields].__contains__(ulocation):
            allow_q = qparser.query.Term("location", u"coordinates")

        #  时间分组,暂时不用
        # start = datetime.datetime(2000, 1, 1)
        # end = datetime.datetime.now()
        # gap = datetime.timedelta(days=365)
        # bdayfacet = sorting.DateRangeFacet("publish_time", start, end, gap)

        results = self.searcher.search(q, limit=_limit,filter=allow_q,sortedby=[scores,date])
        # results = searcher.search(q, limit=_limit,filter=restrict_q,
        #                           groupedby=bdayfacet,sortedby=[scores,date])
        # print results.estimated_length()
        return results
fulltext_query = fulltext.FullText()
View Code

注:支持多字段检索、分类、排序等

whoosh参考

利用whoosh对mongoDB的中文文档建立全文检索

原文:http://www.cnblogs.com/Micang/p/6346437.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!