#coding=gbk import os import struct import sys reload(sys) sys.setdefaultencoding(‘gbk‘) XDB_VERSION = 34 # 0x01 ~ 0xff XDB_TAGNAME = ‘XDB‘ # First bytes XDB_MAXKLEN = 0xf0 # maxklen: < 255 class XDB_R(object): fd = False hash_base = 0 hash_prime = 0 memread = None #内存 mem = False #是否启用内存 off = 0 #位置 len = 0 #内存长度 def __init__(self,mem=False): self.mem = mem pass def __del__(self): self.Close() pass def Open(self,fpath): self.Close() try: fd = file(fpath,‘rb‘) except IOError: raise Exception(‘XDB::Open("‘ + os.path.basename(fpath) + ‘"),invalid xdb failed.‘) else: if(self.mem): self.memread = fd.read() self.len = len(self.memread) self.fd = fd if( self._check_header(fd) is False): raise Exception(‘XDB::Open("‘ + os.path.basename(fpath) + ‘"),invalid xdb format.‘) fd.close() return True def _read(self,size): if(self.mem): return self.memread[self.off:self.off+size] else: return self.fd.read(size) def _seek(self,seek,flag=False): if(self.mem): if self.off > self.len: raise Exception(‘Mem offset !‘) self.off = seek else: self.fd.seek(seek,flag) def _close(self): if(self.mem): self.memread = None else: self.fd.close() self.fd = False def Get(self,key): if(self.fd is False): raise Exception(‘XDB:Get(), null db handler.‘) klen = len(key) #print klen if(klen ==0 or klen > XDB_MAXKLEN): return False rec = self._get_record(key) if(not rec.has_key(‘vlen‘) or rec[‘vlen‘] ==0): return False return rec[‘value‘] def Close(self): if(self.fd is False): return self._close() def _get_index(self,key): l = len(key) h = self.hash_base while l: l-=1 h += (h << 5) h ^= ord(key[l]) h &= 0x7fffffff return (h % self.hash_prime) def _check_header(self,fd): fd.seek(0,os.SEEK_SET) buf = fd.read(32) if(len(buf) != 32): return False unpack = struct.unpack(‘3s B I I I f 12s‘,buf) if(len(unpack) <=6): unpack = list(unpack) unpack.extend(‘ ‘) hdr = {} hdr[‘tag‘],hdr[‘ver‘],hdr[‘base‘],hdr[‘prime‘],hdr[‘fsize‘],hdr[‘check‘],hdr[‘reversed‘] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6] if(hdr[‘tag‘] != XDB_TAGNAME): return False fstat = os.fstat(fd.fileno()) if(fstat.st_size != hdr[‘fsize‘]): return False self.hash_base = hdr[‘base‘] self.hash_prime = hdr[‘prime‘] self.version = hdr[‘ver‘] self.fsize = hdr[‘fsize‘] def _get_record(self,key): self._io_times = 1 index = self._get_index(key) if self.hash_prime > 1 else 0 poff = index * 8 + 32 self._seek(poff,os.SEEK_SET) buf = self._read(8) if(len(buf) ==8): tmp = struct.unpack(‘I I‘,buf) tmp = {‘off‘:tmp[0],‘len‘:tmp[1]} else:tmp = {‘off‘:0,‘len‘:0} return self._tree_get_record(tmp[‘off‘],tmp[‘len‘],poff,key) def _tree_get_record(self,off,len,poff =0,key =‘‘): if(len == 0): return {‘poff‘:poff} self._io_times+=1 self._seek(off,os.SEEK_SET) rlen = XDB_MAXKLEN + 17 if(rlen > len): rlen = len buf = self._read(rlen) unpack = struct.unpack(‘I I I I B‘,buf[0:17]) rec = {} rec[‘loff‘],rec[‘llen‘],rec[‘roff‘],rec[‘rlen‘],rec[‘klen‘] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4] fkey = buf[17:17+rec[‘klen‘]] cmpl = cmp(key,fkey) if(key) else 0 #print key.decode(‘gbk‘),fkey.decode(‘gbk‘) if(cmpl > 0): buf =‘‘ return self._tree_get_record(rec[‘roff‘],rec[‘rlen‘],off+8,key) elif (cmpl < 0): buf=‘‘ return self._tree_get_record(rec[‘loff‘],rec[‘llen‘],off,key) else: rec[‘poff‘] = poff rec[‘off‘] = off rec[‘len‘] = len rec[‘voff‘] = off + 17 + rec[‘klen‘] rec[‘vlen‘] = len - 17 - rec[‘klen‘] rec[‘key‘] = fkey self._seek(rec[‘voff‘],os.SEEK_SET) rec[‘value‘] = self._read(rec[‘vlen‘]) return rec # #aa = XDB_R(True) #aa.Open(‘./dict.xdb‘) #aab = aa.Get(‘上海‘) #print aab
本文章来至源码世界 http://www.ymsky.net/views/65091.shtml
pyscws4 是一个python的分词程序,布布扣,bubuko.com
原文:http://blog.csdn.net/long7181226/article/details/22730273