在我们的广告引擎中,第1步主要用到了倒排索引(Inverted Index),第二步主要用到了正排索引(在我们的系统这部分叫做Profile)。本文主要对广告引擎中的倒排索引进行介绍,正排索引的介绍将留到下一篇文章。
广告引擎索引结构有两部分组成,索引元数据信息(Index Meta data)以及索引字段(Index Field)信息构成。
/**
** @brief 倒排配置信息全集
*/
struct SIndexInfo {
SIndexFieldInfo *_pFieldInfos[MAX_INDEX_FIELD_NUM]; // 字段信息数组
int32_t _nFieldNum; // 字段个数
int32_t _nHasClassicField;
};
/**
** @brief 倒排字段配置信息
*/
struct SIndexFieldInfo {
char _szFieldName[MAX_FIELD_NAME_LEN]; // 字段名
char _szCompress[MAX_FIELD_NAME_LEN]; // 压缩方法名
INDEX_TYPE _eIndexType; // 倒排索引类型
uint32_t _nMaxDocCount; // 倒排最大长度,0为不限制
int32_t _nIndexFieldIdx;
FIELD_SPREAD_TYPE _nSpreadType; //平铺类型
char _szSourceTableName[MAX_TABLE_NAME_LEN]; // 来源表名
struct { //平铺来源字段信息
int32_t _nSourceTableIdx;
int32_t _nSourcePackageIdx;
int32_t _nSourceFieldIdx;
};
SPayloadFieldInfo *_pPayloadFieldInfos[MAX_PAYLOAD_FIELD_NUM];
int32_t _nPayloadFieldNum;
};
/**
** @brief payload字段配置信息
*/
struct SPayloadFieldInfo {
char _szFieldName[MAX_FIELD_NAME_LEN];
FIELD_STORE_TYPE _eStoreType;
int32_t _nFieldBitCount;
int32_t _nFieldIdx;
int32_t _nIndexFieldIdx;
bool _bKeyField;
};
/**
** @brief index资源信息
*/
struct SIndexResource {
SIndexFieldResource *_pFieldResource[MAX_INDEX_FIELD_NUM];
int32_t _count;
};
/**
** @brief index field资源信息
*/
struct SIndexFieldResource {
util::CMMapMempoolInterface *_pIdxFile; // 一级索引文件句柄
util::CMMapMempoolInterface *_pInvertListFile; // 二级索引文件句柄
util::CMMapMempoolInterface
*_pHashFilePtr; // payload string字段的去重表(hash表部分)
util::CMMapMempoolInterface
*_pDataFilePtr; // payload string字段的去重表(data部分)
SIndexFieldInfo *_pFieldInfo; //字段倒排配置信息
SGlobalInfo *_pGlobalConf; //全局配置信息
SIndexFieldResource() {
_pIdxFile = NULL;
_pInvertListFile = NULL;
_pHashFilePtr = NULL;
_pDataFilePtr = NULL;
_pFieldInfo = NULL;
_pGlobalConf = NULL;
}
~SIndexFieldResource() {}
};
// 一级倒排结构
struct SIdx1Unit {
uint64_t sign; // token 签名
union {
struct {
uint64_t num:26; // doc数量, 最多1.3亿
int64_t beginOffset:38; // 二级倒排起始偏移,最多64G
};
uint64_t numOffset;
};
};
// 二级倒排基本结构
typedef struct SIdx2Unit {
uint32_t docId; // doc-id
} SIdx2Unit;
// 二级倒排结构,带occ
typedef struct SIdx2UnitOcc: public SIdx2Unit {
uint16_t occ; // token在docid中的位置occ
} SIdx2UnitOcc;
struct SIndexFieldMergeInfo {
uint64_t nDocCount;
CIndexField *pIndexField;
};
// 字段倒排管理器,内部,基类
class CIndexField {
protected:
// deletemap 管理器
CDeleteMap *_pDelMap;
// 一级索引
util::CHashTable<SIdx1Unit> *_pIdx;
util::CMMapMempoolInterface *_pIdxFile;
// 二级索引
util::CMMapMempoolInterface *_pInvertListFile;
};
class CIndexTerm {
public:
// 获取倒排长度
virtual uint32_t getDocNum() = 0;
// 获取倒排链,做deletemap过滤
virtual int32_t getDocList(uint32_t *pDocList, char **pPayloadList) = 0;
// 获取倒排链地址
virtual const void *getDocList() = 0;
// 根据当前传入的docid取出第一个等于或大于的docid,用于外层归并
virtual uint32_t seek(uint32_t nDocId, char *&pPayload) { return INVALID_DOCID; }
// 设置deletemap管理器
void setDeleteMap(CDeleteMap *pDelMap);
protected:
CDeleteMap *_pDelMap;
uint32_t _nDocNum;
const char *_pInvertList;
int32_t _nDocUnitLength;
uint32_t _pos;
};
<doc>
adgroupId=100083699
catId=11,110207
defPrice=30
expire=2114380800
goodsPrice=120.00
goodsId=8248681432
campaignId=3481560
custId=1103154803
price=0
modTime=1334573076
catStatus=1
nonsearchMaxPrice=0
propertyId=21511,21943,120173,21940,32999,65235,65262,65266,21517,65256,65268,21385,21456
adgExtension=ordinaryPostFee:1200;isCommend:1;transitFee:10.00;vipDiscountRate:goldCard~100$platinaCard~100$diamondCard~100;location:北京;isNew:1;isPostFree:0;isSupportVip:0;spuId:44086;skuPrice:
adgTitle=全新库存希捷硬盘120G台式机IDE接口1年包换 元送3件礼品
adgTags=0
sell=0
sell1=0
sell7=0
score=0
adgroupStatus=1
doPrice=30
rankScore=0
postage=10.00
location=北京
locationId=19
catIdIdx=11 110207
catPropIds=110207
keywords=价格便宜11907686613,0,4,319163397一年质保11907686617,0,4,252054788硬盘11907686623,0,4,1645298193120G11907686625,0,4,1594966545全新 库存11907686620,0,4,1611743761台式机并口硬盘11907686615,0,4,1074869777希捷11907686628,0,4,1662074897
templateStatus=0
</doc>
<doc>
adgroupId=110083699
catId=11,110207
defPrice=30
expire=2114380800
goodsPrice=120.00
goodsId=8248681432
campaignId=3481560
custId=1103154803
price=0
modTime=1334573076
catStatus=1
nonsearchMaxPrice=0
propertyId=21511,21943,120173,21940,32999,65235,65262,65266,21517,65256,65268,21385,21456
adgExtension=ordinaryPostFee:1200;isCommend:1;transitFee:10.00;vipDiscountRate:goldCard~100$platinaCard~100$diamondCard~100;location:北京;isNew:1;isPostFree:0;isSupportVip:0;spuId:44086;skuPrice:
adgTitle=全新库存希捷硬盘120G台式机IDE接口1年包换 元送3件礼品
adgTags=0
sell=0
sell1=0
sell7=0
score=0
adgroupStatus=1
doPrice=30
rankScore=0
postage=10.00
location=北京
locationId=19
catIdIdx=11 110207
catPropIds=110207
keywords=价格便宜11907686613,0,4,319163397一年质保11907686617,0,4,252054788硬盘11907686623,0,4,1645298193120G11907686625,0,4,1594966545全新 库存11907686620,0,4,1611743761台式机并口硬盘11907686615,0,4,1074869777希捷11907686628,0,4,1662074897
templateStatus=0
</doc>
//倒排
if (_pIndexInfo) {
SIndexFieldInfo *pIndexFieldInfo = NULL;
SPayloadFieldInfo *pPayloadFieldInfo = NULL;
for (nIndexFieldIdx = 0; nIndexFieldIdx < _pIndexInfo->_nFieldNum;
++nIndexFieldIdx) {
pIndexFieldInfo = _pIndexInfo->_pFieldInfos[nIndexFieldIdx];
for (nPayloadFieldIdx = 0;
nPayloadFieldIdx < pIndexFieldInfo->_nPayloadFieldNum;
++nPayloadFieldIdx) {
pPayloadFieldInfo =
pIndexFieldInfo->_pPayloadFieldInfos[nPayloadFieldIdx];
it = record.find(string(pPayloadFieldInfo->_szFieldName));
if (it == record.end()) {
LOG_ERROR("Field %s is required.", pPayloadFieldInfo->_szFieldName);
return -1;
}
_pFields[nFieldPtr].szName = pPayloadFieldInfo->_szFieldName;
_pFields[nFieldPtr].szValues = getIndexFieldValue(it->second.c_str(),
&_pFields[nFieldPtr].nValueCount,
pPool);
if (!_pFields[nFieldPtr].szValues) {
LOG_ERROR("getIndexFieldValue() failed.");
return -1;
}
++nFieldPtr;
}
it = record.find(string(pIndexFieldInfo->_szFieldName));
if (it == record.end()) {
LOG_ERROR("Field %s is required.", pIndexFieldInfo->_szFieldName);
return -1;
}
_pFields[nFieldPtr].szName = pIndexFieldInfo->_szFieldName;
_pFields[nFieldPtr].szValues = getIndexFieldValue(it->second.c_str(),
&_pFields[nFieldPtr].nValueCount,
pPool);
if (!_pFields[nFieldPtr].szValues) {
LOG_ERROR("getIndexFieldValue() failed.");
return -1;
}
++nFieldPtr;
}
}
//判断总数是否相符
if (nFieldPtr != _nTableFieldNum + _nIndexFieldNum) {
LOG_ERROR("%d fields parsed, but %d fields required.",
nFieldPtr,
_nTableFieldNum + _nIndexFieldNum);
return -1;
}
//去重
qsort(_pFields, nFieldPtr, sizeof(update::SField), fieldCmp);
for (int i = 1; i < nFieldPtr;) {
if (strcmp(_pFields[i].szName, _pFields[i - 1].szName) == 0) { //相同字段
for (int j = i; j < nFieldPtr - 1; ++j) {
memcpy(&_pFields[j], &_pFields[j + 1], sizeof(update::SField));
}
nFieldPtr--;
} else {
++i;
}
}
//call indexupdate
ret = _indexUpdate.add(_pTableInfo->_szTableName, _pFields, nFieldPtr, pPool);
if (ret != 0) {
LOG_ERROR("IndexUpdate::add() failed.");
return -1;
}
原文:https://www.cnblogs.com/diegodu/p/9073123.html