首页 > 编程语言 > 详细

推荐算法_CIKM-2019-AnalytiCup 冠军源码解读_2

时间:2020-05-10 12:43:28      阅读:53      评论:0      收藏:0      [点我收藏+]

最近在为机器学习结合推荐算法的优化方法和数据来源想办法。抱着学习的态度继续解读19-AnalytiCup的冠军源码。

第一部分itemcf解读的连接:https://www.cnblogs.com/missouter/p/12701875.html

第二、三部分主要是特征提取和排序。在这篇博客中将作展开。

1、generate_static_features.ipynb 标题简洁明了 提取静态特征

import pandas as pd
import numpy as np

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print(Memory usage of dataframe is {:.2f} MB.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype(category)

    end_mem = df.memory_usage().sum() 
    print(Memory usage after optimization is: {:.2f} MB.format(end_mem))
    print(Decreased by {:.1f}%.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def load_data(path):
    user = reduce_mem_usage(pd.read_csv(path + user.csv,header=None))
    item = reduce_mem_usage(pd.read_csv(path + item.csv,header=None))
    data = pd.read_csv(path + user_behavior.csv,header=None)

    data.columns = [userID,itemID,behavior,timestamp]
    data[day] = data[timestamp] // 86400
    data[hour] = data[timestamp] // 3600 % 24
    
    ## 生成behavior的onehot
    for i in [pv,fav,cart,buy]:
        data[i] = 0
        data.loc[data[behavior] == i, i] = 1

    ## 生成behavior的加权
    
    data[day_hour] = data[day] + data[hour] / float(24)
    data.loc[data[behavior]==pv,behavior] = 1
    data.loc[data[behavior]==fav,behavior] = 2
    data.loc[data[behavior]==cart,behavior] = 3
    data.loc[data[behavior]==buy,behavior] = 1
    max_day = max(data[day])
    min_day = min(data[day])
    data[behavior] = (1 - (max_day-data[day_hour]+2)/(max_day-min_day+2)) * data[behavior] 

    item.columns = [itemID,category,shop,brand]
    user.columns = [userID,sex,age,ability]
    
    data = reduce_mem_usage(data)

    data = pd.merge(left=data, right=item, on=itemID,how=left)
    data = pd.merge(left=data, right=user, on=userID,how=left)

    return user, item, data
    

读取数据内存优化这块已经是老生常谈。loaddata()函数顺便完成了对各类行为权重的转换,值得一提的是购买权重被分配为1.而浏览、收藏等行为则被分配为1、2、3;目的是为了不向顾客推荐已购买过的商品。

主函数部分:

path = ../ECommAI_EUIR_round2_train_20190816/

user, item, data = load_data(path = path)

for count_feature in [itemID, shop, category,brand]:
    data[[behavior, count_feature]].groupby(count_feature, as_index=False).agg(
        {behavior:count}).rename(columns={behavior:count_feature + _count}).to_csv(str(count_feature)+_count.csv, index=False)

for count_feature in [itemID, shop, category,brand]:
    data[[behavior, count_feature]].groupby(count_feature, as_index=False).agg(
        {behavior:sum}).rename(columns={behavior:count_feature + _sum}).to_csv(str(count_feature)+_sum.csv, index=False)

确定路径后,对item、shop、category与brand的特征进行提取。使用groupby().agg()分别提取用户行为权重的次数与累加和(agg参数‘count‘与‘sum‘)。生成文件分别储存于csv文件中。

temp = data[[behavior,category]].groupby(category, as_index=False).agg({behavior: [median,std,skew]})
temp.columns = [category,category_median,category_std,category_skew]

temp.to_csv(category_higher.csv,index=False)

temp = data[[behavior,itemID]].groupby(itemID, as_index=False).agg({behavior: [median,std,skew]})
temp.columns = [itemID,itemID_median,itemID_std,itemID_skew]

temp.to_csv(itemID_higher.csv,index=False)

上述代码使用groupby().agg()提取每个单独category、单独id的行为中值、标准差与偏斜。

data[age] = data[age] // 10
train = data[data[day] < 15]

for count_feature in [sex,ability,age]:
    data[[behavior,itemID,count_feature]].groupby([itemID, count_feature], as_index=False).agg(
        {behavior: count}).rename(columns={behavior:user_to_
                                               + count_feature + _count}).to_csv(item_to_ + str(count_feature)+_count_online.csv, index=False)

这段以每个用户的基本数据(性别、对推荐系统的影响力、年龄)为基准,对其对应的行为次数进行特征提取。

itemcount = pd.read_csv(itemID_count.csv)

temp = pd.merge(left=item, right=itemcount, how=left, on=itemID)

item_rank = []
for eachcat in temp.groupby(category):
    each_df = eachcat[1].sort_values(itemID_count, ascending=False).reset_index(drop=True)
    each_df[rank] = each_df.index + 1
    lenth = each_df.shape[0]
    each_df[rank_percent] = (each_df.index + 1) / lenth
    item_rank.append(each_df[[itemID,rank,rank_percent]])

使用merge对item与item的行为次数进行拼接。使用groupby按照商品类别进行分类。每个类别内商品按照商品的行为次数进行排序,算出商品的类内排名与排名百分比,

item_rank = pd.concat(item_rank, sort=False)

item_rank.to_csv(item_rank.csv,index=False)

将生成的类内排序使用concat()去除多余标签,写入文件。

def unique_count(x):
    return len(set(x))

cat1 = item.groupby(category,as_index=False).agg({itemID: unique_count}).rename(columns={itemID:itemnum_undercat})

cat2 = item.groupby(category,as_index=False).agg({brand: unique_count}).rename(columns={brand:brandnum_undercat})

cat3 = item.groupby(category,as_index=False).agg({shop: unique_count}).rename(columns={shop:shopnum_undercat})

pd.concat([cat1, cat2[[brandnum_undercat]], cat3[[shopnum_undercat]]], axis=1).to_csv(category_lower.csv,index=False)

 这里先定义一个统计集合内元素数量的函数,应用在agg()中作为参数,用groupby以类别进行分类,统计每个类别中商品、品牌与商家的数量,写入csv文件。

2、generate_dynamic_feature.ipynb  提取动态特征

import pandas as pd
import numpy as np

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print(Memory usage of dataframe is {:.2f} MB.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype(category)

    end_mem = df.memory_usage().sum() 
    print(Memory usage after optimization is: {:.2f} MB.format(end_mem))
    print(Decreased by {:.1f}%.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def load_data(path):
    user = reduce_mem_usage(pd.read_csv(path + user.csv,header=None))
    item = reduce_mem_usage(pd.read_csv(path + item.csv,header=None))
    data = pd.read_csv(path + user_behavior.csv,header=None)

    data.columns = [userID,itemID,behavior,timestamp]
    data[day] = data[timestamp] // 86400
    data[hour] = data[timestamp] // 3600 % 24
    
    ## 生成behavior的onehot
    for i in [pv,fav,cart,buy]:
        data[i] = 0
        data.loc[data[behavior] == i, i] = 1

    ## 生成behavior的加权
    
    data[day_hour] = data[day] + data[hour] / float(24)
    data.loc[data[behavior]==pv,behavior] = 1
    data.loc[data[behavior]==fav,behavior] = 2
    data.loc[data[behavior]==cart,behavior] = 3
    data.loc[data[behavior]==buy,behavior] = 1
    max_day = max(data[day])
    min_day = min(data[day])
    data[behavior] = (1 - (max_day-data[day_hour]+2)/(max_day-min_day+2)) * data[behavior] 

    item.columns = [itemID,category,shop,brand]
    user.columns = [userID,sex,age,ability]
    
    data = reduce_mem_usage(data)

    data = pd.merge(left=data, right=item, on=itemID,how=left)
    data = pd.merge(left=data, right=user, on=userID,how=left)

    return user, item, data
    

与静态特征提取一样。

主函数部分:

#path = ‘..\\data\\‘
path = ../ECommAI_EUIR_round2_train_20190816/
user, item, data = load_data(path = path)

train = data[data[day] < 15]

online_features = []
for count_feature in [category,shop,brand]:
    train[[behavior,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
        {behavior: count}).rename(columns={behavior:user_to_
                                               + count_feature + _count}).to_csv(user_to_ + str(count_feature)+_count.csv, index=False)
for count_feature in [category,shop,brand]:
    train[[behavior,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
        {behavior: sum}).rename(columns={behavior:user_to_ 
                                             + count_feature + _sum}).to_csv(user_to_ + str(count_feature)+_sum.csv, index=False)

for count_feature in [category,shop,brand]:
    for behavior_type in [pv,fav,cart,buy]:
        train[[behavior_type,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
            {behavior_type: sum}).rename(columns={behavior_type:user_to_
                                                   + count_feature + _count_ + behavior_type}).to_csv(user_to_ + str(count_feature) + _count_ + behavior_type + .csv, index=False)

 将过去十五天的用户数据进行特征提取。同第一个文件一样的特征提取方式,只不过第二步提取的主体是用户。分别对用户与其产生行为的类别、商家与品牌进行次数、行为加权的特征提取。再对用户的四种行为类型与类别、商家与品牌进行累加和(次数?但它agg参数使用了sum)提取。最后写入csv文件。

yestday = data[data[day] == 14]

for count_feature in [category,shop,brand]:
    yestday[[behavior,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
        {behavior: count}).rename(columns={behavior:user_to_
                                               + count_feature + _count_yestday}).to_csv(user_to_ + str(count_feature)+_count_yestday.csv, index=False)

for count_feature in [category,shop,brand]:
    for behavior_type in [pv,fav,cart,buy]:
        yestday[[behavior_type,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
            {behavior_type: sum}).rename(columns={behavior_type:user_to_
                                                   + count_feature + _count_ + behavior_type+_yestday}).to_csv(user_to_ + str(count_feature) + _count_ + behavior_type + _yestday.csv, index=False)

单独对昨天的用户数据进行提取,针对行为次数与类别写入csv文件。

a5days = data[(data[day] > 15 - 5) & (data[day] < 15)]

for count_feature in [category,shop,brand]:
    a5days[[behavior,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
        {behavior: count}).rename(columns={behavior:user_to_
                                               + count_feature + _count_5days}).to_csv(user_to_ + str(count_feature)+_count_5days.csv, index=False)

for count_feature in [category,shop,brand]:
    for behavior_type in [pv,fav,cart,buy]:
        a5days[[behavior_type,userID,count_feature]].groupby([userID, count_feature], as_index=False).agg(
            {behavior_type: sum}).rename(columns={behavior_type:user_to_
                                                   + count_feature + _count_ + behavior_type+_5days}).to_csv(user_to_ + str(count_feature) + _count_ + behavior_type + _5days.csv, index=False)

针对近五天的用户数据进行提取,针对行为次数与类别写入csv文件。

start_timestamp  = max(data[data[day] < 15][timestamp])

time_features = []
test = data[data[day] < 15]
for time_feature in [shop, category,brand]:
    time_features.append(test[[last_time,userID,time_feature,day]].groupby([userID,time_feature], as_index=False).agg({last_time: min, day:max}).rename(columns={last_time: user_to_+ time_feature + _lasttime, day:user_to_+ time_feature + _lastday}))

for f in time_features:
    f.to_csv(str(f.columns[2])+.csv, index=False)

for f in time_features:
    print(str(f.columns[2])+.csv)

对每个用户访问商户、品牌与类别的最新时间进行提取,写入csv中。

for count_feature in [sex,ability,age]:
    train[[behavior,itemID,count_feature]].groupby([itemID, count_feature], as_index=False).agg(
        {behavior: count}).rename(columns={behavior:user_to_+ count_feature + _count}).to_csv(item_to_ + str(count_feature)+_count.csv, index=False)

 最后以每个用户的基本数据(性别、对推荐系统的影响力、年龄)为基准,对其对应的行为次数进行特征提取,生成一个与第一步对应的线下特征文件。

 3、generate_time_feature.ipynb 提取时间特征

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print(Memory usage of dataframe is {:.2f} MB.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == int:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype(category)

    end_mem = df.memory_usage().sum() 
    print(Memory usage after optimization is: {:.2f} MB.format(end_mem))
    print(Decreased by {:.1f}%.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def load_data(path):
    user = reduce_mem_usage(pd.read_csv(path + user.csv,header=None))
    item = reduce_mem_usage(pd.read_csv(path + item.csv,header=None))
    data = pd.read_csv(path + user_behavior.csv,header=None)

    data.columns = [userID,itemID,behavior,timestamp]
    data[day] = data[timestamp] // 86400
    data[hour] = data[timestamp] // 3600 % 24
    
    ## 生成behavior的onehot
    for i in [pv,fav,cart,buy]:
        data[i] = 0
        data.loc[data[behavior] == i, i] = 1

    ## 生成behavior的加权
    
    data[day_hour] = data[day] + data[hour] / float(24)
    data.loc[data[behavior]==pv,behavior] = 1
    data.loc[data[behavior]==fav,behavior] = 2
    data.loc[data[behavior]==cart,behavior] = 3
    data.loc[data[behavior]==buy,behavior] = 1
    max_day = max(data[day])
    min_day = min(data[day])
    data[behavior] = (1 - (max_day-data[day_hour]+2)/(max_day-min_day+2)) * data[behavior] 

    item.columns = [itemID,category,shop,brand]
    user.columns = [userID,sex,age,ability]
    
    data = reduce_mem_usage(data)

    data = pd.merge(left=data, right=item, on=itemID,how=left)
    data = pd.merge(left=data, right=user, on=userID,how=left)

    return user, item, data
    

一样的读取步骤。

path = ../ECommAI_EUIR_round2_train_20190816/
user, item, data = load_data(path = path)

train = data[data[day] < 15]

start_timestamp  = max(train[timestamp])

train[last_time] = start_timestamp - train[timestamp]

timefeatures = []

for time_feature in [itemID, shop, category,brand]:
    name = time_feature + _last_time_underline.csv
    tf = train[[last_time, time_feature]].groupby(
        time_feature, as_index=False).agg({last_time:min}).rename(columns={last_time: time_feature + last_time})
    tf[time_feature + last_time_hour_ed] = tf[time_feature + last_time] // 3600 % 24
    timefeatures.append((name, tf))

for f in timefeatures:
    f[1].to_csv(f[0], index=False)

这里作者演示了一种提取某个商品/店铺/类别/品牌 距离第15、16天的最后一次点击的方法。通过计算最大时间戳减去每个访问的时间戳得到last_time,通过groupby()分类,agg()提取最小的last_time列得到最后一次点击的商品。

至此,特征提取的源码分析就结束了。这部分的代码给我的感觉是groupby().agg()使用的非常熟练老道,特征工程的构建有很多值得学习的地方。

源码直接跑起来会出现一些意想不到的bug,我们非常感谢原作者薛传雨提供的帮助。

 

推荐算法_CIKM-2019-AnalytiCup 冠军源码解读_2

原文:https://www.cnblogs.com/missouter/p/12859953.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!