首页 > 其他 > 详细

20200820

时间:2020-08-20 22:52:14      阅读:77      评论:0      收藏:0      [点我收藏+]

看NRT,看PYTORCH

def run(data_type, existing_model_name = None):
    modules, consts, options = init_modules(data_type)

init_modules

def init_modules(data_type):
    options = {}
   
    options["is_predicting"] = False #Traing时候填false,predicting填true

    print ("is_predicting: ", options["is_predicting"], ", data = " + data_type)

    options["is_unicode"] = True
    
    print ("loading...") # 读取准备好的数据文件
    all_data = pickle.load(open("./data/data_"+ data_type + ".pkl", "rb"))
    
    ‘‘‘
     x_raw_train/dev/test: [[0,0,5 [‘nice‘, ‘app‘], [‘nice‘,‘app‘...]], [...]]
     old_dic: { ‘nice‘: 1569, ...} len=24177
     dic: {‘nice‘:1569 ...} len=6468 (经过threshold筛选)
     hfw: [‘<unk>‘, ‘<eos>‘, ‘the‘....]
     i2w: {0: ‘nice‘, 1: ‘app‘ ...} w2i: {‘nice‘: 0, ‘app‘: 1 ...}
     w2w: {‘nice‘: ‘nice‘, ‘sy‘: <unk>, ...}
     w2idf: {‘nice‘: 1.1639902059516225, ...}
     i2user: {0: 16672, ...} user2i: {16672: 0, ... }(username)
     i2item/ item2i (itemname)
     user2w: {0: {0, 1, 2, 3, 4, 6467}, 1: {...}, ...} 所有同一个user的对应评论
     item2w: {0: {0, 1, 2, .... 1333, 1902, 6467}, 1:{...}, ... } 所有同一个item对应的评论
    ‘‘‘
    [x_raw_train, x_raw_dev, x_raw_test, old_dic, dic, hfw, i2w, w2i, w2w, w2idf, i2user, user2i, i2item, item2i, user2w, item2w] = all_data

    consts = {}
    consts["num_user"] = len(user2i)
    consts["num_item"] = len(item2i)
    consts["dict_size"] = len(dic)
    consts["hfw_dict_size"] = len(hfw)
    consts["lvt_dict_size"] = LVT_DICT_SIZE #6468
    consts["len_y"] = LEN_Y # 35 + 1

    consts["dim_x"] = DIM_X # 300 根据论文上写的
    consts["dim_w"] = DIM_W # 300
    consts["dim_h"] = DIM_H # 400
    consts["dim_r"] = DIM_RATING # 6
    consts["max_len_predict"] = consts["len_y"]
    consts["min_len_predict"] = 1

    consts["batch_size"] = 200
    
    consts["test_batch_size"] = 1
    consts["test_samples"] = 2000


    consts["beam_size"] = 5

    consts["lr"] = 1.

    consts["print_size"] = 2 #400 2 for demo

    print ("#train = ", len(x_raw_train), ", #dev = ", len(x_raw_dev), ", #test = ", len(x_raw_test))
    print ("#user = ", consts["num_user"], ", #item = ", consts["num_item"])
    print ("#dic = ", consts["dict_size"], ", #hfw = ", consts["hfw_dict_size"])

    modules = {}
    modules["all_data"] = all_data
    modules["hfw"] = hfw
    modules["w2i"] = w2i
    modules["i2w"] = i2w
    modules["w2w"] = w2w
    modules["u2ws"] = user2w
    modules["i2ws"] = item2w
    modules["w2idf"] = w2idf

    modules["eos_emb"] = modules["w2i"][W_EOS]
    modules["unk_emb"] = modules["w2i"][W_UNK]
    
    modules["optimizer"] = "adadelta"
    
    modules["stopwords"] = datar.load_stopwords("./data/stopwords_en.txt")

    return modules, consts, options

model建立模型

    print ("compiling...")
    model = NeuralRecsys(modules, consts, options)

Training

    if training_model:
        print ("training......")
        [x_raw_train, x_raw_dev, x_raw_test, old_dic, dic, hfw, i2w, w2i, w2w, w2idf, i2user, user2i, i2item, item2i, user2w, item2w] = modules["all_data"]
        for epoch in range(0, 50):
            
            if epoch > 2:
                consts["lr"] = 0.1

            start = time.time()
            # [[随机排序的index], [随机的index], []...]
            batch_list = datar.get_batch_index(len(x_raw_train), consts["batch_size"])
            error = 0
            error_i = 0
            error_rmse = 0
            e_nll = 0
            e_nllu = 0
            e_nlli = 0
            e_cce = 0
            e_mae = 0
            e_l2n = 0
            for batch_index in batch_list:
                local_batch_size = len(batch_index)
                #提取出当前batch所需要的traindata
                batch_raw = [x_raw_train[bxi] for bxi in batch_index] 
                load_time = time.time()
                XU, XI, Y_r, Y_r_vec,                         Dict_lvt, Y_rev_tf, Y_rev_mark,                         Y_sum_idx, Y_sum_mark, Y_sum_lvt, lvt_i2i = datar.get_batch_data(batch_raw, consts, options, modules)

get_batch_data

def get_batch_data(raw_x, consts, options, modules):
    batch_size = len(raw_x)
    hfw = modules["hfw"]
    w2i = modules["w2i"]
    i2w = modules["i2w"]
    w2w = modules["w2w"]
    w2idf = modules["w2idf"]
    u2ws = modules["u2ws"]
    i2ws = modules["i2ws"]
    len_y = consts["len_y"]
    lvt_dict_size = consts["lvt_dict_size"]

    xu = np.zeros(batch_size, dtype = "int64") # 记录一组userid
    xi = np.zeros(batch_size, dtype = "int64") # 记录一组itemid
    yr = np.zeros(batch_size, dtype = theano.config.floatX) #记录一组rating
    
    # one_hot encode DIM_RATING: 0--5 = 6
    yr_vec = np.zeros((batch_size, DIM_RATING) , dtype = theano.config.floatX)
    
    y_review_mark = np.zeros((batch_size, lvt_dict_size) , dtype = theano.config.floatX)
    y_review_tf = np.zeros((batch_size, lvt_dict_size) , dtype = theano.config.floatX)
    y_summary_index = np.zeros((len_y, batch_size, 1) , dtype = "int64")
    y_summary_mark = np.zeros((len_y, batch_size) , dtype = theano.config.floatX)
    y_summary_lvt = np.zeros((len_y, batch_size, 1) , dtype = "int64")

    dic_review_words = {}
    w2i_review_words = {}
    lst_review_words = []
    lvt_user_item = {w2i[W_EOS]}
    lvt_i2i = {}
    for i in range(batch_size):
        uid, iid, r, summary, review = raw_x[i] 
        xu[i] = uid
        xi[i] = iid
        yr[i] = r
        yr_vec[i, int(r)] = 1.
        lvt_user_item |= u2ws[uid] | i2ws[iid] #将所有的review词整合到一起

        if len(summary) > len_y: # 最大长度
            summary = summary[0:len_y-1] + [W_EOS]

        for wi in range(len(summary)):
            w = summary[wi]
            w = w2w[w]
            if w in dic_review_words:
                dic_review_words[w] += 1 #记录每个词出现的次数
            else:
                w2i_review_words[w] = len(dic_review_words) #生成当前batch的word2id
                dic_review_words[w] = 1
                lst_review_words.append(w2i[w])
            y_summary_index[wi, i, 0] = w2i[w] #?
            y_summary_lvt[wi, i, 0] = w2i_review_words[w] #?
            y_summary_mark[wi, i] = 1 #?

        for w in review:
            w = w2w[w]
            if w in dic_review_words:
                dic_review_words[w] += 1
            else:
                w2i_review_words[w] = len(dic_review_words)
                dic_review_words[w] = 1
                lst_review_words.append(w2i[w])
            y_review_mark[i, w2i_review_words[w]] = 1
            y_review_tf[i, w2i_review_words[w]] += 1 #w2idf[w]

    y_review_tf /= 10.

    # 补充当前batch未出现过的词,dic=0
    if len(dic_review_words) < lvt_dict_size:
        for rd_hfw in hfw:
            if rd_hfw not in dic_review_words:
                w2i_review_words[rd_hfw] = len(dic_review_words)
                dic_review_words[rd_hfw] = 0
                lst_review_words.append(w2i[rd_hfw])
            if len(dic_review_words) == lvt_dict_size:
                break
    else:
        print ("!!!!!!!!!!!!")

    for i in range(len(lst_review_words)):
        lvt_i2i[i] = lst_review_words[i]
    assert len(dic_review_words) == lvt_dict_size
    assert len(lst_review_words) == lvt_dict_size

    if options["is_predicting"]:
        lvt_i2i = {}
        lst_review_words = list(lvt_user_item)
        for i in range(len(lst_review_words)):
            lvt_i2i[i] = lst_review_words[i]
        
    lst_review_words = np.asarray(lst_review_words)

    return xu, xi, yr, yr_vec, lst_review_words, y_review_tf, y_review_mark, y_summary_index, y_summary_mark, y_summary_lvt, lvt_i2i

train&cost

                cost, mse, nll, nllu, nlli, cce, mae, l2n, rat_pred, sum_pred =     											   model.train(XU, XI, Y_r, Y_r_vec,                                                    Dict_lvt, Y_rev_tf, Y_rev_mark,                                                    Y_sum_idx, Y_sum_mark, Y_sum_lvt,                                                    local_batch_size, consts["lr"])
                #print cost, mse, nll, cce, l2n
                error += cost
                error_rmse += math.sqrt(mse)
                e_nll += nll
                e_nllu += nllu
                e_nlli += nlli
                e_cce += cce
                e_mae += mae
                e_l2n += l2n
                error_i += 1
                if error_i % consts["print_size"] == 0:
                    print ("epoch=", epoch, ", Error now = ", error / error_i, ", ",)
                    print ("RMSE now = ", error_rmse / error_i, ", ",)
                    print ("MAE now = ", e_mae / error_i, ", ",)
                    print ("NLL now = ", e_nll / error_i, ", ",)
                    print ("NLLu now = ", e_nllu / error_i, ", ",)
                    print ("NLLi now = ", e_nlli / error_i, ", ",)
                    print ("CCE now = ", e_cce / error_i, ", ",)
                    print ("L2 now = ", e_l2n / error_i, ", ",)
                    print (error_i, "/", len(batch_list), "time=", time.time()-start)
                    save_model("./model/m_" + data_type + "." + str(epoch) + "." + str(int(error_i / consts["print_size"])), model)
                    print_sent_dec(sum_pred, Y_sum_idx, Y_sum_mark, modules, consts, options, lvt_i2i)
            
            save_model("./model/m_" + data_type + "." + str(epoch), model)
            #print_sent_dec(sum_pred, Y_sum_idx, Y_sum_mark, modules, consts, options, lvt_i2i)
            print ("epoch=", epoch, ", Error = ", error / len(batch_list),)
            print (", RMSE = ", error_rmse / len(batch_list), ", time=", time.time()-start)

为什么出来全都是unk啊

Model

Theano是一个Python库,你可以定义,优化,评价数学表达式,尤其是多维数组 (numpy.ndarray)

最接近Theano的Python包是sympy,Theano比sympy更专注于张量表达式,并且有更多的机制来编译,

class NeuralRecsys(object):
    def __init__(self, modules, consts, options):
        self.is_predicting = options["is_predicting"]
        self.optimizer = modules["optimizer"]

        # import theano.tensor as T
        # 各种建立
        self.xu = T.vector("xu", dtype = "int64")
        self.xi = T.vector("xi", dtype = "int64")
        self.y_rating = T.vector("y_rating")
        self.lvt_dict = T.lvector("lvt_dict")
        self.y_rev_mark = T.matrix("y_rev_mark")
        self.y_rev_tf = T.matrix("y_rev_tf")
        self.y_rat_vec = T.matrix("y_rat_vec")
        self.y_sum_idx = T.tensor3("y_sum_idx", dtype = "int64")
        self.y_sum_mark = T.matrix("y_sum_mark")
        if not self.is_predicting:
            self.y_sum_lvt = T.tensor3("y_sum_lvt", dtype = "int64")

        self.lr = T.scalar("lr")
        self.batch_size = T.iscalar("batch_size")

        self.num_user = consts["num_user"]
        self.num_item = consts["num_item"]
        self.dict_size = consts["hfw_dict_size"]
        self.lvt_dict_size = consts["lvt_dict_size"] 
        self.dim_x = consts["dim_x"]
        self.dim_w = consts["dim_w"]
        self.dim_h = consts["dim_h"]
        self.dim_r = consts["dim_r"]
        self.len_y = consts["len_y"]

        self.params = []
        self.sub_params = []
        
        self.define_layers(modules, consts, options)
        if not self.is_predicting:
            self.define_train_funcs(modules, consts, options)

define_layers

    ‘‘‘
    The number of layers for the rating regression model is 4, and for the tips generation model is 1.
    ‘‘‘
    def define_layers(self, modules, consts, options):
        # init_weights 来自utils_pg.py
        self.emb_u = init_weights((self.num_user, self.dim_x), "emb_u", sample = "normal")
        self.emb_i = init_weights((self.num_item, self.dim_x), "emb_i", sample = "normal")
    
        self.params += [self.emb_u, self.emb_i]

        xu_flat = self.xu.flatten() # 向量吗???
        xi_flat = self.xi.flatten()
        xu_emb = self.emb_u[xu_flat, :] # 这些是什么意思哇
        xu_emb = T.reshape(xu_emb, (self.batch_size, self.dim_x))
        xi_emb = self.emb_i[xi_flat, :]
        xi_emb = T.reshape(xi_emb, (self.batch_size, self.dim_x))

        ## user-doc
        self.W_ud_uh = init_weights((self.dim_x, self.dim_h), "W_ud_uh")
        self.b_ud_uh = init_bias(self.dim_h, "b_ud_uh")
        self.W_ud_hd = init_weights((self.dim_h, self.dict_size), "W_ud_hd")
        self.b_ud_hd = init_bias(self.dict_size, "b_ud_hd")
        self.sub_W_ud_hd = self.W_ud_hd[:, self.lvt_dict]
        self.sub_b_ud_hd = self.b_ud_hd[self.lvt_dict]

        self.params += [self.W_ud_uh, self.b_ud_uh]
        self.sub_params = [(self.W_ud_hd, self.sub_W_ud_hd, (self.dim_h, self.lvt_dict_size)),
                           (self.b_ud_hd, self.sub_b_ud_hd, (self.lvt_dict_size,))]
       
        h_ru = T.nnet.sigmoid(T.dot(xu_emb, self.W_ud_uh) + self.b_ud_uh)
        
        self.W_ud_uh2 = init_weights((self.dim_h, self.dim_h), "W_ud_uh2")
        self.b_ud_uh2 = init_bias(self.dim_h, "b_ud_uh2")
        self.params += [self.W_ud_uh2, self.b_ud_uh2]
        h_ru2 = T.nnet.sigmoid(T.dot(h_ru, self.W_ud_uh2) + self.b_ud_uh2) ###
                self.W_ud_uh3 = init_weights((self.dim_h, self.dim_h), "W_ud_uh3")
        self.b_ud_uh3 = init_bias(self.dim_h, "b_ud_uh3")
        self.params += [self.W_ud_uh3, self.b_ud_uh3]
        h_ru3 = T.nnet.sigmoid(T.dot(h_ru2, self.W_ud_uh3) + self.b_ud_uh3) ##############

        ## item-doc
        self.W_id_ih = init_weights((self.dim_x, self.dim_h), "W_id_ih")
        self.b_id_ih = init_bias(self.dim_h, "b_id_ih")
        self.W_id_hd = init_weights((self.dim_h, self.dict_size), "W_id_hd")
        self.b_id_hd = init_bias(self.dict_size, "b_id_hd")
        self.sub_W_id_hd = self.W_id_hd[:, self.lvt_dict]
        self.sub_b_id_hd = self.b_id_hd[self.lvt_dict]
        self.params += [self.W_id_ih, self.b_id_ih]
        self.sub_params += [(self.W_id_hd, self.sub_W_id_hd, (self.dim_h, self.lvt_dict_size)),
                           (self.b_id_hd, self.sub_b_id_hd, (self.lvt_dict_size,))]

        h_ri = T.nnet.sigmoid(T.dot(xi_emb, self.W_id_ih) + self.b_id_ih)
        
        self.W_id_ih2 = init_weights((self.dim_h, self.dim_h), "W_id_ih2")
        self.b_id_ih2 = init_bias(self.dim_h, "b_id_ih2")
        self.params += [self.W_id_ih2, self.b_id_ih2]
        h_ri2 = T.nnet.sigmoid(T.dot(h_ri, self.W_id_ih2) + self.b_id_ih2) ###
        self.W_id_ih3 = init_weights((self.dim_h, self.dim_h), "W_id_ih3")
        self.b_id_ih3 = init_bias(self.dim_h, "b_id_ih3")
        self.params += [self.W_id_ih3, self.b_id_ih3]
        h_ri3 = T.nnet.sigmoid(T.dot(h_ri2, self.W_id_ih3) + self.b_id_ih3) #####

        ## user-item-doc
        self.W_uid_uh = init_weights((self.dim_x, self.dim_h), "W_uid_uh")
        self.W_uid_ih = init_weights((self.dim_x, self.dim_h), "W_uid_ih")
        self.b_uid_uih = init_bias(self.dim_h, "b_uid_uih")
        self.W_uid_hd = init_weights((self.dim_h, self.dict_size), "W_uid_hd")
        self.b_uid_hd = init_bias(self.dict_size, "b_uid_hd")
        self.sub_W_uid_hd = self.W_uid_hd[:, self.lvt_dict]
        self.sub_b_uid_hd = self.b_uid_hd[self.lvt_dict]
        self.params += [self.W_uid_uh, self.W_uid_ih, self.b_uid_uih]
        self.sub_params += [(self.W_uid_hd, self.sub_W_uid_hd, (self.dim_h, self.lvt_dict_size)),
                           (self.b_uid_hd, self.sub_b_uid_hd, (self.lvt_dict_size,))]

        h_rui = T.nnet.sigmoid(T.dot(xu_emb, self.W_uid_uh) + T.dot(xi_emb, self.W_uid_ih) + self.b_uid_uih)
        
        self.W_uid_uih2 = init_weights((self.dim_h, self.dim_h), "W_uid_uih2")
        self.b_uid_uih2 = init_bias(self.dim_h, "b_uid_uih2")
        self.params += [self.W_uid_uih2, self.b_uid_uih2]
        h_rui2 = T.nnet.sigmoid(T.dot(h_rui, self.W_uid_uih2) + self.b_uid_uih2) ###
        self.W_uid_uih3 = init_weights((self.dim_h, self.dim_h), "W_uid_uih3")
        self.b_uid_uih3 = init_bias(self.dim_h, "b_uid_uih3")
        self.params += [self.W_uid_uih3, self.b_uid_uih3]
        h_rui3 = T.nnet.sigmoid(T.dot(h_rui2, self.W_uid_uih3) + self.b_uid_uih3) #####
        
        # --------------------这上面的都是些什么玩意?-------------------------------
        
        # predict 1: rating
        self.W_r_uh = init_weights((self.dim_x, self.dim_h), "W_r_uh")
        self.W_r_ih = init_weights((self.dim_x, self.dim_h), "W_r_ih")
        self.W_r_uih = init_weights((self.dim_x, self.dim_h), "W_r_uih")
        self.b_r_uih = init_bias(self.dim_h, "b_r_uih")
        self.W_r_hr = init_weights((self.dim_h, 1), "W_r_hr")
        self.W_r_ur = init_weights((self.dim_x, 1), "W_r_ur")
        self.W_r_ir = init_weights((self.dim_x, 1), "W_r_ir")
        self.b_r_hr = init_bias(1, "b_r_hr")
        self.params += [self.W_r_uh, self.W_r_ih, self.W_r_uih, self.b_r_uih, self.W_r_hr, self.b_r_hr, self.W_r_ur, self.W_r_ir]

        # 公式(3)
        h_r = T.nnet.sigmoid(T.dot(xu_emb, self.W_r_uh) + T.dot(xi_emb, self.W_r_ih) + T.dot(xu_emb * xi_emb,  self.W_r_uih) +  self.b_r_uih)

        self.W_r_hh2 = init_weights((self.dim_h, self.dim_h), "W_r_hh2")
        self.b_r_hh2 = init_bias(self.dim_h, "b_r_hh2")
        self.params += [self.W_r_hh2, self.b_r_hh2]
        # 公式(5)多层
        h_r2 = T.nnet.sigmoid(T.dot(h_r, self.W_r_hh2) + self.b_r_hh2) ###
        self.W_r_hh3 = init_weights((self.dim_h, self.dim_h), "W_r_hh3")
        self.b_r_hh3 = init_bias(self.dim_h, "b_r_hh3")
        self.params += [self.W_r_hh3, self.b_r_hh3]
        # 公式(5)
        h_r3 = T.nnet.sigmoid(T.dot(h_r2, self.W_r_hh3) + self.b_r_hh3) #####
        
        ## merge them
        self.review_i = T.nnet.softmax(T.dot(h_ri3, self.sub_W_id_hd) + self.sub_b_id_hd)
        self.review_u = T.nnet.softmax(T.dot(h_ru3, self.sub_W_ud_hd) + self.sub_b_ud_hd)
        self.review = T.nnet.softmax(T.dot(h_rui3, self.sub_W_uid_hd) + self.sub_b_uid_hd)
        
        self.W_r_uhh = init_weights((self.dim_h, self.dim_h), "W_r_uhh")
        self.W_r_ihh = init_weights((self.dim_h, self.dim_h), "W_r_ihh")
        self.W_r_uihh = init_weights((self.dim_h, self.dim_h), "W_r_uihh")
        self.W_r_rhh = init_weights((self.dim_h, self.dim_h), "W_r_rhh")
        self.b_r_uirh = init_bias(self.dim_h, "b_r_uirh")
        self.params += [self.W_r_uhh, self.W_r_ihh, self.W_r_uihh, self.W_r_rhh, self.b_r_uirh]

        # 公式(6),增加了别的东西吧
        h_r4 = T.nnet.sigmoid(T.dot(h_ru3, self.W_r_uhh) + T.dot(h_ri3, self.W_r_ihh) + T.dot(h_r3, self.W_r_rhh) + T.dot(h_rui3, self.W_r_uihh) + self.b_r_uirh)
        self.r_pred = T.dot(h_r4, self.W_r_hr) + T.dot(xu_emb, self.W_r_ur) + T.dot(xi_emb, self.W_r_ir) + self.b_r_hr
        
        # predict 3: summary----------------------------------------------------------
        # 后面的都不会了
        self.emb_w = init_weights((self.dict_size, self.dim_w), "emb_w", sample = "normal")
        
        if self.is_predicting:
            emb_r = T.zeros((self.batch_size, self.dim_r))
            r_idx = T.round(self.r_pred)
            r_idx = T.cast(r_idx, "int64")
            r_idx = T.clip(r_idx, 0, 5) 

            emb_r = T.inc_subtensor(emb_r[:, r_idx], T.cast(1.0, dtype = theano.config.floatX))
        else:
            emb_r = self.y_rat_vec
        
        self.W_s_uh = init_weights((self.dim_x, self.dim_h), "W_s_uh")
        self.W_s_ih = init_weights((self.dim_x, self.dim_h), "W_s_ih")
        self.b_s_uih = init_bias(self.dim_h, "b_s_uih")
        self.W_s_rh = init_weights((self.dim_r, self.dim_h), "W_s_rh")
        self.W_s_uch = init_weights((self.dim_h, self.dim_h), "W_s_uch")
        self.W_s_ich = init_weights((self.dim_h, self.dim_h), "W_s_ich")
        self.W_s_uich = init_weights((self.dim_h, self.dim_h), "W_s_uich")
        
        self.params += [self.emb_w, self.W_s_uh, self.W_s_ih, self.b_s_uih, self.W_s_rh, self.W_s_uch, self.W_s_ich, self.W_s_uich]
        
        ## initialize hidden state
        # 公式(17)
        h_s = T.tanh(T.dot(xu_emb, self.W_s_uh) + T.dot(xi_emb, self.W_s_ih) + T.dot(emb_r, self.W_s_rh)             + T.dot(h_ru3, self.W_s_uch) + T.dot(h_ri3, self.W_s_ich) + T.dot(h_rui3, self.W_s_uich) + self.b_s_uih)
        
        if self.is_predicting:
            inputs = [self.xu, self.xi, self.batch_size]
            self.encode = theano.function(inputs = inputs,
                    outputs = [self.r_pred, h_s, emb_r, xu_emb, xi_emb],
                    on_unused_input = ‘ignore‘)

        y_flat = self.y_sum_idx.flatten()
        if self.is_predicting:
            y_emb = ifelse(T.lt(T.sum(y_flat), 0),
                    T.zeros((self.batch_size, self.dim_w)), self.emb_w[y_flat, :])
            y_emb = T.reshape(y_emb, (self.batch_size, self.dim_w))
        else:
            y_emb = self.emb_w[y_flat, :]
            y_emb = T.reshape(y_emb, (self.len_y, self.batch_size, self.dim_w))
            y_shifted = T.zeros_like(y_emb)
            y_shifted = T.set_subtensor(y_shifted[1:, :, :], y_emb[:-1, :, :])
            y_emb = y_shifted
        
        # gru decoder
        # 这又是哪里哇论文里好像没写?
        self.W_yh = init_weights((self.dim_w, self.dim_h), "W_yh", num_concatenate = 2, axis_concatenate = 1)
        self.b_yh = init_bias(self.dim_h, "b_yh", num_concatenate = 2)
        self.W_yhx = init_weights((self.dim_w, self.dim_h), "W_yhx")
        self.b_yhx = init_bias(self.dim_h, "b_yhx")
        self.W_hru = init_weights((self.dim_h, self.dim_h), "W_hru", "ortho", num_concatenate = 2, axis_concatenate = 1)
        self.W_hh = init_weights((self.dim_h, self.dim_h), "W_hh", "ortho")

        self.params += [self.W_yh, self.b_yh, self.W_yhx, self.b_yhx, self.W_hru, self.W_hh]

        def _slice(x, n):
            if x.ndim == 3:
                return x[:, :, n * self.dim_h : (n + 1) * self.dim_h]
            return x[:, n * self.dim_h : (n + 1) * self.dim_h]

        dec_in_x = T.dot(y_emb, self.W_yh) + self.b_yh
        dec_in_xx = T.dot(y_emb, self.W_yhx) + self.b_yhx

        def _active(x, xx, y_mask, pre_h, W_hru, W_hh):
            tmp1 = T.nnet.sigmoid(T.dot(pre_h, W_hru) + x)
            r1 = _slice(tmp1, 0)
            u1 = _slice(tmp1, 1)
            h1 = T.tanh(T.dot(pre_h * r1, W_hh) + xx)
            h1 = u1 * pre_h + (1.0 - u1) * h1
            h1 = y_mask[:, None] * h1 #+ (1.0 - y_mask) * pre_h
            return h1

        if self.is_predicting:
            self.y_sum_mark = T.ones((self.batch_size, 1))
        sequences = [dec_in_x, dec_in_xx, self.y_sum_mark]
        non_sequences = [self.W_hru, self.W_hh]

        if self.is_predicting:
            self.dec_next_state = T.matrix("dec_next_state")
            print ("use one-step decoder")
            hs = _active(*(sequences + [self.dec_next_state] + non_sequences))
        else:
            hs, _ = theano.scan(_active, 
                    sequences = sequences,
                    outputs_info = [h_s], 
                    non_sequences = non_sequences,
                    allow_gc = False, strict = True)
        
        # output layer
        self.W_hy = init_weights((self.dim_h, self.dim_h), "W_hy")
        self.W_yy = init_weights((self.dim_w, self.dim_h), "W_yy")
        self.W_uy = init_weights((self.dim_x, self.dim_h), "W_uy")
        self.W_iy = init_weights((self.dim_x, self.dim_h), "W_iy")
        self.W_ry = init_weights((self.dim_r, self.dim_h), "W_ry")
        self.b_hy = init_bias(self.dim_h, "b_hy")
        
        self.W_logit = init_weights((self.dim_h, self.dict_size), "W_logit")
        self.b_logit = init_bias(self.dict_size, "b_logit")

        self.sub_W_logit = self.W_logit[:, self.lvt_dict]
        self.sub_b_logit = self.b_logit[self.lvt_dict]
            
        self.params += [self.W_hy, self.W_yy, self.W_uy, self.W_iy, self.W_ry, self.b_hy]
        self.sub_params += [(self.W_logit, self.sub_W_logit, (self.dim_h, self.lvt_dict_size)),
                           (self.b_logit, self.sub_b_logit, (self.lvt_dict_size,))]

        logit = T.tanh(T.dot(hs, self.W_hy) + T.dot(y_emb, self.W_yy)                      + T.dot(xu_emb, self.W_uy) + T.dot(xi_emb, self.W_iy)                      + T.dot(emb_r, self.W_ry)                      + self.b_hy)
        logit = T.dot(logit, self.sub_W_logit) + self.sub_b_logit
        
        old_shape = logit.shape
        if self.is_predicting:
            self.y_sum_pred = T.nnet.softmax(logit)
        else:
            self.y_sum_pred = T.nnet.softmax(logit.reshape((old_shape[0] * old_shape[1], old_shape[2]))).reshape(old_shape)

        if self.is_predicting:
            emb_r_enc = T.matrix("emb_r_enc")
            xu_emb_enc = T.matrix("xu_emb_enc")
            xi_emb_enc = T.matrix("xi_emb_enc")
            
            inputs = [self.y_sum_idx, self.dec_next_state, emb_r, xu_emb, xi_emb, self.batch_size, self.lvt_dict]
            self.decode_once = theano.function(inputs = inputs,
                    outputs = [self.y_sum_pred, hs],
                    on_unused_input = ‘ignore‘)

define_train_funcs

    def define_train_funcs(self, modules, consts, options):
        mse = self.cost_mse(self.r_pred, self.y_rating.reshape((self.batch_size, 1)))
        nll = self.cost_nll(self.review, self.y_rev_tf, self.y_rev_mark)
        nllu = self.cost_nll(self.review_u, self.y_rev_tf, self.y_rev_mark)
        nlli = self.cost_nll(self.review_i, self.y_rev_tf, self.y_rev_mark)
        cce = self.categorical_crossentropy(modules)
        
        mae = self.cost_mae(self.r_pred, self.y_rating.reshape((self.batch_size, 1)))
        l2n = self.l2_norm()

        cost = mse + nll + nllu + nlli  + cce

cost_nll

    # Negative Log-Likelihood (NLL) 公式(16)
    def cost_nll(self, pred, label, mark):
        cost = -T.log(pred) * label * mark
        cost = T.mean(T.sum(cost, axis = 1))
        return cost

theano.function

https://stackoverflow.com/questions/38931112/theano-function-how-does-it-work

function()中updates=[old_w,new_w],当函数被调用的时候,这个会用new_w替换old_w

        gparams = []
        for param in self.params:
            gparams.append(T.clip(T.grad(cost, param), -10, 10))
        sub_gparams = []
        for param in self.sub_params:
            sub_gparams.append(T.clip(T.grad(cost, param[1]), -10, 10))

        optimizer = eval(self.optimizer)
        updates = optimizer(self.params, gparams, self.sub_params, sub_gparams, self.lr)

        inputs = [self.xu, self.xi, self.y_rating, self.y_rat_vec,                   self.lvt_dict, self.y_rev_mark, self.y_rev_tf,                   self.y_sum_idx, self.y_sum_mark, self.y_sum_lvt,                  self.batch_size]
        
        # 定义输入和输出的函数,就自动计算?
        self.train = theano.function(
                inputs = inputs + [self.lr],
                outputs = [cost, mse, nll, nllu, nlli, cce, mae, l2n, self.r_pred, self.y_sum_pred],
                updates = updates,
                on_unused_input = ‘ignore‘)

技术分享图片

创建的w(权重)是Tensor类型,里面有w.data 和 w.grad

技术分享图片

计算时用到了w,用到tensor的计算会构建动态的计算图,x*w也是Tensor了。

技术分享图片

返回loss,调用backward的时候自动计算grad,存在Tensor里。计算图会释放,下一次进行loss计算会生成新的计算图。

更新权重的时候不能直接操作w.grad,它也是一个tensor,而是取w.grad.data。所有的Tensor都要去data数据操作才行,否则就是构建计算图。

清零梯度数据,否则下一次计算的梯度会和之前的梯度叠加,手动。


nn.Module 构造自己的神经网络

loss 构造损失函数

sgd

技术分享图片

一定要有这两个函数(实现,名字相同)

没有backward:有module的类会自动实现backward这个过程

torch.nn.Linear(1,1) 是一个对象。包含w和b两个tensor

技术分享图片

args和kwargs

技术分享图片

_ _ call _ _ 里面会调用forward,建立model初始化后调用model(x)就相当与把x直接传进forward里

MSE损失函数和SGD优化函数

技术分享图片

使用model.parameters得到模型本来定义好的w和b

训练的过程

技术分享图片

先计算出当前结果,然后计算损失,然后backward,最后更新

各种优化器

技术分享图片


LogisticRegression解决分类问题

技术分享图片

Sigmod函数定义在functional包下

技术分享图片

基本分块,Cross EntropyE计算loss
技术分享图片

20200820

原文:https://www.cnblogs.com/pppqq77799/p/13537923.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!