首页 > 其他 > 详细

训练word2vec

时间:2021-03-03 15:00:33      阅读:26      评论:0      收藏:0      [点我收藏+]

记录训练word2vec流程,以做备忘。代码如下:

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
‘‘‘
训练wordvec词向量
1. 后期需要单独处理[PAD]与[UNK]等特殊字符
2. 后期需要注意未登录词的处理
‘‘‘
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
# import os
# os.environ[‘CUDA_VISIBLE_DEVICES‘] = "0"


def get_total_text_file(train_file, dev_file, test_file, output_file):
    ‘‘‘获取所有字符,用于训练word2vec词向量,注意要分字(字与字之间空格隔开)或者分词‘‘‘
    with open(train_file, ‘r‘, encoding=‘utf-8‘) as f1,             open(dev_file, ‘r‘, encoding=‘utf-8‘) as f2,             open(test_file, ‘r‘, encoding=‘utf-8‘) as f3,             open(output_file, ‘w+‘, encoding=‘utf-8‘) as f:
        for line in f1.readlines():
            text1, text2, label = line.split(‘\t‘)
            new_line = ‘ ‘.join([char for char in text1 + text2])
            f.write(new_line + ‘\n‘)
        for line in f2.readlines():
            text1, text2, label = line.split(‘\t‘)
            new_line = ‘ ‘.join([char for char in text1 + text2])
            f.write(new_line + ‘\n‘)
        for line in f3.readlines():
            text1, text2, label = line.split(‘\t‘)
            new_line = ‘ ‘.join([char for char in text1 + text2])
            f.write(new_line + ‘\n‘)
    return


def Vectorize_training(input_file_path, output_model_path, output_w2v_path):
    ‘‘‘
    输入文本格式:每一行如:
    大 家 觉 得 她 好 看 吗
    求 秋 色 之 空 漫 画 全 集
    ‘‘‘
    wiki_news = open(input_file_path,‘r‘,encoding=‘utf-8‘)

    # Word2Vec第一个参数表示预处理之后的训练语料库,sg=0表示使用CBOW模型,size表示词向量的维度,
    # min_count表示过滤掉出现次数小于min_count的单词,workers表示线程数
    model = Word2Vec(LineSentence(wiki_news),sg=0,size=300,window=5,min_count=1,workers=2)

    model.save(output_model_path)  # 保存模型
    ‘‘‘
    保存词向量,格式如下
    2032 300
    [PAD] 0.02861733 - 0.08838269 - 0.053209875 - 0.07564939 - 0.083910674
    ‘‘‘
    model.wv.save_word2vec_format(output_w2v_path)


if __name__ == ‘__main__‘:
    # get_total_text_file(train_file, dev_file, test_file, output_file)

    input_file_path = ‘./data/比赛数据集/new_total_train_w2v.tsv‘
    output_model_path = ‘./data/比赛数据集/sim_text_comp.w2v_model‘
    output_w2v_path = ‘./data/比赛数据集/sim_text_comp_word2vec.bin‘
    Vectorize_training(input_file_path, output_model_path, output_w2v_path)

    model = Word2Vec.load(‘./data/比赛数据集/sim_text_comp.w2v_model‘)
    print(model[‘湖‘].shape)  # (300,)
    # print(model.wv[‘湖‘])
    print(model.wv.vectors.shape)  # (1874, 300)

    # a = model.wv.index2word()
    # print(a)  # 获得所有的词汇
    # for word in model.wv.index2word():
    #     print(word, model[word])  # 获得词汇及其对应的向量

训练word2vec

原文:https://www.cnblogs.com/lyiheng/p/14472867.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!