Pytorch seq2seq实战

时间：2020-09-30 16:59:21 阅读：38 评论：0 收藏：0 [点我收藏+]
import os
import sys
import math
from collections import Counter
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk.book

device = torch.device(‘cpu‘)


# nltk.download(‘punkt‘)


def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, ‘r‘, encoding=‘utf8‘) as f:
        for line in f:
            line = line.strip().split(‘\t‘)

            en.append([‘BOS‘] + nltk.word_tokenize(line[0].lower()) + [‘EOS‘])
            cn.append([‘BOS‘] + [c for c in line[1]] + [‘EOS‘])

    return en, cn


train_file = ‘nmt/en-cn/train.txt‘
dev_file = ‘nmt/en-cn/dev.txt‘
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)
# print(dev_en[:2])
# print(dev_cn[:2])
# 构建单词表
UNK_IDX = 0
PAD_IDX = 1


# print(len(dev_cn),len(dev_en))

def build_dict(sentences, max_words=50000):
    word_count = Counter()

    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1

    ls = word_count.most_common(max_words)  # 词频前max_words个单词(降序)
    total_words = len(ls) + 2  # +2算是“BOS”和“EOS”

    word_dict = {w[0]: index + 2 for index, w in enumerate(ls)}  # {单词:索引}, w[0]:单词, w[1]:词频
    word_dict[‘UNK‘] = UNK_IDX
    word_dict[‘PAD‘] = PAD_IDX

    return word_dict, total_words  # total_words所有单词数, 最大50002


en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
inv_en_dict = {v: k for k, v in en_dict.items()}  # 英文; {索引 : 单词}
inv_cn_dict = {v: k for k, v in cn_dict.items()}  # 中文; {索引 : 字}

print(inv_cn_dict)
print(inv_en_dict)


def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]

    # sort sentences by word
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    # 把中文和英文按照同样的顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]

    return out_en_sentences, out_cn_sentences


train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict,
                        cn_dict)


# print(train_cn[3])
# print([inv_cn_dict[i] for i in train_cn[3]])
# print([inv_en_dict[i] for i in train_en[3]])
# 函数返回：一个minibatch，每个句子的索引, [[11, 4, 3, 5], [16, 7, 5, 7], ...]]
def get_minibatches(n, minibatch_size, shuffle=True):  # n是传进来的句子数
    idx_list = np.arange(0, n, minibatch_size)  # [0, 1, ..., n-1] 按minibatch_size大小分割
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches


# seqs传入的是minibatches中的一个minibatch对应的batch_size个句子索引(嵌套列表)，此处batch_size=64
def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)  # n_samples个句子
    max_len = np.max(lengths)  # batch_size个句子中最长句子长度

    x = np.zeros((n_samples, max_len)).astype(‘int32‘)
    x_lengths = np.array(lengths).astype(‘int32‘)  # batch中原始句子长度

    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq  # lengths[idx]: 每个句子的索引, 长度不够补0

    return x, x_lengths


def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]  # 一个batch中每个句子的对应编码,[[[2, 982, 8], [14,5,6],...]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)  # 一个batch中每个句子的对应编码，长度不够补0; 一个batch中每个句子长度
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    # 返回内容依次是 n / batch_size 个 (batch个句子编码，batch个英文句子长度，batch个中文句子编码，batch个中文句子长度)
    return all_ex


batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
dev_data = gen_examples(dev_en, dev_cn, batch_size)


# masked cross entropy loss
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        # input: [64, 12, 3195] target: [64, 12]  mask: [64, 12]
        # input: (batch_size * seq_len) * vocab_size
        input = input.contiguous().view(-1, input.size(2))
        # target: batch_size * seq_len
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask  # 将input在1维，把target当索引进行取值
        # 这里算得就是交叉熵损失，前面已经算了F.log_softmax
        # output.shape=torch.Size([768, 1])
        # 因为input.gather时，target为0的地方不是零了，mask作用是把padding为0的地方重置为零，
        # 因为在volab里0代表的也是一个单词，但是我们这里target尾部的0代表的不是单词
        output = torch.sum(output) / torch.sum(mask)
        # 均值损失，output前已经加了负号，所以这里还是最小化
        return output


class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):  # 假设embedding_size=hidden_size
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size,
                          batch_first=True)  # batch_first=True: [batch_size, seq_len, hidden_size]
        self.dropout = nn.Dropout(dropout)

    # x: 一个batch的每个句子的编码
    # lengths: 每个句子的原始编码长度(未补0的长度)
    # 最后一个hidden_state要取出来作为context vector，所以需要lengths
    def forward(self, x, lengths):
        # (排序好元素，排序好元素下标)
        sorted_len, sorted_idx = lengths.sort(0, descending=True)  # 把batch里的seq按照长度降序排列
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))

        # 句子padding到一样长度的(真实句长会比padding的短)
        # 为了rnn时能取到真实长度的最后状态，先pack_padded_sequence进行处理
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(),
                                                            batch_first=True)
        # out:[batch, seq_len, hidden_zize]
        # hidden: [num_layers=1, batch, hidden_size]
        packed_out, hidden = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # 回到padding长度

        _, original_idx = sorted_idx.sort(0, descending=False)  # 排序回原来的样子

        out = out[original_idx.long()].contiguous()  # [batch_size, seq_len, hidden_size]
        hidden = hidden[:, original_idx.long()].contiguous()  # [num_layers, batch_size, hidden_size]

        #         print("out.shape: ", out.shape, ‘hidden.shape: ‘, hidden.shape)

        return out, hidden[[-1]]  # hidden[[-1]], 相当于out[:, -1]


class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)  # [batch_size, seq_len, hidden_size]
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    # 和PlainEncoder的forward过程大致差不多，区别在于hidden_state不是0而是传入的
    # y:一个batch的每个中文句子编码
    # hid: hidden_state, context vectors
    def forward(self, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]

        # [batch_size, y_lengths, embed_size=hidden_size]
        y_sorted = self.dropout(self.embed(y_sorted))

        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(),
                                                       batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        _, original_idx = sorted_idx.sort(0, descending=False)  # 原来的索引升序
        output_seq = unpacked[original_idx.long()].contiguous()  # [batch_size, y_length, hidden_size]
        hid = hid[:, original_idx.long()].contiguous()  # [1, batch_size, hidden_size]

        output = F.log_softmax(self.fc(output_seq), -1)  # [batch_size, y_lengths, vocab_size]

        return output, hid


class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x, x_lengths, y, y_lengths):
        encoder_cut, hid = self.encoder(x, x_lengths)
        output, hid = self.decoder(y, y_lengths, hid)

        return output, None

    def translate(self, x, x_lengths, y, max_length=10):
        encoder_cut, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        # sample
        for i in range(max_length):
            # output: [batch_size, y_lengths, vocab_size]
            # 训练的时候y是一个句子，一起decoder训练
            # 测试的时候y是个一个词一个词生成的，所以这里的y是传入的第一个单词，这里是bos
            # 同理y_lengths也是1
            output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device),
                                       hid=hid)
            # 刚开始循环bos作为模型的首个输入单词，后续更新y，下个预测单词的输入是上个输出单词
            # output.shape = torch.Size([1, 1, 3195])
            # hid.shape = torch.Size([1, 1, 100])

            y = output.max(2)[1].view(batch_size, 1)
            # .max(2)在第三个维度上取最大值,返回最大值和对应的位置索引，[1]取出最大值所在的索引
            preds.append(y)
            # preds = [tensor([[5]], device=‘cuda:0‘), tensor([[24]], device=‘cuda:0‘), ... tensor([[4]], device=‘cuda:0‘)]

        # torch.cat(preds, 1) = tensor([[ 5, 24,  6, 22,  7,  4,  3,  4,  3,  4]], device=‘cuda:0‘)
        return torch.cat(preds, 1), None


dropout = 0.2
hidden_size = 100
encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)

model = PlainSeq2Seq(encode, decoder)
model = model.to(device)

loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())


def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()  # 训练模式
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()

            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()  # EOS之前
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()  # BOS之后

            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            # [mb_y_len.max()]->[1, mb_y_len.max()]
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            # (pre, target, mask)
            # mb_output是句子单词的索引
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words

            # 更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()

            if it % 100 == 0:
                print("Epoch: ", epoch, ‘iteration‘, it, ‘loss:‘, loss.item())

        print("Epoch", epoch, "Training loss", total_loss / total_num_words)

        if epoch % 5 == 0:
            evaluate(model, dev_data)

    torch.save(model.state_dict(), ‘translate_model.pt‘)


def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.

    with torch.no_grad():
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words

    print("Evaluation loss", total_loss / total_num_words)


def translate_dev(i):
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])  # 原来的英文
    print(en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])  # 原来的中文
    print("".join(cn_sent))

    # 一条句子
    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)  # shape:[1,1], [[2]]

    # y_lengths: [[2]], 一个句子
    translation, attn = model.translate(mb_x, mb_x_len, bos)  # [1, 10]
    # 映射成中文
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))  # 翻译后的中文


train(model, train_data, num_epochs=100)

# 导入训练好模型
model.load_state_dict(torch.load(‘translate_model.pt‘, map_location=device))
for i in range(100, 120):
    translate_dev(i)
    print()
Pytorch seq2seq实战
原文：https://www.cnblogs.com/Carraway-Space/p/13754277.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)