import os import sys import math from collections import Counter import numpy as np import random import torch import torch.nn as nn import torch.nn.functional as F import nltk.book device = torch.device(‘cpu‘) # nltk.download(‘punkt‘) def load_data(in_file): cn = [] en = [] num_examples = 0 with open(in_file, ‘r‘, encoding=‘utf8‘) as f: for line in f: line = line.strip().split(‘\t‘) en.append([‘BOS‘] + nltk.word_tokenize(line[0].lower()) + [‘EOS‘]) cn.append([‘BOS‘] + [c for c in line[1]] + [‘EOS‘]) return en, cn train_file = ‘nmt/en-cn/train.txt‘ dev_file = ‘nmt/en-cn/dev.txt‘ train_en, train_cn = load_data(train_file) dev_en, dev_cn = load_data(dev_file) # print(dev_en[:2]) # print(dev_cn[:2]) # 构建单词表 UNK_IDX = 0 PAD_IDX = 1 # print(len(dev_cn),len(dev_en)) def build_dict(sentences, max_words=50000): word_count = Counter() for sentence in sentences: for word in sentence: word_count[word] += 1 ls = word_count.most_common(max_words) # 词频前max_words个单词(降序) total_words = len(ls) + 2 # +2算是“BOS”和“EOS” word_dict = {w[0]: index + 2 for index, w in enumerate(ls)} # {单词:索引}, w[0]:单词, w[1]:词频 word_dict[‘UNK‘] = UNK_IDX word_dict[‘PAD‘] = PAD_IDX return word_dict, total_words # total_words所有单词数, 最大50002 en_dict, en_total_words = build_dict(train_en) cn_dict, cn_total_words = build_dict(train_cn) inv_en_dict = {v: k for k, v in en_dict.items()} # 英文; {索引 : 单词} inv_cn_dict = {v: k for k, v in cn_dict.items()} # 中文; {索引 : 字} print(inv_cn_dict) print(inv_en_dict) def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True): length = len(en_sentences) out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences] out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences] # sort sentences by word def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) # 把中文和英文按照同样的顺序排序 if sort_by_len: sorted_index = len_argsort(out_en_sentences) out_en_sentences = [out_en_sentences[i] for i in sorted_index] out_cn_sentences = [out_cn_sentences[i] for i in sorted_index] return out_en_sentences, out_cn_sentences train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict) dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict) # print(train_cn[3]) # print([inv_cn_dict[i] for i in train_cn[3]]) # print([inv_en_dict[i] for i in train_en[3]]) # 函数返回:一个minibatch,每个句子的索引, [[11, 4, 3, 5], [16, 7, 5, 7], ...]] def get_minibatches(n, minibatch_size, shuffle=True): # n是传进来的句子数 idx_list = np.arange(0, n, minibatch_size) # [0, 1, ..., n-1] 按minibatch_size大小分割 if shuffle: np.random.shuffle(idx_list) minibatches = [] for idx in idx_list: minibatches.append(np.arange(idx, min(idx + minibatch_size, n))) return minibatches # seqs传入的是minibatches中的一个minibatch对应的batch_size个句子索引(嵌套列表),此处batch_size=64 def prepare_data(seqs): lengths = [len(seq) for seq in seqs] n_samples = len(seqs) # n_samples个句子 max_len = np.max(lengths) # batch_size个句子中最长句子长度 x = np.zeros((n_samples, max_len)).astype(‘int32‘) x_lengths = np.array(lengths).astype(‘int32‘) # batch中原始句子长度 for idx, seq in enumerate(seqs): x[idx, :lengths[idx]] = seq # lengths[idx]: 每个句子的索引, 长度不够补0 return x, x_lengths def gen_examples(en_sentences, cn_sentences, batch_size): minibatches = get_minibatches(len(en_sentences), batch_size) all_ex = [] for minibatch in minibatches: mb_en_sentences = [en_sentences[t] for t in minibatch] # 一个batch中每个句子的对应编码,[[[2, 982, 8], [14,5,6],...] mb_cn_sentences = [cn_sentences[t] for t in minibatch] mb_x, mb_x_len = prepare_data(mb_en_sentences) # 一个batch中每个句子的对应编码,长度不够补0; 一个batch中每个句子长度 mb_y, mb_y_len = prepare_data(mb_cn_sentences) all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len)) # 返回内容依次是 n / batch_size 个 (batch个句子编码,batch个英文句子长度,batch个中文句子编码,batch个中文句子长度) return all_ex batch_size = 64 train_data = gen_examples(train_en, train_cn, batch_size) dev_data = gen_examples(dev_en, dev_cn, batch_size) # masked cross entropy loss class LanguageModelCriterion(nn.Module): def __init__(self): super(LanguageModelCriterion, self).__init__() def forward(self, input, target, mask): # input: [64, 12, 3195] target: [64, 12] mask: [64, 12] # input: (batch_size * seq_len) * vocab_size input = input.contiguous().view(-1, input.size(2)) # target: batch_size * seq_len target = target.contiguous().view(-1, 1) mask = mask.contiguous().view(-1, 1) output = -input.gather(1, target) * mask # 将input在1维,把target当索引进行取值 # 这里算得就是交叉熵损失,前面已经算了F.log_softmax # output.shape=torch.Size([768, 1]) # 因为input.gather时,target为0的地方不是零了,mask作用是把padding为0的地方重置为零, # 因为在volab里0代表的也是一个单词,但是我们这里target尾部的0代表的不是单词 output = torch.sum(output) / torch.sum(mask) # 均值损失,output前已经加了负号,所以这里还是最小化 return output class PlainEncoder(nn.Module): def __init__(self, vocab_size, hidden_size, dropout=0.2): # 假设embedding_size=hidden_size super(PlainEncoder, self).__init__() self.embed = nn.Embedding(vocab_size, hidden_size) self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True) # batch_first=True: [batch_size, seq_len, hidden_size] self.dropout = nn.Dropout(dropout) # x: 一个batch的每个句子的编码 # lengths: 每个句子的原始编码长度(未补0的长度) # 最后一个hidden_state要取出来作为context vector,所以需要lengths def forward(self, x, lengths): # (排序好元素,排序好元素下标) sorted_len, sorted_idx = lengths.sort(0, descending=True) # 把batch里的seq按照长度降序排列 x_sorted = x[sorted_idx.long()] embedded = self.dropout(self.embed(x_sorted)) # 句子padding到一样长度的(真实句长会比padding的短) # 为了rnn时能取到真实长度的最后状态,先pack_padded_sequence进行处理 packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True) # out:[batch, seq_len, hidden_zize] # hidden: [num_layers=1, batch, hidden_size] packed_out, hidden = self.rnn(packed_embedded) out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True) # 回到padding长度 _, original_idx = sorted_idx.sort(0, descending=False) # 排序回原来的样子 out = out[original_idx.long()].contiguous() # [batch_size, seq_len, hidden_size] hidden = hidden[:, original_idx.long()].contiguous() # [num_layers, batch_size, hidden_size] # print("out.shape: ", out.shape, ‘hidden.shape: ‘, hidden.shape) return out, hidden[[-1]] # hidden[[-1]], 相当于out[:, -1] class PlainDecoder(nn.Module): def __init__(self, vocab_size, hidden_size, dropout=0.2): super(PlainDecoder, self).__init__() self.embed = nn.Embedding(vocab_size, hidden_size) self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True) # [batch_size, seq_len, hidden_size] self.fc = nn.Linear(hidden_size, vocab_size) self.dropout = nn.Dropout(dropout) # 和PlainEncoder的forward过程大致差不多,区别在于hidden_state不是0而是传入的 # y:一个batch的每个中文句子编码 # hid: hidden_state, context vectors def forward(self, y, y_lengths, hid): sorted_len, sorted_idx = y_lengths.sort(0, descending=True) y_sorted = y[sorted_idx.long()] hid = hid[:, sorted_idx.long()] # [batch_size, y_lengths, embed_size=hidden_size] y_sorted = self.dropout(self.embed(y_sorted)) packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True) out, hid = self.rnn(packed_seq, hid) unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) _, original_idx = sorted_idx.sort(0, descending=False) # 原来的索引升序 output_seq = unpacked[original_idx.long()].contiguous() # [batch_size, y_length, hidden_size] hid = hid[:, original_idx.long()].contiguous() # [1, batch_size, hidden_size] output = F.log_softmax(self.fc(output_seq), -1) # [batch_size, y_lengths, vocab_size] return output, hid class PlainSeq2Seq(nn.Module): def __init__(self, encoder, decoder): super(PlainSeq2Seq, self).__init__() self.encoder = encoder self.decoder = decoder def forward(self, x, x_lengths, y, y_lengths): encoder_cut, hid = self.encoder(x, x_lengths) output, hid = self.decoder(y, y_lengths, hid) return output, None def translate(self, x, x_lengths, y, max_length=10): encoder_cut, hid = self.encoder(x, x_lengths) preds = [] batch_size = x.shape[0] attns = [] # sample for i in range(max_length): # output: [batch_size, y_lengths, vocab_size] # 训练的时候y是一个句子,一起decoder训练 # 测试的时候y是个一个词一个词生成的,所以这里的y是传入的第一个单词,这里是bos # 同理y_lengths也是1 output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device), hid=hid) # 刚开始循环bos作为模型的首个输入单词,后续更新y,下个预测单词的输入是上个输出单词 # output.shape = torch.Size([1, 1, 3195]) # hid.shape = torch.Size([1, 1, 100]) y = output.max(2)[1].view(batch_size, 1) # .max(2)在第三个维度上取最大值,返回最大值和对应的位置索引,[1]取出最大值所在的索引 preds.append(y) # preds = [tensor([[5]], device=‘cuda:0‘), tensor([[24]], device=‘cuda:0‘), ... tensor([[4]], device=‘cuda:0‘)] # torch.cat(preds, 1) = tensor([[ 5, 24, 6, 22, 7, 4, 3, 4, 3, 4]], device=‘cuda:0‘) return torch.cat(preds, 1), None dropout = 0.2 hidden_size = 100 encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout) decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout) model = PlainSeq2Seq(encode, decoder) model = model.to(device) loss_fn = LanguageModelCriterion().to(device) optimizer = torch.optim.Adam(model.parameters()) def train(model, data, num_epochs=20): for epoch in range(num_epochs): model.train() # 训练模式 total_num_words = total_loss = 0. for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data): mb_x = torch.from_numpy(mb_x).to(device).long() mb_x_len = torch.from_numpy(mb_x_len).to(device).long() mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long() # EOS之前 mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long() # BOS之后 mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long() mb_y_len[mb_y_len <= 0] = 1 mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len) # [mb_y_len.max()]->[1, mb_y_len.max()] mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None] mb_out_mask = mb_out_mask.float() # (pre, target, mask) # mb_output是句子单词的索引 loss = loss_fn(mb_pred, mb_output, mb_out_mask) num_words = torch.sum(mb_y_len).item() total_loss += loss.item() * num_words total_num_words += num_words # 更新模型 optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.) optimizer.step() if it % 100 == 0: print("Epoch: ", epoch, ‘iteration‘, it, ‘loss:‘, loss.item()) print("Epoch", epoch, "Training loss", total_loss / total_num_words) if epoch % 5 == 0: evaluate(model, dev_data) torch.save(model.state_dict(), ‘translate_model.pt‘) def evaluate(model, data): model.eval() total_num_words = total_loss = 0. with torch.no_grad(): for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data): mb_x = torch.from_numpy(mb_x).to(device).long() mb_x_len = torch.from_numpy(mb_x_len).to(device).long() mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long() mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long() mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long() mb_y_len[mb_y_len <= 0] = 1 mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len) mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None] mb_out_mask = mb_out_mask.float() loss = loss_fn(mb_pred, mb_output, mb_out_mask) num_words = torch.sum(mb_y_len).item() total_loss += loss.item() * num_words total_num_words += num_words print("Evaluation loss", total_loss / total_num_words) def translate_dev(i): en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]]) # 原来的英文 print(en_sent) cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]]) # 原来的中文 print("".join(cn_sent)) # 一条句子 mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device) mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device) bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device) # shape:[1,1], [[2]] # y_lengths: [[2]], 一个句子 translation, attn = model.translate(mb_x, mb_x_len, bos) # [1, 10] # 映射成中文 translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)] trans = [] for word in translation: if word != "EOS": trans.append(word) else: break print("".join(trans)) # 翻译后的中文 train(model, train_data, num_epochs=100) # 导入训练好模型 model.load_state_dict(torch.load(‘translate_model.pt‘, map_location=device)) for i in range(100, 120): translate_dev(i) print()
原文:https://www.cnblogs.com/Carraway-Space/p/13754277.html