import matplotlib as mpl import matplotlib.pyplot as plt #%matplotlib inline import numpy as np import sklearn import pandas as pd import os import sys import time import tensorflow as tf from tensorflow import keras print(tf.__version__) print(sys.version_info) for module in mpl,np,pd,sklearn,tf,keras: print(module.__name__,module.__version__)
2.0.0 sys.version_info(major=3, minor=7, micro=4, releaselevel=‘final‘, serial=0) matplotlib 3.1.1 numpy 1.16.5 pandas 0.25.1 sklearn 0.21.3 tensorflow 2.0.0 tensorflow_core.keras 2.2.4-tf
input_filepath = ‘./shakespeare.txt‘ text = open(input_filepath,‘r‘).read() print(len(text)) print(text[0:10])
First Citi
2、建立映射表 字符——>id id——>字符
3、定义输入与输出 eg. 输入abcd ——> 输出bcde
vocab = sorted(set(text)) # 利用set方法取出字符,并将重复字符去掉,sort方法用来排序 print(len(vocab)) print(vocab)
65 [‘\n‘, ‘ ‘, ‘!‘, ‘$‘, ‘&‘, "‘", ‘,‘, ‘-‘, ‘.‘, ‘3‘, ‘:‘, ‘;‘, ‘?‘, ‘A‘, ‘B‘, ‘C‘, ‘D‘, ‘E‘, ‘F‘, ‘G‘, ‘H‘,
‘I‘, ‘J‘, ‘K‘, ‘L‘, ‘M‘, ‘N‘, ‘O‘, ‘P‘, ‘Q‘, ‘R‘, ‘S‘, ‘T‘, ‘U‘, ‘V‘, ‘W‘, ‘X‘, ‘Y‘, ‘Z‘, ‘a‘, ‘b‘, ‘c‘, ‘d‘,
‘e‘, ‘f‘, ‘g‘, ‘h‘, ‘i‘, ‘j‘, ‘k‘, ‘l‘, ‘m‘, ‘n‘, ‘o‘, ‘p‘, ‘q‘, ‘r‘, ‘s‘, ‘t‘, ‘u‘, ‘v‘, ‘w‘, ‘x‘, ‘y‘, ‘z‘]
# 建立映射 字符-->id char2idx = {char : idx for idx,char in enumerate(vocab)} print(char2idx)
{‘\n‘: 0, ‘ ‘: 1, ‘!‘: 2, ‘$‘: 3, ‘&‘: 4, "‘": 5, ‘,‘: 6, ‘-‘: 7, ‘.‘: 8, ‘3‘: 9, ‘:‘: 10, ‘;‘: 11, ‘?‘: 12,
‘A‘: 13, ‘B‘: 14, ‘C‘: 15, ‘D‘: 16, ‘E‘: 17, ‘F‘: 18, ‘G‘: 19, ‘H‘: 20, ‘I‘: 21, ‘J‘: 22, ‘K‘: 23, ‘L‘: 24,
‘M‘: 25, ‘N‘: 26, ‘O‘: 27, ‘P‘: 28, ‘Q‘: 29, ‘R‘: 30, ‘S‘: 31, ‘T‘: 32, ‘U‘: 33, ‘V‘: 34, ‘W‘: 35, ‘X‘: 36,
‘Y‘: 37, ‘Z‘: 38, ‘a‘: 39, ‘b‘: 40, ‘c‘: 41, ‘d‘: 42, ‘e‘: 43, ‘f‘: 44, ‘g‘: 45, ‘h‘: 46, ‘i‘: 47, ‘j‘: 48,
‘k‘: 49, ‘l‘: 50, ‘m‘: 51, ‘n‘: 52, ‘o‘: 53, ‘p‘: 54, ‘q‘: 55, ‘r‘: 56, ‘s‘: 57, ‘t‘: 58, ‘u‘: 59, ‘v‘: 60,
‘w‘: 61, ‘x‘: 62, ‘y‘: 63, ‘z‘: 64}
# 把vocab转为numpy 即id ——> 字符 idx2char = np.array(vocab) print(idx2char)
[‘\n‘ ‘ ‘ ‘!‘ ‘$‘ ‘&‘ "‘" ‘,‘ ‘-‘ ‘.‘ ‘3‘ ‘:‘ ‘;‘ ‘?‘ ‘A‘ ‘B‘ ‘C‘ ‘D‘ ‘E‘ ‘F‘ ‘G‘ ‘H‘ ‘I‘ ‘J‘ ‘K‘ ‘L‘ ‘M‘ ‘N‘ ‘O‘ ‘P‘ ‘Q‘ ‘R‘ ‘S‘ ‘T‘ ‘U‘ ‘V‘ ‘W‘ ‘X‘ ‘Y‘ ‘Z‘ ‘a‘ ‘b‘ ‘c‘ ‘d‘ ‘e‘ ‘f‘ ‘g‘ ‘h‘ ‘i‘ ‘j‘ ‘k‘ ‘l‘ ‘m‘ ‘n‘ ‘o‘ ‘p‘ ‘q‘ ‘r‘ ‘s‘ ‘t‘ ‘u‘ ‘v‘ ‘w‘ ‘x‘ ‘y‘ ‘z‘]
# 将text变为对应的idx组成的文本 text_as_int = np.array([char2idx[c] for c in text]) print(text_as_int[0:10]) print(text[0:10])
[18 47 56 57 58 1 15 47 58 47]
First Citi
# 定义输入与输出 def split_input_target(id_text): """ 文本为abcde,则输入为abcd,四个字符对应的输出分别为:bcde,即每个输出都是输入的下一个字符 """ return id_text[0:-1],id_text[1:]
# 定义dataset # 每个字符集对应的idx的dataset char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # 定义一个句子型的dataset seg_length = 100 seq_dataset = char_dataset.batch(seg_length+1, drop_remainder=True) # 打印两个数据集的内容 for ch_id in char_dataset.take(2): print(ch_id,idx2char[ch_id.numpy()]) for seg_id in seq_dataset.take(2): print(seg_id) print(repr(‘‘.join(idx2char[seg_id.numpy()])))
tf.Tensor(18, shape=(), dtype=int32) F tf.Tensor(47, shape=(), dtype=int32) i tf.Tensor( [18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 14 43 44 53 56 43 1 61 43 1 54 56 53 41 43 43 42 1 39 52 63 1 44 59 56 58 46 43 56 6 1 46 43 39 56 1 51 43 1 57 54 43 39 49 8 0 0 13 50 50 10 0 31 54 43 39 49 6 1 57 54 43 39 49 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 37 53 59 1], shape=(101,), dtype=int32) ‘First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou ‘ tf.Tensor( [39 56 43 1 39 50 50 1 56 43 57 53 50 60 43 42 1 56 39 58 46 43 56 1 58 53 1 42 47 43 1 58 46 39 52 1 58 53 1 44 39 51 47 57 46 12 0 0 13 50 50 10 0 30 43 57 53 50 60 43 42 8 1 56 43 57 53 50 60 43 42 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 18 47 56 57 58 6 1 63 53 59 1 49], shape=(101,), dtype=int32) ‘are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k‘
seq_dataset = seq_dataset.map(split_input_target) for item_input, item_output in seq_dataset.take(2): print(item_input.numpy()) print(item_output.numpy())
[18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 14 43 44 53 56 43 1 61 43 1 54 56 53 41 43 43 42 1 39 52 63 1 44 59 56 58 46 43 56 6 1 46 43 39 56 1 51 43 1 57 54 43 39 49 8 0 0 13 50 50 10 0 31 54 43 39 49 6 1 57 54 43 39 49 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 37 53 59] [47 56 57 58 1 15 47 58 47 64 43 52 10 0 14 43 44 53 56 43 1 61 43 1 54 56 53 41 43 43 42 1 39 52 63 1 44 59 56 58 46 43 56 6 1 46 43 39 56 1 51 43 1 57 54 43 39 49 8 0 0 13 50 50 10 0 31 54 43 39 49 6 1 57 54 43 39 49 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 37 53 59 1] [39 56 43 1 39 50 50 1 56 43 57 53 50 60 43 42 1 56 39 58 46 43 56 1 58 53 1 42 47 43 1 58 46 39 52 1 58 53 1 44 39 51 47 57 46 12 0 0 13 50 50 10 0 30 43 57 53 50 60 43 42 8 1 56 43 57 53 50 60 43 42 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 18 47 56 57 58 6 1 63 53 59 1] [56 43 1 39 50 50 1 56 43 57 53 50 60 43 42 1 56 39 58 46 43 56 1 58 53 1 42 47 43 1 58 46 39 52 1 58 53 1 44 39 51 47 57 46 12 0 0 13 50 50 10 0 30 43 57 53 50 60 43 42 8 1 56 43 57 53 50 60 43 42 8 0 0 18 47 56 57 58 1 15 47 58 47 64 43 52 10 0 18 47 56 57 58 6 1 63 53 59 1 49]
batch_size = 64 buffer_size = 10000 seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)
# 定义模型 vocab_size = len(vocab) embedding_dim = 256 rnn_units = 1024 def build_model(vocab_size,embedding_dim,rnn_units,batch_size): model = keras.models.Sequential([ keras.layers.Embedding(vocab_size,embedding_dim, batch_input_shape=[batch_size,None]), keras.layers.LSTM(units=rnn_units, stateful=True, recurrent_initializer="glorot_uniform", return_sequences=True), keras.layers.Dense(vocab_size) ]) return model model = build_model(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=batch_size) model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (64, None, 256) 16640 _________________________________________________________________ lstm (LSTM) (64, None, 1024) 5246976 _________________________________________________________________ dense (Dense) (64, None, 65) 66625 ================================================================= Total params: 5,330,241 Trainable params: 5,330,241 Non-trainable params: 0 _________________________________________________________________
在训练之前我们先通过这个未被训练过的模型来 查看下我们的输出是什么样子。
# 查看输出 for input_example_batch, target_example_batch in seq_dataset.take(1): example_batch_prediction = model(input_example_batch) print(example_batch_prediction.shape)
(64, 100, 65)
# random sampling sample_indices = tf.random.categorical(logits = example_batch_prediction[0], num_samples=1) print(sample_indices) sample_indices = tf.squeeze(sample_indices,axis=-1) print(sample_indices)
tf.random.categorical的作用即是用来随机采样的。将example_batch_prediction的第一个数据输入进去,num_sample = 1指的是只在65个概率分布中取一个值作为输出。
因输入大小为(100,65),随机 采样后的结果大小为(100,1)。然后我们再通过tf.squeeze来将输出的第二个维度去掉,即最终输出结果大小为(100,)。
tf.Tensor( [[59] [11] [34] [10] [35] [10] [ 0] [64] [39] [53] [52] [62] [22] [35] [37] [ 9] [58] [45] [12] [21] [24] [63] [20] [ 5] [ 7] [54] [34] [43] [35] [41] [41] [15] [33] [ 9] [12] [33] [42] [55] [30] [45] [52] [25] [ 6] [53] [55] [ 2] [48] [ 6] [47] [ 3] [17] [26] [18] [56] [59] [53] [ 2] [30] [14] [ 2] [18] [33] [53] [36] [41] [64] [16] [ 5] [50] [63] [31] [19] [27] [27] [ 9] [59] [62] [23] [41] [35] [56] [40] [30] [18] [36] [62] [54] [26] [37] [ 6] [47] [52] [57] [17] [52] [35] [62] [63] [23] [58]], shape=(100, 1), dtype=int64) tf.Tensor( [59 11 34 10 35 10 0 64 39 53 52 62 22 35 37 9 58 45 12 21 24 63 20 5 7 54 34 43 35 41 41 15 33 9 12 33 42 55 30 45 52 25 6 53 55 2 48 6 47 3 17 26 18 56 59 53 2 30 14 2 18 33 53 36 41 64 16 5 50 63 31 19 27 27 9 59 62 23 41 35 56 40 30 18 36 62 54 26 37 6 47 52 57 17 52 35 62 63 23 58], shape=(100,), dtype=int64)
# 打印输入,输出与预测对应的字符串 print("Input:",repr("".join(idx2char[input_example_batch[0]]))) print() print("Output:",repr("".join(idx2char[target_example_batch[0]]))) print() print("Predictions:",repr("".join(idx2char[sample_indices])))
Input: "ed Richard‘s royal queen.\n\nQUEEN ELIZABETH:\nO, cut my lace in sunder, that my pent heart\nMay have so" Output: "d Richard‘s royal queen.\n\nQUEEN ELIZABETH:\nO, cut my lace in sunder, that my pent heart\nMay have som" Predictions: "u;V:W:\nzaonxJWY3tg?ILyH‘-pVeWccCU3?UdqRgnM,oq!j,i$ENFruo!RB!FUoXczD‘lySGOO3uxKcWrbRFXxpNY,insEnWxyKt"
# 自定义损失函数 def loss(labels,logits): return keras.losses.sparse_categorical_crossentropy( labels,logits,from_logits=True) model.compile(optimizer=‘adam‘,loss=loss) # 查看之前例子的loss example_loss = loss(target_example_batch,example_batch_prediction) print(example_loss.shape) print(example_loss.numpy().mean())
(64, 100)
# 模型保存与训练 output_dir = "./text_generation_lstm_checkpoints" if not os.path.exists(output_dir): os.mkdir(output_dir) checkpoint_prefix = os.path.join(output_dir,‘ckpt_{epoch}‘) checkpoint_callback = keras.callbacks.ModelCheckpoint( filepath=checkpoint_prefix, save_weights_only=True)
epochs = 10 history = model.fit(seq_dataset,epochs = epochs, callbacks=[checkpoint_callback])
Epoch 1/10 172/172 [==============================] - 775s 5s/step - loss: 2.5823 Epoch 2/10 172/172 [==============================] - 772s 4s/step - loss: 1.8873 Epoch 3/10 172/172 [==============================] - 1961s 11s/step - loss: 1.6376 Epoch 4/10 172/172 [==============================] - 777s 5s/step - loss: 1.5030 Epoch 5/10 172/172 [==============================] - 3035s 18s/step - loss: 1.4225 Epoch 6/10 172/172 [==============================] - 730s 4s/step - loss: 1.3668 Epoch 7/10 172/172 [==============================] - 662s 4s/step - loss: 1.3220 Epoch 8/10 172/172 [==============================] - 665s 4s/step - loss: 1.2838 Epoch 9/10 172/172 [==============================] - 677s 4s/step - loss: 1.2490 Epoch 10/10 172/172 [==============================] - 673s 4s/step - loss: 1.2137
# 查看最新保存的模型 tf.train.latest_checkpoint(output_dir)
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1) model2.load_weights(tf.train.latest_checkpoint(output_dir)) # 定义model2的输入shape model2.build(tf.TensorShape([1,None])) model2.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (1, None, 256) 16640 _________________________________________________________________ lstm_1 (LSTM) (1, None, 1024) 5246976 _________________________________________________________________ dense_1 (Dense) (1, None, 65) 66625 ================================================================= Total params: 5,330,241 Trainable params: 5,330,241 Non-trainable params: 0 _________________________________________________________________
# 定义函数实现文本生成 temperature = 0.5 def generate_text(model,start_string,num_generate=1000): input_eval = [char2idx[ch] for ch in start_string] input_eval = tf.expand_dims(input_eval,0) text_generated=[] model.reset_states() for _ in range(num_generate): predictions = model(input_eval) predictions = predictions / temperature predictions = tf.squeeze(predictions,0) predicted_id = tf.random.categorical( predictions,num_samples=1)[-1,0].numpy() text_generated.append(idx2char[predicted_id]) input_eval = tf.expand_dims([predicted_id],0) return start_string+‘‘.join(text_generated) # 调用 new_text = generate_text(model2,"first: ") print(new_text)
通过执行代码,我们可以得到当输入为“first: ”时的输出结果为:
first: the sun KING RICHARD II: Have we no disiness of my soul to this shame, That hath a stand and stone of monster than the bears, And so and heard the sea of death. LEONTES: What shall we here? BENVOLIO: Alas, that news, what says my love! CLARENCE: My lord, this is not so dishonour‘d him, That you shall have done, so we will be content: The shapes of love, that seal‘d the state with heavens of heart, To think it strangers as you that have been To her and loss of his accusers, and stumbled, How is it here, and what a cruel waters, That are you shall seek to them and warriared. TRANIO: And therefore stand upon the world‘s accusation that still have for some state of mine. HENRY BOLINGBROKE: The arm of my soul, that I may leave you. MERCUTIO: Here‘s some that will not honour than her mother, I am so med my shoulder, and then at him, Had comfort in the violent of your souls, That comforts of eather doth their souls, With leave before her brothers, and am I come to the torture, That the