首页 > 其他 > 详细

Tensorflow模型保存时程序意外被终止导致模型参数数据损坏且加载模型失败

时间:2019-12-13 10:32:17      阅读:97      评论:0      收藏:0      [点我收藏+]

Tensorflow模型保存时程序意外被终止导致模型参数数据损坏且加载模型失败

在采用Tensorflow训练并保存模型时,由于断电、系统死机等突发原因导致正在保存模型的程序被终止,在checkpoint保存的目录中会出现诸如xxx.tempstate的文件。
当加载模型准备恢复session时,会报错:checksum failed. 这就是因为md5加密得到的code和受损的checkpoint文件(一共3个)计算得到的md5码不符,这就是无可恢复的模型损坏。
为了避免该情形的出现,需要使用global_step结合max_to_keep两个设置来设置模型备份,避免只保存一个模型导致的高风险。
其中,global_step必须是一个自增的变量,它是tensorflow构建的图中的一个全局的tensor,每次sess.run(opt)的时候都需要自增tf.assign_add(global_step, 1),初始化为initializer=0即可。
其中,max_to_keep是在创建tf.train.Saver时设置的,目的是避免保存过多的checkpoint文件,该值确保checkpoint保存目录下最多只有max_to_keep数目的模型文件。
示例代码如下:

# iterative_inference.py # NN inference in an iterative manner, instead of a forward single shot. import numpy as np import os import platform import matplotlib.pyplot as plt import dataset import components.utils as utils import tensorflow as tf def get_conv_weights(w, h, chn_in, chn_out): dim = [w, h, chn_in, chn_out] init_op = tf.truncated_normal(dim, 0.02) return tf.get_variable( name=‘weights‘, initializer=init_op) def get_fc_weights(chn_in, chn_out): dim = [chn_in, chn_out] init_op = tf.truncated_normal(dim, 0.02) return tf.get_variable( name=‘weights‘, initializer=init_op) def get_bias(filters): init_op = tf.zeros([filters], dtype=tf.float32) return tf.get_variable( name=‘bias‘, initializer=init_op) def get_nonlinear_layer(inputs): return tf.nn.leaky_relu(inputs, alpha=0.2) def get_conv_layer(inputs, kernel_size, strides, filters): w = kernel_size[0] h = kernel_size[1] chn_in = inputs.shape.as_list()[-1] chn_out = filters weights = get_conv_weights(w, h, chn_in, chn_out) bias = get_bias(chn_out) layer = tf.nn.conv2d(inputs, weights, strides, padding=‘SAME‘) layer = tf.nn.bias_add(layer, bias) return layer def get_fc_layer(inputs, units): chn_in = inputs.shape.as_list()[-1] chn_out = units weights = get_fc_weights(chn_in, chn_out) bias = get_bias(chn_out) layer = tf.matmul(inputs, weights) layer = tf.nn.bias_add(layer, bias) return layer def get_controlled_layer(inputs, control): # define your own control strategy return tf.nn.bias_add(inputs, control) def get_loss(outputs, feedbacks): return tf.nn.softmax_cross_entropy_with_logits_v2(None, feedbacks, outputs) def convert_tensor_conv2fc(tensor): # issue: use max or mean for pooling? return tf.reduce_mean(tensor, axis=[1, 2]) class IINN(object): def __init__(self, dim_x, dim_y, conv_config, fc_config, att_config): self.inputs = tf.placeholder(shape=dim_x, dtype=tf.float32) self.feedbacks = tf.placeholder(shape=dim_y, dtype=tf.float32) self.rec_layers = [] self.rec_layers.append(self.inputs) self.att_layers = [] self.att_layers.append(self.feedbacks) self.ctl_layers = [] # the optimizer # Learning rate stages: 1E-3, 1E-4, 1E-5. # On CIFAR-10, it converged on 0.4 (cross entrophy) self.optimzer = tf.train.AdamOptimizer(learning_rate=1E-4) scope = ‘attention‘ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): # attention module sub_scope = ‘fc_%d‘ for i in range(len(att_config)): with tf.variable_scope(sub_scope % i, reuse=tf.AUTO_REUSE): fc_ = get_fc_layer( self.att_layers[-1], att_config[i][‘units‘]) fc_ = get_nonlinear_layer(fc_) self.att_layers.append(fc_) # bridge tensor between attention to biases of conv num_biases = 0 for i in range(len(conv_config)): num_biases += conv_config[i][‘filters‘] with tf.variable_scope(sub_scope % len(att_config)): fc_ = get_fc_layer( self.att_layers[-1], num_biases) assert fc_.shape.as_list()[0] == 1 self.att_layers.append(fc_[0]) scope = ‘control‘ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): # creating sub operations with back-prop killed conv_bias_ctl = [] offset = 0 for i in range(len(conv_config)): ctl_grad_free = self.att_layers[-1][offset:offset + conv_config[i][‘filters‘]] self.ctl_layers.append(ctl_grad_free) assert conv_config[i][‘filters‘] == ctl_grad_free.shape.as_list()[0] offset += ctl_grad_free.shape.as_list()[0] scope = ‘recognition‘ with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): sub_scope = ‘conv_%d‘ for i in range(len(conv_config)): with tf.variable_scope(sub_scope % i): conv_ = get_conv_layer( self.rec_layers[-1], conv_config[i][‘ksize‘], conv_config[i][‘strides‘], conv_config[i][‘filters‘]) conv_ = get_controlled_layer(conv_, self.ctl_layers[i]) conv_ = get_nonlinear_layer(conv_) self.rec_layers.append(conv_) # bridge tensor between conv and fc to let it flow thru layer = convert_tensor_conv2fc(self.rec_layers[-1]) self.rec_layers.append(layer) # creating classifier using fc layers sub_scope = ‘fc_%d‘ for i in range(len(fc_config)): with tf.variable_scope(sub_scope % i): fc_ = get_fc_layer( self.rec_layers[-1], fc_config[i][‘units‘]) fc_ = get_nonlinear_layer(fc_) self.rec_layers.append(fc_) # the last classifier layer -- using fc without nonlinearization with tf.variable_scope(sub_scope % len(fc_config)): self.outputs = get_fc_layer(self.rec_layers[-1], dim_y[1]) self.rec_layers.append(self.outputs) # calculate the loss self.rec_loss = get_loss(self.outputs, self.feedbacks) # Creating minimizers for different training purpose # group the variables by its namespace vars = tf.global_variables() rec_vars = [] att_vars = [] for i in range(len(vars)): if vars[i].name.find(‘recognition‘) != -1: rec_vars.append(vars[i]) elif vars[i].name.find(‘attention‘) != -1: att_vars.append(vars[i]) else: raise NameError(‘unknown variables: %s‘ % vars[i].name) self.minimizer_rec = self.optimzer.minimize( self.rec_loss, var_list=rec_vars, name=‘opt_rec‘) self.minimizer_att = self.optimzer.minimize( self.rec_loss, var_list=att_vars, name=‘opt_att‘) # network self check print("================================ VARIABLES ===================================") vars = tf.global_variables() for i in range(len(vars)): print("var#%03d:%40s %16s %12s" % (i, vars[i].name[:-2], vars[i].shape, str(vars[i].dtype)[9:-6])) print("==============================================================================") print("\n") print("================================ OPERATORS ===================================") ops = self.rec_layers for i in range(len(ops)): print("opr#%03d:%40s %16s %12s" % (i, ops[i].name[:-2], ops[i].shape, str(ops[i].dtype)[9:-2])) ops = self.att_layers for i in range(len(ops)): print("opr#%03d:%40s %16s %12s" % (i, ops[i].name[:-2], ops[i].shape, str(ops[i].dtype)[9:-2])) ops = self.ctl_layers for i in range(len(ops)): print("opr#%03d:%40s %16s %12s" % (i, ops[i].name[:-2], ops[i].shape, str(ops[i].dtype)[9:-2])) print("==============================================================================") def attention(self, x, y): pass def inference(self, x, a): pass def getInputPlaceHolder(self): return self.inputs def getFeedbackPlaceHolder(self): return self.feedbacks def getOutputTensor(self): return self.outputs def getControlTensors(self): return self.ctl_layers def getLoss(self): return self.rec_loss def getOptRec(self): return self.minimizer_rec def getOptAtt(self): return self.minimizer_att def new_conv_config(k_w, k_h, s_w, s_h, filters): demo_config = dict() demo_config[‘ksize‘] = (k_w, k_h) demo_config[‘strides‘] = (1, s_w, s_h, 1) demo_config[‘filters‘] = filters return demo_config def new_fc_config(units): demo_config = dict() demo_config[‘units‘] = units return demo_config def Build_IINN(n_class): dim_x = [1, None, None, 3] dim_y = [1, n_class] # configure the convolution layers n_conv = 4 conv_config = [None] * n_conv for i in range(n_conv): conv_config[i] = new_conv_config(3, 3, 2, 2, 8 << i) # configure the fully connectied layers n_fc = 3 fc_config = [None] * n_fc for i in range(n_fc): fc_config[i] = new_fc_config(16 << i) # configure the special module : feedback attention n_att = 3 att_config = [None] * n_att for i in range(n_att): att_config[i] = new_fc_config(64 >> i) return IINN(dim_x, dim_y, conv_config, fc_config, att_config) def Train_IINN(iinn_: IINN, data: dict, model_path: str) -> float: xx = data[‘input‘] yy = data[‘output‘] x_t = iinn_.getInputPlaceHolder() # tensor of inputs y_t = iinn_.getOutputTensor() # tensor of outputs c_t = iinn_.getControlTensors() # tensor of all control signals f_t = iinn_.getFeedbackPlaceHolder() # tensor of feedback loss_t = iinn_.getLoss() opt_rec = iinn_.getOptRec() opt_att = iinn_.getOptAtt() # stage 1: train without attention ( a plain convolution classifier ) # set up all the control signals to 0 ctl_sig = [] for i in range(len(c_t)): ctl_sig.append(np.array([0] * c_t[i].shape.as_list()[0])) # batch size should be always 1 because of control module limit BAT_NUM = 1024 MAX_ITR = 100000 * BAT_NUM CVG_EPS = 1e-2 itr = 0 eps = 1E10 # set up the global step counter global_step = tf.get_variable(name="global_step", initializer=0) step_next = tf.assign_add(global_step, 1, use_locking=True) # establish the training context sess = tf.Session() vars = tf.trainable_variables() saver = tf.train.Saver(var_list=vars, max_to_keep=5) # load the pretrained model if exists if tf.train.checkpoint_exists(model_path): saver.restore(sess, model_path) utils.initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # training loop loss = np.zeros([BAT_NUM], dtype=np.float32) while itr < MAX_ITR and eps > CVG_EPS: idx = np.random.randint(xx.shape[0]) feed_in = dict() feed_in[x_t] = xx[idx:idx+1, :, :, :] feed_in[f_t] = yy[idx:idx+1, :] for i in range(len(c_t)): feed_in[c_t[i]] = ctl_sig[i] loss[itr % BAT_NUM], _, _ = sess.run([loss_t, opt_rec, step_next], feed_dict=feed_in) itr += 1 if itr % BAT_NUM == 0: eps = np.mean(loss) print("batch#%05d loss=%3.5f" % (itr / BAT_NUM, eps)) if itr % (BAT_NUM * 16) == 0: saver.save(sess, model_path, global_step=global_step) return eps def Test_IINN(iinn_: IINN, data: dict, model_path: str) -> float: xx = data[‘input‘] yy = data[‘output‘] x_t = iinn_.getInputPlaceHolder() # tensor of inputs y_t = iinn_.getOutputTensor() # tensor of outputs c_t = iinn_.getControlTensors() # tensor of all control signals # set up all the control signals to 0 ctl_sig = [] for i in range(len(c_t)): ctl_sig.append(np.array([0] * c_t[i].shape.as_list()[0])) sess = tf.Session() vars = tf.trainable_variables() saver = tf.train.Saver(var_list=vars) # load the pretrained model if exists if tf.train.checkpoint_exists(model_path): saver.restore(sess, model_path) #utils.initialize_uninitialized(sess) else: raise NameError("failed to load checkpoint from path %s" %model_path) # inference labels_gt = np.argmax(yy, axis=-1) num_correct = 0 for i in range(xx.shape[0]): feed_in = dict() feed_in[x_t] = xx[i:i + 1, :, :, :] for i in range(len(c_t)): feed_in[c_t[i]] = ctl_sig[i] y = sess.run(y_t, feed_dict=feed_in)[0] label_out = np.argmax(y) if label_out == labels_gt[i]: num_correct += 1 return float(num_correct) / float(len(labels_gt)) ‘‘‘ # iterative inference demo for i in range(xx.shape[0]): x = xx[i] y = yy[i] y_trivial = np.ones(n_class) # start from a trivial solution a = iinn_.attention(x, y_trivial) y = iinn_.inference(x, a) a = iinn_.attention(x, y) y = iinn_.inference(x, a) # ... this procedure goes on and on until converged pass ‘‘‘ if __name__ == "__main__": n_class = 10 iinn_ = Build_IINN(n_class) # training with CIFAR-10 dataset data_train, data_test = dataset.cifar10.Load_CIFAR10(‘../Datasets/CIFAR10/‘) model_path = ‘../Models/CIFAR10-IINN/ckpt_iinn_cifar10‘ Train_IINN(iinn_, data_train, model_path) # test the trained model with test split of the same dataset acc = Test_IINN(iinn_, data_test, model_path) print("Accuracy = %6.5f" % acc)

其中Train_IINN函数就使用了该策略:注意不要将global_step变量引入到要保存的模型中,解决方法是在创建global_step变量前创建saver,并指定变量列表为tf.trainable_variables。
这样就只会保留前向计算所需的变量,训练的临时变量都会舍弃。

Tensorflow模型保存时程序意外被终止导致模型参数数据损坏且加载模型失败

原文:https://www.cnblogs.com/thisisajoke/p/12033274.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!