Tensorflow模型保存时程序意外被终止导致模型参数数据损坏且加载模型失败

时间：2019-12-13 10:32:17 阅读：98 评论：0 收藏：0 [点我收藏+]

Tensorflow模型保存时程序意外被终止导致模型参数数据损坏且加载模型失败

在采用Tensorflow训练并保存模型时，由于断电、系统死机等突发原因导致正在保存模型的程序被终止，在checkpoint保存的目录中会出现诸如xxx.tempstate的文件。
当加载模型准备恢复session时，会报错：checksum failed. 这就是因为md5加密得到的code和受损的checkpoint文件（一共3个）计算得到的md5码不符，这就是无可恢复的模型损坏。
为了避免该情形的出现，需要使用global_step结合max_to_keep两个设置来设置模型备份，避免只保存一个模型导致的高风险。
其中，global_step必须是一个自增的变量，它是tensorflow构建的图中的一个全局的tensor，每次sess.run(opt)的时候都需要自增tf.assign_add(global_step, 1)，初始化为initializer=0即可。
其中，max_to_keep是在创建tf.train.Saver时设置的，目的是避免保存过多的checkpoint文件，该值确保checkpoint保存目录下最多只有max_to_keep数目的模型文件。
示例代码如下：

# iterative_inference.py
# NN inference in an iterative manner, instead of a forward single shot.

import numpy as np
import os
import platform
import matplotlib.pyplot as plt

import dataset
import components.utils as utils

import tensorflow as tf


def get_conv_weights(w, h, chn_in, chn_out):
    dim = [w, h, chn_in, chn_out]
    init_op = tf.truncated_normal(dim, 0.02)
    return tf.get_variable(
        name=‘weights‘,
        initializer=init_op)


def get_fc_weights(chn_in, chn_out):
    dim = [chn_in, chn_out]
    init_op = tf.truncated_normal(dim, 0.02)
    return tf.get_variable(
        name=‘weights‘,
        initializer=init_op)


def get_bias(filters):
    init_op = tf.zeros([filters], dtype=tf.float32)
    return tf.get_variable(
        name=‘bias‘,
        initializer=init_op)


def get_nonlinear_layer(inputs):
    return tf.nn.leaky_relu(inputs, alpha=0.2)


def get_conv_layer(inputs, kernel_size, strides, filters):
    w = kernel_size[0]
    h = kernel_size[1]
    chn_in = inputs.shape.as_list()[-1]
    chn_out = filters
    weights = get_conv_weights(w, h, chn_in, chn_out)
    bias = get_bias(chn_out)
    layer = tf.nn.conv2d(inputs, weights, strides, padding=‘SAME‘)
    layer = tf.nn.bias_add(layer, bias)
    return layer


def get_fc_layer(inputs, units):
    chn_in = inputs.shape.as_list()[-1]
    chn_out = units
    weights = get_fc_weights(chn_in, chn_out)
    bias = get_bias(chn_out)
    layer = tf.matmul(inputs, weights)
    layer = tf.nn.bias_add(layer, bias)
    return layer


def get_controlled_layer(inputs, control): # define your own control strategy
    return tf.nn.bias_add(inputs, control)


def get_loss(outputs, feedbacks):
    return tf.nn.softmax_cross_entropy_with_logits_v2(None, feedbacks, outputs)


def convert_tensor_conv2fc(tensor): # issue: use max or mean for pooling?
    return tf.reduce_mean(tensor, axis=[1, 2])


class IINN(object):
    def __init__(self, dim_x, dim_y,
                 conv_config, fc_config, att_config):
        self.inputs = tf.placeholder(shape=dim_x, dtype=tf.float32)
        self.feedbacks = tf.placeholder(shape=dim_y, dtype=tf.float32)

        self.rec_layers = []
        self.rec_layers.append(self.inputs)

        self.att_layers = []
        self.att_layers.append(self.feedbacks)

        self.ctl_layers = []

        # the optimizer
        # Learning rate stages: 1E-3, 1E-4, 1E-5.
        # On CIFAR-10, it converged on 0.4 (cross entrophy)
        self.optimzer = tf.train.AdamOptimizer(learning_rate=1E-4)

        scope = ‘attention‘
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            # attention module
            sub_scope = ‘fc_%d‘
            for i in range(len(att_config)):
                with tf.variable_scope(sub_scope % i, reuse=tf.AUTO_REUSE):
                    fc_ = get_fc_layer(
                        self.att_layers[-1],
                        att_config[i][‘units‘])
                    fc_ = get_nonlinear_layer(fc_)
                    self.att_layers.append(fc_)
            # bridge tensor between attention to biases of conv
            num_biases = 0
            for i in range(len(conv_config)):
                num_biases += conv_config[i][‘filters‘]
            with tf.variable_scope(sub_scope % len(att_config)):
                fc_ = get_fc_layer(
                    self.att_layers[-1],
                    num_biases)
                assert fc_.shape.as_list()[0] == 1
                self.att_layers.append(fc_[0])

        scope = ‘control‘
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            # creating sub operations with back-prop killed
            conv_bias_ctl = []
            offset = 0
            for i in range(len(conv_config)):
                ctl_grad_free =                     self.att_layers[-1][offset:offset + conv_config[i][‘filters‘]]
                self.ctl_layers.append(ctl_grad_free)
                assert conv_config[i][‘filters‘] == ctl_grad_free.shape.as_list()[0]
                offset += ctl_grad_free.shape.as_list()[0]

        scope = ‘recognition‘
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            sub_scope = ‘conv_%d‘
            for i in range(len(conv_config)):
                with tf.variable_scope(sub_scope % i):
                    conv_ = get_conv_layer(
                        self.rec_layers[-1],
                        conv_config[i][‘ksize‘],
                        conv_config[i][‘strides‘],
                        conv_config[i][‘filters‘])
                    conv_ = get_controlled_layer(conv_, self.ctl_layers[i])
                    conv_ = get_nonlinear_layer(conv_)
                    self.rec_layers.append(conv_)
            # bridge tensor between conv and fc to let it flow thru
            layer = convert_tensor_conv2fc(self.rec_layers[-1])
            self.rec_layers.append(layer)

            # creating classifier using fc layers
            sub_scope = ‘fc_%d‘
            for i in range(len(fc_config)):
                with tf.variable_scope(sub_scope % i):
                    fc_ = get_fc_layer(
                        self.rec_layers[-1],
                        fc_config[i][‘units‘])
                    fc_ = get_nonlinear_layer(fc_)
                    self.rec_layers.append(fc_)
            # the last classifier layer -- using fc without nonlinearization
            with tf.variable_scope(sub_scope % len(fc_config)):
                self.outputs = get_fc_layer(self.rec_layers[-1], dim_y[1])
            self.rec_layers.append(self.outputs)

            # calculate the loss
            self.rec_loss = get_loss(self.outputs, self.feedbacks)

        # Creating minimizers for different training purpose
        # group the variables by its namespace
        vars = tf.global_variables()
        rec_vars = []
        att_vars = []
        for i in range(len(vars)):
            if vars[i].name.find(‘recognition‘) != -1:
                rec_vars.append(vars[i])
            elif vars[i].name.find(‘attention‘) != -1:
                att_vars.append(vars[i])
            else:
                raise NameError(‘unknown variables: %s‘ % vars[i].name)

        self.minimizer_rec = self.optimzer.minimize(
            self.rec_loss, var_list=rec_vars, name=‘opt_rec‘)
        self.minimizer_att = self.optimzer.minimize(
            self.rec_loss, var_list=att_vars, name=‘opt_att‘)

        # network self check
        print("================================ VARIABLES ===================================")
        vars = tf.global_variables()
        for i in range(len(vars)):
            print("var#%03d:%40s %16s %12s" %
                  (i, vars[i].name[:-2], vars[i].shape, str(vars[i].dtype)[9:-6]))
        print("==============================================================================")
        print("\n")
        print("================================ OPERATORS ===================================")
        ops = self.rec_layers
        for i in range(len(ops)):
            print("opr#%03d:%40s %16s %12s" %
                  (i, ops[i].name[:-2], ops[i].shape, str(ops[i].dtype)[9:-2]))
        ops = self.att_layers
        for i in range(len(ops)):
            print("opr#%03d:%40s %16s %12s" %
                  (i, ops[i].name[:-2], ops[i].shape, str(ops[i].dtype)[9:-2]))
        ops = self.ctl_layers
        for i in range(len(ops)):
            print("opr#%03d:%40s %16s %12s" %
                  (i, ops[i].name[:-2], ops[i].shape, str(ops[i].dtype)[9:-2]))
        print("==============================================================================")

    def attention(self, x, y):
        pass
    def inference(self, x, a):
        pass
    def getInputPlaceHolder(self):
        return self.inputs
    def getFeedbackPlaceHolder(self):
        return self.feedbacks
    def getOutputTensor(self):
        return self.outputs
    def getControlTensors(self):
        return self.ctl_layers
    def getLoss(self):
        return self.rec_loss
    def getOptRec(self):
        return self.minimizer_rec
    def getOptAtt(self):
        return self.minimizer_att


def new_conv_config(k_w, k_h, s_w, s_h, filters):
    demo_config = dict()
    demo_config[‘ksize‘] = (k_w, k_h)
    demo_config[‘strides‘] = (1, s_w, s_h, 1)
    demo_config[‘filters‘] = filters
    return demo_config

def new_fc_config(units):
    demo_config = dict()
    demo_config[‘units‘] = units
    return demo_config


def Build_IINN(n_class):
    dim_x = [1, None, None, 3]
    dim_y = [1, n_class]

    # configure the convolution layers
    n_conv = 4
    conv_config = [None] * n_conv
    for i in range(n_conv):
        conv_config[i] = new_conv_config(3, 3, 2, 2, 8 << i)

    # configure the fully connectied layers
    n_fc = 3
    fc_config = [None] * n_fc
    for i in range(n_fc):
        fc_config[i] = new_fc_config(16 << i)

    # configure the special module : feedback attention
    n_att = 3
    att_config = [None] * n_att
    for i in range(n_att):
        att_config[i] = new_fc_config(64 >> i)

    return IINN(dim_x, dim_y,
                conv_config,
                fc_config,
                att_config)


def Train_IINN(iinn_: IINN, data: dict, model_path: str) -> float:
    xx = data[‘input‘]
    yy = data[‘output‘]

    x_t = iinn_.getInputPlaceHolder() # tensor of inputs
    y_t = iinn_.getOutputTensor() # tensor of outputs
    c_t = iinn_.getControlTensors() # tensor of all control signals
    f_t = iinn_.getFeedbackPlaceHolder() # tensor of feedback

    loss_t = iinn_.getLoss()
    opt_rec = iinn_.getOptRec()
    opt_att = iinn_.getOptAtt()

    # stage 1: train without attention ( a plain convolution classifier )
    # set up all the control signals to 0
    ctl_sig = []
    for i in range(len(c_t)):
        ctl_sig.append(np.array([0] * c_t[i].shape.as_list()[0]))

    # batch size should be always 1 because of control module limit
    BAT_NUM = 1024
    MAX_ITR = 100000 * BAT_NUM
    CVG_EPS = 1e-2
    itr = 0
    eps = 1E10

    # set up the global step counter
    global_step = tf.get_variable(name="global_step", initializer=0)
    step_next = tf.assign_add(global_step, 1, use_locking=True)

    # establish the training context
    sess = tf.Session()
    vars = tf.trainable_variables()
    saver = tf.train.Saver(var_list=vars, max_to_keep=5)
    # load the pretrained model if exists
    if tf.train.checkpoint_exists(model_path):
        saver.restore(sess, model_path)
        utils.initialize_uninitialized(sess)
    else:
        sess.run(tf.global_variables_initializer())
    # training loop
    loss = np.zeros([BAT_NUM], dtype=np.float32)
    while itr < MAX_ITR and  eps > CVG_EPS:
        idx = np.random.randint(xx.shape[0])
        feed_in = dict()
        feed_in[x_t] = xx[idx:idx+1, :, :, :]
        feed_in[f_t] = yy[idx:idx+1, :]
        for i in range(len(c_t)):
            feed_in[c_t[i]] = ctl_sig[i]
        loss[itr % BAT_NUM], _, _ =             sess.run([loss_t, opt_rec, step_next], feed_dict=feed_in)
        itr += 1
        if itr % BAT_NUM == 0:
            eps = np.mean(loss)
            print("batch#%05d loss=%3.5f" % (itr / BAT_NUM, eps))
        if itr % (BAT_NUM * 16) == 0:
            saver.save(sess, model_path, global_step=global_step)
    return eps


def Test_IINN(iinn_: IINN, data: dict, model_path: str) -> float:
    xx = data[‘input‘]
    yy = data[‘output‘]

    x_t = iinn_.getInputPlaceHolder()  # tensor of inputs
    y_t = iinn_.getOutputTensor()  # tensor of outputs
    c_t = iinn_.getControlTensors()  # tensor of all control signals

    # set up all the control signals to 0
    ctl_sig = []
    for i in range(len(c_t)):
        ctl_sig.append(np.array([0] * c_t[i].shape.as_list()[0]))

    sess = tf.Session()
    vars = tf.trainable_variables()
    saver = tf.train.Saver(var_list=vars)
    # load the pretrained model if exists
    if tf.train.checkpoint_exists(model_path):
        saver.restore(sess, model_path)
        #utils.initialize_uninitialized(sess)
    else:
        raise NameError("failed to load checkpoint from path %s" %model_path)

    # inference
    labels_gt = np.argmax(yy, axis=-1)
    num_correct = 0

    for i in range(xx.shape[0]):
        feed_in = dict()
        feed_in[x_t] = xx[i:i + 1, :, :, :]
        for i in range(len(c_t)):
            feed_in[c_t[i]] = ctl_sig[i]
        y = sess.run(y_t, feed_dict=feed_in)[0]
        label_out = np.argmax(y)
        if label_out == labels_gt[i]:
            num_correct += 1
    return float(num_correct) / float(len(labels_gt))

    ‘‘‘ 
    # iterative inference demo
    for i in range(xx.shape[0]):
        x = xx[i]
        y = yy[i]
        y_trivial = np.ones(n_class)  # start from a trivial solution
        a = iinn_.attention(x, y_trivial)
        y = iinn_.inference(x, a)
        a = iinn_.attention(x, y)
        y = iinn_.inference(x, a)
        # ... this procedure goes on and on until converged
        pass
    ‘‘‘


if __name__ == "__main__":
    n_class = 10
    iinn_ = Build_IINN(n_class)

    # training with CIFAR-10 dataset
    data_train, data_test =         dataset.cifar10.Load_CIFAR10(‘../Datasets/CIFAR10/‘)
    model_path = ‘../Models/CIFAR10-IINN/ckpt_iinn_cifar10‘
    Train_IINN(iinn_, data_train, model_path)
    # test the trained model with test split of the same dataset
    acc = Test_IINN(iinn_, data_test, model_path)
    print("Accuracy = %6.5f" % acc)
    

其中Train_IINN函数就使用了该策略：注意不要将global_step变量引入到要保存的模型中，解决方法是在创建global_step变量前创建saver，并指定变量列表为tf.trainable_variables。
这样就只会保留前向计算所需的变量，训练的临时变量都会舍弃。

Tensorflow模型保存时程序意外被终止导致模型参数数据损坏且加载模型失败

原文：https://www.cnblogs.com/thisisajoke/p/12033274.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)