1 """ 2 This part of code is the Deep Q Network (DQN) brain. 3 4 view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification 5 6 View more on my tutorial page: https://morvanzhou.github.io/tutorials/ 7 8 Using: 9 Tensorflow: r1.2 10 """ 11 12 import numpy as np 13 import tensorflow as tf 14 15 np.random.seed(1) 16 tf.set_random_seed(1) 17 18 19 # Deep Q Network off-policy 20 class DeepQNetwork: 21 def __init__( 22 self, 23 n_actions, 24 n_features, 25 learning_rate=0.01, 26 reward_decay=0.9, 27 e_greedy=0.9, 28 replace_target_iter=300, 29 memory_size=500, 30 batch_size=32, 31 e_greedy_increment=None, 32 output_graph=False, 33 ): 34 self.n_actions = n_actions 35 self.n_features = n_features 36 self.lr = learning_rate 37 self.gamma = reward_decay 38 self.epsilon_max = e_greedy 39 self.replace_target_iter = replace_target_iter 40 self.memory_size = memory_size 41 self.batch_size = batch_size 42 self.epsilon_increment = e_greedy_increment 43 self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max 44 45 # total learning step 46 self.learn_step_counter = 0 47 48 # initialize zero memory [s, a, r, s_] 49 self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) 50 51 # consist of [target_net, evaluate_net] 52 self._build_net() 53 54 t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘target_net‘) 55 e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=‘eval_net‘) 56 57 with tf.variable_scope(‘hard_replacement‘): 58 self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] 59 60 self.sess = tf.Session() 61 62 if output_graph: 63 # $ tensorboard --logdir=logs 64 tf.summary.FileWriter("logs/", self.sess.graph) 65 66 self.sess.run(tf.global_variables_initializer()) 67 self.cost_his = [] 68 69 def _build_net(self): 70 # ------------------ all inputs ------------------------ 71 self.s = tf.placeholder(tf.float32, [None, self.n_features], name=‘s‘) # input State 72 self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name=‘s_‘) # input Next State 73 self.r = tf.placeholder(tf.float32, [None, ], name=‘r‘) # input Reward 74 self.a = tf.placeholder(tf.int32, [None, ], name=‘a‘) # input Action 75 76 w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) 77 78 # ------------------ build evaluate_net ------------------ 79 with tf.variable_scope(‘eval_net‘): 80 e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer, 81 bias_initializer=b_initializer, name=‘e1‘) 82 self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer, 83 bias_initializer=b_initializer, name=‘q‘) 84 85 # ------------------ build target_net ------------------ 86 with tf.variable_scope(‘target_net‘): 87 t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer, 88 bias_initializer=b_initializer, name=‘t1‘) 89 self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer, 90 bias_initializer=b_initializer, name=‘t2‘) 91 92 with tf.variable_scope(‘q_target‘): 93 q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name=‘Qmax_s_‘) # shape=(None, ) 94 self.q_target = tf.stop_gradient(q_target) 95 with tf.variable_scope(‘q_eval‘): 96 a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) 97 self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) # shape=(None, ) 98 with tf.variable_scope(‘loss‘): 99 self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name=‘TD_error‘)) 100 with tf.variable_scope(‘train‘): 101 self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) 102 103 def store_transition(self, s, a, r, s_): 104 if not hasattr(self, ‘memory_counter‘): 105 self.memory_counter = 0 106 transition = np.hstack((s, [a, r], s_)) 107 # replace the old memory with new memory 108 index = self.memory_counter % self.memory_size 109 self.memory[index, :] = transition 110 self.memory_counter += 1 111 112 def choose_action(self, observation): 113 # to have batch dimension when feed into tf placeholder 114 observation = observation[np.newaxis, :] 115 116 if np.random.uniform() < self.epsilon: 117 # forward feed the observation and get q value for every actions 118 actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) 119 action = np.argmax(actions_value) 120 else: 121 action = np.random.randint(0, self.n_actions) 122 return action 123 124 def learn(self): 125 # check to replace target parameters 126 if self.learn_step_counter % self.replace_target_iter == 0: 127 self.sess.run(self.target_replace_op) 128 print(‘\ntarget_params_replaced\n‘) 129 130 # sample batch memory from all memory 131 if self.memory_counter > self.memory_size: 132 sample_index = np.random.choice(self.memory_size, size=self.batch_size) 133 else: 134 sample_index = np.random.choice(self.memory_counter, size=self.batch_size) 135 batch_memory = self.memory[sample_index, :] 136 137 _, cost = self.sess.run( 138 [self._train_op, self.loss], 139 feed_dict={ 140 self.s: batch_memory[:, :self.n_features], 141 self.a: batch_memory[:, self.n_features], 142 self.r: batch_memory[:, self.n_features + 1], 143 self.s_: batch_memory[:, -self.n_features:], 144 }) 145 146 self.cost_his.append(cost) 147 148 # increasing epsilon 149 self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 150 self.learn_step_counter += 1 151 152 def plot_cost(self): 153 import matplotlib.pyplot as plt 154 plt.plot(np.arange(len(self.cost_his)), self.cost_his) 155 plt.ylabel(‘Cost‘) 156 plt.xlabel(‘training steps‘) 157 plt.show() 158 159 if __name__ == ‘__main__‘: 160 DQN = DeepQNetwork(3,4, output_graph=True)
原文:https://www.cnblogs.com/zle1992/p/10241794.html