首页 > 编程语言 > 详细

QLearning python实现

时间:2019-01-08 23:16:03      阅读:531      评论:0      收藏:0      [点我收藏+]

技术分享图片

 

  1 """
  2 This part of code is the Deep Q Network (DQN) brain.
  3 
  4 view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification
  5 
  6 View more on my tutorial page: https://morvanzhou.github.io/tutorials/
  7 
  8 Using:
  9 Tensorflow: r1.2
 10 """
 11 
 12 import numpy as np
 13 import tensorflow as tf
 14 
 15 np.random.seed(1)
 16 tf.set_random_seed(1)
 17 
 18 
 19 # Deep Q Network off-policy
 20 class DeepQNetwork:
 21     def __init__(
 22             self,
 23             n_actions,
 24             n_features,
 25             learning_rate=0.01,
 26             reward_decay=0.9,
 27             e_greedy=0.9,
 28             replace_target_iter=300,
 29             memory_size=500,
 30             batch_size=32,
 31             e_greedy_increment=None,
 32             output_graph=False,
 33     ):
 34         self.n_actions = n_actions
 35         self.n_features = n_features
 36         self.lr = learning_rate
 37         self.gamma = reward_decay
 38         self.epsilon_max = e_greedy
 39         self.replace_target_iter = replace_target_iter
 40         self.memory_size = memory_size
 41         self.batch_size = batch_size
 42         self.epsilon_increment = e_greedy_increment
 43         self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
 44 
 45         # total learning step
 46         self.learn_step_counter = 0
 47 
 48         # initialize zero memory [s, a, r, s_]
 49         self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
 50 
 51         # consist of [target_net, evaluate_net]
 52         self._build_net()
 53 
 54         t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=target_net)
 55         e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=eval_net)
 56 
 57         with tf.variable_scope(hard_replacement):
 58             self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
 59 
 60         self.sess = tf.Session()
 61 
 62         if output_graph:
 63             # $ tensorboard --logdir=logs
 64             tf.summary.FileWriter("logs/", self.sess.graph)
 65 
 66         self.sess.run(tf.global_variables_initializer())
 67         self.cost_his = []
 68 
 69     def _build_net(self):
 70         # ------------------ all inputs ------------------------
 71         self.s = tf.placeholder(tf.float32, [None, self.n_features], name=s)  # input State
 72         self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name=s_)  # input Next State
 73         self.r = tf.placeholder(tf.float32, [None, ], name=r)  # input Reward
 74         self.a = tf.placeholder(tf.int32, [None, ], name=a)  # input Action
 75 
 76         w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
 77 
 78         # ------------------ build evaluate_net ------------------
 79         with tf.variable_scope(eval_net):
 80             e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
 81                                  bias_initializer=b_initializer, name=e1)
 82             self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
 83                                           bias_initializer=b_initializer, name=q)
 84 
 85         # ------------------ build target_net ------------------
 86         with tf.variable_scope(target_net):
 87             t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
 88                                  bias_initializer=b_initializer, name=t1)
 89             self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
 90                                           bias_initializer=b_initializer, name=t2)
 91 
 92         with tf.variable_scope(q_target):
 93             q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name=Qmax_s_)    # shape=(None, )
 94             self.q_target = tf.stop_gradient(q_target)
 95         with tf.variable_scope(q_eval):
 96             a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
 97             self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)    # shape=(None, )
 98         with tf.variable_scope(loss):
 99             self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name=TD_error))
100         with tf.variable_scope(train):
101             self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
102 
103     def store_transition(self, s, a, r, s_):
104         if not hasattr(self, memory_counter):
105             self.memory_counter = 0
106         transition = np.hstack((s, [a, r], s_))
107         # replace the old memory with new memory
108         index = self.memory_counter % self.memory_size
109         self.memory[index, :] = transition
110         self.memory_counter += 1
111 
112     def choose_action(self, observation):
113         # to have batch dimension when feed into tf placeholder
114         observation = observation[np.newaxis, :]
115 
116         if np.random.uniform() < self.epsilon:
117             # forward feed the observation and get q value for every actions
118             actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
119             action = np.argmax(actions_value)
120         else:
121             action = np.random.randint(0, self.n_actions)
122         return action
123 
124     def learn(self):
125         # check to replace target parameters
126         if self.learn_step_counter % self.replace_target_iter == 0:
127             self.sess.run(self.target_replace_op)
128             print(\ntarget_params_replaced\n)
129 
130         # sample batch memory from all memory
131         if self.memory_counter > self.memory_size:
132             sample_index = np.random.choice(self.memory_size, size=self.batch_size)
133         else:
134             sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
135         batch_memory = self.memory[sample_index, :]
136 
137         _, cost = self.sess.run(
138             [self._train_op, self.loss],
139             feed_dict={
140                 self.s: batch_memory[:, :self.n_features],
141                 self.a: batch_memory[:, self.n_features],
142                 self.r: batch_memory[:, self.n_features + 1],
143                 self.s_: batch_memory[:, -self.n_features:],
144             })
145 
146         self.cost_his.append(cost)
147 
148         # increasing epsilon
149         self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
150         self.learn_step_counter += 1
151 
152     def plot_cost(self):
153         import matplotlib.pyplot as plt
154         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
155         plt.ylabel(Cost)
156         plt.xlabel(training steps)
157         plt.show()
158 
159 if __name__ == __main__:
160     DQN = DeepQNetwork(3,4, output_graph=True)

 

QLearning python实现

原文:https://www.cnblogs.com/zle1992/p/10241794.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!