我已经运行了这段代码,但是10秒钟之后,会话崩溃了,因为该代码使用了太多的内存。导致此问题的原因是什么?如何解决?
我的假设是类PreProcess被所有RAM占用,或者它可能是随机梯度下降算法?我真的不知道。
[注意:我在Keras和tensorflow中使用了google collab]
代码如下:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random
import gym
import time
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib.animation import PillowWriter
#input_shape = (4,80,80)
class replay_buffer:
def __init__(self, mem_size=25000, input_shape=(210,160,3)):
self.mem_size = mem_size
self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
self.state_memory = np.zeros((self.mem_size,*input_shape), dtype=np.float32)
self.next_state_memory = np.zeros((self.mem_size,*input_shape), dtype=np.float32)
self.terminal_state_memory = np.zeros(self.mem_size, dtype=np.float32)
self.mem = 0
def store_transition(self, state, next_state, action, reward, terminal_state):
index = self.mem % self.mem_size
print("index: {}".format(index))
self.action_memory[index] = action
self.reward_memory[index] = reward
self.state_memory[index] = state
self.next_state_memory[index] = next_state
self.terminal_state_memory[index] = terminal_state
self.mem += 1
def sample_buffer(self):
indexs = np.random.choice(range(len(self.terminal_state_memory)), size = 32)
action_sample = np.array(self.action_memory[x] for x in indexs)
reward_sample = np.array(self.reward_memory[x] for x in indexs)
state_sample = np.array(self.state_memory[x] for x in indexs)
next_state_sample = np.array(self.next_state_memory[x] for x in indexs)
terminal_state_sample = np.array(self.terminal_state_memory[x] for x in indexs)
return action_sample, reward_sample, state_sample, next_state_sample, terminal_state_sample
class dqn_network(tf.keras.Model):
def __init__(self):
super(dqn_network,self).__init__()
self.input_layer = tf.keras.Input(shape=(84,84,4,))
self.first_hidden_layer = tf.keras.layers.Conv2D(16,8, strides=4, activation="relu")
self.second_hidden_layer = tf.keras.layers.Conv2D(32,4, strides=2, activation="relu")
self.dense_layer = tf.keras.layers.Dense(256, activation="relu")
self.output_layer = tf.keras.layers.Dense(4, activation="linear")
def __call__(self):
layer1 = self.first_hidden_layer(self.input_layer)
layer2 = self.second_hidden_layer(layer1)
layer3 = Flatten()(layer2)
layer4 = self.dense_layer(layer3)
layer5 = self.output_layer(layer4)
model = tf.keras.Model(inputs=layer1, outputs=layer5)
model.compile(optimizer= tf.keras.optimizers.Adam(lr=1e-3) , loss= tf.keras.losses.mean_squared_error)
return model
class agent(object):
def __init__(self, epsilon=1, max_epsilon=1, min_epsilon=0.1, update_target=10000,timestep=0, batch_size=32):
super(agent, self).__init__()
self.epsilon = epsilon
self.max_epsilon = max_epsilon
self.min_epsilon = min_epsilon
self.target_network = dqn_network()
self.Q_network = dqn_network()
self.update_target = update_target
self.timestep = timestep
self.experience_relay = replay_buffer()
self.batch_size = batch_size
def update_timestep(self, newtimestep):
self.timestep = newtimestep
def update_target_network(self):
if self.timestep != 0 and self.update_target % self.timestep == 0:
self.target_network.set_weights(self.Q_network.get_weights())
def greedy_policy(self):
if random.uniform(0,1) < self.epsilon:
return np.random.choice(env.action_space.n)
else:
q_values = self.target_network.predict(state[np.newaxis])
return np.argmax(q_values[0])
def store_transition(self, state, next_state, action, reward, terminal_state):
self.experience_relay.store_transition(state, next_state, action, reward, terminal_state)
def annealing_epsilon(self):
interval = self.max_epsilon - self.min_epsilon
self.epsilon -= interval / 100000
if self.epsilon < 0.1:
self.epsilon = self.min_epsilon
def training(self):
if self.timestep % 4 == 0 and self.experience_relay.mem > self.batch_size:
actions, rewards, states, next_states, dones = self.experience_relay.sample_buffer()
print("next state: {}".format(next_states))
np_states = np.expand_dims(next_states, axis=0)
print("np_states: {}".format(np_states))
#tf_states = tf.convert_to_tensor(next_states)
self.update_target_network()
next_q_value = self.Q_network.predict(next_states)
q_targets = rewards + (1-dones)*gamma*np.max(next_q_value, axis = 1)
mask = tf.one_hot(actions, env.action_space.n)
with tf.GradientTape() as tape:
total_q_value = dqn_network(states)
q_values = tf.reduce_sum(mask*total_q_value, axis=1, keepdims=True)
loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(q_targets, q_values))
grad = tape.gradient(loss, self.Q_network.trainable.variables)
tf.keras.optimizers.Adam.apply_gradients(zip(grad, self.Q_network.trainable.variables))
class PreProcess(gym.ObservationWrapper):
def __init__(self, env=None):
super(PreProcess, self).__init__(env)
self.observation_space = gym.spaces.Box(low=0,high=255,shape=(84,84,1), dtype= np.uint8)
def greyscale(self):
return np.mean(self.observation_space, axis=2)
class model:
def __init__(self):
self.frame_buffer = []
def add_img(self, img):
self.frame_buffer.append(img)
def create_gif(self, filepath=None): # here here here
plt.figure(figsize=(self.frame_buffer[0].shape[1] / 72, self.frame_buffer[0].shape[0] / 72), dpi = 72)
patch = plt.imshow(self.frame_buffer[0])
plt.axis('off')
def animate(i):
patch.set_data(self.frame_buffer[i])
ani = animation.FuncAnimation(plt.gcf(), animate, frames = len(self.frame_buffer))
if filepath:
writergif = animation.PillowWriter(fps=20)
ani.save(filepath, writer = writergif)
print("file saved")
if __name__ == "__main__":
env = gym.make("BreakoutDeterministic-v4")
PreProcess(env)
dqn = agent()
target_update = 10000
MaxTimestep = 100000
episode_num = 0
frame_num = 0
state = env.reset()
while True:
image_file = model()
start = time.time()
for timestep in range(MaxTimestep):
frame_num +=1
action = dqn.greedy_policy()
dqn.annealing_epsilon()
next_state, reward, done, info = env.step(action)
dqn.update_timestep(timestep)
dqn.store_transition(state, next_state, action, reward, done)
img = env.render("rgb_array")
image_file.add_img(img)
if (target_update % frame_num) == 0:
dqn.training()
if done or (timestep == MaxTimestep-1):
end = time.time()
print("[episode: {}, time taken: {:.5f} sec, timestep: {}]".format(episode_num + 1 , end-start, timestep))
if episode_num % 10 == 0:
image_file.create_gif(filepath= r"./drive/My Drive/GIF-breakout-v1/episode{}.gif")
print("[[episode: {}, time taken: {:.5f} sec, timestep: {}]]".format(episode_num + 1 , end-start, timestep))
break
break
episode_num += 1
先谢谢您:]
答案 0 :(得分:1)
您可以尝试将重播缓冲区的大小从25000减少到250,以查看是否存在问题。