内部Epoch功能 变量self.prev_pos被分配给self.pos self.prev_pos的静止值在self.Move函数之后发生变化,虽然它没有在任何函数中更新
这是代码 -
import numpy as np
import random
import cv2
N=0
E=1
S=2
W=3
exit=1
dir=['N','E','S','W']
class RLMaze():
def __init__(self,height=None,width=None,goal=None,nogoal=None,start=None,obstacles=None):
#initialise cordinates and maze parameters
self.height=height or 3
self.width= width or 4
self.goal=goal or [0,3]
self.nogoal= nogoal or [1,3]
self.obstacles=obstacles or [[1,1]]
self.start=[2,0]
self.pos=self.start
self.maze=self.generate_maze()
self.reward=self.maze
self.prev_pos=None
self.actions=[0,1,2,3]
self.neighbours=[[-1,0],[0,1],[1,0],[0,-1]]
#self.move=np.array([[0,0,1],[1,0,0],[0,1,1],[1,1,0],[1,1,2],[2,1,1],[1,2,2],[2,2,1],[2,2,3],[3,2,2],[2,3,3],[3,3,2],[3,3,0],[0,3,3],[3,0,0],[0,0,3]])
#learning parameters
self.movement_cost=-0.04
self.alpha=0.8
self.gamma=0.9
self.epsilon=0.1
e=5
#learning variables
self.policy=""
self.maxpolicy=""
self.total_reward=0
self.iteration=1
self.best=[""]
self.move_reward=0
self.exit=1
self.reached=np.zeros([self.height,self.width])
self.Q=np.zeros((self.height,self.width,len(self.actions)),dtype=np.float32)
#image parameters
self.pixelpercell=100
self.pause=25
self.img = np.zeros([self.pixelpercell*self.maze.shape[0],self.pixelpercell*self.maze.shape[1],3], np.uint8)
self.img=self.generate_background()
def generate_maze(self):
maze=np.zeros([self.height,self.width])
maze[self.goal[0],self.goal[1]]=1
maze[self.nogoal[0],self.nogoal[1]]=-1
for ob in self.obstacles:
maze[ob[0],ob[1]]=-10
return maze
def generate_background(self):
self.img[:,:,:]=255
c=self.img.shape[0]
for i in xrange(1,self.maze.shape[0]):
a=c*i/self.maze.shape[0]
cv2.line(self.img,(0,a),(self.img.shape[1],a),[0,0,0],2)
c=self.img.shape[1]
for i in xrange(1,self.maze.shape[1]):
a=c*i/self.maze.shape[1]
cv2.line(self.img,(a,0),(a,self.img.shape[0]),[0,0,0],2)
for i in xrange(self.maze.shape[0]):
for j in xrange(self.maze.shape[1]):
spl=False
if [i,j]==self.goal:
color=[0,255,0]
spl=True
elif [i,j]==self.nogoal:
color=[0,0,255]
spl=True
if [i,j] in self.obstacles:
color=[255,0,0]
spl=True
if spl:
a=self.pixelpercell*i
c=self.pixelpercell*(i+1)
b=self.pixelpercell*j
d=self.pixelpercell*(j+1)
cv2.rectangle(self.img,(b,a),(d,c),color, thickness=-1)
#print(a,b,c,d)
#cv2.imshow('Environment',self.img)
#cv2.waitKey(0)
return self.img
def getQ(self,state,action):
if state==self.goal or state==self.nogoal:
return self.Q[state[0],state[1],0]
else:
return self.Q[state[0],state[1],action]
def putQ(self,state,action,q):
if state==self.goal or state==self.nogoal:
self.Q[state[0],state[1],0]=q
else:
self.Q[state[0],state[1],action]=q
def choose_action(self, state):
if random.random() < self.epsilon:
action = random.choice(self.actions)
#print("randomaction",action)
else:
q = [self.getQ(state, a) for a in self.actions]
maxQ = max(q)
count = q.count(maxQ)
if count > 1:
best = [i for i in range(len(self.actions)) if q[i] == maxQ]
i = random.choice(best)
else:
i = q.index(maxQ)
action = self.actions[i]
#print("best action",action)
return action
def render(self):
self.env=self.generate_background()
a=self.pixelpercell*self.pos[0]
c=self.pixelpercell*(self.pos[0]+1)
b=self.pixelpercell*self.pos[1]
d=self.pixelpercell*(self.pos[1]+1)
cv2.rectangle(self.env,(b+20,a+20),(d-20,c-20),[255,255,0], thickness=-1)
for i in xrange(self.maze.shape[0]):
for j in xrange(self.maze.shape[1]):
c=self.pixelpercell*(i+0.54)
d=self.pixelpercell*(j+0.28)
for ac in self.actions:
a=c+self.pixelpercell*0.225*self.neighbours[ac][0]
b=d+self.pixelpercell*0.225*self.neighbours[ac][1]
cv2.putText( self.env,str(int(10000*self.Q[i][j][ac])/100.0)[0:-1],(int(b),int(a)), cv2.FONT_HERSHEY_PLAIN, 0.8,(0, 0, 0), 1 )
cv2.imshow('Environment',self.env)
cv2.waitKey(self.pause)
def MoveDir(self,state,action):
h=state[0]+self.neighbours[action][0]
w=state[1]+self.neighbours[action][1]
if ((-1<h<self.height) and (-1<w<self.width)):
if(self.maze[h][w]!=-10):
state[0]=state[0]+self.neighbours[action][0]
state[1]=state[1]+self.neighbours[action][1]
#print(self.pos)
if(state==self.goal) or (state==self.nogoal):
exit=0
print("exit")
else:
exit=1
move_reward=(self.reward[state[0]][state[1]]+self.movement_cost)
return exit,state,move_reward
def Movec(self,state,act):
c=""
exit,state,move_reward=self.MoveDir(state,act)
c+=str(dir[act])
self.policy+=c
self.policy+='|'
self.reached[state[0]][state]+=1
return exit,state,move_reward
def learn(self):
maxr=self.total_reward
while(1):
self.Epoch()
self.render()
#print("total reward",self.total_reward)
if ((self.total_reward) >= maxr):
maxr=self.total_reward
print("maxr",maxr)
print("success with max rewdrd and policy",self.policy)
def Epoch(self):
self.reset()
self.exit=1
n=0
self.policy=""
while(self.exit==1):
self.prev_pos=self.pos
#print('assign',self.prev_pos)
n+=1
act=self.choose_action(self.pos)
#print('chact',self.prev_pos)
Q=self.getQ([self.prev_pos[0],self.prev_pos[1]],act)
print('aftQ',self.prev_pos)
self.exit,pos,self.move_reward=self.Movec(self.pos,act)
print('aftmove',self.prev_pos)
self.total_reward+=self.move_reward
#print('end',self.prev_pos,self.pos)
#print("move_reward",r)
self.render()
q = [self.getQ(pos, a) for a in self.actions]
maxQ = max(q)
#print(maxQ)
Q=Q+self.alpha*(self.move_reward+(self.gamma*maxQ)-Q)
self.putQ([self.prev_pos[0],self.prev_pos[1]],act,Q)
self.pos=pos
#print("total reward",self.total_reward)
return exit
def reset(self):
self.total_reward=0
#print(self.pos)
self.pos=[self.start[0],self.start[1]]
#print(self.policy)
print("reset")
self.reached=np.zeros([self.height,self.width])
self.exit=1
rl=RLMaze()
rl.learn()
输出:
reset
('aftQ', [2, 0])
('aftmove', [1, 0])
('aftQ', [1, 0])
('aftmove', [2, 0])
('aftQ', [2, 0])
('aftmove', [2, 1])
('aftQ', [2, 1])
('aftmove', [2, 1])
('aftQ', [2, 1])
('aftmove', [2, 0])
('aftQ', [2, 0])
('aftmove', [1, 0])