我正在编写以下代码,用于自驾车程序。我的choose_action函数有问题。代理应该从以下步骤中具有最高Q值的操作中选择随机操作:
"否则: action = maxQaction"
但是我现在写的方式,每次都会选择相同的动作。任何人都可以建议如何将选择随机化为最高Q值,也许我可以使用列表。
Code:
import random
import math
from environment import Agent, Environment
from planner import RoutePlanner
from simulator import Simulator
import itertools
class LearningAgent(Agent):
""" An agent that learns to drive in the Smartcab world.
This is the object you will be modifying. """
def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
super(LearningAgent, self).__init__(env) # Set the agent in the evironment
self.planner = RoutePlanner(self.env, self) # Create a route planner
self.valid_actions = self.env.valid_actions # The set of valid actions
# Set parameters of the learning agent
self.learning = learning # Whether the agent is expected to learn
self.Q = dict() # Create a Q-table which will be a dictionary of tuples
self.epsilon = epsilon # Random exploration factor
self.alpha = alpha # Learning factor
###########
## TO DO ##
###########
# Set any additional class parameters as needed
self.states = [
['red', 'green'], #light
['left', 'right', 'forward', None], #vehicleleft
['left', 'right', 'forward', None], #vehicleright
['left', 'right', 'forward', None], #vehicleoncoming
['left', 'right', 'forward'] #waypoint
]
self.x = 0
random.seed(42)
self.q_maker = dict((k, 0.0) for k in self.valid_actions)
for prod_state in itertools.product(*self.states):
self.Q[prod_state] = self.q_maker.copy()
def reset(self, destination=None, testing=False):
""" The reset function is called at the beginning of each trial.
'testing' is set to True if testing trials are being used
once training trials have completed. """
# Select the destination as the new location to route to
self.planner.route_to(destination)
###########
## TO DO ##
###########
# Update epsilon using a decay function of your choice
# Update additional class parameters as needed
# If 'testing' is True, set epsilon and alpha to 0
#Added for Question 6
#self.x = self.x + 1
if testing:
self.epsilon = 0.0
self.alpha = 0.0
else:
#self.epsilon = self.epsilon - 0.05 for question 6
self.x += 1
self.epsilon = math.exp(-self.alpha*self.x)
#self.epsilon = math.fabs(math.cos(self.alpha*self.x))
# self.epsilon = 1.0/(self.x**2)
# self.epsilon = self.alpha**self.x
return None
def build_state(self):
""" The build_state function is called when the agent requests data from the
environment. The next waypoint, the intersection inputs, and the deadline
are all features available to the agent. """
# Collect data about the environment
waypoint = self.planner.next_waypoint() # The next waypoint
inputs = self.env.sense(self) # Visual input - intersection light and traffic
deadline = self.env.get_deadline(self) # Remaining deadline
###########
## TO DO ##
###########
# Set 'state' as a tuple of relevant data for the agent
#state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) #None modified for "Update the Driving Agent State"
state = (inputs['light'], inputs['left'], inputs['right'], inputs['oncoming'],waypoint)
return state
def get_maxQ(self, state):
""" The get_max_Q function is called when the agent is asked to find the
maximum Q-value of all actions based on the 'state' the smartcab is in. """
###########
## TO DO ##
###########
# Calculate the maximum Q-value of all actions for a given state
action_selections = self.Q[state]
maxQ = max(action_selections.items(), key=lambda x: x[1])[1]
return maxQ
def createQ(self, state):
""" The createQ function is called when a state is generated by the agent. """
###########
## TO DO ##
###########
# When learning, check if the 'state' is not in the Q-table
# If it is not, create a new dictionary for that state
# Then, for each action available, set the initial Q-value to 0.0
if not self.learning:
return
if not state in self.Q:
self.Q[state] = self.q_maker.copy()
return
def choose_action(self, state):
""" The choose_action function is called when the agent is asked to choose
which action to take, based on the 'state' the smartcab is in. """
# Set the agent state and default action
self.state = state
self.next_waypoint = self.planner.next_waypoint()
action = random.choice([None, 'forward', 'left', 'right']) ##None ##Modified from None for question 3
#action = None # added after first submission
###########
## TO DO ##
###########
# When not learning, choose a random action
# When learning, choose a random action with 'epsilon' probability
# Otherwise, choose an action with the highest Q-value for the current state
action_selections = self.Q[state]
maxQaction = max(action_selections.items(), key=lambda x: x[1])[0]
if self.learning:
choose_using_epsilon = random.random() < 1 - self.epsilon
if not choose_using_epsilon:
valid_actions = filter(lambda x: x != maxQaction,
Environment.valid_actions)
action = random.choice(valid_actions)
else:
action = maxQaction
else:
action = random.choice(Environment.valid_actions)
return action
答案 0 :(得分:1)
if not self.learning or random.random() < self.epsilon:
action = random.choice(self.valid_actions)
else:
maxQaction= self.get_maxQ(state)
maxQaction= [] # build list of actions that match the max Q value
for act in self.Q[state]:
if self.Q[state][act] == maxQ:
maxQaction.append(act)
action = random.choice(maxQaction) # choose one randomly