Question

我基于python中的sugarscape模型实现了一个小型仿真。我在程序中有三个类，当我尝试运行Q学习算法时，我将模型收敛到单个状态，并且从未更改过，就像我在没有q学习算法的情况下运行模型时，状态总是不同的。

sugarscape_env.py

import gym
from gym import error, spaces, utils
from gym.utils import seeding
import logging
import numpy
import sys
import random
from six import StringIO
from agents import Agent
from IPython.display import Markdown, display
from pandas import *


numpy.set_printoptions(threshold=sys.maxsize)
logger = logging.getLogger(__name__)
ACTIONS = ["N", "E", "S", "W", "EAT"]
list_of_agents = []
list_of_agents_shuffled = {}
number_of_agents_in_list = 0
size_of_environment = 0
agents_dead = 0
initial_number_of_agents = 0
P = {state: {action: []
    for action in range(5)} for state in range(2500)}# 50 * 50 = 2500 positions on the map any agent can be in, then 5 actions that can occur so 2500 * 5 = 12,500 states/actions
state = None
new_state = None
new_row = None
new_col = None
reward = None
done = None
action_performed = None

random.seed(9001)

class SugarscapeEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(SugarscapeEnv, self).__init__()
        self.action_space = spaces.Discrete(5) #Number of applicable actions
        self.observation_space = spaces.Discrete(50 * 50) # state space on 50 by 50 grid
        self.current_step = 0



    def step(self, action):
        global reward, done, state
        """

        Parameters
        ----------
        action :

        Returns
        -------
        ob, reward, episode_over, info : tuple
            ob (object) :
                an environment-specific object representing your observation of
                the environment.
            reward (float) :
                amount of reward achieved by the previous action. The scale
                varies between environments, but the goal is always to increase
                your total reward.
            episode_over (bool) :
                whether it's time to reset the environment again. Most (but not
                all) tasks are divided up into well-defined episodes, and done
                being True indicates the episode has terminated. (For example,
                perhaps the pole tipped too far, or you lost your last life.)
            info (dict) :
                 diagnostic information useful for debugging. It can sometimes
                 be useful for learning (for example, it might contain the raw
                 probabilities behind the environment's last state change).
                 However, official evaluations of your agent are not allowed to
                 use this for learning.
        """
        self._take_action(action) # Perform one action (N, E, S or W)
        #self._agent_s_wealth() # Return agents sugar wealth and information
        self._regeneration()
        # The agents_die method doesn't work properly due to list indexing, need to work on that.
        #self._agents_die() # Have any agents died? If so replace the dead ones with new ones.
        self.current_step += 1
        #self.status = self._get_status() # Are all agents still alive or have they all died?
        #episode_over = self.status == 'ALL AGENTS DEAD' # Have all the agents died?
        return state, reward, done, {} # Return the ob, reward, episode_over and {}


    def _regeneration(self):
        global size_of_environment
        random_sugar = random.randrange(0, 3)
        """
            1. Iterate over all 0 sugar cells of the environment
            2. change the 0 to a random number between 0 - 5 (so at least some sugar is created)
        """
        for x in range(size_of_environment):
            for y in range(size_of_environment):
                if(self.environment[x, y] == 0):
                    self.environment[x, y] = random_sugar


    def _get_P(self):
        global P
        return P


    def _take_action(self, action):
        """
        One action is performed if action is N then agent will consider moving North
        if the sugar in the north cell (distance measured by vision of agent) is greater
        than or equal all other moves W, E or S. If moving North is not lucractive enough
        then agent will randomly move to the next highest paying cell.
        """
        global list_of_agents, ACTIONS, list_of_agents_shuffled, number_of_agents_in_list, size_of_environment, P, state, new_row, new_col, reward, done, action_performed, new_state
        agents_iteration = 0


        #while (number_of_agents != 10): #CHANGE TO 250
        for x in range(size_of_environment):
            for y in range(size_of_environment):
                #while number_of_agents in range(10):
                # FOR EACH CELL, CHECK IF AN AGENT OUT OF THE 250 IS STANDING IN THAT CELL.

                if agents_iteration < number_of_agents_in_list:
                    if(self.environment[x, y] == "\033[1mX\033[0m" and list_of_agents_shuffled[agents_iteration].get_ID() == agents_iteration):

                        #print(f"agend ID: {list_of_agents_shuffled[agents_iteration].get_ID()} and iteration {agents_iteration}")
                        #current_cell_sugar = self.environment[x, y]

                        #DEFAULTS
                        state = self.encode(x, y)
                        new_row = x
                        new_col = y
                        self._agents_die()
                        reward = self._get_reward()
                        self.status = self._get_status()
                        done = self.status == 'ALL AGENTS DEAD'

                        # Once the agent has been identified in the environment we set the applicable moves and vision variables
                        vision_of_agent = list_of_agents_shuffled[agents_iteration].get_vision()
                        move_south = self.environment[(x - vision_of_agent) % size_of_environment, y]
                        move_north = self.environment[(x + vision_of_agent) % size_of_environment, y]
                        move_east = self.environment[x, (y + vision_of_agent) % size_of_environment]
                        move_west = self.environment[x, (y - vision_of_agent) % size_of_environment]

                        # If moving south, north, east or west means coming into contact with another agent
                        # Set that locations sugar to 0
                        if(isinstance(self.environment[(x - vision_of_agent) % size_of_environment, y], str)):
                            move_south = int(0)
                        if(isinstance(self.environment[(x + vision_of_agent) % size_of_environment, y], str)):
                            move_north = int(0)
                        if(isinstance(self.environment[x, (y + vision_of_agent) % size_of_environment], str)):
                            move_east = int(0)
                        if(isinstance(self.environment[x, (y - vision_of_agent) % size_of_environment], str)):
                            move_west = int(0)

                        #print(move_north, move_east, move_south, move_west)


                        # MOVE UP (N)
                        if(action == ACTIONS[0]):
                            if((move_north >= move_south) and
                                (move_north >= move_east) and
                                (move_north >= move_west)):
                                # AGENT COLLECTS SUGAR.
                                list_of_agents_shuffled[agents_iteration].collect_sugar(move_north)
                                # CALCULATE AGENT SUGAR HEALTH
                                list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
                                # SUGAR AT LOCATION NOW SET TO 0
                                self.environment[(x + vision_of_agent) % size_of_environment, y] = 0
                                self.environment_duplicate[(x + vision_of_agent) % size_of_environment, y] = 0
                                #MOVE AGENT TO NEW LOCATION.
                                self.environment[(x + vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration].get_visual()
                                self.environment_duplicate[(x + vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration]
                                # SET PREVIOUS POSITION CELL TO 0 sugar
                                self.environment[x, y] = 0
                                self.environment_duplicate[x, y] = 0
                                # ADD ACTIONS TO ENV ACT
                                action_performed = 0
                                new_row = (x + vision_of_agent) % size_of_environment
                                new_col = y


                            else:
                                self._random_move(agents_iteration, move_south, move_east, move_north, move_west, x, y, vision_of_agent)


                        # MOVE DOWN (S)
                        if(action == ACTIONS[2]):
                            if((move_south >= move_north) and
                                (move_south >= move_east) and
                                (move_south >= move_west)):
                                # AGENT COLLECTS SUGAR.
                                list_of_agents_shuffled[agents_iteration].collect_sugar(move_south)
                                # CALCULATE AGENT SUGAR HEALTH
                                list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
                                # SUGAR AT LOCATION NOW SET TO 0
                                self.environment[(x - vision_of_agent) % size_of_environment, y] = 0
                                self.environment_duplicate[(x - vision_of_agent) % size_of_environment, y] = 0
                                #MOVE AGENT TO NEW LOCATION.
                                self.environment[(x - vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration].get_visual()
                                self.environment_duplicate[(x - vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration]
                                # SET PREVIOUS POSITION CELL TO 0 sugar
                                self.environment[x, y] = 0
                                self.environment_duplicate[x, y] = 0
                                # ADD ACTIONS TO ENV ACT
                                action_performed = 2
                                new_row = (x - vision_of_agent) % size_of_environment
                                new_col = y
                            else:
                                self._random_move(agents_iteration, move_south, move_east, move_north, move_west, x, y, vision_of_agent)


                        # MOVE LEFT (W)
                        if(action == ACTIONS[3]):
                            if((move_west >= move_south) and
                                (move_west >= move_east) and
                                (move_west >= move_north)):
                                # AGENT COLLECTS SUGAR.
                                list_of_agents_shuffled[agents_iteration].collect_sugar(move_west)
                                # CALCULATE AGENT SUGAR HEALTH
                                list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
                                # SUGAR AT LOCATION NOW SET TO 0
                                self.environment[x, (y - vision_of_agent) % size_of_environment] = 0
                                self.environment_duplicate[x, (y - vision_of_agent) % size_of_environment] = 0
                                #MOVE AGENT TO NEW LOCATION.
                                self.environment[x, (y - vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration].get_visual()
                                self.environment_duplicate[x, (y - vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration]
                                # SET PREVIOUS POSITION CELL TO 0 sugar
                                self.environment[x, y] = 0
                                self.environment_duplicate[x, y] = 0
                                # ADD ACTIONS TO ENV ACT
                                action_performed = 3
                                new_row = x
                                new_col = (y - vision_of_agent) % size_of_environment


                            else:
                                self._random_move(agents_iteration, move_south, move_east, move_north, move_west, x, y, vision_of_agent)


                        # MOVE RIGHT (E)
                        if(action == ACTIONS[1]):
                            if((move_east >= move_south) or
                                (move_east >= move_west) or
                                (move_east >= move_north)):
                                # AGENT COLLECTS SUGAR.
                                list_of_agents_shuffled[agents_iteration].collect_sugar(move_east)
                                # CALCULATE AGENT SUGAR HEALTH
                                list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
                                # SUGAR AT LOCATION NOW SET TO 0
                                self.environment[x, (y + vision_of_agent) % size_of_environment] = 0
                                self.environment_duplicate[x, (y + vision_of_agent) % size_of_environment] = 0
                                #MOVE AGENT TO NEW LOCATION.
                                self.environment[x, (y + vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration].get_visual()
                                self.environment_duplicate[x, (y + vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration]
                                # SET PREVIOUS POSITION CELL TO 0 sugar
                                self.environment[x, y] = 0
                                self.environment_duplicate[x, y] = 0
                                # ADD ACTIONS TO ENV ACT
                                action_performed = 1
                                new_row = x
                                new_col = (y + vision_of_agent) % size_of_environment
                            else:
                                self._random_move(agents_iteration, move_south, move_east, move_north, move_west, x, y, vision_of_agent)

                        new_state = self.encode(new_row, new_col)
                        P[state][action_performed].append(
                            (1.0, new_state, reward, done))

                        agents_iteration = agents_iteration + 1


    # state = env.get_state()
    def get_state(self):
        global state
        return state


    def _random_move(self, agents_iteration, move_south, move_east, move_north, move_west, x, y, vision_of_agent):
        global list_of_agents, ACTIONS, list_of_agents_shuffled, size_of_environment, P, state, new_row, new_col, reward, done, action_performed, new_state
        random_move = random.randrange(0, 3)


        if random_move == 0:
            list_of_agents_shuffled[agents_iteration].collect_sugar(move_north)
            list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
            self.environment[(x + vision_of_agent) % size_of_environment, y] = 0
            self.environment_duplicate[(x + vision_of_agent) % size_of_environment, y] = 0
            self.environment[(x + vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration].get_visual()
            self.environment_duplicate[(x + vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration]
            self.environment[x, y] = 0
            self.environment_duplicate[x, y] = 0
            action_performed = 0
            new_row = (x + vision_of_agent) % size_of_environment
            new_col = y


        elif random_move == 1:
            list_of_agents_shuffled[agents_iteration].collect_sugar(move_east)
            list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
            self.environment[x, (y + vision_of_agent) % size_of_environment] = 0
            self.environment_duplicate[x, (y + vision_of_agent) % size_of_environment] = 0
            self.environment[x, (y + vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration].get_visual()
            self.environment_duplicate[x, (y + vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration]
            self.environment[x, y] = 0
            self.environment_duplicate[x, y] = 0
            action_performed = 1
            new_row = x
            new_col = (y + vision_of_agent) % size_of_environment


        elif random_move == 2:
            list_of_agents_shuffled[agents_iteration].collect_sugar(move_south)
            list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
            self.environment[(x - vision_of_agent) % size_of_environment, y] = 0
            self.environment_duplicate[(x - vision_of_agent) % size_of_environment, y] = 0
            self.environment[(x - vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration].get_visual()
            self.environment_duplicate[(x - vision_of_agent) % size_of_environment, y] = list_of_agents_shuffled[agents_iteration]
            self.environment[x, y] = 0
            self.environment_duplicate[x, y] = 0
            action_performed = 2
            new_row = (x - vision_of_agent) % size_of_environment
            new_col = y


        else:
            list_of_agents_shuffled[agents_iteration].collect_sugar(move_west)
            list_of_agents_shuffled[agents_iteration].calculate_s_wealth()
            self.environment[x, (y - vision_of_agent) % size_of_environment] = 0
            self.environment_duplicate[x, (y - vision_of_agent) % size_of_environment] = 0
            self.environment[x, (y - vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration].get_visual()
            self.environment_duplicate[x, (y - vision_of_agent) % size_of_environment] = list_of_agents_shuffled[agents_iteration]
            self.environment[x, y] = 0
            self.environment_duplicate[x, y] = 0
            action_performed = 3
            new_row = x
            new_col = (y - vision_of_agent) % size_of_environment


        new_state = self.encode(new_row, new_col)
        P[state][action_performed].append(
            (1.0, new_state, reward, done))


    # 50 * 50
    def encode(self, agent_row, agent_column):
        i = agent_row
        i *= 50
        i = agent_column
        i *= 50
        return i


    def decode(self, i):
        out = []
        out.append(i % 50)
        i = i // 50
        out.append(i % 50)
        i = i // 50
        out.append(i)
        assert 0 <= i < 50
        return reversed(out)


    def _get_reward(self):
        """
        If all agents have positive s_wealth then reward 1 else 0
        therefore, the Q-learning algorithm will try learn how each agent can
        move to have positive s_wealth each iteration.
        """

        global agents_dead, number_of_agents_in_list

        if (agents_dead == 0):
            return 10
        elif(agents_dead < (number_of_agents_in_list / 2)):
            return 5
        else:
            return -1


    def reset(self, number_of_agents_in_list_local, size_of_environment_local):
        global number_of_agents_in_list, list_of_agents, list_of_agents_shuffled, size_of_environment, observation_space_calculated, initial_number_of_agents
        number_of_agents_in_list = number_of_agents_in_list_local
        size_of_environment = size_of_environment_local
        initial_number_of_agents = number_of_agents_in_list_local
        observation_space_calculated = size_of_environment_local
        number_of_agents = 0
        # Reset the state of the environment to an initial state
        self.growth_rate = 1
        self.environment = numpy.empty((size_of_environment,size_of_environment), dtype=numpy.object)
        self.environment_duplicate = numpy.empty((size_of_environment, size_of_environment), dtype=numpy.object)


        # Creating 250 agent objects and putting them into the list_of_agents array.
        for i in range(number_of_agents_in_list): #CHANGE TO 250
            list_of_agents.append(Agent(i))


        # Looping though the environment and adding random values between 0 and 4
        # This will be sugar levels.
        for i in range(size_of_environment):
            for j in range(size_of_environment):
                self.environment[i, j] = random.randrange(0, 4)


        # Looping 250 times over the environment and randomly placing agents on 0 sugar cells.
        while(number_of_agents != number_of_agents_in_list): #CHANGE TO 250
            x = random.randrange(size_of_environment)
            y = random.randrange(size_of_environment)
            if(self.environment[x, y] == 0):
                self.environment[x, y] = list_of_agents[number_of_agents].get_visual()
                self.environment_duplicate[x, y] = list_of_agents[number_of_agents]
                # Added the agent objects have been placed down randomly onto the environment from first to last.
                list_of_agents_shuffled[number_of_agents] = list_of_agents[number_of_agents]
                number_of_agents = number_of_agents + 1


    def _get_status(self):
        global size_of_environment
        """
            count the environment cells. If there are no X's on the environment
            then count these cells, if the total number of cells in the environment
            is the max size of the cells then all agents have died, else some agents
            are still alive.
        """
        counter = 0
        for i in range(size_of_environment):
            for j in range(size_of_environment):
                if(self.environment[i, j] != "\033[1mX\033[0m"):
                    counter = counter + 1

        if(counter == (size_of_environment * size_of_environment)):
            return 'ALL AGENTS DEAD'
        else:
            return 'SOME AGENTS STILL ALIVE'


    def render(self, mode='human', close=False):
        """
            Prints the state of the environment 2D grid
        """

        return('\n'.join([''.join(['{:1}'.format(item) for item in row]) for row in self.environment]))


    def _agent_s_wealth(self):
        """
            Returns the agents information each iteration of the simulation. ID, SUGAR WEALTH and AGE
        """
        for i in range(number_of_agents_in_list):
            print("Agent %s is of age %s and has sugar wealth %s" % (list_of_agents_shuffled[i].get_ID(),list_of_agents_shuffled[i].get_age(), list_of_agents_shuffled[i].get_s_wealth()))


    def _agents_die(self):
        """
            total_simulation_runs increments by 1 each iteration of the simulation
            when the total_simulation_runs == agents.age then agent dies and
            a new agent appears in a random location on the environment.
            number_of_agents_in_list: the number of agents created in the environment originally.
            agent_to_die = the agent whose age is == the frame number
            agent_dead = boolean if agent has died.
        """
        agent_to_die = None
        agent_dead = False
        global number_of_agents_in_list, size_of_environment, agents_dead


        # Remove the agents from the dictionary
        for i in range(number_of_agents_in_list):
            if (list_of_agents_shuffled[i].get_age() == self.current_step):

                """Remove the agent from the list of agents"""
                agent_to_die = list_of_agents_shuffled[i].get_ID()

                del list_of_agents_shuffled[i]
                key_value_of_agent_dead_in_dictionary = i
                # An agent is being deleted from the environment.
                agent_dead = True
                number_of_agents_in_list = number_of_agents_in_list - 1


        if(agent_dead == True):
            agents_dead += 1
            # Remove the agent from the list.
            for i in range(number_of_agents_in_list):
                if agent_to_die == list_of_agents[i].get_ID():
                    del list_of_agents[i]

            # Create a new agent and add it to the list_of_agents list.
            list_of_agents.append(Agent(key_value_of_agent_dead_in_dictionary))
            # Add new agent to dictionary.
            list_of_agents_shuffled[key_value_of_agent_dead_in_dictionary] = list_of_agents[len(list_of_agents) - 1]


            #print(f"AGENT AGE ADDED TO DICTIONARY: {list_of_agents_shuffled[key_value_of_agent_dead_in_dictionary].get_age()}")
            # Replace the agent in the Environment with the new agent.
            for x in range(size_of_environment):
                for y in range(size_of_environment):
                    if(self.environment[x, y] == "\033[1mX\033[0m" and self.environment_duplicate[x, y].get_ID() == agent_to_die):
                        # Add new agent to environment where old agent died.
                        self.environment[x, y] = list_of_agents[number_of_agents_in_list].get_visual()
                        self.environment_duplicate[x, y] = list_of_agents[number_of_agents_in_list]


            number_of_agents_in_list += 1

agents.py

import random
from IPython.display import Markdown, display

random.seed(9001)

class Agent:

    def __init__(self, ID):

        self.vision = random.randrange(1, 6)
        self.metabolic_rate = random.randrange(1, 4)
        self.age = random.randrange(500)
        self.s_wealth = random.randrange(5, 25)
        self.sugar_collected = 0
        self.ID = ID
        self.visual = "\033[1mX\033[0m"

    def get_vision(self):
        return self.vision

    def get_visual(self):
        return self.visual

    def get_metabolic_rate(self):
        return self.metabolic_rate

    def get_age(self):
        return self.age

    def get_s_wealth(self):
        return self.s_wealth

    def calculate_s_wealth(self):
        self.s_wealth = self.sugar_collected - self.metabolic_rate

    def collect_sugar(self, environment_cell_sugar):
        self.sugar_collected = self.sugar_collected + environment_cell_sugar

    def get_ID(self):
        return self.ID

最后是main.py

from sugarscape_env import SugarscapeEnv
import random
from IPython.display import clear_output
import numpy as np



alpha = 0.1
gamma = 0.6
epsilon = 0.1

all_epochs = []
all_penalties = []
ACTIONS = ['N', 'E', 'S', 'W']


"""
Example scenario run of model
"""
x = SugarscapeEnv()
#x = SugarscapeEnv()
#x.reset(10, 50) # 50 by 50 grid and 10 agents.
q_table = np.zeros([x.observation_space.n, x.action_space.n])

# For plotting metrics
all_epochs = []
all_penalties = []




for j in range(1, 100001):

    x.reset(10, 50) # 50 by 50 grid and 10 agents.
    state = x.get_state()
    print("OLD STATE: ", state)
    epochs, penalties, reward, = 0, 0, 0
    done = False

    for i in range(100):
        if random.uniform(0, 1) < epsilon:
            action = random.randrange(4)
        else:
            action = np.argmax(q_table[state])

        next_state, reward, done, info = x.step(ACTIONS[action])
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)

        q_table[state, action] = new_value

        if reward == -1:
            penalties += 1

        state = next_state
        print("NEW STATE: ", state)
        epochs += 1


print("Training finished.\n")

如果实现上述内容并运行main.py，则模型将运行，但是，随着时间的流逝，将出现以下输出：新州：850 新州：1800 新州：300 新州：2300 旧州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300 新州：2300

状态改变了，但是在某些时候它们停止了改变，我似乎无法弄清楚为什么。

Answer 1

问题多于答案。我不知道问题出在哪里，但这是代码中不清楚的地方，如果纠正，可能会把您带到期望的结果：

在_regeneration中是否打算用等量的糖播种整个田地？您随机选择一次值并将其分配给整个字段。看起来很奇怪，因为所有瓷砖现在对代理商都具有相同的吸引力。
奇怪的是动作可以被某些逻辑覆盖：不要执行动作，因为它不是最好的动作，所以请随机执行
state只是一列代理乘以50，这是什么意思？它看起来根本与糖分布无关。
我看不到q_table的新生成字段正在更新；看起来逻辑如下：以10％的机会随机移动，以90％的机会查看之前是否遇到过new_state，然后选择最佳的动作。这里有多个问题：状态不代表田地和糖，不考虑代理商的新职位。

实际上，我正在努力了解这件事是如何工作的。

PS 是受到Primer（https://www.youtube.com/channel/UCKzJFdi57J53Vr_BkTfN3uQ）视频的启发吗？

Python Q学习实施不起作用

1 个答案: