Java到Python代码不起作用

时间:2014-03-20 16:30:13

标签: python machine-learning q-learning

我正在尝试将Java代码转换为Python代码,到目前为止我已经完成了。 Java代码有效但Python代码不起作用。请帮帮我。

Python代码

import random


class QLearning():
    alpha = 0.1
    gamma = 0.9

    state_a = 0
    state_b = 1
    state_c = 2
    state_d = 3
    state_e = 4
    state_f = 5

    states_count = 6

    states = [state_a, state_b, state_c, state_d, state_e, state_f]

    R = [[0 for x in range(states_count)] for x in range(states_count)]
    Q = [[0 for x in range(states_count)] for x in range(states_count)]

    action_from_a = [state_b, state_d]
    action_from_b = [state_a, state_c, state_e]
    action_from_c = [state_c]
    action_from_d = [state_a, state_e]
    action_from_e = [state_b, state_d, state_f]
    action_from_f = [state_c, state_e]

    actions = [action_from_a, action_from_b, action_from_c, action_from_d, action_from_e, action_from_f]

    state_names = ["A","B","C","D","E","F"]

    def __init__(self):
        self.R[self.state_b][self.state_c] = 100
        self.R[self.state_f][self.state_c] = 100

    def run(self):
        for i in range(1000):
            state = random.randrange(self.states_count)
            while(state != self.state_c):
                actions_from_state = self.actions[state]
                index = random.randrange(len(actions_from_state))
                action = actions_from_state[index]
                next_state = action
                q = self.Q_Value(state, action)
                max_Q = self.max_q(next_state)
                r = self.R_Value(state, action)

                value = q + self.alpha * (r + self.gamma * max_Q - q)
                self.set_q(state, action, value)
                state = next_state

    def max_q(self, s):
        self.run().actions_from_state = self.actions[s]
        max_value = 5
        for i in range(len(self.run().actions_from_state)):
            self.run().next_state = self.run().actions_from_state[i]
            self.run().value = self.Q[s][self.run().next_state]

            if self.run().value > max_value:
                max_value = self.run().value
        return max_value

    def policy(self, state):
        self.run().actions_from_state = self.actions[state]
        max_value = 5
        policy_goto_state = state
        for i in range(len(self.run().actions_from_state)):
            self.run().next_state = self.run().actions_from_state[i]
            self.run().value = self.Q[state][self.run().next_state]

            if self.run().value > max_value:
                max_value = self.run().value
                policy_goto_state = self.run().next_state
        return policy_goto_state

    def Q_Value(self, s,a):
        return self.Q[s][a]

    def set_q(self, s, a, value):
        self.Q[s][a] = value

    def R_Value(self, s, a):
        return self.R[s][a]

    def print_result(self):
        print("Print Result")
        for i in range(len(self.Q)):
            print("Out From (0)".format(self.state_names[i]))
            for j in range(len(self.Q[i])):
                print(self.Q[i][j])

    def show_policy(self):
        print("Show Policy")
        for i in range(len(self.states)):
            fro = self.states[i]
            to = self.policy(fro)
            print("From {0} goto {1}".format(self.state_names[fro], self.state_names[to]))

obj = QLearning()
obj.run()
obj.print_result()
obj.show_policy()

Java代码

import java.text.DecimalFormat;
import java.util.Random;

public class Qlearning {
    final DecimalFormat df = new DecimalFormat("#.##");

    // path finding
    final double alpha = 0.1;
    final double gamma = 0.9;


// states A,B,C,D,E,F
// e.g. from A we can go to B or D
// from C we can only go to C
// C is goal state, reward 100 when B->C or F->C
//
// _______
// |A|B|C|
// |_____|
// |D|E|F|
// |_____|
//

    final int stateA = 0;
    final int stateB = 1;
    final int stateC = 2;
    final int stateD = 3;
    final int stateE = 4;
    final int stateF = 5;

    final int statesCount = 6;
    final int[] states = new int[]{stateA,stateB,stateC,stateD,stateE,stateF};

    // http://en.wikipedia.org/wiki/Q-learning
    // http://people.revoledu.com/kardi/tutorial/ReinforcementLearning/Q-Learning.htm

    // Q(s,a)= Q(s,a) + alpha * (R(s,a) + gamma * Max(next state, all actions) - Q(s,a))

    int[][] R = new int[statesCount][statesCount]; // reward lookup
    double[][] Q = new double[statesCount][statesCount]; // Q learning

    int[] actionsFromA = new int[] { stateB, stateD };
    int[] actionsFromB = new int[] { stateA, stateC, stateE };
    int[] actionsFromC = new int[] { stateC };
    int[] actionsFromD = new int[] { stateA, stateE };
    int[] actionsFromE = new int[] { stateB, stateD, stateF };
    int[] actionsFromF = new int[] { stateC, stateE };
    int[][] actions = new int[][] { actionsFromA, actionsFromB, actionsFromC,
            actionsFromD, actionsFromE, actionsFromF };

    String[] stateNames = new String[] { "A", "B", "C", "D", "E", "F" };

    public Qlearning() {
        init();
    }

    public void init() {
        R[stateB][stateC] = 100; // from b to c
        R[stateF][stateC] = 100; // from f to c
    }

    public static void main(String[] args) {
        long BEGIN = System.currentTimeMillis();

        Qlearning obj = new Qlearning();

        obj.run();
        obj.printResult();
        obj.showPolicy();

        long END = System.currentTimeMillis();
        System.out.println("Time: " + (END - BEGIN) / 1000.0 + " sec.");
    }

    void run() {
        /*
         1. Set parameter , and environment reward matrix R
         2. Initialize matrix Q as zero matrix
         3. For each episode: Select random initial state
            Do while not reach goal state o
                Select one among all possible actions for the current state o
                Using this possible action, consider to go to the next state o
                Get maximum Q value of this next state based on all possible actions o
                Compute o Set the next state as the current state
         */

        // For each episode
        Random rand = new Random();
        for (int i = 0; i < 1000; i++) { // train episodes
            // Select random initial state
            int state = rand.nextInt(statesCount);
            while (state != stateC) // goal state
            {
                // Select one among all possible actions for the current state
                int[] actionsFromState = actions[state];

                // Selection strategy is random in this example
                int index = rand.nextInt(actionsFromState.length);
                int action = actionsFromState[index];

                // Action outcome is set to deterministic in this example
                // Transition probability is 1
                int nextState = action; // data structure

                // Using this possible action, consider to go to the next state
                double q = Q(state, action);
                double maxQ = maxQ(nextState);
                int r = R(state, action);

                double value = q + alpha * (r + gamma * maxQ - q);
                setQ(state, action, value);

                // Set the next state as the current state
                state = nextState;
            }
        }
    }

    double maxQ(int s) {
        int[] actionsFromState = actions[s];
        double maxValue = Double.MIN_VALUE;
        for (int i = 0; i < actionsFromState.length; i++) {
            int nextState = actionsFromState[i];
            double value = Q[s][nextState];

            if (value > maxValue)
                maxValue = value;
        }
        return maxValue;
    }

    // get policy from state
    int policy(int state) {
        int[] actionsFromState = actions[state];
        double maxValue = Double.MIN_VALUE;
        int policyGotoState = state; // default goto self if not found
        for (int i = 0; i < actionsFromState.length; i++) {
            int nextState = actionsFromState[i];
            double value = Q[state][nextState];

            if (value > maxValue) {
                maxValue = value;
                policyGotoState = nextState;
            }
        }
        return policyGotoState;
    }

    double Q(int s, int a) {
        return Q[s][a];
    }

    void setQ(int s, int a, double value) {
        Q[s][a] = value;
    }

    int R(int s, int a) {
        return R[s][a];
    }

    void printResult() {
        System.out.println("Print result");
        for (int i = 0; i < Q.length; i++) {
            System.out.print("out from " + stateNames[i] + ":  ");
            for (int j = 0; j < Q[i].length; j++) {
                System.out.print(df.format(Q[i][j]) + " ");
            }
            System.out.println();
        }
    }

    // policy is maxQ(states)
    void showPolicy() {
        System.out.println("\nshowPolicy");
        for (int i = 0; i < states.length; i++) {
            int from = states[i];
            int to =  policy(from);
            System.out.println("from "+stateNames[from]+" goto "+stateNames[to]);
        }
    }
}

回溯

C:\Python33\python.exe "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py"
Traceback (most recent call last):
  File "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py", line 4, in <module>
    class QLearning():
  File "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py", line 19, in QLearning
    R = [[0 for x in range(states_count)] for x in range(states_count)]
  File "C:/Users/Ajay/Documents/Python Scripts/RL/QLearning.py", line 19, in <listcomp>
    R = [[0 for x in range(states_count)] for x in range(states_count)]
NameError: global name 'states_count' is not defined

1 个答案:

答案 0 :(得分:2)

要访问您定义的所有类属性(即class QLearningdef __init__之间的所有内容),您需要使用self或类名:

self.states_count

QLearning.states_count

我不知道算法,但这些类属性可能是实例属性(即对于每个类的实例是分开的,而不是在所有实例之间共享),因此在{{1}中定义(或其他实例方法)无论如何都使用__init__