Summary of strengthening learning (3) -- Thinking and decision-making of QLearning

1, Thinking decision framework

import numpy as np
import pandas as pd

class QLearningTable:
    # Initialization
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):

    # Selection behavior
    def choose_action(self, observation):

    # Learn to update parameters
    def learn(self, s, a, r, s_):

    # Check whether state exists
    def check_state_exist(self, state):

2, Function implementation

1. initialization

  • Actions: all actions
  • epsilon: greedy rate
  • lr: learning rate α
  • gamma: reward attenuation
  • Q table: Q table
  def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate # Learning rate
        self.gamma = reward_decay   # Reward attenuation
        self.epsilon = e_greedy     # greediness
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)   # Initial q'table

2. Choose action

  • if: select the maximum within the greedy rate (to prevent disorderly order of choice s with the same value)
  • else: random selection
 def choose_action(self, observation):
        self.check_state_exist(observation) # Check whether the state exists in q'table

        # Select action
        if np.random.uniform() < self.epsilon:  # Select the action with the highest Q value
            state_action = self.q_table.loc[observation, :]
            # For the same state, there may be multiple identical Q action value s, so let's sort them out
            action = np.random.choice(state_action[state_action == np.max(state_action)].index)

        else:   # Randomly select action
            action = np.random.choice(self.actions)

        return action

3. Learn to update parameters (update Q table)

  def learn(self, s, a, r, s_):
        self.check_state_exist(s_)  # Check if s exists in Q table_ 


        q_predict = self.q_table.loc[s, a]    # Get Q forecast
        if s_ != 'terminal':                  # Get real value
            q_target = r + self.gamma * self.q_table.loc[s_, :].max()  # Next state is not a terminator
        else:
            q_target = r  # Next state is the terminator

         # Update Q table: update the corresponding state action value
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

4. Check whether there is current state action value in Q table

If there is no current state, we will insert a set of all 0 data as the initial values of all action s of this state

def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index=self.q_table.columns,
                    name=state,
                )
            )

 

Added by pmiller624 on Sun, 22 Dec 2019 20:46:27 +0200