# Q-LearningAgent

In [None]:
import random
import numpy as np

#package for defining abstract base classes 
from abc import ABC, abstractmethod

class AbstractQLearningAgent(ABC):
    def __init__(self, env, epsilon_start, alpha, gamma):
        self.env = env
        self.epsilon = epsilon_start  # exploration constant
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.actions = range(env.action_space.n) #actions

    @abstractmethod
    def get_q_value(self, state, action):
        raise Exception("Not Implemented")

    def choose_action(self, state):
        #TODO implement epsilon-greedy policy
        raise Exception("Not Implemented")

    @abstractmethod
    def update(self, state, action, reward, next_state):
        raise Exception("Not Implemented")

## Tabular method with discretized states

In [None]:
class QLearningAgentTabular(AbstractQLearningAgent):
    def __init__(self, env, epsilon_start, alpha, gamma, discretisations=10):
        super().__init__(env, epsilon_start, alpha, gamma)
        self.q_table = {}
        self.discretisations = discretisations
    
    def _discretize(self, state):
        #returns the discrete_state
        low = self.env.observation_space.low
        high = self.env.observation_space.high
        diff = (high - low) / self.discretisations
        discrete_state = (state - low) // diff
        return tuple(discrete_state.tolist())

    def get_q_value(self, state, action):
        discrete_state = self._discretize(state)
        #TODO implement q_table lookup

    def update(self, state, action, reward, next_state):
        #TODO implement Q-learning

## Linear Approximation

In [None]:
class QLearningAgentApproximator(AbstractQLearningAgent):

    def __init__(self, env, epsilon, alpha, gamma, basis_functions_per_dimension=10):
        super().__init__(env, epsilon, alpha, gamma)

        low = env.observation_space.low
        high = env.observation_space.high

        xx, yy = np.meshgrid(np.linspace(low[0], high[0], basis_functions_per_dimension),
                             np.linspace(low[1], high[1], basis_functions_per_dimension))
        radials = np.append(xx.reshape(xx.shape + (1,)), yy.reshape(yy.shape + (1,)), axis=2)
        self.radials = radials.reshape((radials.size // 2, 2))        
        low = env.observation_space.low
        high = env.observation_space.high
        self.sigma_inv = 1 / (high  - low) * basis_functions_per_dimension

        self.weights = np.random.random((len(self.actions), basis_functions_per_dimension ** 2)) * 0.01 
        #shape of weights = (3, 100)
    
    def _feature_vector(self, state):
        r = self.sigma_inv * (self.radials - state)
        return np.exp(-0.5 * np.sum(r * r, axis=1))

    def get_q_value(self, state, action):
        #TODO implement q-value approximation

    def update(self, state, action, reward, next_state):
        #TODO implement semi-gradient Q-Learning

# Main

In [None]:

%matplotlib notebook

import gym
from mpl_toolkits.mplot3d import axes3d
from matplotlib import pyplot as plt
import numpy as np


def episode(env, agent, gamma, render=False):
    state = env.reset()
    discounted_return = 0
    done = False
    time_step = 0
    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.update(state, action, reward, next_state)
        if render:
            env.render()
        discounted_return += reward*(gamma**time_step)
        state = next_state
        time_step +=1
    return discounted_return


def train(env, agent, gamma, nr_episodes, epsilon_start, fig, ax, fig2, ax2):
    returns = []
    test_returns = []
    for i in range(nr_episodes):
        agent.epsilon = max(0.1, agent.epsilon - epsilon_start/nr_episodes)
        episode_return = episode(env, agent, gamma)
        
        returns.append(episode_return)
        if i % (nr_episodes//20) == 0:
            print("episode {:5d}, return {}, epsilon {:.2f}".format(i, episode_return, agent.epsilon))
            epsilon = agent.epsilon
            agent.epsilon = 0
            test_returns.append(sum([episode(env, agent, gamma) for _ in range(10)]) / 10.)
            agent.epsilon = epsilon
            ax.plot(returns)
            fig.canvas.draw()
            ax2.plot(test_returns)
            fig2.canvas.draw()
            

def plot_values(env, agent):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    low = env.observation_space.low
    high = env.observation_space.high
    xs = np.linspace(low[0], high[0], 500)
    ys = np.linspace(low[1], high[1], 500)
    X, Y = np.meshgrid(xs, ys)

    states = np.append(X.reshape(X.shape + (1,)), Y.reshape(Y.shape + (1,)), axis=2)
    states = states.reshape((states.shape[0]*states.shape[1], 2,))
    values = np.array(list(map(lambda x: max([agent.get_q_value(x, a) for a in agent.actions]), states))) #state value
    Z = -values.reshape(X.shape)

    # plot reward function (multiplied by -1)
    ax.plot_wireframe(X, Y, Z, rstride=10, cstride=10)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel("state value")
    plt.show()
  
    
env = gym.make('MountainCar-v0')
eps_start = 0.9
alpha = 0.01
gamma = 0.999
fig,ax = plt.subplots(1,1)
ax.set_title("Training returns")
ax.set_xlabel('episode')
fig2,ax2 = plt.subplots(1,1)
ax2.set_title("Test returns")
ax2.set_xlabel('episode')

#agent = QLearningAgentTabular(env, eps_start, alpha, gamma)
agent = QLearningAgentApproximator(env, eps_start, alpha, gamma)
nr_episodes = 3000

train(env, agent, gamma, nr_episodes, eps_start, fig, ax, fig2, ax2)

In [None]:
#Plot the state values
plot_values(env, agent)

In [None]:
#visualize learned policy within domain for 3 episodes
agent.epsilon = 0
for _ in range(3):
     episode(env, agent, render = True)