Assume that we have the following grid world:

The goal of the agent whose position is at the bottom left corner is to maximize the expected future reward. The top right (reward +1) and the middle right cell (reward -1) are terminal states. There are four different actions the agent can choose, namely go up, down, left and right. However, there is a chance that the agent will result in a wrong grid cell by accident. If the action is going up, there is a chance of 0.8 that the agent moves up, a chance of 0.1 that he moves left and a chance of 0.1 that he moves right. All other actions behave accordingly. If the agent would bounce on a wall it stays at the current position. Note that the agent cannot move through the grayed-out grid cell. Assume a gamma value of $\gamma=0.9$.
Assume that the following utility values are given:

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
class MDP:
    
    "0: up, 1: right, 2: left, 3: down"
    _transitions = np.array([
        [[4, 0.8], [-1, 0.1], [1, 0.1]],
        [[1, 0.8], [-4, 0.1], [4, 0.1]],
        [[-1, 0.8], [-4, 0.1], [4, 0.1]],
        [[-4, 0.8], [-1, 0.1], [1, 0.1]]
        ])
    
    def __init__(self, gamma=0.9):
        self.reward = np.zeros(3*4)
        self.reward[-1] = +1
        self.reward[5] = -1
        self.reward[7] = -1
        self.gamma = gamma
        
    def rewards(self, state):
        if state == 12:
            return 0
        return self.reward[state]
    
    def _bounces(self, state, s):
        return np.sum(
            np.abs(np.array(np.where(np.arange(12).reshape((3,4)) == state)) 
                   - np.array(np.where(np.arange(12).reshape((3,4)) == s)))
        ) > 1
    
    def transition(self, state, action):
        if state == 7 or state == 11 or state == 12:
            return np.array([[12, 1]])
        trans = MDP._transitions[action].copy()
        trans = trans.T
        trans[0,:] += state
        trans = trans.T
        for i, (s, prob) in enumerate(trans):
            if s < 0 or s >= 3*4 or s == 5 or self._bounces(state, s):
                trans[i,0] = state
        return trans
def print_utilities(utilities):
    plt.figure()
    reshaped = utilities[0:-1].reshape((3,4))
    reshaped[1,1] = -1
    ax = sns.heatmap(reshaped, annot=True)
    ax.invert_yaxis()
mdp = MDP()
utilities = np.array([0.5,0.4,0.3,0.,0.5,-1.,0.5,-1.,0.6,0.7,0.8,1,0])
print_utilities(utilities)

Implement a policy improvement step in order to compute the policy that is defined by the given utility values.
def policy_improvement(mdp, utilities):
    # TODO implement
    raise Exception("Not yet implemented!")
# Prints the computed policy
policy = policy_improvement(mdp, utilities).astype(int)
print_utilities(policy)

Implement a policy evaluation step in order evaluate the utility values of the improved policy.
def policy_evaluation(mdp, policy):
    # TODO implement
    raise Exception("Not yet implemented!")
# Prints the starting utilities and the utilities of the improved policy
print_utilities(utilities)
print_utilities(policy_evaluation(mdp, policy))


Implement the policy iteration algorithm by using the policy improvement and evaluation function defined above.
def policy_iteration(mdp):
    # TODO implement
    raise Exception("Not yet implemented!")
# prints the optimal policy and the optimal utility values
policy = policy_iteration(mdp)
print_utilities(policy)
print_utilities(policy_evaluation(mdp, policy))


Implement the value iteration algorithm.
def value_iteration(mdp, utilities):
    # TODO implement
    raise Exception("Not yet implemented!")
# prints the optimal utility from Value Iteration
optimal = value_iteration(mdp, utilities)
print_utilities(optimal)
