breakout_test_keras.py

import gym
import numpy as np
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.models import load_model
from keras.layers import *
from keras.optimizers import *

# Uncomment line below to play the game as a human
# from gym.utils import play
# play.play(env, zoom=3)

# Agent and memory constants
PROBLEM = 'BreakoutDeterministic-v4'
FRAME_SKIP = 4
MEMORY_BATCH_SIZE = 32
REPLAY_START_SIZE = 50000
REPLAY_MEMORY_SIZE = 10000  # RMSProp train updates sampled from this number of recent frames
NUMBER_OF_EPISODES = 40  # TODO: save and restore model with infinite episodes
EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 0.1
MAX_FRAMES_DECAYED = REPLAY_MEMORY_SIZE / FRAME_SKIP  # TODO: correct? 1 million in paper

# CNN Constants
IMAGE_INPUT_HEIGHT, IMAGE_INPUT_WIDTH, IMAGE_INPUT_CHANNELS = 84, 84, 1
CONV1_NUM_FILTERS, CONV1_FILTER_SIZE, CONV1_FILTER_STRIDES = 32, 8, 4
CONV2_NUM_FILTERS, CONV2_FILTER_SIZE, CONV2_FILTER_STRIDES = 64, 4, 2
CONV3_NUM_FILTERS, CONV3_FILTER_SIZE, CONV3_FILTER_STRIDES = 64, 3, 1
DENSE_NUM_UNITS, OUTPUT_NUM_UNITS = 512, 4  # TODO: GET Action count from constructor
LEARNING_RATE, GRADIENT_MOMENTUM, MIN_SQUARED_GRADIENT = 0.00025, 0.95, 0.01
HUBER_LOSS_DELTA, DISCOUNT_FACTOR = 2.0, 0.99  # TODO: is value 1 or 2 in paper for Huber?
RANDOM_WEIGHT_INITIALIZER = tf.initializers.RandomNormal()
HIDDEN_ACTIVATION, OUTPUT_ACTIVATION, PADDING = 'relu', 'linear', "SAME"  # TODO: remove?
LEAKY_RELU_ALPHA, DROPOUT_RATE = 0.2, 0.5  # TODO: remove or use to improve paper
optimizer = tf.optimizers.RMSprop(learning_rate=LEARNING_RATE, rho=GRADIENT_MOMENTUM, epsilon=MIN_SQUARED_GRADIENT)


class FramePreprocessor:
    """
    FramePreprocessor re-sizes, normalizes and converts RGB atari frames to gray scale frames.
    """

    def __init__(self, state_space):
        self.state_space = state_space

    def convert_rgb_to_grayscale(self, tf_frame):
        return tf.image.rgb_to_grayscale(tf_frame)
    
    def resize_frame(self, tf_frame, frame_height, frame_width):
        return tf.image.resize(tf_frame, [frame_height,frame_width], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    def normalize_frame(self, tf_frame):
        return tf_frame / 255

    def plot_frame_from_greyscale_values(self, image):
        height, width, _ = image.shape 
        grey_image = np.array([[(image[i, j].numpy()[0], image[i, j].numpy()[0], image[i, j].numpy()[0]) 
                                for i in range(height)]
                                for j in range(width)])
        grey_image = np.transpose(grey_image, (1, 0, 2))  # Switch height and width
        plt.imshow(grey_image) 
        plt.show()

    def preprocess_frame(self, frame):
        tf_frame = tf.Variable(frame, shape=self.state_space, dtype=tf.float32)  # TODO: uint8 does not work
        image = self.convert_rgb_to_grayscale(tf_frame)
        image = self.resize_frame(image, IMAGE_INPUT_HEIGHT, IMAGE_INPUT_WIDTH)
        image = self.normalize_frame(image)

        return image


class Memory:
    """
    Memory class holds a list of game plays stored as experiences (s,a,r,s')
    """
    samples = []
    
    def __init__(self, capacity):  # Initialize memory with given capacity
        self.capacity = capacity
    
    def add(self, sample):  # Add a sample to the memory, removing the earliest entry if memeory capacity is reached
        self.samples.append(sample)
        
        if len(self.samples) > self.capacity:
            self.samples.pop(0)
         
    def get_samples(self, sample_size):  # Return n samples from the memory
        sample_size = min(sample_size, len(self.samples))
  
      
        return random.sample(self.samples, sample_size)

    def get_size(self):
        return len(self.samples)

def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(

    return K.mean(loss)

class ConvolutionalNeuralNetwork:
    """
    CNN CLASS
    Architecture of DQN has 4 hidden layers:

    Input:  84 X 84 X 1 image (4 in paper due to frame skipping) (PREPROCESSED image), Game-score, Life count, Actions_count (4)
    1st Hidden layer: Convolves 32 filters of 8 X 8 with stride 4 (relu)
    2nd hidden layer: Convolves 64 filters of 4 X 4 with stride 2 (relu)
    3rd hidden layer: Convolves 64 filters of 3 X 3 with stride 1 (Relu)
    4th hidden layer: Fully connected, (512 relu units)
    Output: Fully connected linear layer, Separate output unit for each action, outputs are predicted Q-values
    """

    weights = {
        # Conv Layer 1: 8x8 conv, 1 input (preprocessed image has 1 color channel), 32 output filters
        'conv1_weights': tf.Variable(RANDOM_WEIGHT_INITIALIZER([CONV1_FILTER_SIZE,  # Filter width
                                                                CONV1_FILTER_SIZE,  # Filter height
                                                                IMAGE_INPUT_CHANNELS,  # In Channel
                                                                CONV1_NUM_FILTERS])),  # Out Channel
        # Conv Layer 2: 4x4 conv, 32 input filters, 64 output filters
        'conv2_weights': tf.Variable(RANDOM_WEIGHT_INITIALIZER([CONV2_FILTER_SIZE,
                                                                CONV2_FILTER_SIZE,
                                                                CONV1_NUM_FILTERS,
                                                                CONV2_NUM_FILTERS])),
        # Conv Layer 3: 3x3 conv, 64 input filters, 64 output filters
        'conv3_weights': tf.Variable(RANDOM_WEIGHT_INITIALIZER([CONV3_FILTER_SIZE,
                                                                CONV3_FILTER_SIZE,
                                                                CONV2_NUM_FILTERS,
                                                                CONV3_NUM_FILTERS])),
        # Fully Connected (Dense) Layer: 3x3x64 inputs (64 filters of size 3x3), 512 output units
        'dense_weights': tf.Variable(RANDOM_WEIGHT_INITIALIZER([CONV3_FILTER_SIZE * CONV3_FILTER_SIZE * CONV3_NUM_FILTERS, DENSE_NUM_UNITS])),

        # Output layer: 512 input units, 4 output units (actions)
        'output_weights': tf.Variable(RANDOM_WEIGHT_INITIALIZER([DENSE_NUM_UNITS, OUTPUT_NUM_UNITS]))
    }

    biases = {
        'conv1_biases': tf.Variable(tf.zeros([CONV1_NUM_FILTERS])),  # 32
        'conv2_biases': tf.Variable(tf.zeros([CONV2_NUM_FILTERS])),  # 64
        'conv3_biases': tf.Variable(tf.zeros([CONV3_NUM_FILTERS])),  # 64
        'dense_biases': tf.Variable(tf.zeros([DENSE_NUM_UNITS])),  # 512
        'output_biases': tf.Variable(tf.zeros([OUTPUT_NUM_UNITS]))  # 4
    }

    def __init__(self, number_of_states, number_of_actions,model=None):
        self.number_of_states = number_of_states
        self.number_of_actions = number_of_actions
        if model is None:
            self.model = self._createModel()
        else:
            self.model = load_model(model)
    
    
    def huber_error_loss(self, y_predictions, y_true):
        errors = y_true - y_predictions

        condition = tf.abs(errors) < HUBER_LOSS_DELTA  # 2.0

        l2_squared_loss = 0.5 * tf.square(errors)
        l1_absolute_loss = HUBER_LOSS_DELTA * (tf.abs(errors) - 0.5 * HUBER_LOSS_DELTA)

        loss = tf.where(condition, l2_squared_loss, l1_absolute_loss)

        return np.mean(loss)

    def _createModel(self):
        model = Sequential()   
        model.add(Conv2D(32, (8, 8), strides=(4,4), activation='relu', input_shape=(self.number_of_states)))
        model.add(Conv2D(64, (4, 4), strides=(2,2), activation='relu'))
        model.add(Conv2D(64, (3, 3), activation='relu'))
        model.add(Flatten())
        model.add(Dense(units=512, activation='relu'))

        model.add(Dense(units=number_of_actions, activation='linear'))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model
    
    def train(self, x, y, epochs=1, verbose=0):
        self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)
    
    def predict(self, s, target=False):
        if target:
            pass
            #return self.model_.predict(s)
        else:
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        return self.predict(s.reshape(1, 210, 160, 3), target).flatten()

    @tf.function
    def convolutional_2d_layer(self, inputs, filter_weights, biases, strides=1):
        output = tf.nn.conv2d(inputs, filter_weights, strides, padding=PADDING)  # TODO: padding in paper?
        output_with_bias = tf.nn.bias_add(output, biases)
        activation = tf.nn.relu(output_with_bias)  # non-linearity TODO: improve paper with leaky relu?
        return activation

    # TODO: consider removing since not used
    @tf.function
    def maxpool_layer(self, inputs, pools_dim, strides_dim):
        return tf.nn.max_pool2d(inputs, pools_dim, strides_dim, padding=PADDING)

    @tf.function
    def flatten_layer(self, layer, weights_name='dense_weights'):  # output shape: [-1, 3*3*64]
        dimensions = self.weights[weights_name].get_shape().as_list()[0]
        flattened_layer = tf.reshape(layer, shape=(-1, dimensions))  # -1 flattens into 1-D
        return flattened_layer

    @tf.function
    def dense_layer(self, inputs, weights, biases):
        output = tf.nn.bias_add(tf.matmul(inputs, weights), biases)
        dense_activation = tf.nn.leaky_relu(output, LEAKY_RELU_ALPHA)  # non-linearity
        dropout = tf.nn.dropout(dense_activation, rate=DROPOUT_RATE)  # TODO: does paper dropout?
        return dropout

    @tf.function
    def output_layer(self, input, weights, biases):
        linear_output = tf.nn.bias_add(tf.matmul(input, weights), biases)
        return linear_output

    
    @tf.function
    def train_(self, inputs, outputs):  # Optimization
        # Wrap computation inside a GradientTape for automatic differentiation
        with tf.GradientTape() as tape:
            predictions = self.predict(inputs)
            current_loss = self.huber_error_loss(predictions, outputs)

        # Trainable variables to update
        trainable_variables = list(self.weights.values()) + list(self.biases.values())

        gradients = tape.gradient(current_loss, trainable_variables)

        # Update weights and biases following gradients
        optimizer.apply_gradients(zip(gradients, trainable_variables))

        # tf.print(tf.reduce_mean(current_loss))

    @tf.function
    def predict_(self, inputs):

        # Input shape: [1, 84, 84, 1]. A batch of 84x84x1 (gray scale) images.
        inputs = tf.reshape(tf.cast(inputs, dtype=tf.float32), shape=[-1, IMAGE_INPUT_HEIGHT, IMAGE_INPUT_WIDTH, IMAGE_INPUT_CHANNELS])

        # Convolution Layer 1 with output shape [-1, 84, 84, 32]
        conv1 = self.convolutional_2d_layer(inputs, self.weights['conv1_weights'], self.biases['conv1_biases'])

        # Convolutional Layer 2 with output shape [-1, 84, 84, 64]
        conv2 = self.convolutional_2d_layer(conv1, self.weights['conv2_weights'], self.biases['conv2_biases'])

        # Flatten output of 2nd conv. layer to fit dense layer input, output shape [-1, 3x3x64]
        flattened_layer = self.flatten_layer(layer=conv2, weights_name='dense_weights')

        # Dense fully connected layer with output shape [-1, 512]
        dense_layer = self.dense_layer(flattened_layer, self.weights['dense_weights'], biases=self.biases['dense_biases'])

        # Fully connected output of shape [-1, 4]
        output_layer = self.output_layer(dense_layer, self.weights['output_weights'], biases=self.biases['output_biases'])

        return output_layer


class Agent:
    """
    Agent takes actions and saves them to its memory, which is initialized with a given capacity
    """
    steps = 0
    exploration_rate = EXPLORATION_RATE
     
    def decay_exploration_rate(self):
        decay_rate = (self.exploration_rate - MIN_EXPLORATION_RATE) / MAX_FRAMES_DECAYED
        return decay_rate

    # Initialize agent with a given memory capacity, and a state, and action space
    def __init__(self, number_of_states, number_of_actions, model=None):
        self.replay_memory_buffer = Memory(REPLAY_MEMORY_SIZE)
        if model is None:
            self.model = ConvolutionalNeuralNetwork(number_of_states, number_of_actions)  # TODO parameters
        else:
            self.model = ConvolutionalNeuralNetwork(number_of_states, number_of_actions,model)
        self.number_of_states = number_of_states
        self.number_of_actions = number_of_actions
        self.decay_rate = self.decay_exploration_rate()

    # The behaviour policy during training was e-greedy with e annealed linearly
    # from 1.0 to 0.1 over the first million frames, and fixed at 0.1 thereafter
    def e_greedy_policy(self, state):
        exploration_rate_threshold = random.uniform(0, 1)

        if exploration_rate_threshold > self.exploration_rate:

            next_q_values = self.model.predictOne(state)  # TODO: return only 4 actions not 784 x 4 !!!
            best_action = np.argmax(next_q_values)
        else:
            best_action = self.random_policy() 

        return best_action
    
    def random_policy(self): 
        return random.randint(0, self.number_of_actions-1)

    def choose_action(self, state):
        if self.replay_memory_buffer.get_size() <= REPLAY_START_SIZE:
            return self.random_policy()
        else:  
            return self.e_greedy_policy(state)

    def observe(self, sample):
        self.replay_memory_buffer.add(sample)
        self.steps += 1
        self.exploration_rate = (MIN_EXPLORATION_RATE
                                 if self.exploration_rate <= MIN_EXPLORATION_RATE
                                 else self.exploration_rate - self.decay_rate)

    def experience_replay(self):
        memory_batch = self.replay_memory_buffer.get_samples(MEMORY_BATCH_SIZE)
      
        batch_len = len(memory_batch)

        #make 0 array with size of statespace
        no_state = np.zeros(self.number_of_states)
        #make array of current states in batch
        states = np.array([experience[0] for experience in memory_batch])
        #make array of next states in batch. If next state is None then use 0 array
        next_states = np.array([no_state if experience[3] is None else experience[3] for experience in memory_batch])

        #predict current states
        predictions = self.model.predict(states)
        #predict next_states
        next_predictions = self.model.predict(next_states)
      
        c, w, h = 3, 160, 210
        input = np.zeros((batch_len, h, w, c))
        output = np.zeros((batch_len, self.number_of_actions))

        for i in range(batch_len):
            experience = memory_batch[i]
            
            state = experience[0]; action = experience[1]; reward = experience[2]; next_state = experience[3]

            #the 4 action values predicted in given experience: ex. [0.2, 0.1, -3, 4.2]
            target = predictions[i] 
            #Only set the value of the target for the action it actually performed
            if next_state is None:
                target[action] = reward #if the action led to terminal state, set value to the reward
            else:
                target[action] = reward + DISCOUNT_FACTOR*np.amax(next_predictions[i]) #else use Bellman picking the max_action of the next_prediction
            
            input[i] = state
            output[i] = target
        self.model.train(input,output)

    
        # for (state, action, reward, next_state, is_done) in memory_batch:
        #     if next_state is not None:  # TODO: should we replace next_state by value?
        #         target = self.model.predict(next_state)  # TODO: q network
        #         self.model.train(next_state, outputs=target)  # TODO: initial state not preprocessed

    def get_replay_memory(self):
        return self.replay_memory_buffer


REPLAY_START_SIZE = 2000  # TODO: remove normally


class Environment:
    """
    Creates a game environment which an agent can play using certain actions.
    Run takes an agent as argument that plays the game, until the agent 'dies' (no more lives)
    """
   
    def __init__(self, problem):
        self.gym = gym.make(problem)
        self.state_space = self.gym.observation_space.shape
        self.frame_preprocessor = FramePreprocessor(self.state_space)
        self.best_reward = 0

    # Clip positive rewards to 1 and negative rewards to -1
    def clip_reward(self, reward):
        return np.sign(reward)

    def runSavedModel(self, agent):
        state = self.gym.reset()
        #state = self.frame_preprocessor.preprocess_frame(self.gym.reset())
        total_reward = 0
        step = 0
        
        while True:
            self.gym.render()
            next_q_values = agent.model.predict_one(state)  # TODO: return only 4 actions not 784 x 4 !!!
            print(next_q_values)
            action = np.argmax(next_q_values)
           
            next_state, reward, is_done, _ = self.gym.step(action)
            
            if is_done:
                print(f"Finished game after {step} steps")
                # self.gym.render()
                next_state = None
    
            state = next_state
            total_reward += reward
            step += 1

            if is_done:
                break
        
        if total_reward > self.best_reward:
            self.best_reward = total_reward
        self.gym.close()
        print(f"Total reward: {total_reward} memory: {agent.replay_memory_buffer.get_size()} \n")


    def run(self, agent):
        state = self.gym.reset()
        #state = self.frame_preprocessor.preprocess_frame(self.gym.reset())
        total_reward = 0
        step = 0
        render = False
        train = False

        while True:
            if render is True:
                self.gym.render()
            action = agent.choose_action(state)
            next_state, reward, is_done, _ = self.gym.step(action)
            #next_state = self.frame_preprocessor.preprocess_frame(next_state)
            # self.frame_preprocessor.plot_frame_from_greyscale_values(next_state)
            #reward = self.clip_reward(reward)

            if is_done:
                print(f"Finished game after {step} steps")
                # self.gym.render()
                next_state = None
            
            experience = (state, action, reward, next_state, is_done)
            agent.observe(experience)
            
            if agent.get_replay_memory().get_size() > REPLAY_START_SIZE:
                train = True

            if step > 10:
                train = False
               # render = True

            if train is True:  # Learn after 50.000 random actions in memory
                render = False
                agent.experience_replay()

            # if agent.get_replay_memory().get_size() > REPLAY_START_SIZE:
            #     agent.experience_replay()

            state = next_state
            total_reward += reward
            step += 1

            if is_done:
                break
        
        if total_reward > self.best_reward:
            self.best_reward = total_reward
        self.gym.close()
        print(f"Total reward: {total_reward} memory: {agent.replay_memory_buffer.get_size()} exploration rate: {agent.exploration_rate} \n")

HAS_MODEL = False
environment = Environment(PROBLEM)
number_of_states = environment.gym.observation_space.shape
number_of_actions = environment.gym.action_space.n
if HAS_MODEL is True:
    dqn_agent = Agent(number_of_states, number_of_actions,model="Breakout-4.h5")
    for ep in range(NUMBER_OF_EPISODES):
        print(f"Episode: {ep+1}")
        environment.runSavedModel(dqn_agent)

else:
    dqn_agent = Agent(number_of_states, number_of_actions)

    for ep in range(NUMBER_OF_EPISODES):
        print(f"Episode: {ep+1}")
        environment.run(dqn_agent)
        dqn_agent.model.model.save("models\Breakout-4.h5")

    print(environment.best_reward)