Q Learning

import numpy as np
import pandas as pd
import time
import IPython


length = 4  # 道路长度
epsilon = 0.9  # 贪心值
a = [0, 1]  # 动作
Q_a = np.zeros([length + 1, len(a)])  # Q-a表
alpha = 0.1
gamma = 0.9
game_over = False  # 一轮游戏是否结束


def print_environment(state):
    """打印环境"""
    str = '-' * state
    str += 'o'
    str += '-' * (length - state - 1)
    if state != length:
        str += '{%raw%}$'
    print(str)

def best_Q(state):
    """最佳动作获得的Q"""
    return Q_a[state].max()

def best_action(state):
    """最佳动作, 有多个最大从中随机选择一个"""
    max_arr = np.argsort(Q_a[state])[::-1]  # 排序
    max_index = np.random.randint(0, np.sum(Q_a[state] == Q_a[state][max_arr[0]]))  # 随机选择
    return max_arr[max_index]

def update_Q(state, next_state, action, reward):
    """更新Q表"""
    Q_a[state][action] += alpha * (reward + gamma * best_Q(next_state) - Q_a[state][action])

def greedy(state):
    """贪婪策略"""
    if np.random.rand() < epsilon:
        return best_action(state)
    else:
        return np.random.randint(0, Q_a[state].shape[0])  # 随机动作

def update_state(now_state, action):
    """更新环境及获取奖赏"""
    if action == 0:
        if now_state != 0:
            next_state = now_state - 1
            reward = 0
        else:
            next_state = now_state
            reward = 0
    else:
        if now_state == length - 1:
            next_state = now_state + 1
            reward = 1  # 到达终点奖励
            global game_over
            game_over = True  # 游戏结束
        else:
            next_state = now_state + 1
            reward = 0
    print_environment(next_state)
    return next_state, reward

def Q_learning(start, rounds, max_times):
    """start是起点, rounds是轮数, max_times是每轮迭代最大次数"""
    state = start
    for i in range(rounds):
        state = start
        print_environment(state)
        time.sleep(1)
        IPython.display.clear_output()  # 清空输出
        for j in range(max_times):
            global game_over
            if game_over:
                game_over = False
                print('Game Over')
                time.sleep(1)
                IPython.display.clear_output()  # 清空输出
                break
            action = greedy(state)
            next_state, reward = update_state(state, action)
            update_Q(state, next_state, action, reward)
            state = next_state
            time.sleep(1)
            IPython.display.clear_output()  # 清空输出

            
Q_learning(0, 5, 100)

Sarsa

import numpy as np
import pandas as pd
import time
import IPython


length = 5  # 道路长度(由于saras算法过于保守, 所以如果设太长的话...你可能得等半天)
epsilon = 0.9  # 贪心值
a = [0, 1]  # 动作
Q_a = np.zeros([length + 1, len(a)])  # Q-a表
alpha = 0.1
gamma = 0.9
game_over = False  # 一轮游戏是否结束


def print_environment(state):
    """打印环境"""
    str = '-' * state
    str += 'o'
    str += '-' * (length - state - 1)
    if state != length:
        str += '${%endraw%}'
    print(str)

def best_Q(state):
    """最佳动作获得的Q"""
    return Q_a[state].max()

def best_action(state):
    """最佳动作, 有多个最大从中随机选择一个"""
    max_arr = np.argsort(Q_a[state])[::-1]  # 排序
    max_index = np.random.randint(0, np.sum(Q_a[state] == Q_a[state][max_arr[0]]))  # 随机选择
    return max_arr[max_index]

def update_Q(state, next_state, action, next_action, reward):
    """更新Q表, 相比Q Learning的差别就在于并非选取最佳动作获得的Q而是实际动作获得的Q"""
    Q_a[state][action] += alpha * (reward + gamma * Q_a[next_state][next_action] - Q_a[state][action])

def greedy(state):
    """贪婪策略"""
    if np.random.rand() < epsilon:
        return best_action(state)
    else:
        return np.random.randint(0, Q_a[state].shape[0])  # 随机动作

def update_state(now_state, action):
    """更新环境及获取奖赏"""
    if action == 0:
        if now_state != 0:
            next_state = now_state - 1
            reward = 0
        else:
            next_state = now_state
            reward = 0
    else:
        if now_state == length - 1:
            next_state = now_state + 1
            reward = 1  # 到达终点奖励
            global game_over
            game_over = True  # 游戏结束
        else:
            next_state = now_state + 1
            reward = 0
    print_environment(next_state)
    return next_state, reward

def Sarsa(start, rounds, max_times):
    """start是起点, rounds是轮数, max_times是每轮迭代最大次数"""
    state = start
    for i in range(rounds):
        state = start
        print_environment(state)
        time.sleep(1)
        IPython.display.clear_output()  # 清空输出
        action = greedy(state)
        for j in range(max_times):
            global game_over
            if game_over:
                game_over = False
                print('Game Over')
                time.sleep(1)
                IPython.display.clear_output()  # 清空输出
                break
            next_action = greedy(state)
            next_state, reward = update_state(state, action)
            update_Q(state, next_state, action, next_action, reward)
            state = next_state
            action = next_action
            time.sleep(1)
            IPython.display.clear_output()  # 清空输出

            
Sarsa(0, 10, 100)

Sarsa Lambda

import numpy as np
import pandas as pd
import time
import IPython


length = 5  # 道路长度
epsilon = 0.9  # 贪心值
a = [0, 1]  # 动作
Q_a = np.zeros([length + 1, len(a)])  # Q-a表
E = Q_a.copy()  # 状态表, 可以理解为遗忘程度 (0 即为完全遗忘, 1 即为完全记得, 按照遗忘程度来给出贡献)
alpha = 0.1
gamma = 0.9
trace_decay = 0.9  # lambda值
game_over = False  # 一轮游戏是否结束


def print_environment(state):
    """打印环境"""
    str = '-' * state
    str += 'o'
    str += '-' * (length - state - 1)
    if state != length:
        str += '$'
    print(str)

def best_Q(state):
    """最佳动作获得的Q"""
    return Q_a[state].max()

def best_action(state):
    """最佳动作, 有多个最大从中随机选择一个"""
    max_arr = np.argsort(Q_a[state])[::-1]  # 排序
    max_index = np.random.randint(0, np.sum(Q_a[state] == Q_a[state][max_arr[0]]))  # 随机选择
    return max_arr[max_index]

def update_Q(state, next_state, action, next_action, reward):
    """更新Q表, 相比Sarsa的差别就在于E表起到了对之前的Q表更新的作用"""
    # 这里有两种更新方式, 可以都尝试一下
    """
    E[state][action] += 1
    """
    global E
    global Q_a
    E[state,:] = 0
    E[state][action] = 1
    Q_a += alpha * E * (reward + gamma * Q_a[next_state][next_action] - Q_a[state][action])
    E *= trace_decay * gamma

def greedy(state):
    """贪婪策略"""
    if np.random.rand() < epsilon:
        return best_action(state)
    else:
        return np.random.randint(0, Q_a[state].shape[0])  # 随机动作

def update_state(now_state, action):
    """更新环境及获取奖赏"""
    if action == 0:
        if now_state != 0:
            next_state = now_state - 1
            reward = 0
        else:
            next_state = now_state
            reward = 0
    else:
        if now_state == length - 1:
            next_state = now_state + 1
            reward = 1  # 到达终点奖励
            global game_over
            game_over = True  # 游戏结束
        else:
            next_state = now_state + 1
            reward = 0
    print_environment(next_state)
    return next_state, reward

def Sarsa_Lambda(start, rounds, max_times):
    """start是起点, rounds是轮数, max_times是每轮迭代最大次数"""
    state = start
    for i in range(rounds):
        state = start
        print_environment(state)
        time.sleep(1)
        IPython.display.clear_output()  # 清空输出
        action = greedy(state)
        for j in range(max_times):
            global game_over
            if game_over:
                game_over = False
                print('Game Over')
                time.sleep(1)
                IPython.display.clear_output()  # 清空输出
                E[:] = 0  # E表归0
                break
            next_action = greedy(state)
            next_state, reward = update_state(state, action)
            update_Q(state, next_state, action, next_action, reward)
            state = next_state
            action = next_action
            time.sleep(1)
            IPython.display.clear_output()  # 清空输出

            
Sarsa_Lambda(0, 10, 100)