import numpy as np

class Env:
    def __init__(self, r, c, begin, end, wind):
        self.r = r
        self.c = c
        self.wind = wind.copy()
        self.end = end.copy()
        self.begin = begin.copy()
        self.pos = begin.copy()
    def move(self, act):
        self.pos += act + [-self.wind[self.pos[1]] + np.random.randint(-1, 2), 0]
        if (self.pos == end).all():
            self.pos = self.begin.copy()
            return 0
        else:
            if self.pos[0] >= self.r:
                self.pos[0] = self.r - 1
            elif self.pos[0] < 0:
                self.pos[0] = 0
            if self.pos[1] >= self.c:
                self.pos[1] = self.c - 1
            elif self.pos[1] < 0:
                self.pos[1] = 0
            return -1

class Act:
    def __init__(self, r, c):
        self.Q = np.zeros([r, c, 3, 3])
        self.actSpace = np.array(np.meshgrid(np.arange(-1, 2), np.arange(-1, 2))).T.reshape(-1, 2)
    def act(self, pos):
        res = np.array(np.unravel_index(np.argmax(self.Q[tuple(pos)]), self.Q[tuple(pos)].shape))
        res[res == 2] = -1
        return res
    def train(self, env, alpha=0.1 , gamma=1, times=10000, deadline=10000):
        prePos = env.pos.copy()
        preAct = self.act(tuple(prePos)).copy()
        for _ in range(times):
            for _ in range(deadline):
                reward = env.move(preAct)
                folPos = env.pos.copy()
                folAct = self.act(tuple(prePos)).copy()
                self.Q[tuple(prePos)][tuple(preAct)] += alpha * (reward + (gamma * self.Q[tuple(folPos)][tuple(folAct)] - self.Q[tuple(prePos)][tuple(preAct)]))
                prePos = folPos.copy()
                preAct = folAct.copy()
                if reward == 0:
                    break

r = 7
c = 10
wind = np.array([0, 0, 0, 1, 1, 1, 2, 2, 1, 0])
begin = np.array([3, 0])
end = np.array([3, 7])

env = Env(r, c, begin, end, wind)
act = Act(r, c)

act.train(env, times=100)