In [1]:
import numpy as np
In [2]:
class Env:
def __init__(self, r, c, begin, end, wind):
self.r = r
self.c = c
self.wind = wind.copy()
self.end = end.copy()
self.begin = begin.copy()
self.pos = begin.copy()
def move(self, act):
self.pos += act + [-self.wind[self.pos[1]] + np.random.randint(-1, 2), 0]
if (self.pos == end).all():
self.pos = self.begin.copy()
return 0
else:
if self.pos[0] >= self.r:
self.pos[0] = self.r - 1
elif self.pos[0] < 0:
self.pos[0] = 0
if self.pos[1] >= self.c:
self.pos[1] = self.c - 1
elif self.pos[1] < 0:
self.pos[1] = 0
return -1
In [3]:
class Act:
def __init__(self, r, c):
self.Q = np.zeros([r, c, 3, 3])
self.actSpace = np.array(np.meshgrid(np.arange(-1, 2), np.arange(-1, 2))).T.reshape(-1, 2)
def act(self, pos):
res = np.array(np.unravel_index(np.argmax(self.Q[tuple(pos)]), self.Q[tuple(pos)].shape))
res[res == 2] = -1
return res
def train(self, env, alpha=0.1 , gamma=1, times=10000, deadline=10000):
prePos = env.pos.copy()
preAct = self.act(tuple(prePos)).copy()
for _ in range(times):
for _ in range(deadline):
reward = env.move(preAct)
folPos = env.pos.copy()
folAct = self.act(tuple(prePos)).copy()
self.Q[tuple(prePos)][tuple(preAct)] += alpha * (reward + (gamma * self.Q[tuple(folPos)][tuple(folAct)] - self.Q[tuple(prePos)][tuple(preAct)]))
prePos = folPos.copy()
preAct = folAct.copy()
if reward == 0:
break
In [4]:
r = 7
c = 10
wind = np.array([0, 0, 0, 1, 1, 1, 2, 2, 1, 0])
begin = np.array([3, 0])
end = np.array([3, 7])
In [5]:
env = Env(r, c, begin, end, wind)
act = Act(r, c)
In [39]:
act.train(env, times=100)





