In [ ]:
import numpy as np
np.set_printoptions(suppress=True)
theta = 50
gamma = 0.9
In [ ]:
class Env():
def borrow(self):
borrow1 = np.random.poisson(3)
borrow2 = np.random.poisson(4)
rew = (np.min([borrow1, self.park1]) + np.min([borrow2, self.park2])) * 10
self.park1 = np.max([self.park1 - borrow1, 0])
self.park2 = np.max([self.park2 - borrow2, 0])
self.reward += rew
def back(self):
back1 = np.random.poisson(3)
back2 = np.random.poisson(4)
self.park1 = np.min([20, self.park1 + back1])
self.park2 = np.min([20, self.park2 + back2])
def __init__(self, park1=0, park2=0):
self.park1 = park1
self.park2 = park2
self.reward = 0
def change(self, action, times):
"""
1 是 park1 移到 park 2
0 是 park2 移到 park 1
"""
if action == 1:
self.reward += -2 * (min([self.park1, times]))
self.park1 = max([0, self.park1 - times])
self.park2 = min([20, self.park2 + times])
def set_state(self, park1, park2):
self.park1 = park1
self.park2 = park2
def get_rew(self):
rew = self.reward
self.reward = 0
return rew
def get_park(self):
return [self.park1, self.park2]
def act(self, action):
self.change(*action)
self.borrow()
self.back()
return [self.get_rew(), self.get_park()]
In [ ]:
V = np.zeros([21, 21])
pi = np.zeros([21, 21])
A_s = []
park = Env()
S = []
for action in range(2):
for times in range(6):
A_s.append([action, times])
for part1 in range(21):
for part2 in range(21):
S.append([part1, part2])
In [ ]:
while 1:
delta = 0
for s in S:
v = V[int(s[0])][int(s[1])]
action = pi[s[0]][s[1]]
s1 = min([0, s[0] - action])
s2 = max([20, s[1] + action])
v_rew = -2 * action
for r1 in range(21):
if r1 <= s[0]:
mul1 = ((3**(r1)) / np.math.factorial(r1)) * (np.e**(-3))
for r2 in range(21):
if r2 <= s[1]:
mul2 = ((4**(r2)) / np.math.factorial(r2)) * (np.e**(-4))
s1 -= r1
s2 -= r2
for state in S:
if (state[0] >= s1) and (state[1] >= s2):
v_rew += ((3**(state[0] - s1)) / np.math.factorial((state[0] - s1))) * (np.e**(-3)) *\
((4**(state[1] - s2)) / np.math.factorial((state[1] - s2))) * (np.e**(-4)) *\
(mul1 * mul2) *\
(10 * (r1 + r2) + gamma * V[int(state[0])][int(state[1])])
V[int(s[0])][int(s[1])] = v_rew
delta = np.max([delta, v - v_rew, v_rew - v])
if delta < theta:
break
In [ ]:
V