import numpy as np

np.set_printoptions(suppress=True)
        
theta = 50
gamma = 0.9

class Env():
    def borrow(self):
        borrow1 = np.random.poisson(3)
        borrow2 = np.random.poisson(4)
        rew = (np.min([borrow1, self.park1]) + np.min([borrow2, self.park2])) * 10
        self.park1 = np.max([self.park1 - borrow1, 0])
        self.park2 = np.max([self.park2 - borrow2, 0])
        self.reward += rew
        
    def back(self):
        back1 = np.random.poisson(3)
        back2 = np.random.poisson(4)
        self.park1 = np.min([20, self.park1 + back1])
        self.park2 = np.min([20, self.park2 + back2])
    
        
    def __init__(self, park1=0, park2=0):
        self.park1 = park1
        self.park2 = park2
        self.reward = 0
        
        
    def change(self, action, times):
        """
        1 是 park1 移到 park 2
        0 是 park2 移到 park 1
        """
        if action == 1:
            self.reward += -2 * (min([self.park1, times]))
            self.park1 = max([0, self.park1 - times])
            self.park2 = min([20, self.park2 + times])
    
    def set_state(self, park1, park2):
        self.park1 = park1
        self.park2 = park2
                
                
    def get_rew(self):
        rew = self.reward
        self.reward = 0
        return rew
    
    def get_park(self):
        return [self.park1, self.park2]
    
    
    def act(self, action):
        self.change(*action)
        self.borrow()
        self.back()
        return [self.get_rew(), self.get_park()]

V = np.zeros([21, 21])
pi = np.zeros([21, 21])
A_s = []
park = Env()
S = []
for action in range(2):
    for times in range(6):
        A_s.append([action, times])
for part1 in range(21):
    for part2 in range(21):
        S.append([part1, part2])

while 1:
    delta = 0
    for s in S:
        v = V[int(s[0])][int(s[1])]
        action = pi[s[0]][s[1]]
        s1 = min([0, s[0] - action])
        s2 = max([20, s[1] + action])
        v_rew = -2 * action
        for r1 in range(21):
            if r1 <= s[0]:
                mul1 = ((3**(r1)) / np.math.factorial(r1)) * (np.e**(-3))
                for r2 in range(21):
                    if r2 <= s[1]:
                        mul2 = ((4**(r2)) / np.math.factorial(r2)) * (np.e**(-4))
                        s1 -= r1
                        s2 -= r2
                        for state in S:
                            if (state[0] >= s1) and (state[1] >= s2):
                                v_rew += ((3**(state[0] - s1)) / np.math.factorial((state[0] - s1))) * (np.e**(-3)) *\
                                ((4**(state[1] - s2)) / np.math.factorial((state[1] - s2))) * (np.e**(-4)) *\
                                (mul1 * mul2) *\
                                (10 * (r1 + r2) + gamma * V[int(state[0])][int(state[1])])
        V[int(s[0])][int(s[1])] = v_rew
        delta = np.max([delta, v - v_rew, v_rew - v])
    if delta < theta:
        break

V