import numpy as np
import matplotlib.pyplot as plt
para_set = [1 / 128, 1 / 64, 1 / 32, 1 / 16, 1 / 8, 1 / 4, 1 / 2, 1, 2, 4]
para_char = ['1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4']
class Arm:
def __init__(self, loc=0, scale=1):
self.loc = loc
self.scale = scale
def play(self):
return np.random.normal(self.loc, self.scale)
def change(self):
self.loc += np.random.normal(0, 0.01)
class Env:
def __init__(self, arm_num=10, loc=0, scale=1):
arm_list = []
for i in range(arm_num):
arm_list.append(Arm(loc, scale))
self.arm_list = arm_list
def act(self, action):
self.arm_list[action].change()
return self.arm_list[action].play()
class UCBAct:
def __init__(self, c, value=np.int(0), time=np.float(0)):
self.time = time
self.value = value
self.c = c
def __gt__(self, o):
if self.time == 0:
return True
elif o.time == 0:
return False
else:
return self.value + (self.c * np.sqrt(np.log(t) / self.time)) > o.value + (o.c * np.sqrt(np.log(t) / o.time))
class UCB:
def __init__(self, c, act_num=10, alpha=0.1):
self.c = c
self.act_num = act_num
self.alpha = alpha
self.act_map = []
for _ in range(act_num):
self.act_map.append(UCBAct(self.c))
def act(self):
action = np.argmax(self.act_map)
self.act_map[action].time += 1
return action
def update(self, action, r):
self.act_map[action].value = self.act_map[action].value + ((r -self.act_map[action].value) * self.alpha)
ubc_res = []
for para in para_set:
env = Env()
ucb = UCB(para)
res = 0
for t in range(200000):
action = ucb.act()
r = env.act(action)
ucb.update(action, r)
if t > 99999:
res += r
ubc_res.append(res / 100000)
ubc_res
class Eps:
def __init__(self, epsilon=0.1, act_num=10, alpha=0.1):
self.epsilon = epsilon
self.alpha = alpha
self.act_map = np.rec.fromarrays([np.zeros(act_num), np.zeros(act_num).astype('int')])
self.act_num = act_num
def act(self):
ep_con = np.random.choice([0, 1], p = [1-self.epsilon, self.epsilon])
if ep_con:
action = np.random.choice(range(self.act_num))
else:
action = np.argmax([value[0] for value in self.act_map])
self.act_map[action][1] += 1
return action
def update(self, action, r):
self.act_map[action][0] = self.act_map[action][0] + ((r - self.act_map[action][0]) * self.alpha)
eps_res = []
for para in para_set:
if para > 1:
break
env = Env()
eps = Eps(para)
res = 0
for t in range(200000):
action = eps.act()
r = env.act(action)
eps.update(action, r)
if t > 99999:
res += r
eps_res.append(res / 100000)
eps_res
class Opt:
def __init__(self, epsilon=0.1, act_num=10, alpha=0.1):
self.alpha = alpha
self.epsilon = epsilon
self.act_num = act_num
self.act_map = np.rec.fromarrays([np.array([1] * self.act_num), np.zeros(act_num).astype('int')])
def act(self):
ep_con = np.random.choice([0, 1], p = [1-self.epsilon, self.epsilon])
if ep_con:
action = np.random.choice(range(self.act_num))
else:
action = np.argmax([value[0] for value in self.act_map])
self.act_map[action][1] += 1
return action
def update(self, action, r):
self.act_map[action][0] = self.act_map[action][0] + ((r - self.act_map[action][0]) * self.alpha)
opt_res = []
for para in para_set:
if para > 1:
break
env = Env()
opt = Opt(para)
res = 0
for t in range(200000):
action = opt.act()
r = env.act(action)
opt.update(action, r)
if t > 99999:
res += r
opt_res.append(res / 100000)
opt_res
class Gra:
def __init__(self, alpha=0.1, act_num=10):
self.H_map = np.zeros(act_num)
self.act_num = act_num
self.alpha = alpha
self.d = 0
self.r_avg = 0
def act(self):
self.d = np.sum(np.exp(self.H_map))
return np.random.choice([i for i in range(self.act_num)], p=np.exp(self.H_map) / self.d)
def update(self, action, r):
self.r_avg += (r - self.r_avg) / (t + 1)
temp_mu = self.alpha * (r - self.r_avg)
self.H_map = self.H_map - (temp_mu * (np.exp(self.H_map) / self.d))
self.H_map[action] += temp_mu
gra_res = []
for para in para_set:
env = Env()
gra = Gra(para)
res = 0
for t in range(200000):
action = gra.act()
r = env.act(action)
gra.update(action, r)
if t > 99999:
res += r
gra_res.append(res / 100000)
gra_res
plt.plot(range(len(para_set)), ubc_res, Label='UBC')
plt.plot(range(len(para_set)), eps_res + [0]*2, Label='Eps')
plt.plot(range(len(para_set)), opt_res + [0]*2, Label='Opt')
plt.plot(range(len(para_set)), gra_res, Label='Gra')
plt.legend(loc=0,ncol=2)
_ = plt.xticks(range(len(para_set)),para_char)