import gym env = gym.make(‘KellyCoinflip-v0‘) from scipy.stats import binom import numpy as np from repoze.lru import lru_cache def V(w, b, m=250): if w>=250: return 250 if w<=0: return 0 if b==0: return w else: try: j = binom.ppf(float(w)/float(m), b, 0.5) return 1.2**b * 1.5**-j * (w + m/2 * sum(np.multiply(binom.cdf(list(map(lambda x2 :x2-1, range(0,int(j+1)))),b,0.5), list(map(lambda x : 1.5**x, list(reversed(range(0, int(j+1)))))) ))) except ValueError: print ("Error:", (w,b,m)) @lru_cache(None) def VPplan(w, b): # optimization: short-circuit if w<=0 or w>=250: return 0 else: if b==0: return w else: possibleBets = list(map(lambda pb : float(pb)/100.0, range(0*100,int((w*100)+1),1))) returns = list(map(lambda pb : 0.6*V(w+pb, b-1) + 0.4*V(w-pb,b-1), possibleBets)) return float(returns.index(max(returns)))/100.0 # play 500 games and calculate mean reward: rewards = [] for n in range(0,500): done = False reward = 0 while not done: w = env._get_obs()[0][0] b = env._get_obs()[1] bet = VPplan(w, b) results = env.step(bet*100) print (n, w, b, bet, "results:", results) reward = reward+results[1] done = results[2] rewards.append(reward) env.reset() print (sum(rewards)/len(rewards))
原文:https://www.cnblogs.com/dzqdzq/p/15008589.html