1

QLearning论文

#导入包
import numpy as np
import pandas as pd
import time

定义参数

np.random.seed(2)
N_STATES=6 #最开始的距离离保障的距离
ACTIONS=['left','right'] #行为
EPSILON=0.9 #选择动作概率,90% 的情况选择最优动作.10%选择随机动作
ALPHA=0.1 # 学习效率
LAMBDA=0.9 #未来奖励衰减度,
MAX_EPISODES=13 #回合数
FRESH_TIME=0.3 #每步的时间,为看效果
#构建QLabel,
def build_q_table(n_state,actions):
    table=pd.DataFrame(
    np.zeros((n_state,len(actions))),
    columns=actions,)
    print(table)
    return table
#选择动作
def choice_action(state,q_tabel):
    #根据state,选择行
    state_action=q_tabel.iloc[state,:]
    #随机生成一个数,如果大于EPSILON 或者选择的所有行为都是0  随机选择一个行为
    if(np.random.uniform()>EPSILON) or (state_action.all()==0):
        action_name=np.random.choice(ACTIONS)
    else:
        #选择这一步中的较大值
        action_name=ACTIONS[state_action.argmax()]
    return action_name    
#创建环境和环境的feedback 
def get_env_feedback(S,A):
    if A=='right':
        if S==N_STATES-2:
            S_='terminal'
            R=1
        else:
            S_=S+1
            R=0
    else:
        R=0
        if S==0:
            S_=S
        else:
            S_=S-1
    return S_,R       
def update_env(S,episode,step_counter):
    env_list=['-']*(N_STATES-1)+['T']
    if S=='terminal':
        interaction='Episode %s: total_steps= %s'%(str(episode+1),step_counter)
        print('\r{}'.format(interaction),end='')
        time.sleep(2)
        print('\r                       ',end='')
    else:
        env_list[S]='O'
        interaction=''.join(env_list)
        print('\r{}'.format(interaction),end='')
        time.sleep(FRESH_TIME)
def rl():
    q_tabel=build_q_table(N_STATES,ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter=0
        S=0
        is_terminated=False
        update_env(S,episode,step_counter)
        while not is_terminated:
            A=choice_action(S,q_tabel)
            S_,R=get_env_feedback(S,A)
            #估计值
            q_predict=q_tabel.loc[S,A]
            if S_!='terminal':
                #真实值
                q_target=R+LAMBDA*q_tabel.iloc[S_,:].max()
            else:
                q_target=R
                is_terminated=True
            q_tabel.loc[S,A]+=ALPHA*(q_target-q_predict)
            S=S_
            update_env(S,episode,step_counter+1)
            step_counter+=1
    return q_tabel   
if __name__ == '__main__':
    q_tabel=rl()
    print('\r\n Q-tabel:\n')
    print(q_tabel)

捕风
47 声望17 粉丝