最好不要使用全局变量
所以我们采取面向对象编程思想将上一个例子改写并添加图形显示
这是环境类
classdef one_dimensional_env
%一维宝藏环境类
properties
len
actions
fresh_time
fig
agent
observation
reward
done
info
max_episodes
step_counter
end
% 上面是外部可操作的属性
% 下面是内部可操作的属性
properties (SetAccess = private)
len_
fresh_time_
fig_
end
% 在methods中是外部可调用的方法
methods
function self = one_dimensional_env(len,fresh_time)
% 新建类时候初始化
switch nargin
case 1
self.len=len;
self.fresh_time=0.05;
case 2
self.len=len;
self.fresh_time=fresh_time;
otherwise
error('缺少参数');
end
self.max_episodes=13;
self.step_counter=0;
self.actions=[1 2];
self.agent=1;
self.observation=self.agent;
self.reward=0;
self.done=0;
self.info=0;
h=figure;
self.fig = h.Number;
self.len_=self.len;
self.fresh_time_=self.fresh_time;
self.fig_=self.fig;
end
function render(self)
% 绘制图像
pause(self.fresh_time)
h=figure(self.fig);
ax = axes('Parent',h);
ax.YAxis.Visible = 'off';
ax.XAxis.Visible = 'off';
for i=1:self.len
draw.rect(i*10,50,10);
if i==self.agent
draw.circle(i*10,50,2,'r');
end
end
draw.circle(i*10,50,4,'b');
axis(ax, 'equal')
end
function self = step(self,action)
% 一步动作
self.done = 0;
if action == 1
temp = self.agent-1;
end
if action == 2
temp = self.agent+1;
end
if temp <= 1
temp=1;
end
if temp >= self.len
temp=self.len;
end
if temp == self.len
self.done = 1;
end
self.observation=temp;
self.reward=self.done;
self.info=0;
end
function self=reset(self)
% 重置环境
self.len=self.len_;
self.fresh_time=self.fresh_time_;
self.fig=self.fig_;
self.actions=[1 2];
self.step_counter=0;
self.agent=1;
self.observation=self.agent;
self.reward=0;
self.done=0;
self.info=0;
end
end
end
这是强化学习逻辑类
classdef rl
% 强化学习逻辑
properties
q_table
actions
epsilon
alpha
gamma
end
methods
function obj = rl(n_states,actions,epsilon,alpha,gamma)
% 初始化
obj.actions=actions;
obj.epsilon=epsilon;
obj.alpha=alpha;
obj.gamma=gamma;
obj.q_table = zeros(n_states, length(actions));
end
function obj=update_q_table(obj,agent, A,q_predict,q_target)
% 更新QLearning table
obj.q_table(agent, A) = obj.q_table(agent, A) + obj.alpha * (q_target - q_predict);
end
function action_name = choose_action(obj,state)
% 选择一个动作
state_actions = obj.q_table(state, :) ;% 取出这一步的概率
if (rand() > obj.epsilon) || (all(state_actions == 0))
% 初始时随机选择
action_name = obj.actions(randi(2));
else % 贪心选择
[~,I] = max(state_actions);
action_name = obj.actions(I);
% 选概率大的
end
end
end
end
最后才是调用上面两个类的主程序
% 强化学习
cc
rng('default');
env=one_dimensional_env(6,0.05);
one_dimensional_rl=rl(env.len,env.actions,0.9,0.1,0.9);
pause(2)
for episode =1:env.max_episodes
is_terminated = 0;
env = env.reset();
env.render();
while ~is_terminated
A = one_dimensional_rl.choose_action(env.agent);
env = env.step(A); % 采取动作获得状态和奖励
q_predict = one_dimensional_rl.q_table(env.agent, A);
if env.done ~= 1
q_target = env.reward + one_dimensional_rl.gamma * max(one_dimensional_rl.q_table(env.observation, :)); % 没有结束
else
q_target = env.reward; % 一局结束了
is_terminated = 1; % 更新标记
end
one_dimensional_rl=one_dimensional_rl.update_q_table(env.agent, A,q_predict,q_target); % 更新
% disp(one_dimensional_rl.q_table)
env.agent=env.observation;
env.render();
end