强化学习入门:从Q-Learning到深度强化学习
强化学习入门:从Q-Learning到深度强化学习
什么是强化学习?
强化学习是机器学习的三大范式之一(监督学习、无监督学习、强化学习)。它研究的是智能体如何在环境中通过试错来学习最优策略。
核心概念:
- 智能体(Agent):学习和决策的主体
- 环境(Environment):智能体交互的外部世界
- 状态(State):环境的描述
- 动作(Action):智能体可以执行的操作
- 奖励(Reward):环境对动作的反馈
强化学习vs监督学习
| 特性 | 监督学习 | 强化学习 |
|---|---|---|
| 数据 | 标注数据 | 环境交互 |
| 反馈 | 即时 | 延迟 |
| 目标 | 最小化误差 | 最大化累积奖励 |
| 探索 | 不需要 | 需要探索 |
核心算法
1. Q-Learning
Q-Learning是一种无模型的强化学习算法,通过学习动作价值函数Q(s,a)来找到最优策略。
import numpy as np
class QLearning:
def __init__(self, state_size, action_size, alpha=0.1, gamma=0.99, epsilon=0.1):
self.q_table = np.zeros((state_size, action_size))
self.alpha = alpha # 学习率
self.gamma = gamma # 折扣因子
self.epsilon = epsilon # 探索率
def choose_action(self, state):
if np.random.random() < self.epsilon:
return np.random.randint(self.q_table.shape[1])
return np.argmax(self.q_table[state])
def update(self, state, action, reward, next_state, done):
if done:
target = reward
else:
target = reward + self.gamma * np.max(self.q_table[next_state])
self.q_table[state, action] += self.alpha * (target - self.q_table[state, action])
2. DQN(Deep Q-Network)
DQN使用神经网络来近似Q函数,解决了Q-Learning在高维状态空间中的问题。
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
class DQN(nn.Module):
def __init__(self, state_size, action_size):
super().__init__()
self.network = nn.Sequential(
nn.Linear(state_size, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, action_size)
)
def forward(self, x):
return self.network(x)
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=10000)
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.batch_size = 32
self.model = DQN(state_size, action_size)
self.target_model = DQN(state_size, action_size)
self.optimizer = optim.Adam(self.model.parameters())
self.loss_fn = nn.MSELoss()
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if random.random() < self.epsilon:
return random.randrange(self.action_size)
state_tensor = torch.FloatTensor(state).unsqueeze(0)
with torch.no_grad():
q_values = self.model(state_tensor)
return q_values.argmax().item()
def replay(self):
if len(self.memory) < self.batch_size:
return
batch = random.sample(self.memory, self.batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
states = torch.FloatTensor(states)
actions = torch.LongTensor(actions)
rewards = torch.FloatTensor(rewards)
next_states = torch.FloatTensor(next_states)
dones = torch.FloatTensor(dones)
current_q = self.model(states).gather(1, actions.unsqueeze(1))
next_q = self.target_model(next_states).max(1)[0]
target_q = rewards + (1 - dones) * self.gamma * next_q
loss = self.loss_fn(current_q.squeeze(), target_q.detach())
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
def update_target(self):
self.target_model.load_state_dict(self.model.state_dict())
3. Policy Gradient
策略梯度直接优化策略函数,适合连续动作空间。
class PolicyNetwork(nn.Module):
def __init__(self, state_size, action_size):
super().__init__()
self.network = nn.Sequential(
nn.Linear(state_size, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, action_size),
nn.Softmax(dim=-1)
)
def forward(self, x):
return self.network(x)
class PolicyGradientAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.gamma = 0.99
self.policy = PolicyNetwork(state_size, action_size)
self.optimizer = optim.Adam(self.policy.parameters(), lr=0.001)
self.log_probs = []
self.rewards = []
def act(self, state):
state_tensor = torch.FloatTensor(state).unsqueeze(0)
action_probs = self.policy(state_tensor)
dist = torch.distributions.Categorical(action_probs)
action = dist.sample()
self.log_probs.append(dist.log_prob(action))
return action.item()
def remember(self, reward):
self.rewards.append(reward)
def train(self):
returns = []
G = 0
for reward in reversed(self.rewards):
G = reward + self.gamma * G
returns.insert(0, G)
returns = torch.FloatTensor(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
loss = 0
for log_prob, G in zip(self.log_probs, returns):
loss -= log_prob * G
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.log_probs.clear()
self.rewards.clear()
4. Actor-Critic
Actor-Critic结合了策略梯度和价值函数的优点。
class ActorCritic(nn.Module):
def __init__(self, state_size, action_size):
super().__init__()
self.shared = nn.Sequential(
nn.Linear(state_size, 128),
nn.ReLU()
)
self.actor = nn.Linear(128, action_size)
self.critic = nn.Linear(128, 1)
def forward(self, x):
shared = self.shared(x)
action_probs = torch.softmax(self.actor(shared), dim=-1)
value = self.critic(shared)
return action_probs, value
class ActorCriticAgent:
def __init__(self, state_size, action_size):
self.gamma = 0.99
self.model = ActorCritic(state_size, action_size)
self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
def act(self, state):
state_tensor = torch.FloatTensor(state).unsqueeze(0)
action_probs, value = self.model(state_tensor)
dist = torch.distributions.Categorical(action_probs)
action = dist.sample()
return action.item(), dist.log_prob(action), value
def train(self, log_prob, value, reward, next_value, done):
if done:
target = reward
else:
target = reward + self.gamma * next_value
advantage = target - value.item()
actor_loss = -log_prob * advantage
critic_loss = (target - value) ** 2
loss = actor_loss + critic_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
强化学习的应用
1. 游戏AI
AlphaGo、AlphaStar等使用强化学习战胜人类顶尖选手。
2. 机器人控制
让机器人学习走路、抓取物体等复杂动作。
3. 推荐系统
使用强化学习优化长期用户满意度,而非短期点击率。
4. 自动驾驶
在模拟环境中学习驾驶策略。
强化学习的挑战
- 样本效率:需要大量交互数据
- 奖励设计:奖励函数难以设计
- 探索与利用:如何平衡探索新策略和利用已知策略
- 稳定性:训练过程可能不稳定
总结
强化学习是实现通用人工智能的重要途径。从简单的Q-Learning到复杂的深度强化学习算法,强化学习正在各个领域展现强大的能力。掌握强化学习的基本原理对于理解现代AI系统至关重要。