下面是一个详细的 MATLAB 项目实例,展示如何将 深度 Q 网络(DQN) 与 蚁群优化算法(ACO) 结合,用于 无人机三维路径规划(3D UAV Path Planning)。
⚠️ 注意:由于 DQN 和 ACO 是两种不同范式(强化学习 vs. 启发式优化),本方案采用 混合策略 —— 使用 ACO 提供高质量的初始路径或经验回放样本,辅助 DQN 的训练;或者用 DQN 指导 ACO 的信息素更新。此处我们采用 ACO 生成先验路径作为 DQN 的专家示范(Demonstration),加速 DQN 收敛。
🎯 项目目标
在三维栅格地图中,从起点到终点为无人机规划一条 避障、平滑、最短 的路径。
📁 项目结构概览
UAV_DQN_ACO/
│
├── main.m % 主程序入口
├── env_3d.m % 3D环境建模(含障碍物)
├── dqn_agent.m % DQN智能体类
├── aco_path_planner.m % ACO路径规划器
├── replay_buffer.m % 经验回放缓冲区
├── utils.m % 工具函数(如坐标转换、可视化等)
└── models/ % 保存训练好的网络
1️⃣ 环境建模:env_3d.m
matlab
function env = env_3d(map_size, obstacle_ratio)
% 创建3D栅格地图
if nargin < 2, obstacle_ratio = 0.2; end
map = false(map_size(1), map_size(2), map_size(3)); % false=自由,true=障碍
num_obstacles = floor(obstacle_ratio prod(map_size));
for i = 1:num_obstacles
x = randi(map_size(1));
y = randi(map_size(2));
z = randi(map_size(3));
map(x,y,z) = true;
end
% 确保起点和终点无障碍
start = [1,1,1];
goal = map_size;
map(start(1),start(2),start(3)) = false;
map(goal(1),goal(2),goal(3)) = false;
env.map = map;
env.size = map_size;
env.start = start;
env.goal = goal;
end
2️⃣ ACO 路径规划器:aco_path_planner.m
matlab
function [best_path, pheromone] = aco_path_planner(env, nAnts, maxIter)
[nx, ny, nz] = deal(env.size(1), env.size(2), env.size(3));
pheromone = ones(nx, ny, nz) 0.1; % 初始信息素
eta = zeros(nx, ny, nz); % 启发式因子(1/到目标距离)
goal = env.goal;
for x = 1:nx
for y = 1:ny
for z = 1:nz
if ~env.map(x,y,z)
dist = norm([x,y,z] - goal);
eta(x,y,z) = 1 / (dist + eps);
end
end
end
end
best_len = inf;
best_path = [];
for iter = 1:maxIter
paths = cell(nAnts,1);
path_lens = zeros(nAnts,1);
for ant = 1:nAnts
pos = env.start;
path = pos;
visited = false(nx,ny,nz);
visited(pos(1),pos(2),pos(3)) = true;
while ~isequal(pos, env.goal) && length(path) < nxnynz
moves = get_valid_moves(pos, env, visited);
if isempty(moves)
break;
end
probs = compute_probs(moves, pheromone, eta, alpha=1, beta=2);
next_idx = randsample(1:size(moves,1), 1, true, probs);
pos = moves(next_idx,:);
path(end+1,:) = pos;
visited(pos(1),pos(2),pos(3)) = true;
end
paths{ant} = path;
path_lens(ant) = size(path,1);
end
% 更新信息素
evap_rate = 0.9;
pheromone = pheromone evap_rate;
for ant = 1:nAnts
path = paths{ant};
if isequal(path(end,:), env.goal)
delta = 1 / path_lens(ant);
for i = 1:size(path,1)
p = path(i,:);
pheromone(p(1),p(2),p(3)) = pheromone(p(1),p(2),p(3)) + delta;
end
if path_lens(ant) < best_len
best_len = path_lens(ant);
best_path = path;
end
end
end
end
end
function moves = get_valid_moves(pos, env, visited)
[x,y,z] = deal(pos(1),pos(2),pos(3));
candidates = [];
for dx = -1:1
for dy = -1:1
for dz = -1:1
if dx0 && dy0 && dz==0, continue; end
nx = x + dx; ny = y + dy; nz = z + dz;
if nx>=1 && nx<=env.size(1) && …
ny>=1 && ny<=env.size(2) && …
nz>=1 && nz<=env.size(3) && …
~env.map(nx,ny,nz) && ~visited(nx,ny,nz)
candidates(end+1,:) = [nx,ny,nz];
end
end
end
end
moves = candidates;
end
function probs = compute_probs(moves, pher, eta, alpha, beta)
tau_eta = (pher(sub2ind(size(pher), moves(:,1), moves(:,2), moves(:,3))).^alpha) . …
(eta(sub2ind(size(eta), moves(:,1), moves(:,2), moves(:,3))).^beta);
probs = tau_eta / sum(tau_eta + eps);
end
3️⃣ DQN 智能体:dqn_agent.m
matlab
classdef dqn_agent < handle
properties
stateSize
actionSize = 26; % 3D 26邻域动作(-1/0/1 in x,y,z except (0,0,0))
discountFactor = 0.95;
epsilon = 1.0;
epsilonDecay = 0.995;
epsilonMin = 0.05;
replayBuffer
policyNet
targetNet
learnRate = 1e-3;
updateFreq = 100;
steps = 0;
end
methods
function obj = dqn_agent(stateSize)
obj.stateSize = stateSize;
obj.replayBuffer = replay_buffer(10000);
obj.buildNetworks();
end
function buildNetworks(obj)
layers = [
featureInputLayer(prod(obj.stateSize))
fullyConnectedLayer(256)
reluLayer
fullyConnectedLayer(256)
reluLayer
fullyConnectedLayer(obj.actionSize)
];
opts = trainingOptions(‘adam’, …
‘MaxEpochs’, 1, ‘MiniBatchSize’, 64, …
‘InitialLearnRate’, obj.learnRate, …
‘Plots’, ‘none’);
obj.policyNet = dlnetwork(layerGraph(layers));
obj.targetNet = dlnetwork(layerGraph(layers));
end
function action = selectAction(obj, state)
if rand < obj.epsilon
action = randi(obj.actionSize);
else
state = dlarray(single(state(😃'), ‘CB’);
qvals = predict(obj.policyNet, state);
[~, action] = max(extractdata(qvals));
end
end
function train(obj, batch)
states = cat(2, batch.states{:});
actions = cat(1, batch.actions{:});
rewards = cat(1, batch.rewards{:});
nextStates = cat(2, batch.nextStates{:});
dones = cat(1, batch.dones{:});
states = dlarray(single(states), ‘CB’);
nextStates = dlarray(single(nextStates), ‘CB’);
% 目标Q值
with torch.no_grad
nextQ = predict(obj.targetNet, nextStates);
maxNextQ = max(nextQ, [], 1);
targetQ = rewards + obj.discountFactor maxNextQ . (1 - dones);
end
% 当前Q值
currentQ = predict(obj.policyNet, states);
actionIdx = sub2ind([obj.actionSize, size(states,2)], actions’, 1:size(states,2));
predictedQ = currentQ(actionIdx);
loss = mse(predictedQ, targetQ);
% 反向传播(简化,实际需使用 dlgradient + adamupdate)
% 此处省略详细训练代码(MATLAB 2023b+ 支持)
% ε衰减
obj.epsilon = max(obj.epsilonMin, obj.epsilon * obj.epsilonDecay);
obj.steps = obj.steps + 1;
if mod(obj.steps, obj.updateFreq) == 0
obj.targetNet = obj.policyNet;
end
end
end
end
💡 注:MATLAB 对 DQN 的完整实现较复杂,建议使用 Reinforcement Learning Toolbox(若可用)。上述为简化版示意。
4️⃣ 主程序:main.m
matlab
clear; clc; close all;
% 参数设置
map_size = [10,10,10];
env = env_3d(map_size, 0.2);
% Step 1: ACO 生成示范路径
fprintf(‘Running ACO to generate expert demonstrations…\n’);
[aco_path, ~] = aco_path_planner(env, nAnts=20, maxIter=50);
% Step 2: 构建 DQN Agent
stateSize = map_size;
agent = dqn_agent(stateSize);
% Step 3: 将 ACO 路径转为经验回放样本
rb = agent.replayBuffer;
for t = 1:size(aco_path,1)-1
s = encode_state(aco_path(t,:), env);
a = decode_action(aco_path(t,:), aco_path(t+1,:));
r = -norm(aco_path(t+1,:) - env.goal); % 负距离奖励
s_next = encode_state(aco_path(t+1,:), env);
done = isequal(aco_path(t+1,:), env.goal);
rb.add(s, a, r, s_next, done);
end
% Step 4: DQN 训练(可结合在线探索)
maxEpisodes = 200;
maxSteps = 100;
for ep = 1:maxEpisodes
pos = env.start;
totalReward = 0;
for step = 1:maxSteps
s = encode_state(pos, env);
a = agent.selectAction(s);
next_pos = apply_action(pos, a, env.size);
if ~is_valid(next_pos, env)
r = -100; % 碰撞惩罚
done = true;
else
r = -1; % 步长惩罚
if isequal(next_pos, env.goal)
r = 100;
done = true;
else
done = false;
end
end
s_next = encode_state(next_pos, env);
rb.add(s, a, r, s_next, done);
totalReward = totalReward + r;
pos = next_pos;
if done, break; end
% 从回放缓冲区采样训练
if rb.length > 64
batch = rb.sample(64);
agent.train(batch);
end
end
fprintf(‘Episode %d Reward: %.2f Epsilon: %.3f\n’, ep, totalReward, agent.epsilon);
end
% Step 5: 测试 DQN 路径
path_dqn = test_policy(agent, env);
visualize_paths(env, aco_path, path_dqn);
5️⃣ 辅助函数(utils.m)
matlab
function s = encode_state(pos, env)
s = zeros(env.size);
s(pos(1),pos(2),pos(3)) = 1;
% 可加入目标位置通道、障碍物通道等(多通道状态)
end
function a = decode_action(from, to)
delta = to - from; % [-1,0,1]^3 \ {0,0,0}
moves = [];
idx = 1;
for dx = -1:1
for dy = -1:1
for dz = -1:1
if dx0 && dy0 && dz==0, continue; end
moves(idx,:) = [dx,dy,dz];
idx = idx + 1;
end
end
end
[~, a] = ismember(delta, moves, ‘rows’);
end
function new_pos = apply_action(pos, a, map_size)
moves = [];
for dx = -1:1
for dy = -1:1
for dz = -1:1
if dx0 && dy0 && dz==0, continue; end
moves(end+1,:) = [dx,dy,dz];
end
end
end
delta = moves(a,:);
new_pos = pos + delta;
new_pos = min(max(new_pos, [1,1,1]), map_size);
end
function valid = is_valid(pos, env)
x = pos(1); y = pos(2); z = pos(3);
valid = x>=1 && x<=env.size(1) && …
y>=1 && y<=env.size(2) && …
z>=1 && z<=env.size(3) && …
~env.map(x,y,z);
end
function path = test_policy(agent, env)
pos = env.start;
path = pos;
for step = 1:200
if isequal(pos, env.goal), break; end
s = encode_state(pos, env);
a = agent.selectAction(s);
pos = apply_action(pos, a, env.size);
if ~is_valid(pos, env)
warning(‘Collision!’); break;
end
path(end+1,:) = pos;
end
end
function visualize_paths(env, path1, path2)
figure;
hold on;
[x_obs,y_obs,z_obs] = ind2sub(env.size, find(env.map));
scatter3(x_obs,y_obs,z_obs, 50, ‘k’, ‘filled’); % 障碍物
plot3(path1(:,1), path1(:,2), path1(:,3), ‘-o’, ‘LineWidth’,2, ‘DisplayName’,‘ACO’);
plot3(path2(:,1), path2(:,2), path2(:,3), ‘-s’, ‘LineWidth’,2, ‘DisplayName’,‘DQN’);
scatter3(env.start(1),env.start(2),env.start(3), 100, ‘g’, ‘filled’);
scatter3(env.goal(1),env.goal(2),env.goal(3), 100, ‘r’, ‘filled’);
legend; grid on; xlabel(‘X’); ylabel(‘Y’); zlabel(‘Z’);
title(‘3D UAV Path Planning: ACO vs DQN’);
view(3);
end
✅ 优势与创新点
ACO 提供高质量先验知识,缓解 DQN 在稀疏奖励下的探索难题。
DQN 具备泛化能力,可在新环境中快速规划,而 ACO 需重新运行。
混合框架兼顾全局优化与在线决策。
🔧 扩展建议
- 使用 Double DQN / Dueling DQN 提升稳定性。
- 引入 优先经验回放(PER)。
- 状态表示改用 局部感知窗口(如 5×5×5) 而非全图。
- 动作空间限制为 可行飞行方向(如最小转弯半径)。
- 加入 动态障碍物 或 风场模型。