Julia使用transformer训练数据预测模型
一、任务描述
选取各检测设备工艺运行参数1534个。
每分钟记录1条数据。
根据1个小时的60条数据预测未来30分钟的30条数据。
二、模型描述
输入大小60*1534,输出大小30*1534
三、主要参数
输入长度in_len 60,输出长度out_len 30,特征值feature_dim1534,模型大小d_model64,头数n_head,编码层num_enc和解码层层数num_dec2
训练批次batch_size:32
四:代码
1、由于训练数据存储在多个csv文件里,因此使用增量方式计算均值和方差,方便进行数据的标准化
# 定义增量计算均值和标准差的函数 function online_mean_std!(μ::Vector{Float64}, σ::Vector{Float64}, n::Int, new_data::Matrix{Float64}) m = size(new_data, 2) # 特征数量 if n == 0 # 初始时直接设置均值和标准差(注意这里 mean(new_data, dims=1) 得到的是 1×m 的数组,转换为向量) μ .= vec(mean(new_data, dims=1)) σ .= vec(std(new_data, dims=1; corrected=false)) return size(new_data, 1) end for i in 1:size(new_data, 1) n += 1 for j in 1:m delta = new_data[i, j] - μ[j] μ[j] += delta / n # 更新公式(非修正方差) σ[j] = sqrt(((n - 2) / (n - 1)) * (σ[j]^2) + (delta^2 / n)) end end return n end # 计算均值和标准差 function calc_mean_std(data_dir::String) μ = Float64[] σ = Float64[] n = 0 csv_files = glob("data_*.csv", data_dir) for file in csv_files df = CSV.read(file, DataFrame) # 删除两列,空数据列 select!(df, Not([:pn_y17zsl8001a, :pn_cbkg1003gxtt])) data = Matrix(df) if n == 0 μ = zeros(size(data, 2)) σ = zeros(size(data, 2)) end n = online_mean_std!(μ, σ, n, data) end # 将计算得到的均值和标准差保存到磁盘 open(joinpath(Common.get_runtime_dir(), "../config/data_stats.jls"), "w") do io serialize(io, Dict("mean" => μ, "std" => σ)) end end
2、创建模型
struct MyModel proj_in proj_out pos_enc_src pos_enc_tgt encoder decoder end Flux.@layer MyModel # 让 Flux 知道如何提取参数 function (m::MyModel)(x) src, tgt = x batch = size(src, 3) src_proj = m.proj_in.(eachslice(src, dims=1)) |> (arr -> hcat(arr...)) |> (arr -> reshape(arr, size(m.proj_in.weight, 1), size(src, 1), batch)) src_proj = src_proj .+ m.pos_enc_src[:, 1:size(src,1)] memory = m.encoder(src_proj) tgt_proj = m.proj_in.(eachslice(tgt, dims=1)) |> (arr -> hcat(arr...)) |> (arr -> reshape(arr, size(m.proj_in.weight, 1), size(tgt,1), batch)) tgt_proj = tgt_proj .+ m.pos_enc_tgt[:, 1:size(tgt,1)] dec_out = m.decoder(tgt_proj, memory) output = m.proj_out.(eachslice(dec_out, dims=2)) |> (arr -> hcat(arr...)) |> (arr -> reshape(arr, size(tgt,1), size(tgt,2), batch)) return output end function build_model(in_len::Int, out_len::Int, feature_dim::Int; d_model::Int=64, nhead::Int=4, num_enc::Int=2, num_dec::Int=2) proj_in = Dense(feature_dim, d_model) proj_out = Dense(d_model, feature_dim) pos_enc_src = positional_encoding(d_model, in_len) pos_enc_tgt = positional_encoding(d_model, out_len) enc_layers = [encoder_layer(d_model, nhead) for _ in 1:num_enc] encoder = Chain(enc_layers...) dec_layers = [decoder_layer(d_model, nhead) for _ in 1:num_dec] decoder = decode_chain(dec_layers) return MyModel(proj_in, proj_out, pos_enc_src, pos_enc_tgt, encoder, decoder) end
# 位置编码
function positional_encoding(d_model::Int, length::Int)
# 初始化一个矩阵来存储位置编码
pe = zeros(Float64, d_model, length)
for pos in 1:length
for i in 1:d_model
if i % 2 == 1
pe[i, pos] = sin(pos / 10000^(2 * (i - 1) / d_model))
else
pe[i, pos] = cos(pos / 10000^(2 * (i - 1) / d_model))
end
end
end
return pe
end
# 定义单个编码器层
function encoder_layer(d_model::Int, nheads::Int, dropout=0.1)
# 自注意力层
attention = MultiHeadAttention(d_model; nheads=nheads, dropout_prob=dropout)
# 输出投影层,将各个 head 的输出组合为 d_model 维度
out_proj = Dense((d_model ÷ nheads) * nheads, d_model)
# 前馈网络
feedforward = Chain(
Dense(d_model, 4 * d_model, relu), # 增加维度
Dense(4 * d_model, d_model) # 恢复原维度
)
# 归一化层和残差连接
return Chain(
LayerNorm(d_model),
x -> out_proj(attention(x, x, x)[1]) .+ x, # 残差连接
LayerNorm(d_model),
feedforward
)
end
# 定义一个解码器层
function decoder_layer(d_model::Int, nheads::Int, dropout=0.1)
# 自注意力层
self_attention = MultiHeadAttention(d_model; nheads=nheads, dropout_prob=dropout)
# 编码器-解码器注意力层
encoder_attention = MultiHeadAttention(d_model; nheads=nheads, dropout_prob=dropout)
# 输出投影层,将各个 head 的输出组合为 d_model 维度
out_proj = Dense((d_model ÷ nheads) * nheads, d_model)
# 前馈网络
feedforward = Chain(
Dense(d_model, 4 * d_model, relu),
Dense(4 * d_model, d_model)
)
norm1 = LayerNorm(d_model)
norm2 = LayerNorm(d_model)
norm3 = LayerNorm(d_model)
return (tgt, memory) -> begin
# 自注意力残差连接
x = norm1(tgt)
# self_attention 返回一个元组,取第一个作为注意力输出
attn_out = self_attention(x, x, x)[1]
x = out_proj(attn_out) .+ tgt
# 编码器-解码器注意力残差连接
x_norm = norm2(x)
enc_attn_out = encoder_attention(x_norm, memory, memory)[1]
x = out_proj(enc_attn_out) .+ x
# 前馈网络残差连接
x_norm = norm3(x)
x = feedforward(x_norm) .+ x
return x
end
end
3、训练
loss_fn(model, src, tgt) = Flux.Losses.mse(model((src, tgt)), tgt) function train_model!(model, data; epochs::Int=10, batch_size::Int=32, lr::Float64=1e-3) src_all, tgt_all = data N = size(src_all, 3) opt = Flux.setup(ADAM(lr), model) for epoch in 1:epochs epoch_loss::Float32 = 0.0 idxs = randperm(N) for batch in partition(idxs, batch_size) src_batch = src_all[:, :, collect(batch)] tgt_batch = tgt_all[:, :, collect(batch)] grads = Flux.gradient(loss_fn, model, src_batch, tgt_batch)[1] Flux.update!(opt, model, grads) epoch_loss += loss_fn(model, src_batch, tgt_batch) end println("Loss: $(epoch_loss)") flush(stdout) end end
# ================================
# 4. 遍历多个 CSV 文件训练
# ================================
function train_multiple_csvs(data_dir::String;epochs::Int=200)
csv_files = glob("data_*.csv", data_dir)
# 加载均值和标准差
stats = open(joinpath(Common.get_runtime_dir(), "../config/data_stats.jls"), "r") do io
deserialize(io)
end
μ = stats["mean"]
σ = stats["std"]
# 构建模型
in_len = 60
out_len = 30
feature_dim = 1534
epochs = epochs
model = build_model(in_len, out_len, feature_dim; d_model=64, nhead=4, num_enc=2, num_dec=2)
# println(Flux.params(model))
for epoch in 1:epochs
println("Epoch $epoch")
flush(stdout)
for filepath in csv_files
println("Processing: $filepath")
flush(stdout)
df = load_csv_data(filepath)
X = preprocess_data(df, μ, σ)
src_data, tgt_data = create_windows(X, in_len, out_len)
src_data = Float32.(src_data)
tgt_data = Float32.(tgt_data)
feature_dim = size(X, 2)
train_model!(model, (src_data, tgt_data), epochs=1, batch_size=32, lr=1e-3)
end
end
save_model(model, joinpath(Common.get_runtime_dir(), "../config/model.bson"))
end