【QWen1.5】使用AutoDL多卡对QWen1.5-7B模型进行lora微调

过程描述

按教程里的正常流程启动微调后，总会出现cuda out of memory的现象
于是不得不用两个gpu同时训练
这里的代码修改只涉及到设置多gpu进行微调，不涉及量化等

完整微调脚本

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
from peft import (
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training
)
import os

print("=========================================")
print("Qwen1.5-7B LoRA微调 - 多GPU分布式训练")
print("=========================================")

# ====================
# 1. 加载tokenizer和模型
# ====================
model_path = './qwen/Qwen1.5-7B-Chat/'

print(f"1. 加载tokenizer: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=False,
    trust_remote_code=True
)

# 确保pad token设置正确
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"  设置pad_token为eos_token: {tokenizer.pad_token}")

print("2. 检查GPU信息...")
print(f"  GPU数量: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"    显存总量: {torch.cuda.get_device_properties(i).total_memory / 1024 ** 3:.2f} GB")

print("3. 加载模型（半精度）...")
# 检查是否支持bfloat16，如果支持则使用，否则使用float16
if torch.cuda.is_bf16_supported():
    torch_dtype = torch.bfloat16
    print("  GPU支持bfloat16，使用bfloat16精度")
else:
    torch_dtype = torch.float16
    print("  GPU不支持bfloat16，使用float16精度")

# 加载模型
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",  # 让transformers自动分配模型到多GPU
    torch_dtype=torch_dtype,
    trust_remote_code=True,
)

print(f"  模型以{torch_dtype}精度加载")
print(f"  当前显存占用: {torch.cuda.memory_allocated() / 1024 ** 3:.2f} GB")

# ====================
# 2. 加载并处理数据集
# ====================
dataset_path = "/tmp/pycharm_project_514/dataset/huanhuan.json"
print(f"\n4. 加载数据集: {dataset_path}")

try:
    dataset = load_dataset('json', data_files=dataset_path)['train']
    print(f"  数据集大小: {len(dataset)} 条")
except Exception as e:
    print(f"  数据集加载失败: {e}")
    # 创建测试数据
    from datasets import Dataset

    test_data = [
        {
            "instruction": "你是谁？",
            "input": "",
            "output": "家父是大理寺少卿甄远道。"
        } for _ in range(100)
    ]
    dataset = Dataset.from_list(test_data)
    print(f"  使用测试数据，大小: {len(dataset)} 条")


def process_func(example):
    MAX_LENGTH = 128  # 减小序列长度以节省显存

    # 构建对话格式
    system_msg = "现在你要扮演皇帝身边的女人--甄嬛"
    user_msg = example['instruction'] + example['input']

    # Qwen1.5的特殊格式
    prompt = f"<|im_start|>system\n{system_msg}<|im_end|>\n"
    prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
    prompt += f"<|im_start|>assistant\n"

    # 编码
    prompt_encoded = tokenizer(prompt, add_special_tokens=False)
    response = example['output'] + "<|im_end|>"
    response_encoded = tokenizer(response, add_special_tokens=False)

    # 拼接
    input_ids = prompt_encoded["input_ids"] + response_encoded["input_ids"]
    attention_mask = prompt_encoded["attention_mask"] + response_encoded["attention_mask"]
    labels = [-100] * len(prompt_encoded["input_ids"]) + response_encoded["input_ids"]

    # 截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


print("5. 处理数据集中...")
tokenized_dataset = dataset.map(
    process_func,
    remove_columns=dataset.column_names,
    desc="处理数据"
)
print(f"  数据预处理完成")

# ====================
# 3. 配置Lora参数
# ====================
print("\n6. 配置LoRA...")
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # 可调整训练层
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
)

# ====================
# 4. 准备模型用于训练
# ====================
print("7. 准备模型训练...")
model = get_peft_model(model, lora_config)

# 打印可训练参数
model.print_trainable_parameters()

# ====================
# 5. 配置训练参数（多GPU优化）
# ====================
print("\n8. 配置训练参数（多GPU）...")

# 计算有效的总batch size
per_device_batch_size = 1  # 减小batch size，因为现在不使用量化
gradient_accumulation_steps = 16  # 增加梯度累积步数
num_gpus = max(1, torch.cuda.device_count())  # 可用GPU数量
effective_batch_size = per_device_batch_size * gradient_accumulation_steps * num_gpus

print(f"  可用GPU数量: {num_gpus}")
print(f"  每个GPU batch size: {per_device_batch_size}")
print(f"  梯度累积步数: {gradient_accumulation_steps}")
print(f"  有效总batch size: {effective_batch_size}")

training_args = TrainingArguments(
    output_dir="./output/Qwen1.5-7B-Chat-multi-gpu",
    per_device_train_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=1e-4,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="adamw_torch",  # 使用标准优化器
    save_total_limit=3,
    report_to="none",
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    group_by_length=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,

    # 多GPU相关设置
    dataloader_num_workers=4,  # 增加数据加载工作线程
    ddp_find_unused_parameters=False if num_gpus > 1 else None,  # 多GPU时需要
    ddp_timeout=1800,  # 分布式训练超时时间

    # 优化设置
    max_grad_norm=0.3,
    weight_decay=0.01,
)

# ====================
# 6. 创建Trainer
# ====================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        padding=True,
        pad_to_multiple_of=8,
        return_tensors="pt"
    ),
)

# ====================
# 7. 开始训练
# ====================
print("\n9. 开始训练...")
print("=" * 50)

# 清空显存
torch.cuda.empty_cache()
print(f"训练前显存状态:")
for i in range(torch.cuda.device_count()):
    print(f"  GPU {i}: {torch.cuda.memory_allocated(i) / 1024 ** 3:.2f} GB / "
          f"{torch.cuda.get_device_properties(i).total_memory / 1024 ** 3:.2f} GB")

try:
    print(f"开始分布式训练，使用 {num_gpus} 个GPU...")
    trainer.train()
    print("\n训练完成！")

    # 保存模型（只在主进程保存）
    output_dir = "./output/Qwen1.5-7B-Chat-lora-multi"
    trainer.save_model(output_dir)
    print(f"模型已保存到: {output_dir}")

except torch.cuda.OutOfMemoryError as e:
    print(f"\n显存不足错误: {e}")
    print("尝试减小以下参数:")
    print(f"  1. per_device_train_batch_size (当前: {per_device_batch_size})")
    print(f"  2. MAX_LENGTH (当前: 128)")
    print(f"  3. gradient_accumulation_steps (当前: {gradient_accumulation_steps})")
    print("或尝试以下方法:")
    print("  4. 减小LoRA秩 (当前: r=8)")
    print("  5. 只训练更少的层 (当前: target_modules包含4个层)")

except Exception as e:
    print(f"\n训练出错: {e}")
    import traceback

    traceback.print_exc()

print("\n训练结束")
print("=" * 50)

关于多gpu的部分：

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",  # 让transformers自动分配模型到多GPU
    torch_dtype=torch_dtype,
    trust_remote_code=True,
)

在这里设置device_map="auto",就会自动检测到两个gpu并开启多卡训练

点击查看代码

per_device_batch_size = 1  # 减小batch size，因为现在不使用量化
gradient_accumulation_steps = 16  # 增加梯度累积步数
num_gpus = max(1, torch.cuda.device_count())  # 可用GPU数量
effective_batch_size = per_device_batch_size * gradient_accumulation_steps * num_gpus

print(f"  可用GPU数量: {num_gpus}")
print(f"  每个GPU batch size: {per_device_batch_size}")
print(f"  梯度累积步数: {gradient_accumulation_steps}")
print(f"  有效总batch size: {effective_batch_size}")

training_args = TrainingArguments(
    output_dir="./output/Qwen1.5-7B-Chat-multi-gpu",
    per_device_train_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=1e-4,
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="adamw_torch",  # 使用标准优化器
    save_total_limit=3,
    report_to="none",
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    group_by_length=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,

    # 多GPU相关设置
    dataloader_num_workers=4,  # 增加数据加载工作线程
    ddp_find_unused_parameters=False if num_gpus > 1 else None,  # 多GPU时需要
    ddp_timeout=1800,  # 分布式训练超时时间

    # 优化设置
    max_grad_norm=0.3,
    weight_decay=0.01,
)

## 解析 # 多GPU微调训练知识点总结

这段代码展示了在多GPU环境下进行LoRA微调的关键配置，以下是对其中多GPU微调知识点的详细总结：

1. 批量大小计算与配置

关键参数关系

# 核心计算公式
effective_batch_size = per_device_batch_size * gradient_accumulation_steps * num_gpus

参数说明：

per_device_train_batch_size=1：每个GPU每次处理的样本数量
- 由于不使用量化，模型占用显存较大，设置为1确保单GPU能容纳
- 在多GPU环境中，这是每个GPU独立的batch size
gradient_accumulation_steps=16：梯度累积步数
- 每累积16个批次的梯度才更新一次模型参数
- 作用：模拟更大的batch size，但不增加显存占用
effective_batch_size：有效总批量大小
- 计算方式：1 × 16 × GPU数量
- 假设有2个GPU：有效batch size = 1 × 16 × 2 = 32
- 这是参数更新时实际使用的batch size

2. 多GPU特定配置

分布式数据并行参数

ddp_find_unused_parameters=False if num_gpus > 1 else None

作用：在分布式训练中处理未被使用的参数
设置为False：当存在未使用的参数时不报错
条件设置：只在多GPU（num_gpus > 1）时启用

ddp_timeout=1800

作用：设置分布式训练的超时时间（秒）
值1800：30分钟超时
重要性：防止因网络延迟或负载不均衡导致的训练中断

3. 数据加载优化

dataloader_num_workers=4

作用：设置数据加载的工作进程数
推荐值：通常设置为GPU数量的2-4倍
效果：并行加载数据，减少CPU到GPU的数据传输瓶颈

dataloader_pin_memory=False

作用：是否将数据固定到内存中
设置为False：在多GPU环境中通常关闭，避免内存碎片化
替代方案：让PyTorch自动管理内存

4. 训练过程优化

序列长度分组

group_by_length=True

作用：将长度相近的样本分组到一起
优势：
1. 减少padding，提高计算效率
2. 在分布式环境中减少通信开销
3. 提高GPU利用率

梯度相关设置

gradient_checkpointing=True
gradient_checkpointing_kwargs={"use_reentrant": False}
max_grad_norm=0.3

梯度检查点：用计算时间换显存空间
use_reentrant=False：使用非重入检查点，更高效
max_grad_norm=0.3：梯度裁剪阈值，防止梯度爆炸

5. 精度与优化器配置

混合精度训练

bf16=torch.cuda.is_bf16_supported()
fp16=not torch.cuda.is_bf16_supported()

自动检测：根据GPU能力选择最佳精度
bf16优先：如果GPU支持bfloat16则使用，否则使用float16
优势：减少显存占用，加速计算

优化器选择

optim="adamw_torch"

不使用8bit优化器：因为没有使用bitsandbytes量化
AdamW：标准优化器，适合多GPU环境
weight_decay=0.01：L2正则化，防止过拟合

6. 学习率调度策略

lr_scheduler_type="cosine"
warmup_ratio=0.03

余弦退火：学习率从初始值按余弦曲线下降
热身阶段：前3%的训练步数线性增加学习率
分布式优势：在多GPU中确保所有进程同步学习率变化

7. GPU资源管理与监控

资源分配逻辑

自动检测GPU数量：torch.cuda.device_count()
设备映射：通过device_map="auto"自动分配模型到各GPU
负载均衡：Transformers自动将模型层分配到可用GPU

显存使用策略

保守的batch size：从1开始，确保不OOM
梯度累积：用时间换空间
梯度检查点：进一步减少显存峰值

8. 分布式训练中的批次处理流程

单GPU处理流程：
输入样本 → 前向传播 → 计算损失 → 反向传播 → 累积梯度

多GPU处理流程（数据并行）：
批次分割 → 各GPU独立处理 → 梯度同步 → 参数更新
    ↓           ↓           ↓         ↓
per_device  每个GPU    all_reduce   effective
batch_size  独立计算   通信汇总     batch_size

9. 多GPU训练性能权衡

优势：

更大有效batch size：通过多GPU累积实现
更快训练速度：并行处理数据
解决显存限制：模型可以分布在多个GPU上

挑战：

通信开销：梯度同步增加时间成本
负载不均衡：需要合理分配计算任务
调试复杂性：多进程环境更难调试

10. 关键配置原则总结

配置项	单GPU	多GPU	说明
batch_size	较大值	较小值	多GPU时每个GPU的batch size应减小
梯度累积	可选	必须	多GPU时用梯度累积实现大batch
数据加载	1-2 workers	4+ workers	多GPU需要更多数据加载进程
设备映射	"auto"或"cuda"	"auto"	多GPU时必须用auto自动分配
DDP参数	无需	必须设置	处理分布式训练中的未用参数

核心要点

有效批量大小是关键：通过per_device_batch_size × gradient_accumulation × num_gpus计算
梯度同步是瓶颈：合理设置gradient_accumulation_steps平衡通信开销
自动设备映射简化配置：device_map="auto"让框架自动处理模型分布
数据加载优化不可忽视：足够的num_workers确保GPU不空闲
超时设置很重要：ddp_timeout防止因网络问题导致训练失败

通过上述配置，可以在多GPU环境中高效地进行LoRA微调，充分利用硬件资源，同时避免显存不足的问题。

posted @ 2026-01-05 20:26 SaTsuki26681534 阅读(16) 评论(0) 收藏举报

刷新页面返回顶部

Loading

satsuki26681534