本地布署Qwen-Image蒸馏模型

一，#本机环境检查

执行nvidia-smi，查看右上角。验证显卡驱动已安装最高支持的版本。

nvidia-smi

#在调试时，为了实时观察GPU利用率，一般新开一个命令窗口，执行以下命令，一秒刷新一次。

watch -n 1 nvidia-smi

执行nvcc -V验证cuda

nvcc -V

执行conda --version验证conda版本

conda --version

#列出所有已创建的Conda 环境：

conda env list
或
conda info --envs

#若存在，先删除已存在环境

conda env remove -n diffusers_qwen_image

#创建新环境

conda create -n diffusers_qwen_image python=3.10

#激活环境

conda activate diffusers_qwen_image

二，依赖库安装

#下载diffsynth

git clone https://github.com/modelscope/DiffSynth-Studio.git

#安装diffsynth

cd DiffSynth-Studio
pip install .

#验证diffsynth库是否安装成功

python3 -c "import diffsynth; print('diffsynth导入成功，版本:', diffsynth.__version__)"

根据CUDA版本安装PyTorch：
CUDA 12.1:

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

CUDA 12.2:

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu122

#验证PyTorch是否能正确识别GPU

python3 -c "import torch; print('PyTorch版本:', torch.__version__); print('CUDA可用:', torch.cuda.is_available()); print('CUDA版本:', torch.version.cuda); print('GPU设备:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'); print('GPU数量:', torch.cuda.device_count());"

#用AI生成python代码，将以下代码保存为1.py文件

from modelscope import DiffusionPipeline, FlowMatchEulerDiscreteScheduler, snapshot_download

import torch

import math

import os

import urllib.request

import urllib.error

from pathlib import Path

from datetime import datetime

# 设置HF_ENDPOINT使用国内镜像加速模型下载

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

# 设置CUDA可见设备，确保使用所有可用的GPU

# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7" # 根据实际GPU数量调整

# 减少显存碎片化

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 启用推理模式，减少显存占用

torch.inference_mode()

scheduler_config = {

'base_image_seq_len': 256,

'base_shift': math.log(3),

'invert_sigmas': False,

'max_image_seq_len': 8192,

'max_shift': math.log(3),

'num_train_timesteps': 1000,

'shift': 1.0,

'shift_terminal': None,

'stochastic_sampling': False,

'time_shift_type': 'exponential',

'use_beta_sigmas': False,

'use_dynamic_shifting': True,

'use_exponential_sigmas': False,

'use_karras_sigmas': False,

}

scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)

# 检测可用GPU数量

num_gpus = torch.cuda.device_count()

print(f"检测到 {num_gpus} 个GPU设备")

# 根据项目规范，对于DiffusionPipeline模型，我们使用max_memory参数进行显存分配

# 而不是手动指定每层的设备映射

if num_gpus > 1:

# 获取每个GPU的显存信息并计算可分配的显存量

max_memory = {}

for i in range(num_gpus):

free_mem, total_mem = torch.cuda.mem_get_info(i)

# 按70%的空闲显存计算分配量，同时确保不超过22GB

allocated_mem = min(int(free_mem * 0.7), 22 * 1024**3, free_mem)

max_memory[i] = allocated_mem

print(f"GPU {i}: 分配 {(allocated_mem / 1024**3):.2f} GB 显存")

# 加载模型并指定显存分配

pipe = DiffusionPipeline.from_pretrained(

'Qwen/Qwen-Image',

scheduler=scheduler,

torch_dtype=torch.bfloat16,

max_memory=max_memory, # 为每个GPU分配显存

)

else:

# 单GPU情况

pipe = DiffusionPipeline.from_pretrained(

'Qwen/Qwen-Image',

scheduler=scheduler,

torch_dtype=torch.bfloat16,

)

pipe = pipe.to("cuda")

print(f"模型已分配到{num_gpus}个GPU设备上")

# 提前下载LoRA权重

def download_lora_weights():

print("开始下载LoRA权重...")

# 使用ModelScope的snapshot_download下载LoRA权重

model_dir = snapshot_download('lightx2v/Qwen-Image-Lightning')

print(f"LoRA权重已下载到: {model_dir}")

# 查找.pt或.safetensors文件

lora_files = list(Path(model_dir).glob("*.safetensors")) + list(Path(model_dir).glob("*.pt"))

if not lora_files:

raise FileNotFoundError("在下载的LoRA权重目录中未找到.safetensors或.pt文件")

lora_file_path = lora_files[0] # 使用找到的第一个文件

print(f"使用LoRA文件: {lora_file_path}")

return str(lora_file_path)

# 加载LoRA权重

lora_file_path = download_lora_weights()

pipe.load_lora_weights(

lora_file_path # 直接使用本地文件路径而不是模型标识符

)

prompt = '直升飞机准备降落在杭州西湖的三谭印月'

negative_prompt = ' '

# 生成图像

image = pipe(

prompt=prompt,

negative_prompt=negative_prompt,

width=1024,

height=1024,

num_inference_steps=8,

true_cfg_scale=1.0,

generator=torch.manual_seed(0),

).images[0]

# 将图像保存到用户Pictures目录，并使用包含时间戳的文件名

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

pictures_dir = Path.home() / 'Pictures'

pictures_dir.mkdir(exist_ok=True) # 确保Pictures目录存在

output_path = pictures_dir / f'qwen_image_{timestamp}.png'

image.save(str(output_path))

print(f"图像已生成并保存为 {output_path}")

#运行

python3 1.py

生成结果：

qwen_image_20250921-122956

qwen_image_20250921-112439

#指定多张显卡运行。实际测试时，若仅指定4090单张显卡，会报错。至少指定NVIDIA GeForce RTX 4090 两张或两张以上

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 0 has a total capacity of 23.65 GiB of which 9.69 MiB is free. 
Including non-PyTorch memory, this process has 23.62 GiB memory in use. Of the allocated memory 23.24 GiB is allocated by PyTorch, and 8.82 MiB is reserved by PyTorch but unallocated. 
If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.

#指定多张显卡运行。

CUDA_VISIBLE_DEVICES=5,6 python3 1.py

posted on 2025-09-20 22:02 yi-sheng 阅读(68) 评论(0) 收藏举报