LoRA (Low-Rank Adaptation)
LoRA官方文档
Qwen2.5-0.5B微调Notebook
Data preprocess
pip&&import
!pip config
!pip install modelscope==1.18.0
!pip install transformers==4.44.2
!pip install streamlit==1.24.0
!pip install sentencepiece==0.2.0
!pip install accelerate==0.34.2
!pip install datasets==2.20.0
!pip install peft==0.11.1
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, get_scheduler
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.quantization import quantize_dynamic, prepare_qat, convert
from modelscope import snapshot_download, AutoModel, AutoTokenizer
import os
SEED&&environ
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["model_path"] = "/kaggle/input/qwen2.5/transformers/0.5b-instruct/1"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
# Constants
model_path = os.getenv('model_path')
BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 2e-5
SEED = 42
MAX_LENGTH = 512
# Set seed for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)
Load the data
train_df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet")
test_df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet")
sample_submission = pd.read_csv("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv")
Prompt Templates
PROMPT_TEMPLATES = [
"""作为一名专业的语言模型评估师,请针对以下问题和回复,从逻辑性、情感表达以及实用性等角度选择更优的一方:
【问题】: {p}
【回复a】: {a}
【回复b】: {b}""",
"""你是一位对多语言回复质量有独到见解的评估师,请根据以下问题和回复,选择你认为更胜一筹的一方:
【问题】: {p}
【回复a】: {a}
【回复b】: {b}""",
"""请从专业性、用词准确性和情感价值的角度,公平评估以下两个回复,选择你认为更好的一个:
【问题】: {p}
【回复a】: {a}
【回复b】: {b}"""
]
Data Preprocessing
def preprocess_data(df, templates):
"""
Generate enhanced prompts based on the given templates.
"""
prompts = []
for _, row in df.iterrows():
template = np.random.choice(templates)
prompt = template.format(p=row["prompt"], a=row["response_a"], b=row["response_b"])
prompts.append(prompt)
df["enhanced_prompt"] = prompts
return df
# Apply preprocessing to train and test datasets
train_df = preprocess_data(train_df, PROMPT_TEMPLATES)
test_df = preprocess_data(test_df, PROMPT_TEMPLATES)
train_df = train_df.dropna(subset=["winner"])
train_df.shape
from datasets import Dataset
# change Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
from functools import partial # 导入 Python 的 partial 函数,用于创建偏函数
from transformers import DataCollatorForSeq2Seq # 导入用于序列到序列任务的数据整理器
from typing import Dict, List # 导入类型提示工具
MAX_LENGTH = 256 # 设置最大序列长度为 256
def preprocess_train_df(batch: Dict[str, List[str]], tokenizer, max_length=MAX_LENGTH):
"""
Preprocess the train_df dataset for sequence-to-sequence tasks.
- Use `enhanced_prompt` as the input.
- Select the correct response based on the `winner` column as the target output.
"""
# Select response based on winner: 根据 winner 列选择正确的回答
# 如果 winner 是 "model_a",选择 response_a,否则选择 response_b
responses = [
a if winner == "model_a" else b
for a, b, winner in zip(batch["response_a"], batch["response_b"], batch["winner"])
]
# Tokenize input and target: 对输入和输出进行分词
model_input = tokenizer(
batch["enhanced_prompt"], # `enhanced_prompt` 作为输入字段
text_target=responses, # 根据 winner 列选择的响应作为目标输出
truncation=True, # 对文本进行截断,避免超过最大长度
padding="max_length", # 填充到最大长度
max_length=max_length, # 最大长度为 `max_length`(256)
)
# Assign labels: 将输入的 token id 作为标签(标签是目标文本的 token ids)
model_input["labels"] = model_input["input_ids"]
# 返回处理后的数据(包含 input_ids、labels 等)
return model_input
Fine-tuning
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
import transformers
from torch.utils.data import DataLoader
# Tokenizer: 加载预训练模型的分词器(Tokenizer)
pretrained_model_name = model_path # 假设 `model_path` 是预训练模型的路径或名称
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, trust_remote_code=True) # 加载分词器
# Create a partial function: 创建一个偏函数,使用预处理函数(`preprocess_train_df`),并设置 tokenization 和最大长度
_preprocess_function = partial(preprocess_train_df, tokenizer=tokenizer, max_length=MAX_LENGTH)
# Apply preprocessing function: 使用 `map` 函数批量处理数据,进行分词和数据清理
encoded_dataset = train_dataset.map(
_preprocess_function, # 使用预处理函数
batched=True, # 批量处理
remove_columns=['id', 'prompt', 'response_a', 'response_b', 'winner', 'model_a', 'model_b', 'language', 'enhanced_prompt'] # 删除无关列
)
# Filter out sequences that are too long: 过滤掉超过最大长度的序列
processed_dataset = encoded_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)
# Split dataset: 划分数据集为训练集和测试集,比例为 80/20
split_dataset = processed_dataset.train_test_split(test_size=0.2, seed=42)
# Load pre-trained model: 从 Hugging Face 加载预训练模型
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name, device_map="auto", trust_remote_code=True, num_labels=2)
model.enable_input_require_grads() # 启用输入的梯度计算,以便进行微调
# Data collator: 数据整理器,用于处理批量数据,自动填充到最大长度,并处理填充对齐
data_collator = DataCollatorForSeq2Seq(
model=model, # 模型
tokenizer=tokenizer, # 分词器
max_length=MAX_LENGTH, # 最大长度
pad_to_multiple_of=8, # 填充长度到 8 的倍数,通常用于加速训练
padding="max_length" # 填充到最大长度
)
# LoRA configuration: 配置 LoRA 模型的训练参数
config = LoraConfig(
task_type=TaskType.CAUSAL_LM, # 任务类型为自回归语言模型
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # LoRA 将应用于这些模块
inference_mode=False, # 不在推理模式下训练,而是训练模式
r=8, # LoRA 的秩(rank),控制低秩矩阵的维度
lora_alpha=16, # LoRA 的缩放因子
lora_dropout=0.1 # LoRA 的 dropout 比例
)
# Get PEFT model: 使用 LoRA 配置来生成带 LoRA 的 PEFT 模型(Parameter-Efficient Fine-Tuning)
model = get_peft_model(model, config)
# Training arguments: 设置训练参数,包括学习率、batch size、epoch 等
training_args = TrainingArguments(
output_dir="./output/Qwen2.5_instruct_lora", # 输出目录
run_name="qwen2.5_1.5b", # 运行名称
per_device_train_batch_size=8, # 每个设备的训练批量大小
per_device_eval_batch_size=4, # 每个设备的验证批量大小
logging_strategy="epoch", # 按照 epoch 进行日志记录
evaluation_strategy="epoch", # 按照 epoch 进行验证
save_strategy="epoch", # 按照 epoch 进行保存
save_total_limit=2, # 最多保存 2 个模型
gradient_accumulation_steps=4, # 梯度累积步数,减少显存占用
logging_steps=10, # 每 10 步记录一次日志
num_train_epochs=2, # 训练 epochs 数量
save_steps=100, # 每 100 步保存一次模型
learning_rate=1e-4, # 学习率
save_on_each_node=True, # 在每个节点上保存模型
gradient_checkpointing=True, # 启用梯度检查点,节省内存
fp16=True, # 使用半精度训练(FP16)
report_to="none", # 不报告到任何外部监控工具
)
# Print trainable parameters: 打印模型可训练的参数数量
model.print_trainable_parameters()
# Re-split dataset: 再次划分数据集为训练集和测试集,确保正确划分
split_dataset = processed_dataset.train_test_split(test_size=0.2, seed=42)
# Print split dataset: 打印划分后的数据集,查看训练集和测试集的大小
print(split_dataset)
# Trainer: 初始化训练器,负责训练和验证过程
trainer = Trainer(
model=model, # 传入模型
tokenizer=tokenizer, # 传入分词器
args=training_args, # 传入训练参数
data_collator=data_collator, # 传入数据整理器
train_dataset=split_dataset["train"], # 训练集
eval_dataset=split_dataset["test"] # 测试集
)
# Disable caching during training: 禁用模型缓存,避免影响训练
model.config.use_cache = False
# Train the model: 开始训练
trainer.train()
zero-shot(submission)starter_Qwen2.5_0.5B
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.model_selection import train_test_split
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["model_path"] = "/kaggle/input/qwen2.5/transformers/1.5b-instruct/1"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
# Constants
model_path = os.getenv('model_path')
BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 2e-5
SEED = 42
MAX_LENGTH = 512
# Set seed for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)
# Load the data
train_df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet")
test_df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet")
sample_submission = pd.read_csv("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv")
#lora_path = '/kaggle/input/checkpoint/'
# 加载数据
def load_data(train_path, test_path):
train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)
return train_df, test_df
# Prompt Templates
PROMPT_TEMPLATES = [
"""你是一位对多语言回复质量有独到见解的评估师,请根据以下问题和回复,从逻辑性、专业性、用词准确性和情感价值的情感表达以及实用性等角度选择你认为更胜一筹的一方:
【问题】: {p}
【回复a】: {a}
【回复b】: {b}""",
]
# Data Preprocessing
def preprocess_data(df, templates):
"""
Generate enhanced prompts based on the given templates.
"""
prompts = []
for _, row in df.iterrows():
template = np.random.choice(templates)
prompt = template.format(p=row["prompt"], a=row["response_a"], b=row["response_b"])
prompts.append(prompt)
df["enhanced_prompt"] = prompts
return df
# Apply preprocessing to train and test datasets
train_df = preprocess_data(train_df, PROMPT_TEMPLATES)
test_df = preprocess_data(test_df, PROMPT_TEMPLATES)
train_df
class PreferenceModel:
def __init__(self, model_name=model_path):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name,device_map="auto")
def compute_similarity(self, prompt, response, batch_size=1):
"""计算prompt和response之间的得分"""
inputs = self.tokenizer(prompt + " " + response, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = self.model(**inputs)
score = torch.softmax(outputs.logits, dim=1)[0, 1].item() # 分类为[not preferred, preferred]
return score
def predict(self, prompt, response_a, response_b, batch_size=1):
"""预测用户更可能选择的响应"""
sim_a = self.compute_similarity(prompt, response_a, batch_size)
sim_b = self.compute_similarity(prompt, response_b, batch_size)
return "model_a" if sim_a > sim_b else "model_b"
# 用于模型训练和评估的主要流程
def train_and_evaluate(train_df, test_df, templates, batch_size=1, train_mode=True):
# 数据增强
train_df = preprocess_data(train_df, templates)
test_df = preprocess_data(test_df, templates)
# 初始化偏好模型
preference_model = PreferenceModel()
if train_mode:
# 在训练集上评估准确率
correct_predictions = 0
for _, row in train_df.iterrows():
prediction = preference_model.predict(row["prompt"], row["response_a"], row["response_b"], batch_size)
if prediction == row["winner"]:
correct_predictions += 1
train_accuracy = correct_predictions / len(train_df)
print(f"Training Accuracy: {train_accuracy:.4f}")
# 在测试集上生成预测
predictions = []
for _, row in test_df.iterrows():
prediction = preference_model.predict(row["prompt"], row["response_a"], row["response_b"], batch_size)
predictions.append(prediction)
# 构建提交文件
submission = pd.DataFrame({
"id": test_df["id"],
"winner": predictions
})
return submission
# 主程序
if __name__ == "__main__":
# 文件路径
train_path = "/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet"
test_path = "/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet"
# 加载数据
train_df, test_df = load_data(train_path, test_path)
# 定义prompt模板
PROMPT_TEMPLATES = [
"Given the prompt: '{p}', which response is better? Response A: '{a}' or Response B: '{b}'?",
"Consider the following scenario: {p}. Between Response A: '{a}' and Response B: '{b}', which one aligns better?",
"Prompt: {p}. Compare Response A ('{a}') and Response B ('{b}'). Which one would you prefer?"
]
# 设置batch_size
batch_size = BATCH_SIZE
# 是否训练模式
train_mode = False # 设置为False时只进行预测
# 训练并生成提交文件
submission = train_and_evaluate(train_df, test_df, PROMPT_TEMPLATES, batch_size=batch_size, train_mode=train_mode)
# 保存提交文件
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")