Pytorch相关总结
PyTorch相关总结
以二分类为例子。
Dataset
一般自定义的数据集建议自己继承Dataset类进行编写,以下是一个简单的例子,如果是想简易方法直接调用TensorDataset。
# Bert Dataset
class BertDataset(Dataset):
def __init__(self, data, tokenizer, max_length):
self.tokenizer = tokenizer
self.features = tokenizer(data.cleaned_text.values.tolist(),
max_length=max_length,
truncation=True,
padding=True,
return_tensors='pt')
self.labels = data.iloc[:, [1,2,3]].values.tolist()
self.idx = 0
def __getitem__(self, item):
return {'attention_mask': self.features['attention_mask'][item],
'input_ids': self.features['input_ids'][item],
'labels': self.labels[item][self.idx],
'token_type_ids': self.features['token_type_ids'][item]
}
def __len__(self):
return len(self.labels)
def set_label_type(self, idx):
self.idx = idx
常见数据集一般有封装的话,可以将数据集类与加载器封装在一块,以下是一个CIFAR100数据加载类的例子。
# CV CIFAR data
def get_statistics():
# 获取数据集的均值与标准差
train_set = torchvision.datasets.CIFAR100(root="./data", train=True, download=True,
transform=transforms.ToTensor())
data = torch.cat([data[0] for data in DataLoader(train_set)])
return data.mean(dim=[0, 2, 3]), data.std(dim=[0, 2, 3])
class Data:
@staticmethod
def get_statistics():
# 获取数据集的均值与标准差
train_set = torchvision.datasets.CIFAR100(root="./data", train=True, download=True,
transform=transforms.ToTensor())
data = torch.cat([data[0] for data in DataLoader(train_set)])
return data.mean(dim=[0, 2, 3]), data.std(dim=[0, 2, 3])
def __init__(self, batch_size, threads):
mean, std = self.get_statistics()
# 训练集的数据增强
train_transform = transforms.Compose([
# 随机裁剪
torchvision.transforms.RandomCrop(size=(32, 32), padding=4),
# 依概率水平翻转
torchvision.transforms.RandomHorizontalFlip(),
# Tensor化
transforms.ToTensor(),
# 归一化
transforms.Normalize(mean, std),
# 模拟遮掩
Cutout()
])
# 测试集只进行tensor化与归一化
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
# 训练集数据类
train_set = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=train_transform)
# 测试集数据类
test_set = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=test_transform)
# 训练集、测试集加载器
self.train = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=threads)
self.test = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=threads)
self.classes = train_set.classes
Seed & Device
为了实现结果的再现,需要设置种子以支持复现。
# 初步固定
def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True # 这一步会降低速度
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
#torch.backends.cudnn.benchmark = True #for accelerating the running
# 深入加强,一般初步固定即可
def _init_fn(worker_id):
random.seed(10 + worker_id)
np.random.seed(10 + worker_id)
torch.manual_seed(10 + worker_id)
torch.cuda.manual_seed(10 + worker_id)
torch.cuda.manual_seed_all(10 + worker_id)
dataloader = DataLoader(tensor_dataset,
batch_size=opt.batchSize,
shuffle=True,
num_workers=opt.workers,
worker_init_fn=_init_fn)
一般单卡跑的话需要设置CUDA类型,并将模型、输入数据添加到设备之中。
# os.environ['CUDA_VISIBLE_DEVICE'] = '0'
device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
......
model.to(device)
......
for features, labels in dataloader:
feature.to(device)
labels.to(device)
训练 & 测试
训练与测试过程很简单、明了,以下是一个例子。
# bert train with fine tuning
for epoch in range(num_epochs):
train_dict = {"loss": 0, "length": 0, "correct": 0}
with torch.enable_grad():
for batch in train_loader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss # 只要传入labels,就会计算loss
train_dict['loss'] += loss.item()
train_dict['length'] += len(outputs.logits)
train_dict['correct'] += (torch.argmax(outputs.logits, dim=-1) &
batch['labels']).cpu().sum().item()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
print("\nTrain Epoch {0} Loss:{1:.2f}\tAccuracy: {2:.2f} %".
format(epoch, train_dict['loss'] / len(train_loader), (train_dict["correct"] / train_dict['length'])*100))
with torch.no_grad():
result_dict = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
for batch in test_loader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
predictions = torch.argmax(outputs.logits, dim=-1)
# TP predict 和 label 同时为1
result_dict['TP'] += ((predictions == 1) & (batch['labels'] == 1)).cpu().sum().item()
# TN predict 和 label 同时为0
result_dict['TN'] += ((predictions == 0) & (batch['labels'] == 0)).cpu().sum().item()
# FN predict 0 label 1
result_dict['FN'] += ((predictions == 0) & (batch['labels'] == 1)).cpu().sum().item()
# FP predict 1 label 0
result_dict['FP'] += ((predictions == 1) & (batch['labels'] == 0)).cpu().sum().item()
acc, prec, recall, f1 = eval_indicators(result_dict)
if (epoch+1) % 5 == 0:
torch.save(model, result_dir+"/NS-" + str(epoch) + ".pt")
print("\nEval Epoch {0} Accuracy:{1:.2f} %\tPrecision: {2:.2f} %\tRecall: {3:.2f} %\tF1 Score: {4:.2f} %".
format(epoch, acc * 100, prec * 100, recall * 100, f1 * 100))
常见的评估指标基本为ACC、Precision、Recall、F1 Score,简单的计算代码如下。
def eval_indicators(result_dict):
acc = (result_dict['TP']+result_dict['TN'])/(result_dict['TP'] + result_dict['TN'] +
result_dict['FP'] + result_dict['FN'])
precision = (result_dict['TP']/(result_dict['TP'] + result_dict['FP']))
recall = (result_dict['TP']/(result_dict['TP'] + result_dict['FN']))
f1 = 2 * (precision * recall) / (precision + recall)
return acc, precision, recall, f1
如果存在BN层,需要通过以下方法在测试部分调用disable_runing_status函数停止BN层变化。
def disable_running_status(model):
def _disable(module):
if isinstance(model, nn.BatchNorm2d):
# 保存当前BN的momentum到module.backup_momentum进行现场保存
module.backup_momentum = module.momentum
# 将momentum清空
module.momentum = 0
model.apply(_disable)
def enable_running_status(model):
def _enable(module):
if isinstance(module, nn.BatchNorm2d) and hasattr(module, "backup_momentum"):
# 检查BN以及备份的字段并恢复现场
module.momentum = module.backup_momentum
model.apply(_enable)
Transformers Fine Tuning
微调的过程中使用的数据集、数据加载部分就不多说。对于transformers系列的库,一般NLP的微调步骤为:加载分词器、小数据集进行分词、加载模型修改输出层、冻结特定层参数并训练预训练模型。
import transformers as trm
# 加载预训练的分词器
tokenizer = trm.AutoTokenizer.from_pretrained("bert-base-uncased")
# 对数据进行分词,一般包含token_ids,attention_mask,token_type_ids
tokens = tokenizer("Hello,world!", max_length=max_length, truncation=True, padding=True, return_tensors='pt')
# 之后加载模型,以二分类任务为例子,输出层修改为2
model = trm.AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)
# 设置其他的参数,例如优化器、学习率策略、等等
......
# 进行微调训练
model.train()
# 微调训练根据transformers官方的有两种,一种是Trainer类,另一种是常规训练,以常规训练说明
for epoch in range(num_epochs):
with torch.enable_grad():
......
outputs = model(**batch)
loss = outputs.loss # 只要传入labels,就会计算loss
loss.backward()
optimizer.step()
optimizer.zero_grad() # 只会影响BN层
with torch.no_grad():
# 进行模型测试评估,当然可以改成k折训练进行交叉验证
......
冻结层的相关简易代码如下。
# 不需要冻结的层名
unfreeze_layers = [...]
# 进行中间层冻结
for name, param in model.named_parameters():
param.requires_grad = False
for element in unfreeze_layers:
if element in name:
param.requires_grad = True
break
# 调用torch自带的优化器的话,需要优化器过滤其中的冻结层参数以不再更新这些参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
一个可以借鉴的方法如下。
from collections.abc import Iterable
def set_freeze_by_names(model, layer_names, freeze=True):
if not isinstance(layer_names, Iterable):
layer_names = [layer_names]
for name, child in model.named_children():
if name not in layer_names:
continue
for param in child.parameters():
param.requires_grad = not freeze
def freeze_by_names(model, layer_names):
set_freeze_by_names(model, layer_names, True)
def unfreeze_by_names(model, layer_names):
set_freeze_by_names(model, layer_names, False)
def set_freeze_by_idxs(model, idxs, freeze=True):
if not isinstance(idxs, Iterable):
idxs = [idxs]
num_child = len(list(model.children()))
idxs = tuple(map(lambda idx: num_child + idx if idx < 0 else idx, idxs))
for idx, child in enumerate(model.children()):
if idx not in idxs:
continue
for param in child.parameters():
param.requires_grad = not freeze
def freeze_by_idxs(model, idxs):
set_freeze_by_idxs(model, idxs, True)
def unfreeze_by_idxs(model, idxs):
set_freeze_by_idxs(model, idxs, False)

浙公网安备 33010602011771号