寒假gigapath模型优化总结

寒假前主要遇到的问题

寒假前主要遇到的是在最后一个pt的张量处理的时候会出现矩阵大小不同,无法相乘的问题

pt文件处理代码

import torch
import os

def process_and_save_embeddings(input_dir, output_dir, target_dim=1536):
    """
    处理 .pt 文件中的嵌入,将其转换为指定的 target_dim 格式,并保存到新的目录。
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.pt'):
            input_path = os.path.join(input_dir, file_name)
            embeddings = torch.load(input_path)  # 加载嵌入
            
            # 处理嵌入,根据需求整合多层嵌入
            if isinstance(embeddings, dict):
                # 假设使用最后一层嵌入,或者合并所有层
                if 'last_layer_embed' in embeddings:
                    processed_embedding = embeddings['last_layer_embed']
                else:
                    # 将所有层拼接为一个大张量
                    processed_embedding = torch.cat(
                        [embeddings[key] for key in embeddings.keys()], dim=-1
                    )
            elif isinstance(embeddings, torch.Tensor):
                processed_embedding = embeddings
            else:
                raise ValueError(f"Unsupported embedding format in {file_name}")
            
            # 调整嵌入维度到 target_dim
            processed_embedding = adjust_embedding_dim(processed_embedding, target_dim)
            
            # 保存处理后的嵌入
            output_path = os.path.join(output_dir, file_name)
            torch.save(processed_embedding, output_path)
            print(f"Processed and saved: {output_path}")

def adjust_embedding_dim(embedding, target_dim):
    """
    调整嵌入维度到指定的 target_dim。
    如果嵌入维度小于 target_dim,则进行拼接或重复。
    如果嵌入维度大于 target_dim,则进行截断。
    """
    current_dim = embedding.shape[-1]
    if current_dim == target_dim:
        return embedding
    elif current_dim < target_dim:
        # 重复或补零扩展
        repeat_times = target_dim // current_dim
        remainder = target_dim % current_dim
        extended_embedding = embedding.repeat(1, repeat_times)
        if remainder > 0:
            extended_embedding = torch.cat([extended_embedding, embedding[:, :remainder]], dim=-1)
        return extended_embedding
    else:
        # 截断
        return embedding[:, :target_dim]

# 使用示例
input_dir = "mayoo/1"  # 原始嵌入目录
output_dir = "mayoolast/1"  # 处理后嵌入保存目录
target_dim = 1536  # 模型期望的输入维度
process_and_save_embeddings(input_dir, output_dir, target_dim)

 

我利用以上方法将pt文件的最后两层给它融合在一起,然后作为一个1536长度的向量,这样能最大保留向量的高维度特征,然后进行然后用来进行模型训练,然后呃这个时候会出现一个新问题,就是就是模型的向量,它这个时候就是虽然虽然已经压缩成一行了,但是它还会有另外一个空行,如下

在 train 函数中,target 的形状是 [batch_size],而 CrossEntropyLoss 期望的目标张量是类别索引形式(即每个样本的类别标签,而不是 one-hot 编码形式)。如果 target 是 one-hot 编码形式,会导致以下错误:

RuntimeError: Expected target size [batch_size, num_classes], got [batch_size]

我们需要确保 target 是类别索引形式,而不是 one-hot 编码形式。以下是具体的修改步骤:

在 EmbeddingDataset 类的 __getitem__ 方法中,确保 target 是类别索引形式(即整数标签,而不是 one-hot 编码)。

修改后的代码如下:

class EmbeddingDataset(Dataset):
    def __getitem__(self, index):
        sample, target = self.samples[index], self.labels[index]
        embed = self.embeds[sample]
        if self.z_score:
            embed = (embed - embed.mean()) / embed.std()
        target = self.label_dict[target]  # 将标签转换为类别索引
        target = torch.tensor(target, dtype=torch.long)  # 确保 target 是整数类型
        embed = torch.tensor(embed, dtype=torch.float32)  # 确保 embed 是浮点类型
        return embed, target

 

呃,将这个120呃1536×1的向量转化成一个15单纯1536的向量,因为它多出了一个空行。

如下

Processor 类中的 load_embeddings_from_zip 方法

    • 在加载 .pt 文件后,使用 squeeze() 方法将形状为 [1, 1536] 的张量转换为 [1536]

  1. EmbeddingDataset 类中的 __getitem__ 方法

    • 确保从 self.embeds 中获取的张量是形状为 [1536] 的张量

修改后的完整代码

import os
import io
import argparse
import zipfile
import pandas as pd
import torch
import itertools
import numpy as np
from torch import nn
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.utils.tensorboard as tensorboard
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_fscore_support
def seed_torch(device, seed=11):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if device.type == 'cuda':
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
argparser = argparse.ArgumentParser(description='Linear Probe')
argparser.add_argument('--dataset_csv',            type=str, default='', help='The csv file indicating input samples and labels')
argparser.add_argument('--input_path',          type=str, default='', help='The input embedding files')
argparser.add_argument('--embed_dim',           type=int, default=1536, help='The dimension of the embeddings')
argparser.add_argument('--batch_size',          type=int, default=512, help='Batch size')
argparser.add_argument('--train_iters',         type=int, default=12500, help='Number of epochs')
argparser.add_argument('--lr',                  type=float, default=0.01, help='Learning rate')
argparser.add_argument('--min_lr',              type=float, default=0.0, help='Minimum learning rate')
argparser.add_argument('--optim',               type=str, default='sgd', help='Optimizer')
argparser.add_argument('--momentum',            type=float, default=0.0, help='Momentum')
argparser.add_argument('--weight_decay',        type=float, default=0.0, help='Weight decay')
argparser.add_argument('--eval_interval',       type=int, default=10000, help='Evaluation interval')
argparser.add_argument('--model_select',        type=str, default='best', help='Model selection')
argparser.add_argument('--num_workers',         type=int, default=2, help='Number of workers')
argparser.add_argument('--seed',                type=int, default=420, help='Random seed')
argparser.add_argument('--z_score',             action='store_true', default=False, help='Whether use z-score normalization')
argparser.add_argument('--output_dir',          type=str, default='outputs', help='Output directory')
def to_onehot(labels: np.ndarray, num_classes: int) -> np.ndarray:
    '''Convert the labels to one-hot encoding'''
    onehot = np.zeros((labels.shape[0], num_classes))
    onehot[np.arange(labels.shape[0]), labels] = 1
    return onehot
def train(model,train_loader,val_loader,test_loader,train_iters,lr, min_lr,optim,weight_decay,output_dir,eval_interval,momentum,**kwargs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    tensorboard_dir = os.path.join(output_dir, 'tensorboard')
    os.makedirs(tensorboard_dir, exist_ok=True)
    writer = tensorboard.SummaryWriter(tensorboard_dir)
    
    if optim == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
    elif optim == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        raise ValueError('Invalid optimizer')
    
    print('Set the optimizer as {}'.format(optim))
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=train_iters, eta_min=min_lr)
    criterion = nn.CrossEntropyLoss()
    infinite_train_loader = itertools.cycle(train_loader)
    best_f1 = 0
    print('Start training')
    
    for i, (embed, target) in enumerate(infinite_train_loader):
        if i >= train_iters:
            break
        #print(embed)
        embed, target = embed.to(device), target.to(device)
        output = model(embed)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        if (i + 1) % 10 == 0:
            lr = optimizer.param_groups[0]['lr']
            print(f'Iteration [{i}/{train_iters}]\tLoss: {loss.item()}\tLR: {lr}')
            writer.add_scalar('Train Loss', loss.item(), i)
            writer.add_scalar('Learning Rate', lr, i)
        if (i + 1) % eval_interval == 0 or (i + 1) == train_iters:
            print(f'Start evaluating ...')
            accuracy, f1, precision, recall, auroc, auprc = evaluate(model, criterion, val_loader, device)
            print(f'Val [{i}/{train_iters}] Accuracy: {accuracy} f1: {f1} Precision: {precision} Recall: {recall} AUROC: {auroc} AUPRC: {auprc}')
            writer.add_scalar('Val Accuracy', accuracy, i)
            writer.add_scalar('Val f1', f1, i)
            writer.add_scalar('Val AUROC', auroc, i)
            writer.add_scalar('Val AUPRC', auprc, i)
            writer.add_scalar('Val Precision', precision, i)
            writer.add_scalar('Val Recall', recall, i)
            writer.add_scalar('Best f1', best_f1, i)
            if f1 > best_f1:
                print('Best f1 increase from {} to {}'.format(best_f1, f1))
                best_f1 = f1
                torch.save(model.state_dict(), f'{output_dir}/best_model.pth')
    torch.save(model.state_dict(), f'{output_dir}/model.pth')
    if kwargs.get('model_select') == 'best':
        val_f1 = best_f1
        model.load_state_dict(torch.load(f'{output_dir}/best_model.pth'))
    else:
        val_f1 = f1
        model.load_state_dict(torch.load(f'{output_dir}/model.pth'))
    accuracy, f1, precision, recall, auroc, auprc = evaluate(model, criterion, test_loader, device)
    print(f'Test Accuracy: {accuracy} f1: {f1} Precision: {precision} Recall: {recall} AUROC: {auroc} AUPRC: {auprc}')
    writer.add_scalar('Test Accuracy', accuracy, i)
    writer.add_scalar('Test f1', f1, i)
    writer.add_scalar('Test AUROC', auroc, i)
    writer.add_scalar('Test AUPRC', auprc, i)
    writer.add_scalar('Test Precision', precision, i)
    writer.add_scalar('Test Recall', recall, i)
    f = open(f'{output_dir}/results.txt', 'w')
    f.write(f'Val f1: {val_f1}\n')
    f.write(f'Test f1: {f1} Test AUROC: {auroc} Test AUPRC: {auprc}\n')
    f.close()
def evaluate(model, criterion, val_loader, device):
    model.eval()
    with torch.no_grad():
        total_loss = 0
        pred_gather, target_gather = [], []
        for _, (embed, target) in enumerate(val_loader):
            embed, target = embed.to(device), target.to(device)
            output = model(embed)
            loss = criterion(output, target)
            total_loss += loss.item()
            pred_gather.append(output.cpu().numpy())
            target_gather.append(target.cpu().numpy())
    pred_gather = np.concatenate(pred_gather)
    target_gather = np.concatenate(target_gather)
    accuracy = (pred_gather.argmax(1) == target_gather).mean()
    f1 = f1_score(target_gather, pred_gather.argmax(1), average='weighted')
    precision, recall, _, _ = precision_recall_fscore_support(target_gather, pred_gather.argmax(1), average='macro')
    auroc = roc_auc_score(to_onehot(target_gather, pred_gather.shape[1]), pred_gather, average='macro')
    auprc = average_precision_score(to_onehot(target_gather, pred_gather.shape[1]), pred_gather, average='macro')
    return accuracy, f1, precision, recall, auroc, auprc
def main():
    args = argparser.parse_args()
    print(args)
    seed_torch(torch.device('cuda'), args.seed)
    processor = Processor()
    splits = ['train', 'val', 'test']
    train_dataset, val_dataset, test_dataset = [EmbeddingDataset(args.dataset_csv, args.input_path, \
                    split=split, z_score=args.z_score, processor=processor) for split in splits]
    args.num_classes = len(train_dataset.label_dict)
    print(f'Train: {len(train_dataset)}\tVal: {len(val_dataset)}\tTest: {len(test_dataset)}')
    train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset, replacement=True)
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, sampler=train_sampler, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True)
    model = LinearProbe(args.embed_dim, args.num_classes)
    train(model, train_loader, val_loader, test_loader, **vars(args))
class LinearProbe(nn.Module):
    def __init__(self, embed_dim: int = 1536, num_classes: int = 2):
        super(LinearProbe, self).__init__()
        self.fc = nn.Linear(1536, 2)
    def forward(self, x):
        return self.fc(x)
class EmbeddingDataset(Dataset):
    def __init__(self, dataset_csv: str, zip_path: str, split: str = 'train', z_score=False, processor=None):
        df = pd.read_csv(dataset_csv)
        split_df = df[df['split'] == split]
        self.samples = split_df['input'].tolist()
        self.labels = split_df['label'].tolist()
        self.processor = processor
        self.embeds = processor.load_embeddings_from_zip(zip_path, split)
        label_set = list(set(self.labels))
        label_set.sort()
        self.label_dict = {label: i for i, label in enumerate(label_set)}
        self.z_score = z_score
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, index):
        sample, target = self.samples[index], self.labels[index]
        embed = self.embeds[sample]
        if embed.dim() == 2 and embed.shape[0] == 1:
            embed = embed.squeeze(0)
        if self.z_score:
            embed = (embed - embed.mean()) / embed.std()
        target = self.label_dict[target]
        # target = torch.tensor(target)
        target = torch.tensor(target, dtype=torch.long)  # 确保 target 是整数类型
        embed = torch.tensor(embed, dtype=torch.float32)  # 确保 embed 是浮点类型
        return embed, target

class Processor:
    def get_sample_name(self, path):
        return os.path.basename(path).replace('.pt', '')
    def load_embeddings_from_zip(self, zip_path, split):
        loaded_tensors = {}
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            print(len(zip_ref.infolist()))
            for file_info in tqdm(zip_ref.infolist()):       
                if file_info.filename.endswith('.pt') and split in file_info.filename:
                    file_bytes = zip_ref.read(file_info.filename)
                    byte_stream = io.BytesIO(file_bytes)
                    tensor = torch.load(byte_stream)
                    if tensor.dim() == 2 and tensor.shape[0] == 1:
                        tensor = tensor.squeeze(0)  # 移除第 0 维
                    sample_name = self.get_sample_name(file_info.filename)
                    loaded_tensors[sample_name] = tensor
        return loaded_tensors
if __name__ == '__main__':
    main()

这样子训练下来得到的准确率极低,我们还需要进行参数调整

#!/bin/bash
#PBS -N linear_probe
#PBS -o linear_probe_$PBS_JOBID.log
#PBS -e linear_probe_$PBS_JOBID.err
#PBS -l nodes=1:ppn=12
#PBS -q gpu

cd $PBS_O_WORKDIR

module add gcc/11.2.0
source /home/data/software/python/3.12.7/gigapath2/bin/activate

# Default value for INPUTPATH if not provided as an argument
INPUTPATH=${1:-/public/liujx/prov-gigapath2/mayoolast.zip}
DATASETCSV=/public/liujx/mayo.csv
OUTPUT=outputs3

echo "Running linear probe with input path: $INPUTPATH"

python linear_probe/main.py --input_path $INPUTPATH \
                            --dataset_csv $DATASETCSV \
                            --output $OUTPUT \
                            --batch_size 256 \
                            --embed_dim 1536 \
                            --num_workers 2 \
                            --lr 0.001 \
                            --min_lr 1e-5 \
                            --train_iters 10000 \
                            --eval_interval 500 \
                            --optim adam \
                            --weight_decay 0.001

最终得到结果

Val [9999/10000] Accuracy: 0.5 f1: 0.5 Precision: 0.5 Recall: 0.5 AUROC: 0.5 AUPRC: 0.6428571428571428
Test Accuracy: 0.6907894736842105 f1: 0.6670480549199085 Precision: 0.5797037037037037 Recall: 0.5582251082251082 AUROC: 0.5418831168831169 AUPRC: 0.5350509608657055
这里val准确率只有0.5是因为val数据集里面只有两个pt,刚好错了一个
在测试集中的准确率acc为0.69,其它数据都较为一般,可能因为数据量不够大或者参数还需要进一步调整,等待后续研究
posted @ 2025-02-09 11:44  liujunxi  阅读(146)  评论(0)    收藏  举报