DNNRegression(pytorch)

最近开始学习 pytroch，照着kaggle 简单实现优化了一个 DNN

需要搭建网络、把数据放入dataset，然后定义前向传播

搭建net
手写 l2正则 loss
手写 early_stop

import matplotlib.pyplot as plt
import torch
from matplotlib.pyplot import figure
from torch.utils.data import DataLoader

from kaggle.covid.data_loader import CovidDataLoader


class CovidTaskUtil:
    @staticmethod
    def get_dataset_loss(validation_set, model, device):
        model.eval()
        total_loss = 0
        for x, y in validation_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                prediction = model(x)
                mse_loss = model.cal_loss_l2(prediction, y, 0.000075)  # compute loss
            total_loss += mse_loss.detach().item() * len(x)  # accumulate loss
        total_loss = total_loss / len(validation_set.dataset)  # compute averaged loss

        return total_loss

    @staticmethod
    def train(train_set, validation_set, model, config, device):
        n_epochs = config['n_epochs']  # Maximum number of epochs
        optimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas'])
        min_mse = 1000.
        loss_record = {'train': [], 'validation': []}  # for recording training loss
        early_stop_cnt = 0
        epoch = 0
        while epoch < n_epochs:
            model.train()  # set model to training mode
            for x, y in train_set:  # iterate through the dataloader
                optimizer.zero_grad()  # set gradient to zero
                x, y = x.to(device), y.to(device)  # move data to device (cpu/cuda)
                prediction = model(x)  # forward pass (compute output)
                mse_loss = model.cal_loss_l2(prediction, y, 0.000075)  # compute loss
                mse_loss.backward()  # compute gradient (backpropagation)
                optimizer.step()  # update parameters
                loss_record['train'].append(mse_loss.detach().item())

            # After each epoch, test your model on the validation set.
            validation_loss = CovidTaskUtil.get_dataset_loss(validation_set, model, device)
            if validation_loss < min_mse:
                min_mse = validation_loss
                print(f'Saving model (epoch = {epoch + 1}, loss = {min_mse})')
                torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
                early_stop_cnt = 0
            else:
                early_stop_cnt += 1

            epoch += 1
            loss_record['validation'].append(validation_loss)
            if early_stop_cnt > config['early_stop']:
                break

        print('Finished training after {} epochs'.format(epoch))
        return min_mse, loss_record

    @staticmethod
    def get_predictions(tt_set, model, device):
        model.eval()
        predictions = []
        for x in tt_set:
            x = x.to(device)
            with torch.no_grad():
                prediction = model(x)
                predictions.append(prediction.detach().cpu())
        predictions = torch.cat(predictions, dim=0).numpy()
        return predictions

    @staticmethod
    def get_device():
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        return device

    @staticmethod
    def plot_learning_curve(loss_record, title=''):
        total_steps = len(loss_record['train'])
        x_1 = range(total_steps)
        # same length
        x_2 = x_1[::len(loss_record['train']) // len(loss_record['validation'])]
        figure(figsize=(6, 4))
        plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
        plt.plot(x_2, loss_record['validation'], c='tab:cyan', label='validation')
        plt.ylim(0.0, 5.)
        plt.xlabel('Training steps')
        plt.ylabel('MSE loss')
        plt.title('Learning curve of {}'.format(title))
        plt.legend()
        plt.show()

    @staticmethod
    def plot_prediction(data_set, model, device, lim=35., predictions=None, targets=None):
        if predictions is None or targets is None:
            model.eval()
            predictions, targets = [], []
            for x, y in data_set:
                x, y = x.to(device), y.to(device)
                with torch.no_grad():
                    prediction = model(x)
                    predictions.append(prediction.detach().cpu())
                    targets.append(y.detach().cpu())
            predictions = torch.cat(predictions, dim=0).numpy()
            targets = torch.cat(targets, dim=0).numpy()

        figure(figsize=(5, 5))
        plt.scatter(targets, predictions, c='r', alpha=0.5)
        plt.plot([-0.2, lim], [-0.2, lim], c='b')
        plt.xlim(-0.2, lim)
        plt.ylim(-0.2, lim)
        plt.xlabel('ground truth value')
        plt.ylabel('predicted value')
        plt.title('Ground Truth v.s. Prediction')
        plt.show()

    @staticmethod
    def prep_dataloader(path, mode, batch_size, n_jobs=0, indexes=None):
        dataset = CovidDataLoader(path, mode=mode, indexes=indexes)
        dataloader = DataLoader(dataset, batch_size, shuffle=(mode == 'train'), drop_last=False,
                                num_workers=n_jobs, pin_memory=True)
        return dataloader

import csv

import numpy as np
import torch
from torch.utils.data import Dataset


class CovidDataLoader(Dataset):
    def __init__(self, path: str, mode: str, indexes=None):
        self.mode = mode
        with open(path, 'r') as fp:
            data = list(csv.reader(fp))
            # drop first column and title row
            data = np.array(data[1:])[:, 1:].astype(float)

        features = data[:, indexes]
        self.target = target = data[:, -1]

        self.features = torch.FloatTensor(features)
        self.target = torch.FloatTensor(target)

        self.features[:, 40:] = (self.features[:, 40:] - self.features[:, 40:].mean(dim=0, keepdim=True)) / \
                                self.features[:, 40:].std(dim=0, keepdim=True)
        self.dim = self.features.shape[1]

        print(
            f'Finished reading the {mode} set of COVID19 Dataset ({len(self.features)} samples found, each dim = {self.dim})')

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev']:
            return self.features[index], self.target[index]
        else:
            return self.features[index]

    def __len__(self):
        return len(self.features)

from dataclasses import dataclass, field

import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression


@dataclass
class FeatureSelector:
    path: str = ""
    target_index: int = 0
    feature_start_index: int = 0
    feature_end_index: int = 0
    top_selected: int = 0
    data: pd.DataFrame = field(default_factory=pd.DataFrame)
    x: pd.DataFrame = field(default_factory=pd.DataFrame)
    y: pd.DataFrame = field(default_factory=pd.DataFrame)

    def init(self):
        self.data = pd.read_csv(self.path)
        self.x = self.data[self.data.columns[self.feature_start_index:self.feature_end_index]]
        self.y = self.data[self.data.columns[self.target_index]]

    def min_max_scaling(self):
        self.x = (self.x - self.x.min()) / (self.x.max() - self.x.min())

    def run(self):
        self.init()
        self.min_max_scaling()
        selector = SelectKBest(score_func=f_regression, k=self.top_selected)
        fit = selector.fit(self.x, self.y)
        scores = pd.DataFrame(fit.scores_)
        columns = pd.DataFrame(self.x.columns)
        feature_scores = pd.concat([columns, scores], axis=1)
        feature_scores.columns = ['Specs', 'Score']  # naming the dataframe columns
        top_features = feature_scores.nlargest(self.top_selected, 'Score')
        print(top_features)  # print 15 best features
        selected_indexes = top_features.index.tolist()
        return selected_indexes

import torch
import torch.nn as nn
from typing import List


class NeuralNet(nn.Module):
    def __init__(self, input_dim: int, hidden_layers: List[int]):
        super(NeuralNet, self).__init__()
        layers = []
        # 输入层到第一个隐藏层
        layers.extend([
            nn.Linear(input_dim, hidden_layers[0]),
            nn.BatchNorm1d(hidden_layers[0]),
            nn.Dropout(p=0.2),
            nn.LeakyReLU()
        ])
        # 隐藏层之间的连接
        for i in range(len(hidden_layers) - 1):
            layers.extend([
                nn.Linear(hidden_layers[i], hidden_layers[i + 1]),
                nn.BatchNorm1d(hidden_layers[i + 1]),
                nn.Dropout(p=0.2),
                nn.LeakyReLU()
            ])

        layers.append(nn.Linear(hidden_layers[-1], 1))
        self.net = nn.Sequential(*layers)
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        return self.net(x).squeeze(1)

    def cal_loss(self, prediction, target):
        return self.criterion(prediction, target)

    def cal_loss_l2(self, prediction, target, alpha):
        main_loss = self.criterion(prediction, target)
        l2_reg = torch.tensor(0., device=prediction.device)
        for param in self.parameters():
            l2_reg += torch.sum(param ** 2)
        total_loss = main_loss + alpha * l2_reg
        return total_loss

import torch

from kaggle.covid.covid_taks_util import CovidTaskUtil
from kaggle.covid.feature_seletor import FeatureSelector
from kaggle.covid.models.covid_nn import NeuralNet

train_path = "/Users/wunan/Desktop/kaggledata/covid/train.csv"
test_path = "/Users/wunan/Desktop/kaggledata/covid/test.csv"

feature_selector = FeatureSelector(path=train_path, target_index=94, feature_end_index=94, feature_start_index=1,
                                   top_selected=15)
selected_indexes = feature_selector.run()
print("Selected indexes from run():", selected_indexes)

device = CovidTaskUtil.get_device()

config = {
    'n_epochs': 10000,  # 因为有early_stop，所以大一点没有影响
    'batch_size': 200,  # 微调batchsize
    'optimizer': 'Adam',  # 使用Adam优化器
    'optim_hparas': {  # 完全使用默认参数
        # 'lr': 0.0001,
        # 'momentum': 0.9,
        # 'weight_decay': 5e-4,
    },
    'early_stop': 500,  # 由于最后训练使用了所有数据，大一点影响不大
    'save_path': '/Users/wunan/PycharmProjects/mlstudy/kaggle/covid/models/model.pth'
}

train_set = CovidTaskUtil.prep_dataloader(train_path, 'train', config['batch_size'], indexes=selected_indexes)
validation_set = CovidTaskUtil.prep_dataloader(train_path, 'train', config['batch_size'], indexes=selected_indexes)
test_set = CovidTaskUtil.prep_dataloader(test_path, 'test', config['batch_size'], indexes=selected_indexes)

model = NeuralNet(validation_set.dataset.dim, [518, 256, 128, 64]).to(device)

model_loss, model_loss_record = CovidTaskUtil.train(train_set, validation_set, model, config, device)
CovidTaskUtil.plot_learning_curve(model_loss_record, title='deep model')


del model
model = NeuralNet(train_set.dataset.dim, [518, 256, 128, 64]).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu')  # Load your best model
model.load_state_dict(ckpt)
CovidTaskUtil.plot_prediction(train_set, model, device)

https://www.kaggle.com/code/lemontreeyc/hw1-public-strong-baseline/notebook

posted @ 2025-11-28 16:47 ylxn 阅读(3) 评论(0) 收藏举报

刷新页面返回顶部

ylxn

时光旅客～

DNNRegression(pytorch)

公告