DNNRegression(pytorch)
最近开始学习 pytroch,照着kaggle 简单实现优化了一个 DNN
需要搭建网络、把数据放入dataset,然后定义前向传播
- 搭建net
- 手写 l2正则 loss
- 手写 early_stop
import matplotlib.pyplot as plt import torch from matplotlib.pyplot import figure from torch.utils.data import DataLoader from kaggle.covid.data_loader import CovidDataLoader class CovidTaskUtil: @staticmethod def get_dataset_loss(validation_set, model, device): model.eval() total_loss = 0 for x, y in validation_set: x, y = x.to(device), y.to(device) with torch.no_grad(): prediction = model(x) mse_loss = model.cal_loss_l2(prediction, y, 0.000075) # compute loss total_loss += mse_loss.detach().item() * len(x) # accumulate loss total_loss = total_loss / len(validation_set.dataset) # compute averaged loss return total_loss @staticmethod def train(train_set, validation_set, model, config, device): n_epochs = config['n_epochs'] # Maximum number of epochs optimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas']) min_mse = 1000. loss_record = {'train': [], 'validation': []} # for recording training loss early_stop_cnt = 0 epoch = 0 while epoch < n_epochs: model.train() # set model to training mode for x, y in train_set: # iterate through the dataloader optimizer.zero_grad() # set gradient to zero x, y = x.to(device), y.to(device) # move data to device (cpu/cuda) prediction = model(x) # forward pass (compute output) mse_loss = model.cal_loss_l2(prediction, y, 0.000075) # compute loss mse_loss.backward() # compute gradient (backpropagation) optimizer.step() # update parameters loss_record['train'].append(mse_loss.detach().item()) # After each epoch, test your model on the validation set. validation_loss = CovidTaskUtil.get_dataset_loss(validation_set, model, device) if validation_loss < min_mse: min_mse = validation_loss print(f'Saving model (epoch = {epoch + 1}, loss = {min_mse})') torch.save(model.state_dict(), config['save_path']) # Save model to specified path early_stop_cnt = 0 else: early_stop_cnt += 1 epoch += 1 loss_record['validation'].append(validation_loss) if early_stop_cnt > config['early_stop']: break print('Finished training after {} epochs'.format(epoch)) return min_mse, loss_record @staticmethod def get_predictions(tt_set, model, device): model.eval() predictions = [] for x in tt_set: x = x.to(device) with torch.no_grad(): prediction = model(x) predictions.append(prediction.detach().cpu()) predictions = torch.cat(predictions, dim=0).numpy() return predictions @staticmethod def get_device(): device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") return device @staticmethod def plot_learning_curve(loss_record, title=''): total_steps = len(loss_record['train']) x_1 = range(total_steps) # same length x_2 = x_1[::len(loss_record['train']) // len(loss_record['validation'])] figure(figsize=(6, 4)) plt.plot(x_1, loss_record['train'], c='tab:red', label='train') plt.plot(x_2, loss_record['validation'], c='tab:cyan', label='validation') plt.ylim(0.0, 5.) plt.xlabel('Training steps') plt.ylabel('MSE loss') plt.title('Learning curve of {}'.format(title)) plt.legend() plt.show() @staticmethod def plot_prediction(data_set, model, device, lim=35., predictions=None, targets=None): if predictions is None or targets is None: model.eval() predictions, targets = [], [] for x, y in data_set: x, y = x.to(device), y.to(device) with torch.no_grad(): prediction = model(x) predictions.append(prediction.detach().cpu()) targets.append(y.detach().cpu()) predictions = torch.cat(predictions, dim=0).numpy() targets = torch.cat(targets, dim=0).numpy() figure(figsize=(5, 5)) plt.scatter(targets, predictions, c='r', alpha=0.5) plt.plot([-0.2, lim], [-0.2, lim], c='b') plt.xlim(-0.2, lim) plt.ylim(-0.2, lim) plt.xlabel('ground truth value') plt.ylabel('predicted value') plt.title('Ground Truth v.s. Prediction') plt.show() @staticmethod def prep_dataloader(path, mode, batch_size, n_jobs=0, indexes=None): dataset = CovidDataLoader(path, mode=mode, indexes=indexes) dataloader = DataLoader(dataset, batch_size, shuffle=(mode == 'train'), drop_last=False, num_workers=n_jobs, pin_memory=True) return dataloader
import csv import numpy as np import torch from torch.utils.data import Dataset class CovidDataLoader(Dataset): def __init__(self, path: str, mode: str, indexes=None): self.mode = mode with open(path, 'r') as fp: data = list(csv.reader(fp)) # drop first column and title row data = np.array(data[1:])[:, 1:].astype(float) features = data[:, indexes] self.target = target = data[:, -1] self.features = torch.FloatTensor(features) self.target = torch.FloatTensor(target) self.features[:, 40:] = (self.features[:, 40:] - self.features[:, 40:].mean(dim=0, keepdim=True)) / \ self.features[:, 40:].std(dim=0, keepdim=True) self.dim = self.features.shape[1] print( f'Finished reading the {mode} set of COVID19 Dataset ({len(self.features)} samples found, each dim = {self.dim})') def __getitem__(self, index): # Returns one sample at a time if self.mode in ['train', 'dev']: return self.features[index], self.target[index] else: return self.features[index] def __len__(self): return len(self.features)
from dataclasses import dataclass, field import pandas as pd from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression @dataclass class FeatureSelector: path: str = "" target_index: int = 0 feature_start_index: int = 0 feature_end_index: int = 0 top_selected: int = 0 data: pd.DataFrame = field(default_factory=pd.DataFrame) x: pd.DataFrame = field(default_factory=pd.DataFrame) y: pd.DataFrame = field(default_factory=pd.DataFrame) def init(self): self.data = pd.read_csv(self.path) self.x = self.data[self.data.columns[self.feature_start_index:self.feature_end_index]] self.y = self.data[self.data.columns[self.target_index]] def min_max_scaling(self): self.x = (self.x - self.x.min()) / (self.x.max() - self.x.min()) def run(self): self.init() self.min_max_scaling() selector = SelectKBest(score_func=f_regression, k=self.top_selected) fit = selector.fit(self.x, self.y) scores = pd.DataFrame(fit.scores_) columns = pd.DataFrame(self.x.columns) feature_scores = pd.concat([columns, scores], axis=1) feature_scores.columns = ['Specs', 'Score'] # naming the dataframe columns top_features = feature_scores.nlargest(self.top_selected, 'Score') print(top_features) # print 15 best features selected_indexes = top_features.index.tolist() return selected_indexes
import torch import torch.nn as nn from typing import List class NeuralNet(nn.Module): def __init__(self, input_dim: int, hidden_layers: List[int]): super(NeuralNet, self).__init__() layers = [] # 输入层到第一个隐藏层 layers.extend([ nn.Linear(input_dim, hidden_layers[0]), nn.BatchNorm1d(hidden_layers[0]), nn.Dropout(p=0.2), nn.LeakyReLU() ]) # 隐藏层之间的连接 for i in range(len(hidden_layers) - 1): layers.extend([ nn.Linear(hidden_layers[i], hidden_layers[i + 1]), nn.BatchNorm1d(hidden_layers[i + 1]), nn.Dropout(p=0.2), nn.LeakyReLU() ]) layers.append(nn.Linear(hidden_layers[-1], 1)) self.net = nn.Sequential(*layers) self.criterion = nn.MSELoss(reduction='mean') def forward(self, x): return self.net(x).squeeze(1) def cal_loss(self, prediction, target): return self.criterion(prediction, target) def cal_loss_l2(self, prediction, target, alpha): main_loss = self.criterion(prediction, target) l2_reg = torch.tensor(0., device=prediction.device) for param in self.parameters(): l2_reg += torch.sum(param ** 2) total_loss = main_loss + alpha * l2_reg return total_loss
import torch from kaggle.covid.covid_taks_util import CovidTaskUtil from kaggle.covid.feature_seletor import FeatureSelector from kaggle.covid.models.covid_nn import NeuralNet train_path = "/Users/wunan/Desktop/kaggledata/covid/train.csv" test_path = "/Users/wunan/Desktop/kaggledata/covid/test.csv" feature_selector = FeatureSelector(path=train_path, target_index=94, feature_end_index=94, feature_start_index=1, top_selected=15) selected_indexes = feature_selector.run() print("Selected indexes from run():", selected_indexes) device = CovidTaskUtil.get_device() config = { 'n_epochs': 10000, # 因为有early_stop,所以大一点没有影响 'batch_size': 200, # 微调batchsize 'optimizer': 'Adam', # 使用Adam优化器 'optim_hparas': { # 完全使用默认参数 # 'lr': 0.0001, # 'momentum': 0.9, # 'weight_decay': 5e-4, }, 'early_stop': 500, # 由于最后训练使用了所有数据,大一点影响不大 'save_path': '/Users/wunan/PycharmProjects/mlstudy/kaggle/covid/models/model.pth' } train_set = CovidTaskUtil.prep_dataloader(train_path, 'train', config['batch_size'], indexes=selected_indexes) validation_set = CovidTaskUtil.prep_dataloader(train_path, 'train', config['batch_size'], indexes=selected_indexes) test_set = CovidTaskUtil.prep_dataloader(test_path, 'test', config['batch_size'], indexes=selected_indexes) model = NeuralNet(validation_set.dataset.dim, [518, 256, 128, 64]).to(device) model_loss, model_loss_record = CovidTaskUtil.train(train_set, validation_set, model, config, device) CovidTaskUtil.plot_learning_curve(model_loss_record, title='deep model') del model model = NeuralNet(train_set.dataset.dim, [518, 256, 128, 64]).to(device) ckpt = torch.load(config['save_path'], map_location='cpu') # Load your best model model.load_state_dict(ckpt) CovidTaskUtil.plot_prediction(train_set, model, device)
https://www.kaggle.com/code/lemontreeyc/hw1-public-strong-baseline/notebook
谢谢!

浙公网安备 33010602011771号