GlenTt

导航

wide&deep在adult数据集上的应用

wide&deep在adult数据集上的应用


1 数据读取与清洗

train_df = pd.read_csv(...);  test_df = pd.read_csv(...)   # CSV → DataFrame
train_df = _clean_data(train_df)                          # 去 ?、strip 空格

2 特征工程(AdultDataProcessor)

步骤 代码片段 说明
连续列 continuous_columns = [...] 直接留作数值输入
离散列 categorical_columns = [...] 之后做 LabelEncoder
年龄分桶 _create_age_buckets() 连续→离散,为 Wide 交叉准备
交叉列表 crossed_columns 产生哈希特征
LabelEncoder 拟合 le.fit(combined_values) 得到 词表大小 vocab_size
StandardScaler 拟合 scaler.fit(train_df[continuous_columns]) 保存 均值 μ / 方差 σ²

3 _transform_single()——把 DataFrame 变成三类张量

结果键 含义 生成方式
deep_sparse_features 词索引 字典 label_enc.transform(col) + 1 预留 0 作为 padding
deep_dense_features 标准化连续值 (x-μ)/σ via scaler.transform
wide_features 稠密 one-hot 拼接 (1) 每个离散列做独热
(2) 交叉列哈希到 10 k 桶再独热

最终 wide_features.shape = n_samples × wide_dim


4 Dataset & DataLoader

AdultDataset.__getitem__ 把上述三块数据 + label 封装成字典,供训练循环使用。


5 模型结构——WideDeep 关键模块

支路 关键层 输入 作用
Wide wide_linear = nn.Linear(wide_dim, 1) 稠密 one-hot 向量 显式记忆权重
Deep nn.Embedding per field 词索引 (padding=0) 查表得到稠密向量
deep_mlp = [Linear+ReLU+Dropout]×k + 输出层 拼接(Embedding + 连续) 隐式非线性

6 前向传播逻辑

  1. Widewide_logit = wide_linear(wide_input)
  2. Deep 嵌入查表 → 拼接 → MLP → deep_logit
  3. 融合logits = wide_logit + deep_logit

7 梯度如何更新?

参数 梯度来源 说明
Wide 权重 W_wide 激活的 one-hot 维度 梯度 = ∂Loss/∂logit × 1 等价于传统 LR;没有 Embedding
Embedding 表 只有 当前 batch 命中的行 参与更新 padding_idx=0 行固定为 0,不更新
MLP 权重 全连接常规反传 Xavier 初始化

损失使用 BCEWithLogitsLoss,优化器是 Adam;调用 loss.backward() 后所有可学习参数同时反传。


9 流程图(文字版)

CSV → DataFrame
   │
   ├─ 清洗、年龄分桶
   ├─ LabelEncoder.fit  → index 1…V
   ├─ StandardScaler.fit→ μ,σ
   │
_transform_single
   ├─ deep_sparse_features (Long idx +1)
   ├─ deep_dense_features  (Float z-score)
   └─ wide_features        (dense one-hot, hash)
   │
Dataset / DataLoader → batch dict
   │
WideDeep.forward
   ├─ wide_logit = Linear(wide_features)
   ├─ emb = Embedding(idx); concat dense
   ├─ deep_logit = MLP(concat)
   └─ logits = wide_logit + deep_logit
   │
loss = BCEWithLogits
optimizer.step()  → 更新 Wide W + Embeddings + MLP

10 wide&deep model具体代码

# D:\\科研\\搜广推\\实习项目\\代码库\\wide_deep\\model_widedeep.py

from typing import Dict, List, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F


class WideDeep(nn.Module):
    """
    Wide & Deep 推荐模型实现
    WideDeep 类继承自 nn.Module(PyTorch的基类)
    Architecture:
    - Wide部分: 线性变换处理稀疏特征交叉
    - Deep部分: 深度神经网络处理稠密嵌入特征
    - 联合训练: wide_logit + deep_logit
    """

    def __init__(
        self,
        wide_dim: int,
        deep_field_dims: Dict[str, int],
        deep_dense_dim: int = 0,
        deep_embedding_dim: int = None,
        deep_hidden_dims: List[int] = None,
        dropout_rate: float = 0.2,
        activation: str = "relu",
    ):
        """
        Args:
            wide_dim: Wide部分输入维度(稀疏特征维度)
            deep_field_dims: Deep部分各离散字段的词汇表大小 {"field_name": vocab_size}
            deep_embedding_dim: 嵌入维度计算函数,若为None则使用自适应策略
            deep_hidden_dims: MLP隐层维度列表,默认[256, 128]
            deep_dense_dim: 连续特征维度
            dropout_rate: Dropout比例
            activation: 激活函数类型
        """
        super(WideDeep, self).__init__()
        # super(WideDeep, self).__init__() 确保 nn.Module 的构造函数被正确调用

        # === Wide部分 ===
        self.wide_dim = wide_dim
        self.wide_linear = nn.Linear(wide_dim, 1, bias=True)

        # === Deep部分 ===
        self.deep_field_dims = deep_field_dims
        self.deep_dense_dim = deep_dense_dim
        self.dropout_rate = dropout_rate

        # 设置默认隐层架构
        if deep_hidden_dims is None:
            deep_hidden_dims = [256, 128]
        self.deep_hidden_dims = deep_hidden_dims

        # 构建嵌入层
        self.embeddings = nn.ModuleDict()
        total_embedding_dim = 0

        for field_name, vocab_size in deep_field_dims.items():
            # 自适应嵌入维度: k = min(16, int(vocab_size * 0.25))
            if deep_embedding_dim is None:
                emb_dim = min(64, int(vocab_size**0.25 * 6))
            else:
                emb_dim = deep_embedding_dim

            self.embeddings[field_name] = nn.Embedding(
                num_embeddings=vocab_size,
                embedding_dim=emb_dim,
                padding_idx=None,  # 若输入包含这个index,,索引为index(0)的位置将别填充为0,不参与训练
            )
            total_embedding_dim += emb_dim

        # MLP输入维度 = 嵌入特征 + 连续特征
        mlp_input_dim = total_embedding_dim + deep_dense_dim

        # 构建MLP层
        mlp_layers = []
        prev_dim = mlp_input_dim

        for hidden_dim in deep_hidden_dims:
            mlp_layers.extend(
                [
                    nn.Linear(prev_dim, hidden_dim),
                    self._get_activation(activation),
                    nn.Dropout(dropout_rate),
                ]
            )
            prev_dim = hidden_dim
        # mlp_layers会得到设置好的隐藏层结构
        # 这里再加上输出层,得到完整的网络结构:mlp_layers,输入为拼接好的(稀疏向量和连续向量),经过多层MLP后输出一个logit
        mlp_layers.append(nn.Linear(prev_dim, 1))

        self.deep_mlp = nn.Sequential(*mlp_layers)

        # 参数初始化
        self._init_weights()

    def _get_activation(self, activation: str) -> nn.Module:
        """获取激活函数"""
        if activation.lower() == "relu":
            return nn.ReLU()
        elif activation.lower() == "gelu":
            return nn.GELU()
        elif activation.lower() == "tanh":
            return nn.Tanh()
        else:
            return nn.ReLU()

    def _init_weights(self):
        """权重初始化, .modules()返回当前模型中所有子模块的迭代器(递归地,包括 nn.Sequential、nn.ModuleDict 里的内容)"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                """xavier_uniform_, 使权重是均匀分布,范围为 [-a, a],均值 0, 使每一层的输出方差 = 输入方差"""
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    """这样不会一开始就引入偏置影响,更稳健"""
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.xavier_uniform_(module.weight)

    def forward(
        self,
        wide_input: torch.Tensor,
        deep_sparse_inputs: Dict[str, torch.Tensor],
        deep_dense_inputs: torch.Tensor = None,
    ) -> torch.Tensor:
        """
        前向传播

        Args:
            wide_input: Wide部分输入 [batch_size, wide_dim]
            deep_sparse_inputs: Deep部分离散输入 {"field_name": [batch_size]}
            deep_dense_inputs: Deep部分连续输入 [batch_size, dense_dim]

        Returns:
            logits: 预测logits [batch_size, 1]
        """
        batch_size = wide_input.size(0)

        # === Wide部分前向传播 ===
        wide_logit = self.wide_linear(wide_input)  # [batch_size, 1]

        # === Deep部分前向传播 ===
        # 1. 处理稀疏特征嵌入
        deep_embeddings = []
        for field_name, field_input in deep_sparse_inputs.items():
            if field_name in self.embeddings:
                emb = self.embeddings[field_name](field_input)  # [batch_size, emb_dim]
                deep_embeddings.append(emb)

        # 2. 拼接稀疏特征嵌入特征
        if deep_embeddings:
            deep_sparse_concat = torch.cat(
                deep_embeddings, dim=1
            )  # [batch_size, total_emb_dim]
        else:
            deep_sparse_concat = torch.empty(batch_size, 0, device=wide_input.device)

        # 3. 拼接连续特征
        if deep_dense_inputs is not None and deep_dense_inputs.numel() > 0:
            deep_input = torch.cat([deep_sparse_concat, deep_dense_inputs], dim=1)
        else:
            deep_input = deep_sparse_concat

        # 4. MLP前向传播
        deep_logit = self.deep_mlp(deep_input)  # [batch_size, 1]

        # === 融合输出 ===
        logits = wide_logit + deep_logit  # [batch_size, 1]

        return logits

    def get_embeddings_weight(self, field_name: str) -> torch.Tensor:
        """获取指定字段的嵌入权重"""
        if field_name in self.embeddings:
            return self.embeddings[field_name].weight
        else:
            raise KeyError(f"字段 '{field_name}' 不存在于嵌入层中")

    def get_model_size(self) -> Dict[str, int]:
        """获取模型参数统计"""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)

        wide_params = sum(p.numel() for p in self.wide_linear.parameters())
        deep_params = total_params - wide_params

        return {
            "total_parameters": total_params,
            "trainable_parameters": trainable_params,
            "wide_parameters": wide_params,
            "deep_parameters": deep_params,
        }


def build_feature_sizes(
    categorical_columns: List[str],
    feature_vocab_sizes: Dict[str, int],
    crossed_columns: List[Tuple[str, ...]] = None,
    hash_bucket_size: int = 10000,
) -> Tuple[int, Dict[str, int]]:
    """
    构建特征维度信息,供主程序初始化模型

    Args:
        categorical_columns: 离散特征列名列表
        feature_vocab_sizes: 各特征的词汇表大小 {"feature_name": vocab_size}
        crossed_columns: 交叉特征组合列表 [("feat1", "feat2"), ...]
        hash_bucket_size: 哈希桶大小(用于交叉特征)

    Returns:
        wide_dim: Wide部分特征维度
        deep_field_dims: Deep部分各字段维度字典
    """
    # Deep部分维度:直接使用原始离散特征
    deep_field_dims = {}
    for col in categorical_columns:
        if col in feature_vocab_sizes:
            # 为padding预留位置,vocab_size + 1
            deep_field_dims[col] = feature_vocab_sizes[col] + 1

    # Wide部分维度:原始特征 + 交叉特征
    wide_dim = sum(feature_vocab_sizes.get(col, 0) for col in categorical_columns)

    # 添加交叉特征维度
    if crossed_columns:
        wide_dim += len(crossed_columns) * hash_bucket_size

    return wide_dim, deep_field_dims


if __name__ == "__main__":
    # 测试模型构建
    print("=== Wide & Deep 模型测试 ===")

    # 模拟特征配置
    feature_vocab_sizes = {
        "workclass": 8,
        "education": 16,
        "marital_status": 7,
        "occupation": 14,
        "relationship": 6,
        "race": 5,
        "sex": 2,
        "native_country": 41,
    }

    categorical_columns = list(feature_vocab_sizes.keys())
    crossed_columns = [("education", "occupation"), ("age_buckets", "workclass")]

    # 构建模型维度
    wide_dim, deep_field_dims = build_feature_sizes(
        categorical_columns=categorical_columns,
        feature_vocab_sizes=feature_vocab_sizes,
        crossed_columns=crossed_columns,
        hash_bucket_size=1000,
    )

    print(f"Wide维度: {wide_dim}")
    print(f"Deep字段维度: {deep_field_dims}")

    # 创建模型
    model = WideDeep(
        wide_dim=wide_dim,
        deep_field_dims=deep_field_dims,
        deep_dense_dim=6,  # 6个连续特征
        deep_hidden_dims=[256, 128],
        dropout_rate=0.2,
    )

    print(f"模型参数统计: {model.get_model_size()}")

    # 测试前向传播
    batch_size = 32
    device = torch.device("cpu")
    model = model.to(device)

    # 构造测试数据
    wide_input = torch.randn(batch_size, wide_dim).to(device)
    deep_sparse_inputs = {
        field: torch.randint(0, vocab_size, (batch_size,)).to(device)
        for field, vocab_size in deep_field_dims.items()
    }
    deep_dense_inputs = torch.randn(batch_size, 6).to(device)

    # 前向传播
    with torch.no_grad():
        logits = model(wide_input, deep_sparse_inputs, deep_dense_inputs)
        print(f"输出logits形状: {logits.shape}")
        print(f"输出范围: [{logits.min():.4f}, {logits.max():.4f}]")

    print("模型测试完成!")

11 run adult 具体代码


# D:\\科研\\搜广推\\实习项目\\代码库\\wide_deep\\run_adult.py

import argparse
import json
import os
import time
import warnings
from typing import Any, Dict, List, Tuple

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from model_widedeep import WideDeep, build_feature_sizes
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
from torchmetrics import AUROC
from tqdm import tqdm


class AdultDataset(Dataset):
    """Adult数据集PyTorch Dataset封装"""

    def __init__(
        self,
        wide_features: np.ndarray,
        deep_sparse_features: Dict[
            str, np.ndarray
        ],  # 键是特征名(字符串),值是一维 np.ndarray
        deep_dense_features: np.ndarray,
        labels: np.ndarray,
    ):
        self.wide_features = torch.FloatTensor(wide_features)
        self.deep_sparse_features = {
            k: torch.LongTensor(v) for k, v in deep_sparse_features.items()
        }
        self.deep_dense_features = torch.FloatTensor(deep_dense_features)
        self.labels = torch.FloatTensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "wide_input": self.wide_features[idx],
            "deep_sparse_inputs": {
                k: v[idx] for k, v in self.deep_sparse_features.items()
            },
            "deep_dense_inputs": self.deep_dense_features[idx],
            "labels": self.labels[idx],
        }


class AdultDataProcessor:
    """Adult数据集预处理器"""

    def __init__(self, hash_bucket_size: int = 10000):
        self.hash_bucket_size = hash_bucket_size
        self.categorical_columns = [
            "workclass",
            "education",
            "marital_status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "native_country",
            "age_buckets",
        ]
        self.continuous_columns = [
            "age",
            "fnlwgt",
            "education_num",
            "capital_gain",
            "capital_loss",
            "hours_per_week",
        ]
        self.crossed_columns = [
            ("education", "occupation"),
            ("age_buckets", "workclass"),
            ("native_country", "occupation"),
        ]

        # 编码器
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.feature_vocab_sizes = {}

        # 年龄分桶边界
        self.age_buckets = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]

    def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """数据清洗"""
        # 去除缺失值标记“ ,”
        df = df.replace("?", np.nan)

        # 删除含有缺失值的行
        df = df.dropna()

        # 去除掉类别特征数据的空格,也就是.dtype= "object" 的列
        # 注意:这里假设所有类别特征都是字符串类型
        for col in df.columns:
            if df[col].dtype == "object":
                df[col] = df[col].str.strip()

        return df.reset_index(drop=True)

    def _create_age_buckets(self, ages: pd.Series) -> pd.Series:
        """创建年龄分桶特征,pd.cut最终返回每一一样本age对应的age_bucket,age_0, age_1"""
        return pd.cut(
            ages,
            bins=[0] + self.age_buckets + [100],
            labels=[f"age_{i}" for i in range(len(self.age_buckets) + 1)],
        )

    def _create_crossed_features(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        """创建交叉特征"""
        crossed_features = {}

        for i, (col1, col2) in enumerate(self.crossed_columns):
            # 构造交叉特征字符串,cross_values格式为series(行向量,带有索引)
            if col1 == "age_buckets":
                cross_values = (
                    df["age_buckets"].astype(str) + "_x_" + df[col2].astype(str)
                )
            else:
                cross_values = df[col1].astype(str) + "_x_" + df[col2].astype(str)

            # 哈希映射到固定大小的桶
            cross_hashed = cross_values.apply(lambda x: hash(x) % self.hash_bucket_size)
            # 将Series转换为numpy数组
            # 注意:这里的crossed_features是一个交叉特征字典,键是交叉特征的名称,值是对应的哈希编码
            # 例如:{'cross_0': array([1234, 5678, ...]), 'cross_1': array([2345, 6789, ...])}
            crossed_features[f"cross_{i}"] = cross_hashed.values

        return crossed_features

    def fit_transform(
        self, train_df: pd.DataFrame, test_df: pd.DataFrame
    ) -> Tuple[Any, Any]:
        """拟合并转换训练和测试数据"""
        print("开始数据预处理...")

        # 数据清洗
        train_df = self._clean_data(train_df)
        test_df = self._clean_data(test_df)

        print(f"训练集大小: {len(train_df)}, 测试集大小: {len(test_df)}")

        # 创建年龄分桶
        train_df["age_buckets"] = self._create_age_buckets(train_df["age"])
        test_df["age_buckets"] = self._create_age_buckets(test_df["age"])

        # 处理标签
        train_labels = (train_df["income"] == ">50K").astype(int).values
        test_labels = (test_df["income"] == ">50K").astype(int).values

        # 拟合离散特征编码器
        for col in self.categorical_columns:
            # LabelEncoder 关心“类别集合”(类别 ↔ 整数)
            le = LabelEncoder()
            # 使用训练+测试数据的全部唯一值拟合,确保测试集中的新类别被处理
            combined_values = pd.concat([train_df[col], test_df[col]]).unique()
            le.fit(combined_values)
            self.label_encoders[col] = le
            self.feature_vocab_sizes[col] = len(le.classes_)

        # 拟合连续特征标准化器,self.scaler = StandardScaler(),StandardScaler 关心“数值分布”(均值、方差)
        self.scaler.fit(train_df[self.continuous_columns])

        # 转换数据
        train_data = self._transform_single(train_df)
        test_data = self._transform_single(test_df)

        # 添加标签
        train_data["labels"] = train_labels
        test_data["labels"] = test_labels

        print("数据预处理完成!")
        print(f"Wide特征维度: {train_data['wide_features'].shape[1]}")
        print(f"Deep稀疏特征字段: {list(train_data['deep_sparse_features'].keys())}")
        print(f"Deep稠密特征维度: {train_data['deep_dense_features'].shape[1]}")

        return train_data, test_data

    def _transform_single(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        """转换单个数据集"""
        n_samples = len(df)

        # === 处理Deep部分特征 ===
        # 1. 稀疏特征(用于Embedding,词索引)
        deep_sparse_features = {}
        for col in self.categorical_columns:
            # +1 为padding预留0位置?,此时deep_sparse_features对应的label_encoder为各自特征中的词表索引(int)
            deep_sparse_features[col] = self.label_encoders[col].transform(df[col]) + 1

        # 2. 稠密特征(连续特征标准化),deep_dense_features对应的encoder为各自特征标准化后的连续数值
        deep_dense_features = self.scaler.transform(df[self.continuous_columns])

        # === 处理Wide部分特征(编码为onehot向量) ===
        # Wide部分本质是一个 线性模型,最适合 0/1 稀疏键(类别特征和交叉特征);
        # 连续特征要么交给 Deep 学非线性,要么先分桶再当离散键,让 Wide 学“区间 + 交叉特征”
        wide_features_list = []

        # 1. 原始离散特征的One-Hot编码
        for col in self.categorical_columns:
            encoded = self.label_encoders[col].transform(df[col])
            vocab_size = self.feature_vocab_sizes[col]
            # 转换为One-Hot,为每一个特征的所有样本创建n_samples*vocab_size的独热矩阵
            onehot = np.zeros((n_samples, vocab_size))
            # onehot 将对应的索引位置设为1.0
            onehot[np.arange(n_samples), encoded] = 1.0
            wide_features_list.append(onehot)

        # 2. 交叉特征的哈希编码
        crossed_features = self._create_crossed_features(df)
        for cross_name, cross_values in crossed_features.items():
            # 转换为稀疏One-Hot,10000个哈希桶,onehot也就是10000维度
            cross_onehot = np.zeros((n_samples, self.hash_bucket_size))
            cross_onehot[np.arange(n_samples), cross_values] = 1.0
            wide_features_list.append(cross_onehot)

        # 拼接Wide特征,每一个样本有多个one-hot特征和交叉特征,现在将其拼接起来
        wide_features = np.concatenate(wide_features_list, axis=1)

        return {
            "wide_features": wide_features,
            "deep_sparse_features": deep_sparse_features,
            "deep_dense_features": deep_dense_features,
        }


def load_adult_data(
    train_path: str, test_path: str
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """加载Adult数据集"""
    column_names = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country",
        "income",
    ]

    # 加载训练数据,train_df = 32561*15,dataframe
    train_df = pd.read_csv(train_path, names=column_names, skipinitialspace=True)

    # 加载测试数据(注意测试文件第一行可能是列名,第一行为|1x3 Cross validator)
    with open(test_path, "r") as f:
        first_line = f.readline().strip()
        skip_rows = 1 if "|1x3 Cross validator" in first_line else 0

    # test_df = 16281*15,dataframe
    test_df = pd.read_csv(
        test_path, names=column_names, skiprows=skip_rows, skipinitialspace=True
    )

    # 处理测试集标签中的点号
    test_df["income"] = test_df["income"].str.rstrip(".")

    return train_df, test_df


def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    device: torch.device,
    epochs: int,
    learning_rate: float,
) -> Dict[str, List[float]]:
    """训练模型"""
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    auroc = AUROC(task="binary")

    history = {"train_loss": [], "train_auc": [], "val_loss": [], "val_auc": []}

    for epoch in range(epochs):
        # === 训练阶段 ===
        model.train()
        train_loss = 0.0
        train_preds, train_targets = [], []

        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
        for batch in train_pbar:
            # 数据转移到设备
            wide_input = batch["wide_input"].to(device)
            deep_sparse_inputs = {
                k: v.to(device) for k, v in batch["deep_sparse_inputs"].items()
            }
            deep_dense_inputs = batch["deep_dense_inputs"].to(device)
            labels = batch["labels"].to(device)

            # 前向传播
            optimizer.zero_grad()
            logits = model(wide_input, deep_sparse_inputs, deep_dense_inputs)
            loss = criterion(logits.squeeze(), labels)

            # 反向传播
            loss.backward()
            optimizer.step()

            # 统计
            train_loss += loss.item()
            train_preds.extend(torch.sigmoid(logits.squeeze()).cpu().detach().numpy())
            train_targets.extend(labels.cpu().numpy())

            train_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        # === 验证阶段 ===
        model.eval()
        val_loss = 0.0
        val_preds, val_targets = [], []

        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]")
            for batch in val_pbar:
                wide_input = batch["wide_input"].to(device)
                deep_sparse_inputs = {
                    k: v.to(device) for k, v in batch["deep_sparse_inputs"].items()
                }
                deep_dense_inputs = batch["deep_dense_inputs"].to(device)
                labels = batch["labels"].to(device)

                logits = model(wide_input, deep_sparse_inputs, deep_dense_inputs)
                loss = criterion(logits.squeeze(), labels)

                val_loss += loss.item()
                val_preds.extend(torch.sigmoid(logits.squeeze()).cpu().numpy())
                val_targets.extend(labels.cpu().numpy())

                val_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        # 计算指标
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        train_auc = auroc(
            torch.tensor(train_preds), torch.tensor(train_targets).long()
        ).item()
        val_auc = auroc(
            torch.tensor(val_preds), torch.tensor(val_targets).long()
        ).item()

        # 记录历史
        history["train_loss"].append(train_loss)
        history["train_auc"].append(train_auc)
        history["val_loss"].append(val_loss)
        history["val_auc"].append(val_auc)

        # 打印结果
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train - Loss: {train_loss:.4f}, AUC: {train_auc:.4f}")
        print(f"  Val   - Loss: {val_loss:.4f}, AUC: {val_auc:.4f}")
        print("-" * 50)

    return history


def predict_csv(
    model: nn.Module, csv_path: str, processor: AdultDataProcessor, device: torch.device
):
    """对CSV文件进行预测"""
    print(f"开始预测文件: {csv_path}")

    # 加载数据
    df = pd.read_csv(csv_path)

    # 数据预处理(假设格式与训练数据一致)
    processed_data = processor._transform_single(df)

    # 创建数据集
    dataset = AdultDataset(
        wide_features=processed_data["wide_features"],
        deep_sparse_features=processed_data["deep_sparse_features"],
        deep_dense_features=processed_data["deep_dense_features"],
        labels=np.zeros(len(df)),  # 占位符
    )

    dataloader = DataLoader(dataset, batch_size=1024, shuffle=False)

    # 预测
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="预测中"):
            wide_input = batch["wide_input"].to(device)
            deep_sparse_inputs = {
                k: v.to(device) for k, v in batch["deep_sparse_inputs"].items()
            }
            deep_dense_inputs = batch["deep_dense_inputs"].to(device)

            logits = model(wide_input, deep_sparse_inputs, deep_dense_inputs)
            probs = torch.sigmoid(logits.squeeze()).cpu().numpy()
            predictions.extend(probs)

    # 保存结果
    df["pred"] = predictions
    output_path = csv_path.replace(".csv", "_with_pred.csv")
    df.to_csv(output_path, index=False)

    print(f"预测完成,结果保存至: {output_path}")


def main():
    parser = argparse.ArgumentParser(description="Wide & Deep 模型训练")
    parser.add_argument("--epochs", type=int, default=5, help="训练轮数")
    parser.add_argument("--batch_size", type=int, default=1024, help="批次大小")
    parser.add_argument("--lr", type=float, default=1e-3, help="学习率")
    parser.add_argument("--device", type=str, default="cuda", help="设备 (cuda/cpu)")
    parser.add_argument(
        "--save_dir", type=str, default="./checkpoints/", help="模型保存目录"
    )
    parser.add_argument("--predict_csv", type=str, default=None, help="预测CSV文件路径")

    args = parser.parse_args()

    # 设备配置
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
    print(f"使用设备: {device}")

    # 数据路径
    train_path = "D:\\科研\\搜广推\\实习项目\\数据集库\\adult\\adult.data"
    test_path = "D:\\科研\\搜广推\\实习项目\\数据集库\\adult\\adult.test"

    # 创建保存目录
    os.makedirs(args.save_dir, exist_ok=True)

    # === 数据加载与预处理 ===
    print("=== 加载Adult数据集 ===")
    train_df, test_df = load_adult_data(train_path, test_path)

    processor = AdultDataProcessor(hash_bucket_size=10000)

    train_data, test_data = processor.fit_transform(train_df, test_df)

    # === 构建模型 ===
    print("\n=== 构建Wide & Deep模型 ===")
    wide_dim, deep_field_dims = build_feature_sizes(
        categorical_columns=processor.categorical_columns,
        feature_vocab_sizes=processor.feature_vocab_sizes,
        crossed_columns=processor.crossed_columns,
        hash_bucket_size=processor.hash_bucket_size,
    )

    model = WideDeep(
        wide_dim=wide_dim,
        deep_field_dims=deep_field_dims,
        deep_dense_dim=len(processor.continuous_columns),
        deep_hidden_dims=[1024, 512, 256],
        dropout_rate=0.2,
    ).to(device)

    print(f"模型参数统计: {model.get_model_size()}")

    # === 准备数据加载器 ===
    # 训练集进一步分割为训练/验证
    train_size = int(0.8 * len(train_data["labels"]))
    indices = np.random.permutation(len(train_data["labels"]))
    train_indices, val_indices = indices[:train_size], indices[train_size:]

    # 构建训练集
    train_dataset = AdultDataset(
        wide_features=train_data["wide_features"][train_indices],
        deep_sparse_features={
            k: v[train_indices] for k, v in train_data["deep_sparse_features"].items()
        },
        deep_dense_features=train_data["deep_dense_features"][train_indices],
        labels=train_data["labels"][train_indices],
    )

    # 构建验证集
    val_dataset = AdultDataset(
        wide_features=train_data["wide_features"][val_indices],
        deep_sparse_features={
            k: v[val_indices] for k, v in train_data["deep_sparse_features"].items()
        },
        deep_dense_features=train_data["deep_dense_features"][val_indices],
        labels=train_data["labels"][val_indices],
    )

    # 构建测试集
    test_dataset = AdultDataset(
        wide_features=test_data["wide_features"],
        deep_sparse_features=test_data["deep_sparse_features"],
        deep_dense_features=test_data["deep_dense_features"],
        labels=test_data["labels"],
    )

    # 数据加载器
    train_loader = DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0
    )
    val_loader = DataLoader(
        val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0
    )
    test_loader = DataLoader(
        test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0
    )

    print(f"训练集大小: {len(train_dataset)}")
    print(f"验证集大小: {len(val_dataset)}")
    print(f"测试集大小: {len(test_dataset)}")

    # === 模型训练 ===
    print(f"\n=== 开始训练 (Epochs: {args.epochs}, LR: {args.lr}) ===")
    start_time = time.time()

    history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        epochs=args.epochs,
        learning_rate=args.lr,
    )

    training_time = time.time() - start_time
    print(f"训练完成,用时: {training_time:.2f}秒")

    # === 测试集评估 ===
    print("\n=== 测试集评估 ===")
    model.eval()
    test_loss = 0.0
    test_preds, test_targets = [], []
    criterion = nn.BCEWithLogitsLoss()
    auroc = AUROC(task="binary")

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="测试评估"):
            wide_input = batch["wide_input"].to(device)
            deep_sparse_inputs = {
                k: v.to(device) for k, v in batch["deep_sparse_inputs"].items()
            }
            deep_dense_inputs = batch["deep_dense_inputs"].to(device)
            labels = batch["labels"].to(device)

            logits = model(wide_input, deep_sparse_inputs, deep_dense_inputs)
            loss = criterion(logits.squeeze(), labels)

            test_loss += loss.item()
            test_preds.extend(torch.sigmoid(logits.squeeze()).cpu().numpy())
            test_targets.extend(labels.cpu().numpy())

    test_loss /= len(test_loader)
    test_auc = auroc(torch.tensor(test_preds), torch.tensor(test_targets).long()).item()

    print(f"测试集结果 - Loss: {test_loss:.4f}, AUC: {test_auc:.4f}")

    # === 保存模型和指标 ===
    model_path = os.path.join(args.save_dir, "model.pt")
    metrics_path = os.path.join(args.save_dir, "metrics.json")

    # 保存模型
    torch.save(
        {
            "model_state_dict": model.state_dict(),
            "model_config": {
                "wide_dim": wide_dim,
                "deep_field_dims": deep_field_dims,
                "deep_dense_dim": len(processor.continuous_columns),
                "deep_hidden_dims": [256, 128],
                "dropout_rate": 0.2,
            },
            "processor_config": {
                "feature_vocab_sizes": processor.feature_vocab_sizes,
                "categorical_columns": processor.categorical_columns,
                "continuous_columns": processor.continuous_columns,
                "crossed_columns": processor.crossed_columns,
                "hash_bucket_size": processor.hash_bucket_size,
            },
        },
        model_path,
    )

    # 保存指标
    metrics = {
        "training_history": history,
        "test_metrics": {"loss": test_loss, "auc": test_auc},
        "training_time": training_time,
        "model_parameters": model.get_model_size(),
        "hyperparameters": {
            "epochs": args.epochs,
            "batch_size": args.batch_size,
            "learning_rate": args.lr,
            "device": str(device),
        },
    }

    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)

    print(f"模型已保存至: {model_path}")
    print(f"指标已保存至: {metrics_path}")

    # === CSV预测功能 ===
    if args.predict_csv:
        print(f"\n=== CSV预测模式 ===")
        predict_csv(model, args.predict_csv, processor, device)

    # === 训练总结 ===
    print("\n" + "=" * 60)
    print("训练总结:")
    print(f"  最佳验证AUC: {max(history['val_auc']):.4f}")
    print(f"  最终测试AUC: {test_auc:.4f}")
    print(f"  训练时长: {training_time:.2f}秒")
    print(f"  模型参数量: {model.get_model_size()['total_parameters']:,}")
    print("=" * 60)


if __name__ == "__main__":
    main()


posted on 2025-07-27 14:24  GRITJW  阅读(10)  评论(0)    收藏  举报