房价预测-California House Prices

K折交叉验证, 最后成绩Private Score为0.23218,在该种模型下算是不错了。

读取训练和测试数据,并将其特征合并,统一用于数据清洗。

%matplotlib inline
import pandas as pd
import torch
import time
from torch import nn
from d2l import torch as d2l
import numpy as np
def read_csv_and_integration(train_csv_name, test_csv_name,labels=None):
    """ 读取测试和训练集的内容,并且将其特征合并,用于数据统一清洗
    Args:
        train_csv_name: 训练数据集的路径
        test_csv_name: 测试数据集的路径
        labels: 训练数据集中需要预测的数据的标签。
    Returns:
        返回经过两个数据集整合之后的特征数据
    Raises:
        FileNotFoundError: 检查路径是否错误
    """
    train_file, test_file = pd.read_csv(train_csv_name), pd.read_csv(test_csv_name)
    train_file = train_file.drop(train_file['Lot'].idxmax())
    labels_predict = train_file[labels]
    if labels!=None:
        train_file.drop(labels,axis=1,inplace=True)
    all_features = pd.concat((train_file.iloc[:,1:],test_file.iloc[:,1:]))
    return all_features,labels_predict
    
train_csv_name, test_csv_name, labels_name = '../../ycy_data/train.csv', '../../ycy_data/test.csv', 'Sold Price'
def try_gpu(i=0):  #@save
    """如果存在,则返回gpu(i),否则返回cpu()。"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU,如果没有GPU,则返回[cpu(),]。"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()
(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0),
  device(type='cuda', index=1),
  device(type='cuda', index=2),
  device(type='cuda', index=3),
  device(type='cuda', index=4),
  device(type='cuda', index=5),
  device(type='cuda', index=6),
  device(type='cuda', index=7)])
train_file, test_file = pd.read_csv(train_csv_name), pd.read_csv(test_csv_name)
all_features,labels_predict = read_csv_and_integration(train_csv_name, test_csv_name, labels_name)
test_data = pd.read_csv(test_csv_name)

找出无法处理或者价值不高的列,将其drop。

drop_features = ['Address','Summary','Region','Elementary School','Middle School','High School','State','City','Parking features','Appliances included']
all_features.drop(columns=drop_features,axis=1,inplace = True)

将可用数据清洗。

all_features['Year built'] = all_features['Year built'].apply(lambda x: 1800 if x<1800 else(x if x<2022 else 2020))
all_features['Year built'].fillna(int(all_features['Year built'].mean()),inplace = True)
features_list_dict = {'Heating':5,'Cooling':5,'Parking':7,'Flooring':8,'Heating features':9,\
                      'Cooling features':7,'Laundry features':7,\
                     'Type':7
                     }
def cut_features_str_normal(features_list_dict,all_features):
    for i,j in features_list_dict.items():
        all_features[i] = all_features[i].fillna('-')
        all_features[i] = all_features[i].apply(lambda x: x[:j].lower())
    return all_features
all_features = cut_features_str_normal(features_list_dict,all_features)

all_features['Lot'] = all_features['Lot'].apply(lambda x: 435 if  x<435 else x)
all_features['Bedrooms']=all_features['Bedrooms'].apply(lambda x: x.count(',')+1 if isinstance (x,str) else x)
all_features['Listed On'] = all_features['Listed On'].apply(lambda x: float(x.replace('-',"")))
all_features['Last Sold On'].fillna('1970-01-01',inplace = True)
all_features['Last Sold On'] = all_features['Last Sold On'].apply(lambda x: float(x.replace('-',"")))

to_avg_features = ['Lot','Bedrooms','Bathrooms','Full bathrooms',\
                   'Total interior livable area','Total spaces','Garage spaces',\
                   'Elementary School Score','Elementary School Distance',\
                   'Middle School Score','Middle School Distance','High School Score',\
                   'High School Distance','Tax assessed value','Annual tax amount',\
                  'Listed Price','Last Sold Price','Zip']
def avg_data_na(features_list,all_features):
    for i  in features_list:
        all_features[i] = all_features[i].fillna(round(all_features[i].mean()))
        all_features[i] = all_features[i].apply(lambda x:all_features[i].mean() if x==0 else x)
    return all_features
all_features = avg_data_na(to_avg_features,all_features)


数据清洗完毕之后,对其进行normalization。$$x \leftarrow \frac{x - \mu}{\sigma}.$$
Dummy_na=True 将“na”(缺失值)视为有效的特征值,并为其创建指示符特征。
进行独热向量化。

normalization_features = to_avg_features + ['Last Sold On','Listed On','Year built']
all_features[normalization_features] = all_features[normalization_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features = pd.get_dummies(all_features, dummy_na=True)

以上 已经将所有特征清洗完毕,并对部分进行独热向量处理。


将数据 tensor化,进行下面的学习。

n_train = labels_predict.count()
train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float32,device=try_gpu())
test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float32,device=try_gpu())
train_labels = torch.tensor(labels_predict.values.reshape(-1,1),dtype=torch.float32,device=try_gpu())

损失函数使用均方差损失,$$\ell(x, y) = L = {l_1,\dots,l_N}^\top, \quad
l_n = \left( x_n - y_n \right)^2$$

loss = nn.MSELoss()
in_features = train_features.shape[1]
def get_net():
    net = nn.Sequential(nn.Linear(in_features,128),nn.ReLU(),nn.Linear(128,64),nn.ReLU(),nn.Linear(64,8),nn.ReLU(),nn.Linear(8,1))
    net.to(device=try_gpu())
    return net

对于房价,就像股票价格一样,我们关心的是相对数量,而不是绝对数量。因此,[我们更关心相对误差\(\frac{y - \hat{y}}{y}\)]而不是绝对误差\(y - \hat{y}\)。例如,如果我们在俄亥俄州农村地区估计一栋房子的价格时,我们的预测偏差了10万美元,在那里一栋典型的房子的价值是12.5万美元,那么我们可能做得很糟糕。另一方面,如果我们在加州豪宅区的预测出现了这个数字的偏差,这可能是一个惊人的准确预测(在那里,房价均值超过400万美元)。

(解决这个问题的一种方法是用价格预测的对数来衡量差异)。事实上,这也是比赛中官方用来评价提交质量的误差指标。即将 \(\delta\) for \(|\log y - \log \hat{y}| \leq \delta\)转换为\(e^{-\delta} \leq \frac{\hat{y}}{y} \leq e^\delta\)。这使得预测价格的对数与真实标签价格的对数之间出现以下均方根误差:

\[\sqrt{\frac{1}{n}\sum_{i=1}^n\left(\log y_i -\log \hat{y}_i\right)^2}. \]

def log_rmse(net,features,labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size,is_train=True) #  这里好像是缺参数, 
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid
net_dict,dict_kets_to_list = {},[]
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        
        time_str =  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        torch.save(net, './tem_pth/'+time_str+'----'+'K-Flod:'+str(i+1)+'----'+'train_ls:'+str(round(train_ls[-1],4))+'----'+'valid_ls:'+str(round(valid_ls[-1],4))+'.pth')
        
        net_dict[round(valid_ls[-1],4)] = net
        
        
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'fold {i + 1}, train log rmse {float(train_ls[-1]):f}, '
              f'valid log rmse {float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

进行K折交叉验证。

k, num_epochs, lr, weight_decay, batch_size = 10, 500, 0.01, 0.001, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

k, num_epochs, lr, weight_decay, batch_size = 10, 500, 0.1, 0.001, 64

def train_and_pred(train_features, test_feature, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
#     net = get_net()
#     train_ls, _ = train(net, train_features, train_labels, None, None,
#                         num_epochs, lr, weight_decay, batch_size)
    dict_kets_to_list = list(net_dict.keys())
    
    net = net_dict[min(dict_kets_to_list)]
    print(min(dict_kets_to_list))
    
#     d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
#              ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
#     print(f'train log rmse {float(train_ls[-1]):f}')
    # 将网络应用于测试集。
    preds = net(test_features).detach().cpu().numpy()
    # 将其重新格式化以导出到Kaggle
    test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
    submission.to_csv('submission.csv', index=False)
    
train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
posted @ 2021-08-29 08:22  X-POWER  阅读(247)  评论(0编辑  收藏  举报