11.24每日总结
今天完成了大数据的测试,
这一次没有用hadoop用的
是python直接对数据的导入、
清洗、分析和可视化展示的操作,下面是所有的代码。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
class LinearRegressionModel(nn.Module):
def __init__(self, input_size):
super(LinearRegressionModel, self).__init__()
self.linear = nn.Linear(input_size, 1)
def forward(self, x):
return self.linear(x)
def dump_load_demo():
# 1. 获取数据
df = pd.read_csv('C:\\Users\\admin\\Desktop\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\data\\tmdb_1000_predict.csv',
sep=',', engine='python', header=[0])
feature_arr = df[['budget', 'popularity', 'revenue', 'runtime']].values
tag_arr = df['vote_count'].values
print(len(tag_arr))
# 2. 数据基本处理
# 2.1 数据集划分
x_train, x_test, y_train, y_test = train_test_split(feature_arr, tag_arr[:, np.newaxis], random_state=350,
test_size=0.2)
# 3. 特征工程 --标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)
# 4. 机器学习(线性回归)
# 4.1 模型训练
input_size = x_train.shape[1]
model = LinearRegressionModel(input_size)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
x_train_tensor = torch.FloatTensor(x_train)
y_train_tensor = torch.FloatTensor(y_train)
x_test_tensor = torch.FloatTensor(x_test)
# 转换为 PyTorch 的 DataLoader
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
num_epochs = 100
for epoch in range(num_epochs):
for inputs, labels in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
print("这个模型的偏置是:\n", model.linear.bias.item())
import os
if not os.path.exists("save"):
os.makedirs("save")
torch.save(model.state_dict(), "save/test.pth")
# 4.2 模型保存
torch.save(model.state_dict(), "save/test.pth")
# 4.3 模型加载
loaded_model = LinearRegressionModel(input_size)
loaded_model.load_state_dict(torch.load("save/test.pth"))
# 5. 模型评估
# 5.1 预测值和准确率
with torch.no_grad():
y_pre = loaded_model(x_test_tensor).numpy()
print("预测值是:\n", y_pre)
import pandas as pd
from sklearn.metrics import mean_squared_error
# 假设y_test和y_pre是两个包含NaN值的DataFrame
y_test = y_test.dropna()
y_pre = y_pre.dropna()
ret = mean_squared_error(y_test, y_pre)
ret = mean_squared_error(y_test, y_pre)
print("均方误差是:\n", ret)
if __name__ == '__main__':
dump_load_demo()

浙公网安备 33010602011771号