import numpy as np
import struct
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
class Dataset:
def __init__(self, images, labels):
self.images = images
self.labels = labels
# 获取他的一个item, dataset = Dataset(), dataset[index]
def __getitem__(self, index):
return self.images[index], self.labels[index]
# 获取数据集的长度,个数
def __len__(self):
return len(self.images)
class DataLoaderIterator:
def __init__(self, dataloader):
self.dataloader = dataloader
self.cursor = 0
self.indexs = list(range(self.dataloader.count_data)) # 0, ... 60000
if self.dataloader.shuffle:
# 打乱一下
np.random.shuffle(self.indexs)
def __next__(self):
if self.cursor >= self.dataloader.count_data:
raise StopIteration()
batch_data = []
remain = min(self.dataloader.batch_size, self.dataloader.count_data - self.cursor) # 256, 128
for n in range(remain):
index = self.indexs[self.cursor]
data = self.dataloader.dataset[index]
# 如果batch没有初始化,则初始化n个list成员
if len(batch_data) == 0:
batch_data = [[] for i in range(len(data))]
# 直接append进去
for index, item in enumerate(data):
batch_data[index].append(item)
self.cursor += 1
# 通过np.vstack一次性实现合并,而非每次一直在合并
for index in range(len(batch_data)):
batch_data[index] = np.vstack(batch_data[index])
return batch_data
class DataLoader:
# shuffle 打乱
def __init__(self, dataset, batch_size, shuffle):
self.dataset = dataset
self.shuffle = shuffle
self.count_data = len(dataset)
self.batch_size = batch_size
def __iter__(self):
return DataLoaderIterator(self)
class Module:
def __init__(self, name):
self.name = name
self.train_mode = False
def __call__(self, *args):
return self.forward(*args)
def train(self):
self.train_mode = True
for m in self.modules():
m.train()
def eval(self):
self.train_mode = False
for m in self.modules():
m.eval()
def modules(self):
ms = []
for attr in self.__dict__:
m = self.__dict__[attr]
if isinstance(m, Module):
ms.append(m)
return ms
def params(self):
ps = []
for attr in self.__dict__:
p = self.__dict__[attr]
if isinstance(p, Parameter):
ps.append(p)
ms = self.modules()
for m in ms:
ps.extend(m.params())
return ps
def info(self, n):
ms = self.modules()
output = f"{self.name}\n"
for m in ms:
output += (' ' * (n + 1)) + f"{m.info(n + 1)}\n"
return output[:-1]
def __repr__(self):
return self.info(0)
class Initializer:
def __init__(self, name):
self.name = name
def __call__(self, *args):
return self.apply(*args)
class GaussInitializer(Initializer):
# where :math:`\mu` is the mean and :math:`\sigma` the standard
# deviation. The square of the standard deviation, :math:`\sigma^2`,
# is called the variance.
def __init__(self, mu, sigma):
self.mu = mu
self.sigma = sigma
def apply(self, value):
value[...] = np.random.normal(self.mu, self.sigma, value.shape)
class Parameter:
def __init__(self, value):
self.value = value
self.delta = np.zeros(value.shape)
def zero_grad(self):
self.delta[...] = 0
class Linear(Module):
def __init__(self, input_feature, output_feature):
super().__init__("Linear")
self.input_feature = input_feature
self.output_feature = output_feature
self.weights = Parameter(np.zeros((input_feature, output_feature)))
self.bias = Parameter(np.zeros((1, output_feature)))
# 权重初始化
initer = GaussInitializer(0, np.sqrt(2 / input_feature)) # np.sqrt(2 / input_feature)
initer.apply(self.weights.value)
def forward(self, x):
self.x_save = x.copy()
return x @ self.weights.value + self.bias.value
# AB = C G
# dB = A.T @ G
# dA = G @ B.T
def backward(self, G):
self.weights.delta += self.x_save.T @ G
self.bias.delta += np.sum(G, 0) # 值复制
return G @ self.weights.value.T
class ReLU(Module):
def __init__(self, inplace=True):
super().__init__("ReLU")
self.inplace = inplace
# 亿点点
def forward(self, x):
self.negative_position = x < 0
if not self.inplace:
x = x.copy()
x[self.negative_position] = 0
return x
def backward(self, G):
if not self.inplace:
G = G.copy()
G[self.negative_position] = 0
return G
def sigmoid(x):
p0 = x < 0
p1 = ~p0
x = x.copy()
# 如果x的类型是整数,那么会造成丢失精度
x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0]))
x[p1] = 1 / (1 + np.exp(-x[p1]))
return x
class SWish(Module):
def __init__(self):
super().__init__("SWish")
def forward(self, x):
self.x_save = x.copy()
self.sx = sigmoid(x)
return x * self.sx
def backward(self, G):
return G * (self.sx + self.x_save * self.sx * (1 - self.sx))
class Dropout(Module):
def __init__(self, prob_keep=0.5, inplace=True):
super().__init__("Dropout")
self.prob_keep = prob_keep
self.inplace = inplace
def forward(self, x):
if not self.train_mode:
return x
self.mask = np.random.binomial(size=x.shape, p=1 - self.prob_keep, n=1)
if not self.inplace:
x = x.copy()
x[self.mask] = 0
x *= 1 / self.prob_keep
return x
def backward(self, G):
if not self.inplace:
G = G.copy()
G[self.mask] = 0
G *= 1 / self.prob_keep
return G
class ModuleList(Module):
def __init__(self, *args):
super().__init__("ModuleList")
self.ms = list(args)
def modules(self):
return self.ms
def forward(self, x):
for m in self.ms:
x = m(x)
return x
def backward(self, G):
for i in range(len(self.ms) - 1, -1, -1):
G = self.ms[i].backward(G)
return G
class SigmoidCrossEntropy(Module):
def __init__(self, params, weight_decay=1e-5):
super().__init__("CrossEntropyLoss")
self.params = params
self.weight_decay = weight_decay
def sigmoid(self, x):
# return 1 / (1 + np.exp(-x))
p0 = x < 0
p1 = ~p0
x = x.copy()
x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0]))
x[p1] = 1 / (1 + np.exp(-x[p1]))
return x
def decay_loss(self):
loss = 0
for p in self.params:
loss += np.sqrt(np.sum(p.value ** 2)) / (2 * p.value.size) * self.weight_decay
return loss
def decay_backward(self):
for p in self.params:
eps = 1e-8
p.delta += 1 / (2 * np.sqrt(np.sum(p.value ** 2)) + eps) / (
2 * p.value.size) * self.weight_decay * 2 * p.value
def forward(self, x, label_onehot):
eps = 1e-6
self.label_onehot = label_onehot
self.predict = self.sigmoid(x)
self.predict = np.clip(self.predict, a_max=1 - eps, a_min=eps) # 裁切
self.batch_size = self.predict.shape[0]
return -np.sum(label_onehot * np.log(self.predict) + (1 - label_onehot) *
np.log(1 - self.predict)) / self.batch_size + self.decay_loss()
def backward(self):
self.decay_backward()
return (self.predict - self.label_onehot) / self.batch_size
class SoftmaxCrossEntropy(Module):
def __init__(self):
super().__init__("SoftmaxCrossEntropy")
def softmax(self, x):
# return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
max_x = np.max(x, axis=1, keepdims=True)
exp_x = np.exp(x - max_x)
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward(self, x, label_onehot):
eps = 1e-6
self.label_onehot = label_onehot
self.predict = self.softmax(x)
self.predict = np.clip(self.predict, a_max=1 - eps, a_min=eps) # 裁切
self.batch_size = self.predict.shape[0]
return -np.sum(label_onehot * np.log(self.predict)) / self.batch_size
def backward(self):
return (self.predict - self.label_onehot) / self.batch_size
class Optimizer:
def __init__(self, name, model, lr):
self.name = name
self.model = model
self.lr = lr
self.params = model.params()
def zero_grad(self):
for param in self.params:
param.zero_grad()
def set_lr(self, lr):
self.lr = lr
class SGD(Optimizer):
def __init__(self, model, lr=1e-3):
super().__init__("SGD", model, lr)
def step(self):
for param in self.params:
param.value -= self.lr * param.delta
class SGDMomentum(Optimizer):
def __init__(self, model, lr=1e-3, momentum=0.9):
super().__init__("SGDMomentum", model, lr)
self.momentum = momentum
for param in self.params:
param.v = 0
# 移动平均
def step(self):
for param in self.params:
param.v = self.momentum * param.v - self.lr * param.delta
param.value += param.v
class Adam(Optimizer):
def __init__(self, model, lr=1e-3, beta1=0.9, beta2=0.999, l2_regularization=0):
super().__init__("Adam", model, lr)
self.beta1 = beta1
self.beta2 = beta2
self.l2_regularization = l2_regularization
self.t = 0
for param in self.params:
param.m = 0
param.v = 0
# 指数移动平均
def step(self):
eps = 1e-8
self.t += 1
for param in self.params:
g = param.delta
param.m = self.beta1 * param.m + (1 - self.beta1) * g
param.v = self.beta2 * param.v + (1 - self.beta2) * g ** 2
mt_ = param.m / (1 - self.beta1 ** self.t)
vt_ = param.v / (1 - self.beta2 ** self.t)
param.value -= self.lr * mt_ / (np.sqrt(vt_) + eps) + self.l2_regularization * param.value
class Model(Module):
def __init__(self, num_feature, num_hidden, num_classes):
super().__init__("Model")
self.backbone = ModuleList(
Linear(num_feature, num_hidden),
ReLU(),
Dropout(),
Linear(num_hidden, num_classes)
)
def forward(self, x):
return self.backbone(x)
def backward(self, G):
return self.backbone.backward(G)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def estimate_val(predict, gt_labels, classes, loss_func):
plabel = predict.argmax(1)
positive = plabel == gt_labels
total_images = predict.shape[0]
accuracy = sum(positive) / total_images
return accuracy, loss_func(predict, one_hot(gt_labels, classes))
def lr_schedule_cosine(lr_min, lr_max, per_epochs):
def compute(epoch):
return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch / per_epochs * np.pi))
return compute
def load_images(file):
with open(file, "rb") as f:
data = f.read()
magic_number, num_samples, image_width, image_height = struct.unpack(">iiii", data[:16])
if magic_number != 2051: # 0x00000803
print(f"magic number mismatch {magic_number} != 2051")
return None
image_data = np.frombuffer(data[16:], dtype=np.uint8).reshape(num_samples, -1)
return image_data
def one_hot(labels, classes, label_smoothing=0):
n = len(labels)
eoff = label_smoothing / classes
output = np.ones((n, classes), dtype=np.float32) * eoff
for row, label in enumerate(labels):
output[row, label] = 1 - label_smoothing + eoff
return output
def load_labels(file):
with open(file, "rb") as f:
data = f.read()
magic_number, num_samples = struct.unpack(">ii", data[:8])
if magic_number != 2049: # 0x00000801
print(f"magic number mismatch {magic_number} != 2049")
return None
labels = np.array(list(data[8:]))
return labels
val_labels = load_labels("E:/杜老师课程/dataset/t10k-labels-idx1-ubyte") # 10000,
val_images = load_images("E:/杜老师课程/dataset/t10k-images-idx3-ubyte") # 10000, 784
numdata = val_images.shape[0] # 60000
val_images = np.hstack((val_images / 255 - 0.5, np.ones((numdata, 1)))) # 10000, 785
val_pd = pd.DataFrame(val_labels, columns=["label"])
train_labels = load_labels("E:/杜老师课程/dataset/train-labels-idx1-ubyte") # 60000,
train_images = load_images("E:/杜老师课程/dataset/train-images-idx3-ubyte") # 60000, 784
numdata = train_images.shape[0] # 60000
train_images = np.hstack((train_images / 255 - 0.5, np.ones((numdata, 1)))) # 60000, 785
train_pd = pd.DataFrame(train_labels, columns=["label"])
np.random.seed(3)
classes = 10 # 定义10个类别
batch_size = 64 # 定义每个批次的大小
epochs = 20 # 退出策略,也就是最大把所有数据看10次
lr = 1e-2
numdata, data_dims = train_images.shape # 60000, 784
# 定义dataloader和dataset,用于数据抓取
train_data = DataLoader(Dataset(train_images, one_hot(train_labels, classes)), batch_size, shuffle=True)
model = Model(data_dims, 1024, classes)
# loss_func = SoftmaxCrossEntropy()
loss_func = SigmoidCrossEntropy(model.params(), 0)
optim = Adam(model, lr)
iters = 0 # 定义迭代次数,因为我们需要展示loss曲线,那么x将会是iters
lr_schedule = {
5: 1e-3,
15: 1e-4,
18: 1e-5
}
# 开始进行epoch循环,总数是epochs次
for epoch in range(epochs):
if epoch in lr_schedule:
lr = lr_schedule[epoch]
optim.set_lr(lr)
model.train()
# 对一个批次内的数据进行迭代,每一次迭代都是一个batch(即256)
for index, (images, labels) in enumerate(train_data):
x = model(images)
# 计算loss值
loss = loss_func(x, labels)
optim.zero_grad()
G = loss_func.backward()
model.backward(G)
optim.step() # 应用梯度,更新参数
iters += 1
print(f"Iter {iters}, {epoch} / {epochs}, Loss {loss:.3f}, LR {lr:g}")
model.eval()
val_accuracy, val_loss = estimate_val(model(val_images), val_labels, classes, loss_func)
print(f"Val set, Accuracy: {val_accuracy:.6f}, Loss: {val_loss:.3f}")
def load_labels(file):
with open(file, "rb") as f:
data = f.read()
magic_number, num_samples = struct.unpack(">ii", data[:8])
if magic_number != 2049: # 0x00000801
print(f"magic number mismatch {magic_number} != 2049")
return None
labels = np.frombuffer(data[8:], dtype=np.uint8)
return labels