CNN4 参数优化

之前在Caffe学习 一 网络参数和自定义网络包含一些参数的设置项。

momentum是梯度下降法中一种常用的加速技术。对于一般的SGD,其表达式为

w(t+1)=w(t)-lr*grad

w沿负梯度方向下降。

而带momentum项的SGD则写生如下形式:

v(t+1)=momentum*v(t)-lr*grad ; w(t+1)=w(t)+v(t+1)

即如果上次的v与这次的负梯度方向相同,那本次下降的幅度就会加大,以达加速收敛。

learning rate下降有多种方式,本例中每2000次降低为0.9倍。

对500个测试数据,正确率可以达到98.6%,即有7张没有正确分辨。

 

# coding:utf8
import cPickle
import numpy as np


class ConvPoolLayer(object):
    def __init__(self, image_shape,filter_shape,poolsize=(2,2)):
        self.filter_shape = filter_shape
        self.image_shape = image_shape
        self.weights = np.random.normal(loc=0, scale=np.sqrt(1.0/np.prod(filter_shape[1:])),
                                  size=(np.prod(filter_shape[1:]),filter_shape[0]))
        self.b = np.random.normal(loc=0, scale=0.2, size=(1,filter_shape[0],1,1))
        self.poolsize = poolsize
        self.samp_shape=(image_shape[-2] - filter_shape[-2] + 1,image_shape[-1] - filter_shape[-1] + 1)
        self.out_shape=(self.samp_shape[0]/poolsize[0],self.samp_shape[1]/poolsize[1])
        self.v=0

    def im2col(self,a):
        n, m, h, w = self.image_shape
        vn, vm, vh, vw = self.filter_shape
        t = h - vh + 1
        kh = t ** 2
        kw = vh * vh
        z = np.zeros((n, kh, kw * m))
        for i in range(n):
            for j in range(m):
                kt = []
                for it in range(t):
                    for jt in range(t):
                        kt.append((a[i, j, it:it + vh, jt:jt + vh]).flatten())
                z[i, :, j * kw:j * kw + kw] = kt
        z = np.reshape(z, (n * kh, kw * m))
        return z

    def col2im(self,a):
        n, m, h, w = self.image_shape
        vn, vm, vh, vw = self.filter_shape
        t = h - vh + 1
        kh = t ** 2
        kw = vh * vh
        a = np.reshape(a, (n, kh, kw * m))
        z = np.zeros((n, m, h, h))
        for i in range(n):
            for k in range(m):
                for j in range(h):
                    rh = vh * min(j, vh - 1)
                    b = max(j + 1 - vh, 0) * t
                    z[i, k, j] = np.append(a[i, b, rh:rh + vh], a[i, b + 1:b + t, rh + vh - 1])
        return z

    def fw_shape(self,a):
        res = np.reshape(a, (self.image_shape[0], -1, self.filter_shape[0]))
        res = np.rollaxis(res, 2, 1)
        res = np.reshape(res, (self.image_shape[0], -1, self.samp_shape[0], self.samp_shape[1]))
        return res

    def bp_shape(self,a):
        res = np.reshape(a, (self.image_shape[0], self.filter_shape[0], -1))
        res = np.rollaxis(res, 1, 3)
        res = np.reshape(res, (-1, self.filter_shape[0]))
        return res

    def feedforward(self, a):
        z = self.im2col(a)
        res = np.dot(z, self.weights)
        res = self.fw_shape(res)
        self.out = self.relu(res+self.b)
        return np.array(self.pool2d(self.out))

    def backprop(self, x, dnext,eta=0.001,weight_decay=0,momentum=0.9):
        if dnext.ndim<3:
            dnext = np.reshape(dnext,(self.image_shape[0],self.filter_shape[0], self.out_shape[0], -1))
        u = self.relu_prime(self.out)
        dnext = np.multiply(u,self.up(dnext,self.poolsize[0]))
        delta = self.bp_shape(dnext)/self.image_shape[0]
        x=self.im2col(x)
        out_delta = np.dot(delta, self.weights.T)
        out_delta=self.col2im(out_delta)
        w = np.dot(x.T, delta)
        w = eta * w+weight_decay*self.weights**2
        self.v = momentum * self.v - w
        self.weights += self.v
        self.b -= eta * np.reshape(np.sum(delta,0),(self.b.shape))
        return out_delta

    def pool2d(self,input, ds=(2, 2), mode='max'):
        fun = np.max
        if mode == 'sum':
            fun = np.sum
        elif mode == 'average':
            fun = np.average
        n, m, h, w = np.shape(input)
        d, s = ds
        zh = h / d + h % d
        zw = w / s + w % s
        z = np.zeros((n, m, zh, zw))
        for k in range(n):
            for o in range(m):
                for i in range(zh):
                    for j in range(zw):
                        z[k, o, i, j] = fun(input[k, o, d * i:min(d * i + d, h), s * j:min(s * j + s, w)])
        return z

    def up(self,a,n):
        b=np.ones((n,n))
        return np.kron(a,b)

    def relu(self,z):
        return np.maximum(z, 0.0)

    def relu_prime(self,z):
        z[z>0]=1
        return z

class SoftmaxLayer(object):
    def __init__(self, in_num=100,out_num=10):
        self.weights = np.random.randn(in_num, out_num)/np.sqrt(out_num)
        self.v=0

    def feedforward(self, input):
        self.out=self.softmax(np.dot(input, self.weights))
        return self.out

    def backprop(self, input, y,eta=0.001,weight_decay=0,momentum=0.9):
        o=self.out
        delta =o-y
        out_delta=np.dot(delta,self.weights.T)
        w = np.dot(input.T,delta)
        w=eta*w+weight_decay*self.weights**2
        self.v = momentum * self.v - w
        self.weights += self.v
        return out_delta

    def softmax(self,a):
        m = np.exp(a)
        return m / (np.sum(m,axis=1)[:,np.newaxis])

class FullLayer(object):
    def __init__(self, in_num=720,out_num=100):
        self.in_num=in_num
        self.out_num=out_num
        self.biases = np.random.randn(out_num)
        self.weights = np.random.randn(in_num, out_num)/np.sqrt(out_num)
        self.v=0

    def feedforward(self, x):
        if x.ndim>2:
            x = np.reshape(x, (len(x), self.in_num))
        self.out = self.sigmoid(np.dot(x, self.weights)+self.biases)
        return self.out

    def backprop(self, x,delta,eta=0.001,weight_decay=0,momentum=0.9):
        if x.ndim>2:
            x = np.reshape(x, (len(x), self.in_num))
        sp=self.sigmoid_prime(self.out)
        delta = delta * sp
        out_delta = np.dot(delta, self.weights.T)
        w = np.dot( x.T,delta)
        w=eta*w+weight_decay*self.weights**2
        self.v=momentum*self.v-w
        self.weights +=self.v
        self.biases -= eta*np.sum(delta,0)
        return out_delta

    def sigmoid(self,z):
        return 1.0/(1.0+np.exp(-z))

    def sigmoid_prime(self,z):
        return z*(1-z)

class Network(object):
    def __init__(self, layers):
        self.layers=layers
        self.num_layers = len(layers)
        self.a=[]

    def feedforward(self, x):
        self.a.append(x)
        for layer in self.layers:
            x=layer.feedforward(x)
            self.a.append(x)
        return x

    def SGD(self, training_data, test_data,epochs, mini_batch_size, lr=0.001,weight_decay=0.0005,momentum=0.9):
        self.n = len(training_data[0])
        self.mini_batch_size=mini_batch_size
        self.weight_decay = weight_decay
        self.momentum=momentum
        self.lr = lr
        rate=np.exp(np.log(0.9)/2000)**mini_batch_size
        cx=range(epochs)
        for j in cx:
            for k in xrange(0, self.n , mini_batch_size):
                self.lr=self.lr*rate
                batch_x = np.array(training_data[0][k:k + mini_batch_size])
                batch_y = training_data[1][k:k + mini_batch_size]
                self.backprop(batch_x,batch_y)
                if k%500==0:
                    print "Epoch {0}:{1},test:{2},cost={3},lr={4}".format(j,k,
                    self.evaluate([test_data[0],test_data[1]]),self.cost,self.lr)

    def backprop(self, x_in, y):
        self.feedforward(x_in)
        for i in range(self.num_layers):
            delta=self.layers[-i-1].backprop(self.a[-i-2],y,eta=self.lr,
                            weight_decay=self.weight_decay,momentum=self.momentum)
            y=delta

    def evaluate(self, test_data):
        x,y=test_data
        num=len(x)
        x=[self.feedforward(np.array(x[size*i:size*i+size])) for i in range((num/size))]
        x=np.reshape(x,(num,np.shape(x)[-1]))
        xp = np.argmax(x, axis=1)
        yp= np.argmax(y, axis=1) if y[0].ndim else y
        self.cost = -np.mean(np.log(x)[np.arange(num),yp])
        return np.mean(yp == xp)*100


if __name__ == '__main__':
        def get_data(data):
            return [np.reshape(x, (1,28,28)) for x in data[0]]

        def get_label(i):
            c = np.zeros((10))
            c[i] = 1
            return c

        f = open('data/mnist.pkl', 'rb')
        training_data, validation_data, test_data = cPickle.load(f)
        training_inputs = get_data(training_data)
        training_label=[get_label(y_) for y_ in training_data[1]]
        test_inputs = get_data(test_data)
        test = zip(test_inputs,test_data[1])
        size=50
        net = Network([ConvPoolLayer(image_shape=[size,1,28,28],filter_shape=[20,1,5,5],poolsize=(2,2)),
                       ConvPoolLayer(image_shape=[size,20,12,12],filter_shape=[40,20,5,5], poolsize=(2,2)),
                       FullLayer(in_num=40*4*4,out_num=100),
                       SoftmaxLayer(in_num=100,out_num=10)])
        net.SGD([training_inputs,training_label],[test_inputs[:500],test_data[1][:500]],
                epochs=3,mini_batch_size=size, lr=0.005,weight_decay=0,momentum=0.9)

        # Epoch 0:23500,test:98.6,cost=0.0728153919619,lr=0.00144602417273
        # Epoch 0:33000,test:98.6,cost=0.053761519132,lr=0.000876652494295
        # Epoch 0:33500,test:98.6,cost=0.0537392805265,lr=0.000853862813757
        # Epoch 0:34000,test:98.6,cost=0.0546350815828,lr=0.000831665579532

 

posted on 2017-02-06 20:08  1357  阅读(370)  评论(0编辑  收藏  举报

导航