识别MNIST

把前面实现的NN用到MNIST数据集上试一把。

改用ReLU激活函数,以及Softmax代价函数。

最后结果是用了100个Epoch,花了8分钟10秒(我的mac本本),train set正确率93+%, test set正确率 92+%。

这个时候就体现了GPU的好处了,我估计要达到同样的效果GPU只要不到一分钟,后面就尝试搞GPU,只搞CUDA。

代码如下:

  1 #include <iostream>
  2 #include <cmath>
  3 #include <cstdlib>
  4 #include <cassert>
  5 #include <string>
  6 #include <fstream>
  7 #include <vector>
  8 using namespace std;
  9 
 10 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html
 11 // 高斯分布的随机数,均值为0,方差为1
 12 double gaussrand()
 13 {
 14     static double V1, V2, S;
 15     static int phase = 0;
 16     double X;
 17      
 18     if ( phase == 0 ) {
 19         do {
 20             double U1 = (double)rand() / RAND_MAX;
 21             double U2 = (double)rand() / RAND_MAX;
 22              
 23             V1 = 2 * U1 - 1;
 24             V2 = 2 * U2 - 1;
 25             S = V1 * V1 + V2 * V2;
 26         } while(S >= 1 || S == 0);
 27          
 28         X = V1 * sqrt(-2 * log(S) / S);
 29     } else
 30         X = V2 * sqrt(-2 * log(S) / S);
 31          
 32     phase = 1 - phase;
 33  
 34     return X;
 35 }
 36 
 37 // 不做内存释放,搞简单一点
 38 typedef shared_ptr<double> DoublePtr;
 39 inline DoublePtr newDoubleArray(int size)
 40 {
 41     double *p = new double[size];
 42     return DoublePtr(p, default_delete<double[]>());
 43 }
 44 
 45 // 简单的矩阵(复制的时候只复制meta,不复制实际数据)
 46 struct Matrix
 47 {
 48     int row, col, size;
 49     DoublePtr data;
 50 
 51     Matrix(int _row=1, int _col=1) : row(_row), col(_col)
 52     {
 53         size = row * col;
 54         data = newDoubleArray(size);
 55         memset(data.get(), 0, sizeof(double) * size);
 56     }
 57 
 58     inline double* operator[](int i) {
 59         assert(i < row);
 60         return data.get() + i * col;
 61     }
 62 };
 63 
 64 // 打印矩阵内容
 65 ostream& operator<<(ostream& out, Matrix w)
 66 {
 67     out << "[ (" << w.row << " x " << w.col << ")" << endl;
 68     for(int i = 0;i < w.row;i++) {
 69         out << "\t[";
 70         for(int j = 0;j < w.col;j++) {
 71             if(j > 0) out << ",";
 72             out << w[i][j];
 73         }
 74         out << "]" << endl;
 75     }
 76     out << "]";
 77     return out;
 78 }
 79 
 80 // 简单的向量(复制的时候只复制meta,不复制实际数据)
 81 struct Vector
 82 {
 83     int size;
 84     DoublePtr data;
 85 
 86     Vector(int _size=1) : size(_size)
 87     {
 88         data = newDoubleArray(size);
 89         memset(data.get(), 0, sizeof(double) * size);
 90     }
 91 
 92     inline double &operator[](int x)
 93     {
 94         assert(x < size);
 95         return data.get()[x];
 96     }
 97 };
 98 
 99 // 打印向量内容
100 ostream& operator<<(ostream& out, Vector v)
101 {
102     out << "[ (" << v.size << ") ";
103     for(int i = 0;i < v.size;i++) {
104         if(i > 0) out << ",";
105         out << v[i];
106     }
107     out << "]";
108     return out;
109 }
110 
111 Vector operator*(Matrix w, Vector v)
112 {
113     Vector ret(w.row);
114     for(int i = 0;i < w.row;i++) {
115         for(int j = 0;j < w.col;j++) {
116             ret[i] += w[i][j] * v[j];
117         }
118     }
119     return ret;
120 }
121 
122 // 点乘
123 Vector operator*(Vector x, Vector y)
124 {
125     Vector ret(x.size);
126     for(int i = 0;i < x.size;i++) {
127         ret[i] = x[i] * y[i];
128     }
129     return ret;
130 }
131 
132 // w转置,然后和v相乘
133 Vector TandMul(Matrix w, Vector v)
134 {
135     Vector ret(w.col);
136     for(int i = 0;i < w.col;i++) {
137         for(int j = 0;j < w.row;j++) {
138             ret[i] += w[j][i] * v[j];
139         }
140     }
141     return ret;
142 }
143 
144 Vector operator+(Vector x, Vector y)
145 {
146     Vector ret(x.size);
147     for(int i = 0;i < x.size;i++) {
148         ret[i] = x[i] + y[i];
149     }
150     return ret;
151 }
152 
153 Vector operator-(Vector x, Vector y)
154 {
155     Vector ret(x.size);
156     for(int i = 0;i < x.size;i++) {
157         ret[i] = x[i] - y[i];
158     }
159     return ret;
160 }
161 
162 Vector operator*(double x, Vector y)
163 {
164     Vector ret(y.size);
165     for(int i = 0;i < y.size;i++) {
166         ret[i] = x * y[i];
167     }
168     return ret;
169 }
170 
171 Vector operator*(Vector x, double y)
172 {
173     return y * x;
174 }
175 
176 // Cost函数
177 struct CostFun
178 {
179     virtual double calc(Vector x, Vector y)
180     {
181         return 0;
182     }
183 
184     virtual double operator()(Vector x, Vector y)
185     {
186         return calc(x,y);
187     }
188 
189     virtual Vector propagateDelta(Vector output, Vector y)
190     {
191         return Vector(output.size);
192     }
193 };
194 
195 // 方差Cost函数
196 struct SqrCostFun: CostFun
197 {
198     // \sum (x_i-y_i)^2/2
199     virtual double calc(Vector x, Vector y)
200     {
201         double ret = 0;
202         for(int i = 0;i < x.size;i++) {
203             double t = x[i] - y[i];
204             ret += t * t;
205         }
206         return ret / 2;
207     }
208 
209     virtual Vector propagateDelta(Vector output, Vector y)
210     {
211         /*
212           d \sum (x_i-y_i)^2/2
213         = \sum x_i-y_i
214         // x - y
215         */
216         return output - y;
217     }
218 };
219 
220 struct SoftmaxCostFun: CostFun
221 {
222     // - \sum y_j * log( exp(x_j) / \sum exp(x_k) )
223     virtual double calc(Vector x, Vector y)
224     {
225         /*
226           log( exp(x_j) / \sum exp(x_k) )
227         = x_j - log \sum exp(x_k)
228         = x_j - (max + log \sum exp(x_k - max))
229         */
230         double maxX = x[0];
231         for(int i = 0;i < x.size;i++) {
232             if(x[i] > maxX) {
233                 maxX = x[i];
234             }
235         }
236 
237         double xSum = 0;
238         for(int i = 0;i < x.size;i++) {
239             xSum += exp(x[i] - maxX);
240         }
241 
242         double ret = 0;
243         for(int i = 0;i < x.size;i++) {
244             ret += y[i] * (x[i] - (maxX + log(xSum)));
245         }
246 
247         return -ret;
248     }
249 
250     virtual Vector propagateDelta(Vector output, Vector y)
251     {
252         Vector x = output;
253         /*
254           - d \sum y_j * log( exp(x_j) / \sum exp(x_k) )
255         = - d \sum y_j * x_j - d \sum y_j log (\sum exp(x_k) )
256         = - y_i + \sum (y_j * exp(x_i) / \sum exp(x_k))
257         = - y_i + exp(x_i) (\sum y_j) / (\sum exp(x_k))
258         */
259 
260         double maxX = x[0];
261         for(int i = 0;i < x.size;i++) {
262             if(x[i] > maxX) {
263                 maxX = x[i];
264             }
265         }
266 
267         // y - exp(x) sum_of_y / sum_of_exp(x)
268         double sumOfY = 0;
269         double sumOfX = 0;
270         Vector tmp(x.size);
271         for(int i = 0;i < x.size;i++) {
272             tmp[i] = exp(x[i] - maxX);
273             sumOfY += y[i];
274             sumOfX += tmp[i];
275         }
276 
277         return sumOfY/sumOfX * tmp - y;
278     }
279 };
280 
281 // 单例
282 SqrCostFun SqrCostFunSingleton;
283 SoftmaxCostFun SoftmaxCostFunSingleton;
284 
285 // 激活函数
286 struct Activator
287 {
288     // forward
289     virtual double forward(double v) 
290     {
291         return v;
292     }
293 
294     virtual double operator()(double v)
295     {
296         return forward(v);
297     }
298 
299     virtual Vector operator()(Vector v)
300     {
301         Vector ret(v.size);
302         for(int i = 0;i < v.size;i++) {
303             ret[i] = forward(v[i]);
304         }
305         return ret;
306     }
307 
308     // 求导数
309     virtual double derive(double v)
310     {
311         return 1;
312     }
313 
314     virtual Vector derive(Vector v)
315     {
316         Vector ret(v.size);
317         for(int i = 0;i < ret.size;i++) {
318             ret[i] = derive(v[i]);
319         }
320         return ret;
321     }
322 };
323 
324 // Sigmoid激活函数
325 struct SigmoidActivator : Activator
326 {
327     virtual double forward(double v)
328     {
329         return 1 / (1 + exp(-v));
330     }
331 
332     virtual double derive(double v)
333     {
334         double t = exp(-v);
335         return t / ( (1 + t) * (1 + t) );
336     }
337 };
338 
339 struct ReluActivator: Activator
340 {
341     virtual double forward(double v)
342     {
343         return v >= 0 ? v : 0;
344     }
345 
346     virtual double derive(double v)
347     {
348         return v >= 0 ? 1 : 0;
349     }
350 };
351 
352 // 单例
353 SigmoidActivator SigmoidActivatorSingleton;
354 ReluActivator ReluActivatorSingleton;
355 
356 // NN的一层
357 // 1. 输入不算一层
358 // 2. 层的w矩阵是从前面一层到当前层的w,和NG的定义有些出入
359 // 3. 层的b是前面一层到当前层的b,和NG的定义有些出入
360 struct Layer
361 {
362     // 上一层的输出的个数,不包括bias
363     int inSize;
364     // 当前层的输出
365     int outSize;
366 
367     Activator &activator;
368     Matrix w;
369     Vector b;
370 
371     void initWeights(double *p, int size)
372     {
373         // 采用 (0, 0.01)的正太分布初始化
374         for(int i = 0;i < size;i++) {
375             p[i] = gaussrand() * 0.01;
376         }
377     }
378 
379     Layer(int _inSize=1, int _outSize=1, Activator &_activator= SigmoidActivatorSingleton):
380         inSize(_inSize),
381         outSize(_outSize),
382         w(_outSize, _inSize),
383         b(_outSize),
384         activator(_activator)
385     {
386         initWeights(w.data.get(), w.size);
387         initWeights(b.data.get(), b.size);
388     }
389 
390     // 最后一次forward计算之后保存的激活值
391     Vector a;
392     Vector z;
393     // in是上一层的输出
394     Vector operator()(Vector in)
395     {
396         z = w * in + b;
397         return a = activator(z);
398     }
399 
400     // 最后一次反向传播计算之后保存的delta值
401     Vector delta;
402     Vector propagateDelta()
403     {
404         return TandMul(w, delta);
405     }
406 
407     // alpha是学习率
408     // prevA是上一层的输出
409     void updateParameters(double alpha, Vector prevA)
410     {
411         b = b + (-alpha) * delta;
412         Matrix nw(w.row, w.col);
413         for(int i = 0;i < w.row;i++) {
414             for(int j = 0;j < w.col;j++) {
415                 nw[i][j] = w[i][j] - alpha * prevA[j] * delta[i];
416             }
417         }
418         w = nw;
419     }
420 
421     // 利用NG的办法进行BP算法正确性的检查
422     void checkBp(Layer layerList[], int nLayer, Vector input, Vector y, CostFun& costFun, double alpha, Vector prevA)
423     {
424         Vector forward(Layer layerList[], int nLayer, Vector input);
425 
426         // BP算法计算的结果
427         Vector db = delta;
428         Matrix dw(w.row, w.col);
429         for(int i = 0;i < w.row;i++) {
430             for(int j = 0;j < w.col;j++) {
431                 dw[i][j] = prevA[j] * delta[i];
432             }
433         }
434 
435         // NG办法计算的结果
436         // 1. 考虑对b增加一个e
437         double EPS = 0.0001;
438         for(int i = 0;i < b.size;i++) {
439             double tmp = b[i];
440             // +eps
441             b[i] = tmp + EPS;
442             Vector output1 = forward(layerList, nLayer, input);
443             double err1 = costFun(output1, y);
444             // -eps
445             b[i] = tmp - EPS;
446             Vector output2 = forward(layerList, nLayer, input);
447             double err2 = costFun(output2, y);
448 
449             // 恢复原值
450             b[i] = tmp;
451 
452             // NG方法的估算值
453             double v = (err1 - err2) / (2 * EPS);
454             if(v > 0) {
455                 double x = fabs( (v-db[i]) / v);
456                 if(x > 0.0001) {
457                     cerr << "BP算法结果误差太大了" << endl;
458                 }
459             }
460         }
461 
462         // 2. 考虑对w增加一个e
463         for(int i = 0;i < w.row;i++) {
464             for(int j = 0;j < w.col;j++) {
465                 double tmp = w[i][j];
466                 // +eps
467                 w[i][j] = tmp + EPS;
468                 Vector output1 = forward(layerList, nLayer, input);
469                 double err1 = costFun(output1, y);
470                 // -eps
471                 w[i][j] = tmp - EPS;
472                 Vector output2 = forward(layerList, nLayer, input);
473                 double err2 = costFun(output2, y);
474 
475                 // 恢复原值
476                 w[i][j] = tmp;
477 
478                 // NG方法的估算值
479                 double v = (err1 - err2) / (2 * EPS);
480                 if(v > 0) {
481                     double x = fabs( (v-dw[i][j]) / v);
482                     if(x > 0.0001) {
483                         cerr << "BP算法结果误差太大了" << endl;
484                     }
485                 }
486             }
487         }
488     }
489 };
490 
491 ostream& operator<<(ostream& out, Layer& layer)
492 {
493     out << "Layer {" << endl;
494     out << "w = " << layer.w << endl;
495     out << "b = " << layer.b << endl;
496     out << "z = " << layer.z << endl;
497     out << "a = " << layer.a << endl;
498     out << "delta = " << layer.delta << endl;
499     out << "}" << endl;
500     return out;
501 }
502 
503 Vector forward(Layer layerList[], int nLayer, Vector input)
504 {
505     Vector tmp = input;
506     for(int i = 0;i < nLayer;i++) {
507         tmp = layerList[i](tmp);
508     }
509     return tmp;
510 }
511 
512 void backward(Layer layerList[], int nLayer, Vector input, Vector y, CostFun& costFun, double alpha)
513 {
514     // 反向传播delta
515     Layer &lastLayer = layerList[nLayer - 1];
516     // Sqr cost function为例是: -(y - a) f'(z)
517     lastLayer.delta = costFun.propagateDelta(lastLayer.a, y) * lastLayer.activator.derive(lastLayer.z);
518 
519     for(int i = nLayer - 2;i >= 0;i--) {
520         Layer &layer = layerList[i];
521         Layer &nextLayer = layerList[i + 1];
522         layer.delta = nextLayer.propagateDelta() * layer.activator.derive(layer.z);
523     }
524 
525     /*
526     // 检查BP算法正确性(只做一次)
527     static bool hasDoneBpChecking = false;
528     if(!hasDoneBpChecking) {
529         hasDoneBpChecking = true;
530         for(int i = 0;i < nLayer;i++) {
531             layerList[i].checkBp(layerList, nLayer, input, y, costFun, alpha, i == 0 ? input : layerList[i - 1].a);
532         }
533     }
534     //*/
535 
536     // 更新所有的w和b
537     for(int i = 0;i < nLayer;i++) {
538         layerList[i].updateParameters(alpha, i == 0 ? input : layerList[i - 1].a);
539     }
540 }
541 
542 int MsbInt(char buf[], int len=4)
543 {
544     int base = 1;
545     int ret = 0;
546     for(int i = len - 1;i >= 0;i--) {
547         ret += (unsigned char)buf[i] * base;
548         base *= 256;
549     }
550     return ret;
551 }
552 vector<int> ReadMnistLabels(string fileName)
553 {
554     vector<int> ret;
555     ifstream ifs(fileName.c_str(), ios::binary);
556     char buf[1000];
557 
558     // MAGIC
559     ifs.read(buf, 4);
560     int magic = MsbInt(buf);
561     if(magic != 0x00000801) {
562         cerr << "incorrect label file magic number" << endl;
563     }
564 
565     // num of images
566     ifs.read(buf, 4);
567     int nImages = MsbInt(buf);
568 
569     while(nImages--) {
570         ret.push_back(ifs.get());
571     }
572 
573     return ret;
574 }
575 vector<Vector> ReadMnistData(string fileName)
576 {
577     vector<Vector> ret;
578     ifstream ifs(fileName.c_str(), ios::binary);
579     char buf[1000];
580 
581     // MAGIC
582     ifs.read(buf, 4);
583     int magic = MsbInt(buf);
584     if(magic != 0x00000803) {
585         cerr << "incorrect data file magic number" << endl;
586     }
587 
588     // num of images
589     ifs.read(buf, 4);
590     int nImages = MsbInt(buf);
591 
592     int row, col;
593     ifs.read(buf, 4);
594     row = MsbInt(buf);
595     ifs.read(buf, 4);
596     col = MsbInt(buf);
597     if(row != 28 || col != 28) {
598         cerr << "incorrect image size" << endl;
599     }
600 
601     while(nImages--) {
602         Vector image(row * col);
603         for(int i = 0;i < row * col;i++) {
604             image[i] = ifs.get() / 256.0; // 归一化
605         }
606         ret.push_back(image);
607     }
608 
609     return ret;
610 }
611 
612 vector<Vector> translateLabels(vector<int> labels, int k=10)
613 {
614     vector<Vector> ret;
615     for(int i = 0;i < labels.size();i++) {
616         Vector tmp(k);
617         assert(labels[i] >= 0 && labels[i] < k);
618         tmp[labels[i]] = 1;
619         ret.push_back(tmp);
620     }
621     return ret;
622 }
623 
624 int getMaxIdx(Vector x)
625 {
626     int maxIdx = 0;
627     double maxV = x[0];
628     for(int i = 0;i < x.size;i++) {
629         if(x[i] > maxV) {
630             maxV = x[i];
631             maxIdx = i;
632         }
633     }
634     return maxIdx;
635 }
636 
637 int main()
638 {
639     srand(1000);
640 
641     cout << "Loading data" << endl;
642     // 读取数据
643     vector<int> trainLabels = ReadMnistLabels("mnist/train-labels-idx1-ubyte");
644     vector<Vector> trainData = ReadMnistData("mnist/train-images-idx3-ubyte");
645     vector<Vector> trainLabels2 = translateLabels(trainLabels);
646 
647     vector<int> testLabels = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");
648     vector<Vector> testData = ReadMnistData("mnist/t10k-images-idx3-ubyte");
649 
650     // 输入是28 x 28
651     // 输出是10
652     int nInput = 28 * 28;
653     int nOutput = 10;
654 
655     // NN网络结构
656     Layer layerList[] = {
657         Layer(nInput, nOutput, ReluActivatorSingleton),
658         //Layer(100, nOutput, ReluActivatorSingleton)
659     };
660 
661     // Cost fun
662     CostFun &costFun = SoftmaxCostFunSingleton;
663 
664     // 不包括输入层在内的层的个数
665     int nLayer = sizeof(layerList) / sizeof(layerList[0]);
666 
667     int M = trainData.size();
668     int T = testData.size();
669 
670     // 开始训练
671     cout << "Start training" << endl;
672     time_t fullStartedAt = time(NULL);
673     for(int step = 0;step < 100000;step++) {
674         time_t startedAt = time(NULL);
675 
676         double avgError = 0;
677 
678         for(int i = 0;i < M;i++) {
679             Vector x = trainData[i];
680             Vector y = trainLabels2[i];
681 
682             Vector output = forward(layerList, nLayer, x);
683             double error = costFun(output, y);
684             //cout << output << " " << y << " " << error << endl;
685             avgError += error;
686 
687             backward(layerList, nLayer, x, y, costFun, 0.001);
688         }
689         avgError /= M;
690 
691         time_t endedAt = time(NULL);
692 
693         cout << "step=" << step << " time_cost=" << (endedAt - startedAt) << " avgErr=" << avgError << " ";
694 
695         // validate
696         int nTotal = 0;
697         int nGood = 0;
698         for(int i = 0;i < M;i++) {
699             Vector x = trainData[i];
700             Vector output = forward(layerList, nLayer, x);
701             int maxIdx = getMaxIdx(output);
702             if(maxIdx == trainLabels[i]) {
703                 nGood++;
704             }
705             nTotal++;
706         }
707         cout << "train_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";
708         bool doBreak = false;
709         if(nGood * 1.0 / nTotal > 0.95) {
710             doBreak = true;
711         }
712 
713         // check
714         nTotal = 0;
715         nGood = 0;
716         for(int i = 0;i < T;i++) {
717             Vector x = testData[i];
718             Vector output = forward(layerList, nLayer, x);
719             int maxIdx = getMaxIdx(output);
720             if(maxIdx == testLabels[i]) {
721                 nGood++;
722             }
723             nTotal++;
724         }
725         cout << "test_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal;
726         cout << "\n";
727 
728         if(doBreak) {
729             break;
730         }
731     }
732 
733     time_t fullEndedAt = time(NULL);
734     cout << "Total cost " << (fullEndedAt - fullStartedAt) << " seconds" << endl;
735 
736     return 0;
737 }

 

posted on 2016-09-26 20:49  萝卜头Lee  阅读(1088)  评论(0)    收藏  举报

导航