识别MNIST
把前面实现的NN用到MNIST数据集上试一把。
改用ReLU激活函数,以及Softmax代价函数。
最后结果是用了100个Epoch,花了8分钟10秒(我的mac本本),train set正确率93+%, test set正确率 92+%。
这个时候就体现了GPU的好处了,我估计要达到同样的效果GPU只要不到一分钟,后面就尝试搞GPU,只搞CUDA。
代码如下:
1 #include <iostream> 2 #include <cmath> 3 #include <cstdlib> 4 #include <cassert> 5 #include <string> 6 #include <fstream> 7 #include <vector> 8 using namespace std; 9 10 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html 11 // 高斯分布的随机数,均值为0,方差为1 12 double gaussrand() 13 { 14 static double V1, V2, S; 15 static int phase = 0; 16 double X; 17 18 if ( phase == 0 ) { 19 do { 20 double U1 = (double)rand() / RAND_MAX; 21 double U2 = (double)rand() / RAND_MAX; 22 23 V1 = 2 * U1 - 1; 24 V2 = 2 * U2 - 1; 25 S = V1 * V1 + V2 * V2; 26 } while(S >= 1 || S == 0); 27 28 X = V1 * sqrt(-2 * log(S) / S); 29 } else 30 X = V2 * sqrt(-2 * log(S) / S); 31 32 phase = 1 - phase; 33 34 return X; 35 } 36 37 // 不做内存释放,搞简单一点 38 typedef shared_ptr<double> DoublePtr; 39 inline DoublePtr newDoubleArray(int size) 40 { 41 double *p = new double[size]; 42 return DoublePtr(p, default_delete<double[]>()); 43 } 44 45 // 简单的矩阵(复制的时候只复制meta,不复制实际数据) 46 struct Matrix 47 { 48 int row, col, size; 49 DoublePtr data; 50 51 Matrix(int _row=1, int _col=1) : row(_row), col(_col) 52 { 53 size = row * col; 54 data = newDoubleArray(size); 55 memset(data.get(), 0, sizeof(double) * size); 56 } 57 58 inline double* operator[](int i) { 59 assert(i < row); 60 return data.get() + i * col; 61 } 62 }; 63 64 // 打印矩阵内容 65 ostream& operator<<(ostream& out, Matrix w) 66 { 67 out << "[ (" << w.row << " x " << w.col << ")" << endl; 68 for(int i = 0;i < w.row;i++) { 69 out << "\t["; 70 for(int j = 0;j < w.col;j++) { 71 if(j > 0) out << ","; 72 out << w[i][j]; 73 } 74 out << "]" << endl; 75 } 76 out << "]"; 77 return out; 78 } 79 80 // 简单的向量(复制的时候只复制meta,不复制实际数据) 81 struct Vector 82 { 83 int size; 84 DoublePtr data; 85 86 Vector(int _size=1) : size(_size) 87 { 88 data = newDoubleArray(size); 89 memset(data.get(), 0, sizeof(double) * size); 90 } 91 92 inline double &operator[](int x) 93 { 94 assert(x < size); 95 return data.get()[x]; 96 } 97 }; 98 99 // 打印向量内容 100 ostream& operator<<(ostream& out, Vector v) 101 { 102 out << "[ (" << v.size << ") "; 103 for(int i = 0;i < v.size;i++) { 104 if(i > 0) out << ","; 105 out << v[i]; 106 } 107 out << "]"; 108 return out; 109 } 110 111 Vector operator*(Matrix w, Vector v) 112 { 113 Vector ret(w.row); 114 for(int i = 0;i < w.row;i++) { 115 for(int j = 0;j < w.col;j++) { 116 ret[i] += w[i][j] * v[j]; 117 } 118 } 119 return ret; 120 } 121 122 // 点乘 123 Vector operator*(Vector x, Vector y) 124 { 125 Vector ret(x.size); 126 for(int i = 0;i < x.size;i++) { 127 ret[i] = x[i] * y[i]; 128 } 129 return ret; 130 } 131 132 // w转置,然后和v相乘 133 Vector TandMul(Matrix w, Vector v) 134 { 135 Vector ret(w.col); 136 for(int i = 0;i < w.col;i++) { 137 for(int j = 0;j < w.row;j++) { 138 ret[i] += w[j][i] * v[j]; 139 } 140 } 141 return ret; 142 } 143 144 Vector operator+(Vector x, Vector y) 145 { 146 Vector ret(x.size); 147 for(int i = 0;i < x.size;i++) { 148 ret[i] = x[i] + y[i]; 149 } 150 return ret; 151 } 152 153 Vector operator-(Vector x, Vector y) 154 { 155 Vector ret(x.size); 156 for(int i = 0;i < x.size;i++) { 157 ret[i] = x[i] - y[i]; 158 } 159 return ret; 160 } 161 162 Vector operator*(double x, Vector y) 163 { 164 Vector ret(y.size); 165 for(int i = 0;i < y.size;i++) { 166 ret[i] = x * y[i]; 167 } 168 return ret; 169 } 170 171 Vector operator*(Vector x, double y) 172 { 173 return y * x; 174 } 175 176 // Cost函数 177 struct CostFun 178 { 179 virtual double calc(Vector x, Vector y) 180 { 181 return 0; 182 } 183 184 virtual double operator()(Vector x, Vector y) 185 { 186 return calc(x,y); 187 } 188 189 virtual Vector propagateDelta(Vector output, Vector y) 190 { 191 return Vector(output.size); 192 } 193 }; 194 195 // 方差Cost函数 196 struct SqrCostFun: CostFun 197 { 198 // \sum (x_i-y_i)^2/2 199 virtual double calc(Vector x, Vector y) 200 { 201 double ret = 0; 202 for(int i = 0;i < x.size;i++) { 203 double t = x[i] - y[i]; 204 ret += t * t; 205 } 206 return ret / 2; 207 } 208 209 virtual Vector propagateDelta(Vector output, Vector y) 210 { 211 /* 212 d \sum (x_i-y_i)^2/2 213 = \sum x_i-y_i 214 // x - y 215 */ 216 return output - y; 217 } 218 }; 219 220 struct SoftmaxCostFun: CostFun 221 { 222 // - \sum y_j * log( exp(x_j) / \sum exp(x_k) ) 223 virtual double calc(Vector x, Vector y) 224 { 225 /* 226 log( exp(x_j) / \sum exp(x_k) ) 227 = x_j - log \sum exp(x_k) 228 = x_j - (max + log \sum exp(x_k - max)) 229 */ 230 double maxX = x[0]; 231 for(int i = 0;i < x.size;i++) { 232 if(x[i] > maxX) { 233 maxX = x[i]; 234 } 235 } 236 237 double xSum = 0; 238 for(int i = 0;i < x.size;i++) { 239 xSum += exp(x[i] - maxX); 240 } 241 242 double ret = 0; 243 for(int i = 0;i < x.size;i++) { 244 ret += y[i] * (x[i] - (maxX + log(xSum))); 245 } 246 247 return -ret; 248 } 249 250 virtual Vector propagateDelta(Vector output, Vector y) 251 { 252 Vector x = output; 253 /* 254 - d \sum y_j * log( exp(x_j) / \sum exp(x_k) ) 255 = - d \sum y_j * x_j - d \sum y_j log (\sum exp(x_k) ) 256 = - y_i + \sum (y_j * exp(x_i) / \sum exp(x_k)) 257 = - y_i + exp(x_i) (\sum y_j) / (\sum exp(x_k)) 258 */ 259 260 double maxX = x[0]; 261 for(int i = 0;i < x.size;i++) { 262 if(x[i] > maxX) { 263 maxX = x[i]; 264 } 265 } 266 267 // y - exp(x) sum_of_y / sum_of_exp(x) 268 double sumOfY = 0; 269 double sumOfX = 0; 270 Vector tmp(x.size); 271 for(int i = 0;i < x.size;i++) { 272 tmp[i] = exp(x[i] - maxX); 273 sumOfY += y[i]; 274 sumOfX += tmp[i]; 275 } 276 277 return sumOfY/sumOfX * tmp - y; 278 } 279 }; 280 281 // 单例 282 SqrCostFun SqrCostFunSingleton; 283 SoftmaxCostFun SoftmaxCostFunSingleton; 284 285 // 激活函数 286 struct Activator 287 { 288 // forward 289 virtual double forward(double v) 290 { 291 return v; 292 } 293 294 virtual double operator()(double v) 295 { 296 return forward(v); 297 } 298 299 virtual Vector operator()(Vector v) 300 { 301 Vector ret(v.size); 302 for(int i = 0;i < v.size;i++) { 303 ret[i] = forward(v[i]); 304 } 305 return ret; 306 } 307 308 // 求导数 309 virtual double derive(double v) 310 { 311 return 1; 312 } 313 314 virtual Vector derive(Vector v) 315 { 316 Vector ret(v.size); 317 for(int i = 0;i < ret.size;i++) { 318 ret[i] = derive(v[i]); 319 } 320 return ret; 321 } 322 }; 323 324 // Sigmoid激活函数 325 struct SigmoidActivator : Activator 326 { 327 virtual double forward(double v) 328 { 329 return 1 / (1 + exp(-v)); 330 } 331 332 virtual double derive(double v) 333 { 334 double t = exp(-v); 335 return t / ( (1 + t) * (1 + t) ); 336 } 337 }; 338 339 struct ReluActivator: Activator 340 { 341 virtual double forward(double v) 342 { 343 return v >= 0 ? v : 0; 344 } 345 346 virtual double derive(double v) 347 { 348 return v >= 0 ? 1 : 0; 349 } 350 }; 351 352 // 单例 353 SigmoidActivator SigmoidActivatorSingleton; 354 ReluActivator ReluActivatorSingleton; 355 356 // NN的一层 357 // 1. 输入不算一层 358 // 2. 层的w矩阵是从前面一层到当前层的w,和NG的定义有些出入 359 // 3. 层的b是前面一层到当前层的b,和NG的定义有些出入 360 struct Layer 361 { 362 // 上一层的输出的个数,不包括bias 363 int inSize; 364 // 当前层的输出 365 int outSize; 366 367 Activator &activator; 368 Matrix w; 369 Vector b; 370 371 void initWeights(double *p, int size) 372 { 373 // 采用 (0, 0.01)的正太分布初始化 374 for(int i = 0;i < size;i++) { 375 p[i] = gaussrand() * 0.01; 376 } 377 } 378 379 Layer(int _inSize=1, int _outSize=1, Activator &_activator= SigmoidActivatorSingleton): 380 inSize(_inSize), 381 outSize(_outSize), 382 w(_outSize, _inSize), 383 b(_outSize), 384 activator(_activator) 385 { 386 initWeights(w.data.get(), w.size); 387 initWeights(b.data.get(), b.size); 388 } 389 390 // 最后一次forward计算之后保存的激活值 391 Vector a; 392 Vector z; 393 // in是上一层的输出 394 Vector operator()(Vector in) 395 { 396 z = w * in + b; 397 return a = activator(z); 398 } 399 400 // 最后一次反向传播计算之后保存的delta值 401 Vector delta; 402 Vector propagateDelta() 403 { 404 return TandMul(w, delta); 405 } 406 407 // alpha是学习率 408 // prevA是上一层的输出 409 void updateParameters(double alpha, Vector prevA) 410 { 411 b = b + (-alpha) * delta; 412 Matrix nw(w.row, w.col); 413 for(int i = 0;i < w.row;i++) { 414 for(int j = 0;j < w.col;j++) { 415 nw[i][j] = w[i][j] - alpha * prevA[j] * delta[i]; 416 } 417 } 418 w = nw; 419 } 420 421 // 利用NG的办法进行BP算法正确性的检查 422 void checkBp(Layer layerList[], int nLayer, Vector input, Vector y, CostFun& costFun, double alpha, Vector prevA) 423 { 424 Vector forward(Layer layerList[], int nLayer, Vector input); 425 426 // BP算法计算的结果 427 Vector db = delta; 428 Matrix dw(w.row, w.col); 429 for(int i = 0;i < w.row;i++) { 430 for(int j = 0;j < w.col;j++) { 431 dw[i][j] = prevA[j] * delta[i]; 432 } 433 } 434 435 // NG办法计算的结果 436 // 1. 考虑对b增加一个e 437 double EPS = 0.0001; 438 for(int i = 0;i < b.size;i++) { 439 double tmp = b[i]; 440 // +eps 441 b[i] = tmp + EPS; 442 Vector output1 = forward(layerList, nLayer, input); 443 double err1 = costFun(output1, y); 444 // -eps 445 b[i] = tmp - EPS; 446 Vector output2 = forward(layerList, nLayer, input); 447 double err2 = costFun(output2, y); 448 449 // 恢复原值 450 b[i] = tmp; 451 452 // NG方法的估算值 453 double v = (err1 - err2) / (2 * EPS); 454 if(v > 0) { 455 double x = fabs( (v-db[i]) / v); 456 if(x > 0.0001) { 457 cerr << "BP算法结果误差太大了" << endl; 458 } 459 } 460 } 461 462 // 2. 考虑对w增加一个e 463 for(int i = 0;i < w.row;i++) { 464 for(int j = 0;j < w.col;j++) { 465 double tmp = w[i][j]; 466 // +eps 467 w[i][j] = tmp + EPS; 468 Vector output1 = forward(layerList, nLayer, input); 469 double err1 = costFun(output1, y); 470 // -eps 471 w[i][j] = tmp - EPS; 472 Vector output2 = forward(layerList, nLayer, input); 473 double err2 = costFun(output2, y); 474 475 // 恢复原值 476 w[i][j] = tmp; 477 478 // NG方法的估算值 479 double v = (err1 - err2) / (2 * EPS); 480 if(v > 0) { 481 double x = fabs( (v-dw[i][j]) / v); 482 if(x > 0.0001) { 483 cerr << "BP算法结果误差太大了" << endl; 484 } 485 } 486 } 487 } 488 } 489 }; 490 491 ostream& operator<<(ostream& out, Layer& layer) 492 { 493 out << "Layer {" << endl; 494 out << "w = " << layer.w << endl; 495 out << "b = " << layer.b << endl; 496 out << "z = " << layer.z << endl; 497 out << "a = " << layer.a << endl; 498 out << "delta = " << layer.delta << endl; 499 out << "}" << endl; 500 return out; 501 } 502 503 Vector forward(Layer layerList[], int nLayer, Vector input) 504 { 505 Vector tmp = input; 506 for(int i = 0;i < nLayer;i++) { 507 tmp = layerList[i](tmp); 508 } 509 return tmp; 510 } 511 512 void backward(Layer layerList[], int nLayer, Vector input, Vector y, CostFun& costFun, double alpha) 513 { 514 // 反向传播delta 515 Layer &lastLayer = layerList[nLayer - 1]; 516 // Sqr cost function为例是: -(y - a) f'(z) 517 lastLayer.delta = costFun.propagateDelta(lastLayer.a, y) * lastLayer.activator.derive(lastLayer.z); 518 519 for(int i = nLayer - 2;i >= 0;i--) { 520 Layer &layer = layerList[i]; 521 Layer &nextLayer = layerList[i + 1]; 522 layer.delta = nextLayer.propagateDelta() * layer.activator.derive(layer.z); 523 } 524 525 /* 526 // 检查BP算法正确性(只做一次) 527 static bool hasDoneBpChecking = false; 528 if(!hasDoneBpChecking) { 529 hasDoneBpChecking = true; 530 for(int i = 0;i < nLayer;i++) { 531 layerList[i].checkBp(layerList, nLayer, input, y, costFun, alpha, i == 0 ? input : layerList[i - 1].a); 532 } 533 } 534 //*/ 535 536 // 更新所有的w和b 537 for(int i = 0;i < nLayer;i++) { 538 layerList[i].updateParameters(alpha, i == 0 ? input : layerList[i - 1].a); 539 } 540 } 541 542 int MsbInt(char buf[], int len=4) 543 { 544 int base = 1; 545 int ret = 0; 546 for(int i = len - 1;i >= 0;i--) { 547 ret += (unsigned char)buf[i] * base; 548 base *= 256; 549 } 550 return ret; 551 } 552 vector<int> ReadMnistLabels(string fileName) 553 { 554 vector<int> ret; 555 ifstream ifs(fileName.c_str(), ios::binary); 556 char buf[1000]; 557 558 // MAGIC 559 ifs.read(buf, 4); 560 int magic = MsbInt(buf); 561 if(magic != 0x00000801) { 562 cerr << "incorrect label file magic number" << endl; 563 } 564 565 // num of images 566 ifs.read(buf, 4); 567 int nImages = MsbInt(buf); 568 569 while(nImages--) { 570 ret.push_back(ifs.get()); 571 } 572 573 return ret; 574 } 575 vector<Vector> ReadMnistData(string fileName) 576 { 577 vector<Vector> ret; 578 ifstream ifs(fileName.c_str(), ios::binary); 579 char buf[1000]; 580 581 // MAGIC 582 ifs.read(buf, 4); 583 int magic = MsbInt(buf); 584 if(magic != 0x00000803) { 585 cerr << "incorrect data file magic number" << endl; 586 } 587 588 // num of images 589 ifs.read(buf, 4); 590 int nImages = MsbInt(buf); 591 592 int row, col; 593 ifs.read(buf, 4); 594 row = MsbInt(buf); 595 ifs.read(buf, 4); 596 col = MsbInt(buf); 597 if(row != 28 || col != 28) { 598 cerr << "incorrect image size" << endl; 599 } 600 601 while(nImages--) { 602 Vector image(row * col); 603 for(int i = 0;i < row * col;i++) { 604 image[i] = ifs.get() / 256.0; // 归一化 605 } 606 ret.push_back(image); 607 } 608 609 return ret; 610 } 611 612 vector<Vector> translateLabels(vector<int> labels, int k=10) 613 { 614 vector<Vector> ret; 615 for(int i = 0;i < labels.size();i++) { 616 Vector tmp(k); 617 assert(labels[i] >= 0 && labels[i] < k); 618 tmp[labels[i]] = 1; 619 ret.push_back(tmp); 620 } 621 return ret; 622 } 623 624 int getMaxIdx(Vector x) 625 { 626 int maxIdx = 0; 627 double maxV = x[0]; 628 for(int i = 0;i < x.size;i++) { 629 if(x[i] > maxV) { 630 maxV = x[i]; 631 maxIdx = i; 632 } 633 } 634 return maxIdx; 635 } 636 637 int main() 638 { 639 srand(1000); 640 641 cout << "Loading data" << endl; 642 // 读取数据 643 vector<int> trainLabels = ReadMnistLabels("mnist/train-labels-idx1-ubyte"); 644 vector<Vector> trainData = ReadMnistData("mnist/train-images-idx3-ubyte"); 645 vector<Vector> trainLabels2 = translateLabels(trainLabels); 646 647 vector<int> testLabels = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte"); 648 vector<Vector> testData = ReadMnistData("mnist/t10k-images-idx3-ubyte"); 649 650 // 输入是28 x 28 651 // 输出是10 652 int nInput = 28 * 28; 653 int nOutput = 10; 654 655 // NN网络结构 656 Layer layerList[] = { 657 Layer(nInput, nOutput, ReluActivatorSingleton), 658 //Layer(100, nOutput, ReluActivatorSingleton) 659 }; 660 661 // Cost fun 662 CostFun &costFun = SoftmaxCostFunSingleton; 663 664 // 不包括输入层在内的层的个数 665 int nLayer = sizeof(layerList) / sizeof(layerList[0]); 666 667 int M = trainData.size(); 668 int T = testData.size(); 669 670 // 开始训练 671 cout << "Start training" << endl; 672 time_t fullStartedAt = time(NULL); 673 for(int step = 0;step < 100000;step++) { 674 time_t startedAt = time(NULL); 675 676 double avgError = 0; 677 678 for(int i = 0;i < M;i++) { 679 Vector x = trainData[i]; 680 Vector y = trainLabels2[i]; 681 682 Vector output = forward(layerList, nLayer, x); 683 double error = costFun(output, y); 684 //cout << output << " " << y << " " << error << endl; 685 avgError += error; 686 687 backward(layerList, nLayer, x, y, costFun, 0.001); 688 } 689 avgError /= M; 690 691 time_t endedAt = time(NULL); 692 693 cout << "step=" << step << " time_cost=" << (endedAt - startedAt) << " avgErr=" << avgError << " "; 694 695 // validate 696 int nTotal = 0; 697 int nGood = 0; 698 for(int i = 0;i < M;i++) { 699 Vector x = trainData[i]; 700 Vector output = forward(layerList, nLayer, x); 701 int maxIdx = getMaxIdx(output); 702 if(maxIdx == trainLabels[i]) { 703 nGood++; 704 } 705 nTotal++; 706 } 707 cout << "train_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " "; 708 bool doBreak = false; 709 if(nGood * 1.0 / nTotal > 0.95) { 710 doBreak = true; 711 } 712 713 // check 714 nTotal = 0; 715 nGood = 0; 716 for(int i = 0;i < T;i++) { 717 Vector x = testData[i]; 718 Vector output = forward(layerList, nLayer, x); 719 int maxIdx = getMaxIdx(output); 720 if(maxIdx == testLabels[i]) { 721 nGood++; 722 } 723 nTotal++; 724 } 725 cout << "test_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal; 726 cout << "\n"; 727 728 if(doBreak) { 729 break; 730 } 731 } 732 733 time_t fullEndedAt = time(NULL); 734 cout << "Total cost " << (fullEndedAt - fullStartedAt) << " seconds" << endl; 735 736 return 0; 737 }
浙公网安备 33010602011771号