(原)测试intel的并行计算pafor

转载请注明出处:

http://www.cnblogs.com/darkknightzh/p/4988264.html

参考网址:

关于mt19937:http://www.cnblogs.com/egmkang/archive/2012/09/06/2673253.html

 

代码如下:

  1 #include "stdafx.h"
  2 #include <iostream>
  3 #include <random>     // mt19937的头文件
  4 #include <ppl.h>      // parfor的头文件
  5 #include <windows.h>  // QueryPerformanceFrequency等函数的头文件
  6 
  7 using namespace concurrency; // parfor使用
  8 using namespace std;
  9 
 10 
 11 // 分配内存
 12 void AllocMatrix(double** m, size_t n)
 13 {
 14     *m = new double[n*n];
 15     memset(*m, 0, sizeof(double)*n*n);
 16 }
 17 
 18 
 19 // 初始化矩阵内容
 20 template <class Gen>
 21 void IniMatrix(double* m, size_t n, Gen& gen)
 22 {
 23     for (size_t i = 0; i < n; ++i)
 24     {
 25         for (size_t j = 0; j < n; ++j)
 26         {
 27             m[i*n + j] = static_cast<double>(gen());
 28         }
 29     }
 30 }
 31 
 32 
 33 // 释放内存
 34 void FreeMatrix(double** m)
 35 {
 36     if (nullptr != *m)
 37     {
 38         delete[](*m);
 39         (*m) = nullptr;
 40     }
 41 }
 42 
 43 
 44 // 矩阵相乘,使用for
 45 void matrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n)
 46 {
 47     for (size_t i = 0; i < n; i++)
 48     {
 49         for (size_t j = i; j < n; j++)
 50         {
 51             double temp = 0;
 52             for (size_t k = 0; k < n; k++)
 53             {
 54                 temp += m1[i * n + k] * m2[k * n + j];
 55             }
 56             res[i*n + j] = temp;
 57         }
 58     }
 59 }
 60 
 61 
 62 // 矩阵相乘,外层使用parfor
 63 void matrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n)
 64 {
 65     parallel_for(size_t(0), n, [&](size_t i)
 66     {
 67         for (size_t j = i; j < n; j++)
 68         {
 69             double temp = 0;
 70             for (size_t k = 0; k < n; k++)
 71             {
 72                 temp += m1[i * n + k] * m2[k * n + j];
 73             }
 74             res[i*n + j] = temp;
 75         }
 76     });
 77 }
 78 
 79 
 80 // 矩阵相乘,内层使用parfor
 81 void matrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n)
 82 {
 83     for (size_t i = 0; i < n; i++)
 84     {
 85         parallel_for(size_t(i), n, [&](size_t j)
 86         {
 87             double temp = 0;
 88             for (size_t k = 0; k < n; k++)
 89             {
 90                 temp += m1[i * n + k] * m2[k * n + j];
 91             }
 92             res[i*n + j] = temp;
 93         });
 94     }
 95 }
 96 
 97 
 98 // 测试矩阵相乘,使用for的时间
 99 double testmatrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n)
100 {
101     LARGE_INTEGER nFreq, nBeginTime, nEndTime;
102     QueryPerformanceFrequency(&nFreq);
103     QueryPerformanceCounter(&nBeginTime);
104 
105     matrixMultiplyFor(res, m1, m2, n);
106 
107     QueryPerformanceCounter(&nEndTime);
108     return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
109 }
110 
111 
112 // 测试矩阵相乘,外层使用parfor的时间
113 double testmatrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n)
114 {
115     LARGE_INTEGER nFreq, nBeginTime, nEndTime;
116     QueryPerformanceFrequency(&nFreq);
117     QueryPerformanceCounter(&nBeginTime);
118 
119     matrixMultiplyParForOuter(res, m1, m2, n);
120 
121     QueryPerformanceCounter(&nEndTime);
122     return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
123 }
124 
125 
126 // 测试矩阵相乘,内层使用parfor的时间
127 double testmatrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n)
128 {
129     LARGE_INTEGER nFreq, nBeginTime, nEndTime;
130     QueryPerformanceFrequency(&nFreq);
131     QueryPerformanceCounter(&nBeginTime);
132 
133     matrixMultiplyParForInner(res, m1, m2, n);
134 
135     QueryPerformanceCounter(&nEndTime);
136     return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
137 }
138 
139 
140 // 主函数
141 int _tmain(int argc, _TCHAR* argv[])
142 {
143     const size_t n = 1024;
144     double* dM1 = NULL;
145     double* dM2 = NULL;
146     double* dRes1 = NULL;
147     double* dRes2 = NULL;
148     double* dRes3 = NULL;
149 
150     random_device rd;
151     mt19937 gen(rd());
152 
153     AllocMatrix(&dM1, n);
154     AllocMatrix(&dM2, n);
155     IniMatrix(dM1, n, gen);
156     IniMatrix(dM2, n, gen);
157 
158     AllocMatrix(&dRes1, n);
159     AllocMatrix(&dRes2, n);
160     AllocMatrix(&dRes3, n);
161     
162     double dTimeFor = testmatrixMultiplyFor(dRes1, dM1, dM2, n);
163     double dTimeParForOuter = testmatrixMultiplyParForOuter(dRes2, dM1, dM2, n);
164     double dTimeParForInner = testmatrixMultiplyParForInner(dRes3, dM1, dM2, n);
165 
166     printf("time(ms)\nfor: %f \nparforOunter: %f \nparforInner: %f\n", dTimeFor, dTimeParForOuter, dTimeParForInner);
167 
168     FreeMatrix(&dM1);
169     FreeMatrix(&dM2);
170     FreeMatrix(&dRes1);
171     FreeMatrix(&dRes2);
172     FreeMatrix(&dRes3);
173 
174     return 0;
175 }

 

debug

time(ms)

for: 7761.769099

parforOunter: 3416.670736

parforInner: 3423.701265

 

release

time(ms)

for: 3884.167485

parforOunter: 1062.581817

parforInner: 1083.642302

说明:此处测试outerinner是因为,matlab里面,使用outer形式的并行计算,使用parfor后,如果循环比对类似这种三角形式,最终有些核先跑完结果,有些核后跑完结果,导致出现,一个核累死累活的跑程序,另外N-1个核围观的状态,使最终的计算时间变长(不过在matlab中未测试outerinner使用parfor的时间对比)。

但是,在C++里面,不知道是否优化的原因,outer使用parforinner使用parfor要快。此处测试了n=2048,结果也是outerinner的形式要快。

posted on 2015-11-23 13:56  darkknightzh  阅读(795)  评论(0编辑  收藏  举报

导航