openMP的多线程开销
2011-04-19 18:53 blou 阅读(2037) 评论(0) 收藏 举报许多人都在关注openMP,原因在于它比多线程API好用,优势就在于多平台通用,随时调用,快速见效,也不需要过多的线程管理。但是openMP也不是万能的,如果用的不好,速度还会下降,这就需要在使用前做好耗时测算,对于耗时不多的代码,用了openMP,开关线程的开销反而会让程序耗时更多。本文主要讨论一下作者使用中的一点心得,欢迎讨论。这里使用VS
1, openMP用法
网上有详细讲解,参考网址http://topic.csdn.net/u/20080512/16/cce9fb90-2e5b-443b-9c1b-d531673049e9.html
这里以#pragma omp parallel for为例,简单用法是,
1),在属性->C/C++->Language->openMP support选项,选择/openmp
2),在for循环前添加#pragma omp parallel for,注意,后面必须紧跟for循环语句,因为#pragma omp parallel for的作用域仅限于后面的第一个for循环;
这样就完成了调用。
2, 耗时测算
测试发现parallel for多线程的开销与编译选项O2、O0无关,但是与编译器关系很大。
以附件的程序为例,作者首先测试了微软编译器对parallel for多线程的开启和关闭开销,平均值在2.5ms,实际值随计算机状态有波动。另外测试到thread1的计算耗时比thread0多大约10%。
接下来以intel 编译器11.1版做测试,发现parallel for多线程的开启和关闭开销,平均值下降到0.25ms,大约是微软编译器的1/10!intel厉害啊!250us就完成了一个线程的开辟和关闭!有兴趣的朋友可以测试一下Windows API开关线程的耗时,差距更大。
3, 小结一下
1, 选择O2优化(即速度优先),intel编译器中openMP开辟、关闭线程的时间约250us,VC编译器中openMP开辟、关闭线程的时间约2500us;因此在使用中需要注意划分线程的粒度,计算好原单线程运行的耗时,看是否值得使用openMP;
2, 选择intel编译器,不论O2还是O0, openMP开辟和关闭线程的时间基本都是250us,只是O2会把与最终结果无关的运算跳过不算;
3, 使用openMP,需注意主线程以外的线程运算时会多花费10%左右的时间。因此使用中要注意把这个额外的开销算入openMP的影响:
4, 选择粒度的四条准则:
1), 如果循环次数较多而总耗时不多,则粒度不能太小;
2), 如果循环次数不多而总耗时不多,则不能用openMP;
3), 如果循环次数不多而总耗时较多,则粒度可以尽量大;
4), 如果循环次数较多而总耗时较多,则最合适使用openMP。
附件代码
1 // testopenmp.cpp : Defines the entry point for the console application.
2 //
3
4 #include "stdafx.h"
5 #include "omp.h"
6 #include <time.h>
7 #include <conio.h>
8 #include <Windows.h>
9
10 #define TEST_LOOP 200000
11 #define LOOP_NUM 2
12
13 void test()
14 {
15 ////计时器
16 LARGE_INTEGER lpFrequency;
17 __int64 qt1 = 0;
18 __int64 qt2 = 0;
19 __int64 subqt = 0;
20 double dft = 0.0;
21 double dff = 0.0;
22 double dfm = 0.0;
23 time_t rawtime;
24 struct tm *timeinfo;
25 int PRtime = 0;
26 bool bret = FALSE;
27
28 int a = 0;
29 bret = QueryPerformanceFrequency(&lpFrequency);
30 dff = (double)lpFrequency.QuadPart;
31 QueryPerformanceCounter(&lpFrequency);//获得初始值
32 qt1 = lpFrequency.QuadPart;
33
34 for (int i = 0; i < TEST_LOOP; i++)
35 {
36 a = i+1;
37 }
38 ////计时终止
39 QueryPerformanceCounter( &lpFrequency );//获得终止值
40 qt2 = lpFrequency.QuadPart;
41 subqt = qt2 - qt1;
42 dfm = (double)subqt;
43 dft = dfm / dff;//获得对应的时间值
44
45 PRtime = (int)(dft*1000000);///计算牌识时间,微秒
46
47 // printf("Time = %d, Thdnum=%d us\n", PRtime, omp_get_thread_num());
48 }
49
50 int _tmain(int argc, _TCHAR* argv[])
51 {
52 ////计时器
53 LARGE_INTEGER lpFrequency;
54 __int64 qt1 = 0;
55 __int64 qt2 = 0;
56 __int64 subqt = 0;
57 double dft = 0.0;
58 double dff = 0.0;
59 double dfm = 0.0;
60 time_t rawtime;
61 struct tm *timeinfo;
62 int PRtime = 0;
63 bool bret = FALSE;
64 int itime1 = 0;
65 int itime2 = 0;
66
67 int i = 0;
68 int j = 0;
69 //多线程运行,测试时间
70 //开始计时
71 bret = QueryPerformanceFrequency(&lpFrequency);
72 dff = (double)lpFrequency.QuadPart;
73 QueryPerformanceCounter(&lpFrequency);//获得初始值
74 qt1 = lpFrequency.QuadPart;
75
76 //#pragma omp parallel num_threads(8)
77 #pragma omp parallel for
78 for ( int j = 0; j < LOOP_NUM; j++ )
79 {
80 test();
81 }
82 ////计时终止
83 QueryPerformanceCounter( &lpFrequency );//获得终止值
84 qt2 = lpFrequency.QuadPart;
85 subqt = qt2 - qt1;
86 dfm = (double)subqt;
87 dft = dfm / dff;//获得对应的时间值
88
89 itime1 = (int)(dft*1000000);///计算牌识时间,微秒
90 // printf("********并行计算终止*******\n");
91
92
93 //不做优化,测试单线程耗时
94 //开始计时
95 bret = QueryPerformanceFrequency(&lpFrequency);
96 dff = (double)lpFrequency.QuadPart;
97 QueryPerformanceCounter(&lpFrequency);//获得初始值
98 qt1 = lpFrequency.QuadPart;
99
100 for ( int j = 0; j < LOOP_NUM; j++ )
101 {
102 test();
103 }
104 ////计时终止
105 QueryPerformanceCounter( &lpFrequency );//获得终止值
106 qt2 = lpFrequency.QuadPart;
107 subqt = qt2 - qt1;
108 dfm = (double)subqt;
109 dft = dfm / dff;//获得对应的时间值
110
111 itime2 = (int)(dft*1000000);///计算牌识时间,微秒
112 // printf("********串行计算终止*******\n");
113
114 //单次运算耗时
115 //开始计时,使用上次的终止时间
116 //开始计时
117 bret = QueryPerformanceFrequency(&lpFrequency);
118 dff = (double)lpFrequency.QuadPart;
119 QueryPerformanceCounter(&lpFrequency);//获得初始值
120 qt1 = lpFrequency.QuadPart;
121
122 test();
123 ////计时终止
124 QueryPerformanceCounter( &lpFrequency );//获得终止值
125 qt2 = lpFrequency.QuadPart;
126 subqt = qt2 - qt1;
127 dfm = (double)subqt;
128 dft = dfm / dff;//获得对应的时间值
129
130 PRtime = (int)(dft*1000000);///计算牌识时间,微秒
131
132 // printf("********单次计算终止*******\n");
133 printf("并行计算总耗时 = %8d us\n", itime1);
134 printf("单线程运算总耗时 = %8d us\n", itime2);
135 printf("单次循环耗时 = %8d us\n", PRtime);
136 printf("循环%4d次耗时 = %8d us\n", LOOP_NUM, PRtime*LOOP_NUM);
137
138 getch();
139 return 0;
140 }
浙公网安备 33010602011771号