(原創) 如何加速for loop? (C/C++) (OpenMP) (template) (TMP) (C2H)
Absract
據同學說是聯發科的面試考題,以下是我野人獻曝的解法,若有大俠有任何更好的方,歡迎華山論劍。
Introduction
原題如下
有一個for迴圈,從0加到100,可是我覺得它不夠快,要怎樣才能讓她更快呢?(不可以用數學公式)
for(i = 0; i <= 100; i++)
s = s + i;
我能想到的解法
1.調compiler參數,vc++和gcc都有-o可以調,可以optimize for speed,是最懶的方式。
2.善用多核心:
1 /*
2 (C) OOMusou 2008 http://oomusou.cnblogs.com
3
4 Filename : parallel_for_100.c
5 Compiler : Visual C++ 8.0
6 Description : Demo how to parallel sum 0 to 100
7 Release : 03/14/2008 1.0
8 */
9 #include <stdio.h>
10 #include "omp.h"
11
12 int main() {
13 int i;
14 int s = 0;
15
16 printf("Number of core : %d\n", omp_get_num_procs());
17 printf("Number of threads : %d\n", omp_get_num_threads());
18
19 #pragma omp parallel for
20 for(i = 0; i <= 100; ++i) {
21 printf("thread %d : %d\n", omp_get_thread_num(), s = s + i);
22 }
23 printf("%d\n", s);
24 }
2 (C) OOMusou 2008 http://oomusou.cnblogs.com
3
4 Filename : parallel_for_100.c
5 Compiler : Visual C++ 8.0
6 Description : Demo how to parallel sum 0 to 100
7 Release : 03/14/2008 1.0
8 */
9 #include <stdio.h>
10 #include "omp.h"
11
12 int main() {
13 int i;
14 int s = 0;
15
16 printf("Number of core : %d\n", omp_get_num_procs());
17 printf("Number of threads : %d\n", omp_get_num_threads());
18
19 #pragma omp parallel for
20 for(i = 0; i <= 100; ++i) {
21 printf("thread %d : %d\n", omp_get_thread_num(), s = s + i);
22 }
23 printf("%d\n", s);
24 }
執行結果
Number of core : 2
Number of threads : 1
thread 0 : 0
thread 1 : 51
thread 0 : 52
thread 0 : 106
thread 1 : 104
thread 0 : 109
thread 1 : 162
thread 0 : 166
thread 1 : 220
thread 0 : 225
thread 1 : 280
thread 0 : 286
thread 1 : 342
thread 0 : 349
thread 1 : 406
thread 0 : 414
thread 1 : 472
thread 0 : 481
thread 1 : 540
thread 0 : 550
thread 1 : 610
thread 0 : 621
thread 1 : 682
thread 0 : 694
thread 1 : 756
thread 0 : 769
thread 1 : 832
thread 0 : 846
thread 1 : 910
thread 0 : 925
thread 1 : 990
thread 0 : 1006
thread 1 : 1072
thread 0 : 1089
thread 1 : 1156
thread 0 : 1174
thread 1 : 1242
thread 0 : 1261
thread 1 : 1330
thread 0 : 1350
thread 1 : 1420
thread 0 : 1441
thread 1 : 1512
thread 0 : 1534
thread 1 : 1606
thread 0 : 1629
thread 1 : 1702
thread 0 : 1726
thread 1 : 1800
thread 0 : 1825
thread 1 : 1900
thread 0 : 1926
thread 1 : 2002
thread 0 : 2029
thread 1 : 2106
thread 0 : 2134
thread 1 : 2212
thread 0 : 2241
thread 1 : 2320
thread 0 : 2350
thread 1 : 2430
thread 0 : 2461
thread 1 : 2542
thread 0 : 2574
thread 1 : 2656
thread 0 : 2689
thread 1 : 2772
thread 0 : 2806
thread 1 : 2890
thread 0 : 2925
thread 1 : 3010
thread 0 : 3046
thread 1 : 3132
thread 0 : 3169
thread 1 : 3256
thread 0 : 3294
thread 1 : 3382
thread 0 : 3421
thread 1 : 3510
thread 0 : 3550
thread 1 : 3640
thread 0 : 3681
thread 1 : 3772
thread 0 : 3814
thread 1 : 3906
thread 0 : 3949
thread 1 : 4042
thread 0 : 4086
thread 1 : 4180
thread 0 : 4225
thread 1 : 4320
thread 0 : 4366
thread 1 : 4462
thread 0 : 4509
thread 1 : 4606
thread 0 : 4654
thread 1 : 4752
thread 0 : 4801
thread 1 : 4900
thread 0 : 4950
thread 1 : 5050
5050
Number of threads : 1
thread 0 : 0
thread 1 : 51
thread 0 : 52
thread 0 : 106
thread 1 : 104
thread 0 : 109
thread 1 : 162
thread 0 : 166
thread 1 : 220
thread 0 : 225
thread 1 : 280
thread 0 : 286
thread 1 : 342
thread 0 : 349
thread 1 : 406
thread 0 : 414
thread 1 : 472
thread 0 : 481
thread 1 : 540
thread 0 : 550
thread 1 : 610
thread 0 : 621
thread 1 : 682
thread 0 : 694
thread 1 : 756
thread 0 : 769
thread 1 : 832
thread 0 : 846
thread 1 : 910
thread 0 : 925
thread 1 : 990
thread 0 : 1006
thread 1 : 1072
thread 0 : 1089
thread 1 : 1156
thread 0 : 1174
thread 1 : 1242
thread 0 : 1261
thread 1 : 1330
thread 0 : 1350
thread 1 : 1420
thread 0 : 1441
thread 1 : 1512
thread 0 : 1534
thread 1 : 1606
thread 0 : 1629
thread 1 : 1702
thread 0 : 1726
thread 1 : 1800
thread 0 : 1825
thread 1 : 1900
thread 0 : 1926
thread 1 : 2002
thread 0 : 2029
thread 1 : 2106
thread 0 : 2134
thread 1 : 2212
thread 0 : 2241
thread 1 : 2320
thread 0 : 2350
thread 1 : 2430
thread 0 : 2461
thread 1 : 2542
thread 0 : 2574
thread 1 : 2656
thread 0 : 2689
thread 1 : 2772
thread 0 : 2806
thread 1 : 2890
thread 0 : 2925
thread 1 : 3010
thread 0 : 3046
thread 1 : 3132
thread 0 : 3169
thread 1 : 3256
thread 0 : 3294
thread 1 : 3382
thread 0 : 3421
thread 1 : 3510
thread 0 : 3550
thread 1 : 3640
thread 0 : 3681
thread 1 : 3772
thread 0 : 3814
thread 1 : 3906
thread 0 : 3949
thread 1 : 4042
thread 0 : 4086
thread 1 : 4180
thread 0 : 4225
thread 1 : 4320
thread 0 : 4366
thread 1 : 4462
thread 0 : 4509
thread 1 : 4606
thread 0 : 4654
thread 1 : 4752
thread 0 : 4801
thread 1 : 4900
thread 0 : 4950
thread 1 : 5050
5050
由以上結果可以發現,thread 0由0開始跑,thread 1由51開始跑,但之後就隨機分配到兩個核心,所以充分利用兩個核心來運算。
3.inline assembly:我asm忘了差不多了, 一時也寫不出來XD
4.改用unsigned int,而不用int,這種寫法經我測試可以快一倍速度,主要是int須動用到有號數運算,速度較慢,詳細原理請參考(原創) 無號數及有號數的乘加運算電路設計 (初級) (IC Design) (Verilog) (Linux)
1
/*
2
(C) OOMusou 2007 http://oomusou.cnblogs.com
3
4
Filename : for_loop_original.c
5
Compiler : Visual C++ 8.0
6
Description : Demo how to optimize for loop
7
Release : 01/05/2008 1.0
8
*/
9
10
#include <stdio.h>
11
12
int foo(int n) {
13
unsigned int i, s = 0;
14
for(i = 0; i <= n; i++)
15
s = s + i;
16
17
return s;
18
}
19
20
int main() {
21
int s;
22
s = foo(100000000000);
23
printf("%d\n", s);
24
}
/* 2
(C) OOMusou 2007 http://oomusou.cnblogs.com3

4
Filename : for_loop_original.c5
Compiler : Visual C++ 8.06
Description : Demo how to optimize for loop7
Release : 01/05/2008 1.08
*/9

10
#include <stdio.h>11

12
int foo(int n) {13
unsigned int i, s = 0;14
for(i = 0; i <= n; i++)15
s = s + i;16
17
return s;18
}19

20
int main() {21
int s;22
s = foo(100000000000);23
printf("%d\n", s);24
}5.使用Loop unwinding,請參考Wiki Loop unwinding
1
/*
2
(C) OOMusou 2007 http://oomusou.cnblogs.com
3
4
Filename : for_loop_unwiding.c
5
Compiler : Visual C++ 8.0
6
Description : Demo how to optimize for loop
7
Release : 01/11/2008 1.0
8
*/
9
10
#include <stdio.h>
11
12
int foo(int n) {
13
unsigned int i, s = 0;
14
for(i = 1; i <= n; i+=10) {
15
s += i;
16
s += i+1;
17
s += i+2;
18
s += i+3;
19
s += i+4;
20
s += i+5;
21
s += i+6;
22
s += i+7;
23
s += i+8;
24
s += i+9;
25
}
26
27
return s;
28
}
29
30
int main() {
31
int s;
32
s = foo(100);
33
printf("%d\n", s);
34
}
/* 2
(C) OOMusou 2007 http://oomusou.cnblogs.com3

4
Filename : for_loop_unwiding.c5
Compiler : Visual C++ 8.06
Description : Demo how to optimize for loop7
Release : 01/11/2008 1.08
*/9

10
#include <stdio.h>11

12
int foo(int n) {13
unsigned int i, s = 0;14
for(i = 1; i <= n; i+=10) {15
s += i;16
s += i+1;17
s += i+2;18
s += i+3;19
s += i+4;20
s += i+5;21
s += i+6;22
s += i+7;23
s += i+8;24
s += i+9;25
}26
27
return s;28
}29

30
int main() {31
int s;32
s = foo(100);33
printf("%d\n", s);34
}6.使用C++的Template Meta Programming方式,由於這是一種在compile time的運算,而不是在run time計算,所以在run time速度很快。
1
/*
2
(C) OOMusou 2007 http://oomusou.cnblogs.com
3
4
Filename : for_loop_tmp_cpp.cpp
5
Compiler : Visual C++ 8.0
6
Description : Demo how to optimize for loop by TMP
7
Release : 01/05/2008 1.0
8
*/
9
10
#include <iostream>
11
12
using namespace std;
13
14
template <unsigned n>
15
struct foo {
16
enum { value = n + foo<n-1>::value };
17
};
18
19
template<>
20
struct foo<0> {
21
enum { value = 0 };
22
};
23
24
int main() {
25
cout << foo<100>::value << endl;
26
}
/* 2
(C) OOMusou 2007 http://oomusou.cnblogs.com3

4
Filename : for_loop_tmp_cpp.cpp5
Compiler : Visual C++ 8.06
Description : Demo how to optimize for loop by TMP7
Release : 01/05/2008 1.08
*/9

10
#include <iostream>11

12
using namespace std;13

14
template <unsigned n>15
struct foo {16
enum { value = n + foo<n-1>::value };17
};18

19
template<>20
struct foo<0> {21
enum { value = 0 };22
};23

24
int main() {25
cout << foo<100>::value << endl;26
}C++的template類似C的macro,是否可用c來達成呢?我試著用以下的C,但無法compile成功,主要是C並不支援recursive macro,所以無福消受。
1
#include <stdio.h>
2
#define foo(x) ((x)?((x)+foo(x-1)):(0))
3
4
int main() {
5
int s = foo(100);
6
printf("%d\n", s);
7
}
#include <stdio.h> 2
#define foo(x) ((x)?((x)+foo(x-1)):(0))3

4
int main() { 5
int s = foo(100); 6
printf("%d\n", s); 7
}7.使用硬體加速編譯器,請參閱(轉貼) 快速可編程單晶片系統開發與ANSI C語言功能的硬體加速 (IC Design) (C/C++) (News),(原創) 如何使用Nios II C2H compiler? (IC Design) (DE2) (Nios II) (Quartus II) (SOPC Builder)。
Conclusion
以上野人獻曝了幾種方法,還請各位大俠指教,不勝感激。
See Also
(原創) 無號數及有號數的乘加運算電路設計 (初級) (IC Design) (Verilog) (Linux)
(原創) Template Metaprogramming系列[1] : 使用遞迴/??(Recursive) (高級) (ISO C++) (C++/CLI) (TMP)
(轉貼) 快速可編程單晶片系統開發與ANSI C語言功能的硬體加速 (IC Design) (C/C++) (News),(原創) 如何使用Nios II C2H compiler? (IC Design) (DE2) (Nios II) (Quartus II) (SOPC Builder)。
Reference
Wiki Loop unwinding


浙公网安备 33010602011771号