高性能计算-优化-循环展开性能测试(4)

  1. 测试目标:测试for循环展开优化的提升效率
#include <cstdio>
#include <ctime>
#include <cstdlib>

/*
测试一维和二维循环展开效率区别
*/
#include <cstdio>
#include <ctime>
#include <cstdlib>

/*
测试一维和二维循环展开效率区别
*/
int main()
{
	//初始化数组
	double arr1[2000][2000] = {0};
	double arr2[2000*2000] = {0};
	for(int i=0;i<2000;i++)
	{
		for(int j=0;j<2000;j++)
		{
			arr1[i][j] = static_cast<float>(rand()) / RAND_MAX * 6.0f;
			arr2[2000*i + j] = arr1[i][j];
		}
	}
	clock_t start1,start2,start3,start4,end1,end2,end3,end4;
	double sum1,sum2,sum3,sum4;
	sum1 = sum2 = sum3 = sum4 = 0.0;

	start1 = clock();
	for(int i=0;i<2000;i++)
	{
		for(int j=0;j<2000 ;j++)
			sum1 += arr1[i][j];
	}
	end1 = clock();
	start2 = clock();
	for(int i=0;i<2000*2000;i++)
	{
		sum2 += arr2[i]; 
	}
	end2 = clock();
	start3 = clock();
	for(int i=0;i<2000*2000;i+=4)
	{
		sum3 += arr2[i]; 
		sum3 += arr2[i+1]; 
		sum3 += arr2[i+2]; 
		sum3 += arr2[i+3]; 
	}
	end3 = clock();
	start4 = clock();
	for(int i=0;i<2000;i++)
	{
		for(int j=0;j<2000 ;j++)
			sum4 += arr2[i*2000 + j];
	}
	end4 = clock();
	printf("2 dim arr sum1:%lf, cost:%lf s\n",sum1,(double)(end1-start1)/CLOCKS_PER_SEC);
	printf("1 dim arr sum2:%lf, cost:%lf s\n",sum2,(double)(end2-start2)/CLOCKS_PER_SEC);
	printf("1 dim arr sum3:%lf, 4x cost:%lf s\n",sum3,(double)(end3-start3)/CLOCKS_PER_SEC);
	printf("1 dim arr sum4:%lf, double loop cost:%lf s\n",sum4,(double)(end4-start4)/CLOCKS_PER_SEC);
	return 0;
}
  1. 测试数据
//4x O0优化
2 dim arr sum1:11999297.167960, cost:0.021406 s
1 dim arr sum2:11999297.167960, cost:0.020094 s
1 dim arr sum3:11999297.167960, 4x cost:0.022917 s

2 dim arr sum1:11999297.167960, cost:0.022119 s
1 dim arr sum2:11999297.167960, cost:0.021239 s
1 dim arr sum3:11999297.167960, 4x cost:0.023906 s

2 dim arr sum1:11999297.167960, cost:0.023547 s
1 dim arr sum2:11999297.167960, cost:0.021229 s
1 dim arr sum3:11999297.167960, 4x cost:0.021825 s

2 dim arr sum1:11999297.167960, cost:0.023784 s
1 dim arr sum2:11999297.167960, cost:0.021436 s
1 dim arr sum3:11999297.167960, 4x cost:0.021964 s

//2x O0优化
2 dim arr sum1:11999297.167960, cost:0.021334 s
1 dim arr sum2:11999297.167960, cost:0.019663 s
1 dim arr sum3:11999297.167960, 2x cost:0.021414 s

2 dim arr sum1:11999297.167960, cost:0.021309 s
1 dim arr sum2:11999297.167960, cost:0.019634 s
1 dim arr sum3:11999297.167960, 2x cost:0.021390 s

2 dim arr sum1:11999297.167960, cost:0.022325 s
1 dim arr sum2:11999297.167960, cost:0.021339 s
1 dim arr sum3:11999297.167960, 2x cost:0.021407 s

//4x O3优化
2 dim arr sum1:11999297.167960, cost:0.009233 s
1 dim arr sum2:11999297.167960, cost:0.008192 s
1 dim arr sum3:11999297.167960, 4x cost:0.008167 s

2 dim arr sum1:11999297.167960, cost:0.009286 s
1 dim arr sum2:11999297.167960, cost:0.008169 s
1 dim arr sum3:11999297.167960, 4x cost:0.008215 s

2 dim arr sum1:11999297.167960, cost:0.009352 s
1 dim arr sum2:11999297.167960, cost:0.008152 s
1 dim arr sum3:11999297.167960, 4x cost:0.008134 s

//4x Ofast优化
2 dim arr sum1:11999297.167960, cost:0.007150 s
1 dim arr sum2:11999297.167960, cost:0.005946 s
1 dim arr sum3:11999297.167960, 2x cost:0.005869 s

2 dim arr sum1:11999297.167960, cost:0.006816 s
1 dim arr sum2:11999297.167960, cost:0.005367 s
1 dim arr sum3:11999297.167960, 4x cost:0.005357 s

2 dim arr sum1:11999297.167960, cost:0.007004 s
1 dim arr sum2:11999297.167960, cost:0.005852 s
1 dim arr sum3:11999297.167960, 4x cost:0.005808 s

//4x O0 双层一维数组测试
2 dim arr sum1:11999297.167960, cost:0.022291 s
1 dim arr sum2:11999297.167960, cost:0.021354 s
1 dim arr sum3:11999297.167960, 4x cost:0.022473 s
1 dim arr sum4:11999297.167960, double loop cost:0.020899 s

2 dim arr sum1:11999297.167960, cost:0.022396 s
1 dim arr sum2:11999297.167960, cost:0.021465 s
1 dim arr sum3:11999297.167960, 4x cost:0.022987 s
1 dim arr sum4:11999297.167960, double loop cost:0.022999 s

2 dim arr sum1:11999297.167960, cost:0.021562 s
1 dim arr sum2:11999297.167960, cost:0.019639 s
1 dim arr sum3:11999297.167960, 4x cost:0.021876 s
1 dim arr sum4:11999297.167960, double loop cost:0.021314 s

//4x O3 双层一维数组测试
2 dim arr sum1:11999297.167960, cost:0.009570 s
1 dim arr sum2:11999297.167960, cost:0.008306 s
1 dim arr sum3:11999297.167960, 4x cost:0.008309 s
1 dim arr sum4:11999297.167960, double loop cost:0.008273 s

2 dim arr sum1:11999297.167960, cost:0.009535 s
1 dim arr sum2:11999297.167960, cost:0.008302 s
1 dim arr sum3:11999297.167960, 4x cost:0.008289 s
1 dim arr sum4:11999297.167960, double loop cost:0.008264 s

2 dim arr sum1:11999297.167960, cost:0.009337 s
1 dim arr sum2:11999297.167960, cost:0.008195 s
1 dim arr sum3:11999297.167960, 4x cost:0.008210 s
1 dim arr sum4:11999297.167960, double loop cost:0.008193 s

  1. 测试结论
    (1)循环展开中for循环条件转移指令的减少对性能提升并不明显,更大的意义在于减少循环内指令的计算效率,比如向量化优化。向量化优化可以通过编译器自动优化参数 O2、O3、Ofast实现或代码级循环展开实现(仍需要编译器优化参数),代码级提升约 0.2-0.3%。
    (2)双层循环与单层循环相同的数据,具有相同的循环数量,但是增加了循环条件分支指令数量,无编译优化性能下降约0.6-0.8%,O3优化下双层循环提升 0.4-0.6%。
posted @ 2024-10-15 23:43  安洛8  阅读(71)  评论(0)    收藏  举报