OpenCL性能优化实例研究系列4:if-else/switch-case优化案例
1. if-else 两个调用相同的Function,但是参数不同, 通常情况性能下降一半
2 使用Constants 或者Load/Fetch来替代大量的if-elseif实现查找表 lookup table
3 使用多个if语句替代if-elseif以使用少量的ALU来替代分支语句。
4 switch-case优化
1. if-else 两个调用相同的Function,但是参数不同, 通常情况性能下降一半
曾经遇到这样的一个OpenCL Kernel,
if( condition ) {
result = function_call(a,b,c,d,f,e);
}
else
{
result = function_call(a,b,c,d,e,f);
}
sum += result;
其中function_call执行HD5870的 1700条 ALU指令和320条CF指令。这段代码在CPU single Core中没有任何性能下降,因为每个线程都是独立执行的。但是GPU以SIMD的方式运行,性能会非常差,拿HD5870为例,每个wavefront有64个work-items, 如果其中任何n个work-item满足if( condition ), 那么剩余64-n个work-item就会满足条件else. 这样
n of 64 workitems 执行条件 result = function_call(a,b,c,d,f,e); 只有 n/64的系统执行效率。
(64-n) workitems 必然执行条件 result = function_call(a,b,c,d,e,f); 只有 (64-n)/64的系统执行效率。
总的执行时间会变成两倍的result = function_call(a,b,c,d,f,e),50%的执行效率。
优化办法:参数重命名,在if-else之外调用result = function_call(a,b,c,d,e,f);
if( condition ) {
float4 temp = e;
e = f;
f = temp;
}
result = function_call(a,b,c,d,e,f);
sum += result;
简单优化就能达到整体优化提高一倍的效果。
2 使用Constants 或者Load/Fetch来替代大量的if-elseif实现查找表 lookup table
DX11 SDK有一个例子是实现BC6/BC7这两种纹理文件的压缩编码和解压缩的。这个例子实际上效率是相当的低。
D:\Program Files\Microsoft DirectX SDK (June 2010)\Samples\C++\Direct3D11\BC6HBC7EncoderDecoder11\BC6HDecode.hlsl
其中函数extract_mode_index最为典型:
static const uint candidateModeMask[2] = { 0x03, 0x1f };
static const uint candidateModeMemory[14] = { 0x00, 0x01,
0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };
int extract_mode_index( uint4 block )
{
int mode_index;
uint type = block.r & candidateModeMask[0];
if ( type == candidateModeMemory[0] )
{
mode_index = 0;
}
else if ( type == candidateModeMemory[1] )
{
mode_index = 1;
}
else
{
type = block.r & candidateModeMask[1];
if ( type == candidateModeMemory[2] )
{
mode_index = 2;
}
else if ( type == candidateModeMemory[3] )
{
mode_index = 3;
}
else if ( type == candidateModeMemory[4] )
{
mode_index = 4;
}
else if ( type == candidateModeMemory[5] )
{
mode_index = 5;
}
else if ( type == candidateModeMemory[6] )
{
mode_index = 6;
}
else if ( type == candidateModeMemory[7] )
{
mode_index = 7;
}
else if ( type == candidateModeMemory[8] )
{
mode_index = 8;
}
else if ( type == candidateModeMemory[9] )
{
mode_index = 9;
}
else if ( type == candidateModeMemory[10] )
{
mode_index = 10;
}
else if ( type == candidateModeMemory[11] )
{
mode_index = 11;
}
else if ( type == candidateModeMemory[12] )
{
mode_index = 12;
}
else if ( type == candidateModeMemory[13] )
{
mode_index = 13;
}
}
return mode_index;
}
将它装化为一小段OpenCL代码,Stream KernelAnalyzer编译为46条ALU和37条CF指令,预计HD5870处理能力为15335M theads/Sec. 如果计算最坏情况,也就是经过每个分支,总共14分支,性能会下降到1/7的能力 2200M Theads/Sec。
__kernel void extract_mode_index( __global int* input , __global int* output )
{
int gid = get_global_id(0);
int mode_index;
const uint candidateModeMask[2] = { 0x03, 0x1f };
const uint candidateModeMemory[14] = { 0x00, 0x01,
0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };
uint block_r = input[gid];
uint type = block_r & candidateModeMask[0];
if ( type == candidateModeMemory[0] )
{
mode_index = 0;
}
else if ( type == candidateModeMemory[1] )
{
mode_index = 1;
}
else
{
type = block_r & candidateModeMask[1];
if ( type == candidateModeMemory[2] )
{
mode_index = 2;
}
else if ( type == candidateModeMemory[3] )
{
mode_index = 3;
}
else if ( type == candidateModeMemory[4] )
{
mode_index = 4;
}
else if ( type == candidateModeMemory[5] )
{
mode_index = 5;
}
else if ( type == candidateModeMemory[6] )
{
mode_index = 6;
}
else if ( type == candidateModeMemory[7] )
{
mode_index = 7;
}
else if ( type == candidateModeMemory[8] )
{
mode_index = 8;
}
else if ( type == candidateModeMemory[9] )
{
mode_index = 9;
}
else if ( type == candidateModeMemory[10] )
{
mode_index = 10;
}
else if ( type == candidateModeMemory[11] )
{
mode_index = 11;
}
else if ( type == candidateModeMemory[12] )
{
mode_index = 12;
}
else if ( type == candidateModeMemory[13] )
{
mode_index = 13;
}
}
output[gid] = mode_index;
}
使用查找表, Stream KernelAnalyzer编译为14条ALU和5条CF指令,1条VFetch指令,性能稳定为19429M threads/sec。
const int candidateModeMemory[14] = {
0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F ,
0x00, 0x01,};
__kernel void extract_mode_index( __global int* input , __global int* output )
{
int gid = get_global_id(0);
const uint candidateModeMask[2] = { 0x03, 0x1f };
uint block_r = input[gid];
uint candidate_index = 0;
uint type = block_r & candidateModeMask[0];
if (type <=1)
{
candidate_index = 12 + type;
}
else
{
candidate_index = block_r & candidateModeMask[1];
}
output[gid] = candidateModeMemory[candidate_index];
}
你可以将这些代码数入Stream Kernel Analyzer1.6仔细体会。另外input/output会带来额外的7条ALU和1条VFETCH和1 MEM_RAT。这里获得性能优化效果是巨大的。
3 使用多个if语句替代if-else if以使用少量的ALU来替代分支语句。
在CPU的实现中,我们通常把出现频率最高的条件作为多重 if -else if的第一条语句,或者来实现一些分段查找功能。对于SIMD的并行程序,通常很难取得好的执行效率。下面分析一个实例
unsigned int calSlotIdInQuadrant6(float normalizedPos)
{
float8 kSlotBound6 = (float8)(0.965926f,0.866025f,0.707107f,0.5f,0.258819f,0.f,0.f,-1.f);
unsigned int rtnId =5;
if(normalizedPos>=kSlotBound6.s0)
{
rtnId = 0;
}
else if(normalizedPos>=kSlotBound6.s1)
{
rtnId = 1;
}
else if(normalizedPos>=kSlotBound6.s2)
{
rtnId = 2;
}
else if(normalizedPos>=kSlotBound6.s3)
{
rtnId = 3;
}
else if(normalizedPos>=kSlotBound6.s4)
{
rtnId = 4;
}
return rtnId;
}
使用IF语句来替代 if -else if,
unsigned int calSlotIdInQuadrant6(float normalizedPos)
{
float8 kSlotBound6 = (float8)(0.965926f,0.866025f,0.707107f,0.5f,0.258819f,0.f,0.f,-1.f);
unsigned int rtnId =0;
if(normalizedPos < kSlotBound6.s4)
{
rtnId ++;
}
if(normalizedPos < kSlotBound6.s3)
{
rtnId ++;
}
if(normalizedPos< kSlotBound6.s2)
{
rtnId ++;
}
else if(normalizedPos<kSlotBound6.s1)
{
rtnId ++;
}
else if(normalizedPos<kSlotBound6.s0)
{
rtnId ++;
}
return rtnId;
}
4 switch-case优化
下面这个实例取材自DX11 SDK SubD11. switch-case总共有16个不同的分支,调用了三个不同的函数。我们可以将它合并为3个函数的调用。
D:\Program Files\Microsoft DirectX SDK (June 2010)\Samples\C++\Direct3D11\SubD11\SubD11.hlsl
原始Tessellation Shader:
BEZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p,
uint i : SV_OutputControlPointID,
uint PatchID : SV_PrimitiveID )
{
// Valences and prefixes are Constant for this case (4,4,4,4)
static const uint Val[4] = (uint[4])uint4(4,4,4,4);
static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);
float3 CornerB = float3(0,0,0);
float3 CornerU = float3(0,0,0);
float3 CornerV = float3(0,0,0);
BEZIER_CONTROL_POINT Output;
Output.vPosition = float3(0,0,0);
// !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
// inefficient code for the sake of readability.
switch( i )
{
// Interior vertices
case 5:
Output.vPosition = ComputeInteriorVertex( 0, Val, p );
break;
case 6:
Output.vPosition = ComputeInteriorVertex( 1, Val, p );
break;
case 10:
Output.vPosition = ComputeInteriorVertex( 2, Val, p );
break;
case 9:
Output.vPosition = ComputeInteriorVertex( 3, Val, p );
break;
// Corner vertices
case 0:
ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
Output.vPosition = CornerB;
break;
case 3:
ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
Output.vPosition = CornerB;
break;
case 15:
ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
Output.vPosition = CornerB;
break;
case 12:
ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
Output.vPosition = CornerB;
break;
// Edge vertices
case 1:
Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
break;
case 2:
Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
break;
case 13:
Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
break;
case 14:
Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
break;
case 4:
Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
break;
case 8:
Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
break;
case 7:
Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
break;
case 11:
Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
break;
}
return Output;
}
优化后,我们将Switch合并为三个函数的调用,不过对程序的整体性能帮助不大,因为这个Shader不是主要的Bottleneck.
EZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p,
uint i : SV_OutputControlPointID,
uint PatchID : SV_PrimitiveID )
{
// Valences and prefixes are Constant for this case (4,4,4,4)
static const uint Val[4] = (uint[4])uint4(4,4,4,4);
static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);
float3 CornerB = float3(0,0,0);
float3 CornerU = float3(0,0,0);
float3 CornerV = float3(0,0,0);
BEZIER_CONTROL_POINT Output;
Output.vPosition = float3(0,0,0);
// !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
// inefficient code for the sake of readability.
uint t1,t2;
switch( i )
{
// Interior vertices
case 5:
t1 = 0;
t2 = 0;
//Output.vPosition = ComputeInteriorVertex( 0, Val, p );
break;
case 6:
t1 = 0;
t2 = 1;
//Output.vPosition = ComputeInteriorVertex( 1, Val, p );
break;
case 10:
t1 = 0;
t2 = 2;
//Output.vPosition = ComputeInteriorVertex( 2, Val, p );
break;
case 9:
t1 = 0;
t2 = 3;
//Output.vPosition = ComputeInteriorVertex( 3, Val, p );
break;
// Corner vertices
case 0:
t1 = 1;
t2 = 0;
//ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
//Output.vPosition = CornerB;
break;
case 3:
t1 = 1;
t2 = 1;
//ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
//Output.vPosition = CornerB;
break;
case 15:
t1 = 1;
t2 = 2;
//ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
//Output.vPosition = CornerB;
break;
case 12:
t1 = 1;
t2 = 3;
//ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
//Output.vPosition = CornerB;
break;
// Edge vertices
case 1:
t1 = 2;
t2 = 0;
//Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
break;
case 2:
t1 = 2;
t2 = 1;
//Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
break;
case 13:
t1 = 2;
t2 = 2;
//Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
break;
case 14:
t1 = 2;
t2 = 3;
//Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
break;
case 4:
t1 = 2;
t2 = 4;
//Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
break;
case 8:
t1 = 2;
t2 = 5;
//Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
break;
case 7:
t1 = 2;
t2 = 6;
//Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
break;
case 11:
t1 = 2;
t2 = 7;
//Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
break;
}
if(t1 == 0)
{
Output.vPosition = ComputeInteriorVertex( t2, Val, p );
}
if(t1 == 1)
{
ComputeCornerVertex4444( t2, CornerB, CornerU, CornerV, p, Val, Prefixes );
Output.vPosition = CornerB;
}
if(t1 == 2)
{
Output.vPosition = ComputeEdgeVertex( t2, p, Val, Prefixes );
}
return Output;
}

浙公网安备 33010602011771号