OpenCL性能优化实例研究系列4：if-else/switch-case优化案例

原文地址

1. if-else 两个调用相同的Function，但是参数不同, 通常情况性能下降一半

2 使用Constants 或者Load/Fetch来替代大量的if-elseif实现查找表 lookup table

3 使用多个if语句替代if-elseif以使用少量的ALU来替代分支语句。

4 switch-case优化

1. if-else 两个调用相同的Function，但是参数不同, 通常情况性能下降一半

曾经遇到这样的一个OpenCL Kernel,

if( condition ) {

result = function_call(a,b,c,d,f,e);

}

else

{

result = function_call(a,b,c,d,e,f);

}

sum += result;

其中function_call执行HD5870的 1700条 ALU指令和320条CF指令。这段代码在CPU single Core中没有任何性能下降，因为每个线程都是独立执行的。但是GPU以SIMD的方式运行，性能会非常差，拿HD5870为例，每个wavefront有64个work-items, 如果其中任何n个work-item满足if( condition ), 那么剩余64-n个work-item就会满足条件else. 这样

n of 64 workitems 执行条件 result = function_call(a,b,c,d,f,e); 只有 n/64的系统执行效率。

(64-n) workitems 必然执行条件 result = function_call(a,b,c,d,e,f); 只有 (64-n)/64的系统执行效率。

总的执行时间会变成两倍的result = function_call(a,b,c,d,f,e)，50%的执行效率。

优化办法：参数重命名，在if-else之外调用result = function_call(a,b,c,d,e,f);

if( condition ) {

float4 temp = e;

e = f;

f = temp;

}

result = function_call(a,b,c,d,e,f);

sum += result;

简单优化就能达到整体优化提高一倍的效果。

2 使用Constants 或者Load/Fetch来替代大量的if-elseif实现查找表 lookup table

DX11 SDK有一个例子是实现BC6/BC7这两种纹理文件的压缩编码和解压缩的。这个例子实际上效率是相当的低。

D:\Program Files\Microsoft DirectX SDK (June 2010)\Samples\C++\Direct3D11\BC6HBC7EncoderDecoder11\BC6HDecode.hlsl

其中函数extract_mode_index最为典型：

static const uint candidateModeMask[2] = { 0x03, 0x1f };
static const uint candidateModeMemory[14] = { 0x00, 0x01,
    0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };
int extract_mode_index( uint4 block )
{
    int mode_index;

    uint type = block.r & candidateModeMask[0];
    if ( type == candidateModeMemory[0] )
    {
        mode_index = 0;
    }
    else if ( type == candidateModeMemory[1] )
    {
        mode_index = 1;
    }
    else
    {
        type = block.r & candidateModeMask[1];
        if ( type == candidateModeMemory[2] )
        {
            mode_index = 2;
        }
        else if ( type == candidateModeMemory[3] )
        {
            mode_index = 3;
        }
        else if ( type == candidateModeMemory[4] )
        {
            mode_index = 4;
        }
        else if ( type == candidateModeMemory[5] )
        {
            mode_index = 5;
        }
        else if ( type == candidateModeMemory[6] )
        {
            mode_index = 6;
        }
        else if ( type == candidateModeMemory[7] )
        {
            mode_index = 7;
        }
        else if ( type == candidateModeMemory[8] )
        {
            mode_index = 8;
        }
        else if ( type == candidateModeMemory[9] )
        {
            mode_index = 9;
        }
        else if ( type == candidateModeMemory[10] )
        {
            mode_index = 10;
        }
        else if ( type == candidateModeMemory[11] )
        {
            mode_index = 11;
        }
        else if ( type == candidateModeMemory[12] )
        {
            mode_index = 12;
        }
        else if ( type == candidateModeMemory[13] )
        {
            mode_index = 13;
        }
    }
    return mode_index;
}

将它装化为一小段OpenCL代码，Stream KernelAnalyzer编译为46条ALU和37条CF指令，预计HD5870处理能力为15335M theads/Sec. 如果计算最坏情况，也就是经过每个分支，总共14分支，性能会下降到1/7的能力 2200M Theads/Sec。

__kernel void extract_mode_index( __global int* input , __global int* output )
{
   int gid = get_global_id(0);
   int mode_index;
const uint candidateModeMask[2] = { 0x03, 0x1f };
const uint candidateModeMemory[14] = { 0x00, 0x01,
    0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F };

    uint block_r = input[gid];
    uint type = block_r & candidateModeMask[0];
    if ( type == candidateModeMemory[0] )
    {
        mode_index = 0;
    }
    else if ( type == candidateModeMemory[1] )
    {
        mode_index = 1;
    }
    else
    {
        type = block_r & candidateModeMask[1];
        if ( type == candidateModeMemory[2] )
        {
            mode_index = 2;
        }
        else if ( type == candidateModeMemory[3] )
        {
            mode_index = 3;
        }
        else if ( type == candidateModeMemory[4] )
        {
            mode_index = 4;
        }
        else if ( type == candidateModeMemory[5] )
        {
            mode_index = 5;
        }
        else if ( type == candidateModeMemory[6] )
        {
            mode_index = 6;
        }
        else if ( type == candidateModeMemory[7] )
        {
            mode_index = 7;
        }
        else if ( type == candidateModeMemory[8] )
        {
            mode_index = 8;
        }
        else if ( type == candidateModeMemory[9] )
        {
            mode_index = 9;
        }
        else if ( type == candidateModeMemory[10] )
        {
            mode_index = 10;
        }
        else if ( type == candidateModeMemory[11] )
        {
            mode_index = 11;
        }
        else if ( type == candidateModeMemory[12] )
        {
            mode_index = 12;
        }
        else if ( type == candidateModeMemory[13] )
        {
            mode_index = 13;
        }
    }
    output[gid] = mode_index;
}

使用查找表， Stream KernelAnalyzer编译为14条ALU和5条CF指令，1条VFetch指令，性能稳定为19429M threads/sec。

const int candidateModeMemory[14] = {
0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F ,
0x00, 0x01,};

__kernel void extract_mode_index( __global int* input , __global int* output )
{
int gid = get_global_id(0);

const uint candidateModeMask[2] = { 0x03, 0x1f };

uint block_r = input[gid];

uint candidate_index = 0;

    uint type = block_r & candidateModeMask[0];
    if (type <=1)
    {
          candidate_index = 12 + type;
    }

else
    {
           candidate_index = block_r & candidateModeMask[1];
    }
    output[gid] = candidateModeMemory[candidate_index];
}

你可以将这些代码数入Stream Kernel Analyzer1.6仔细体会。另外input/output会带来额外的7条ALU和1条VFETCH和1 MEM_RAT。这里获得性能优化效果是巨大的。

3 使用多个if语句替代if-else if以使用少量的ALU来替代分支语句。

在CPU的实现中，我们通常把出现频率最高的条件作为多重 if -else if的第一条语句，或者来实现一些分段查找功能。对于SIMD的并行程序，通常很难取得好的执行效率。下面分析一个实例

unsigned int calSlotIdInQuadrant6(float normalizedPos)
{
float8 kSlotBound6 = (float8)(0.965926f,0.866025f,0.707107f,0.5f,0.258819f,0.f,0.f,-1.f);
unsigned int rtnId =5;
if(normalizedPos>=kSlotBound6.s0)
{
   rtnId = 0;
}
else if(normalizedPos>=kSlotBound6.s1)
{
   rtnId = 1;
}
else if(normalizedPos>=kSlotBound6.s2)
{
   rtnId = 2;
}
else if(normalizedPos>=kSlotBound6.s3)
{
   rtnId = 3;
}
else if(normalizedPos>=kSlotBound6.s4)
{
   rtnId = 4;
}
return rtnId;
}

使用IF语句来替代 if -else if，

unsigned int calSlotIdInQuadrant6(float normalizedPos)
{
float8 kSlotBound6 = (float8)(0.965926f,0.866025f,0.707107f,0.5f,0.258819f,0.f,0.f,-1.f);
unsigned int rtnId =0;
if(normalizedPos < kSlotBound6.s4)
{
   rtnId ++;
}
if(normalizedPos < kSlotBound6.s3)
{
   rtnId ++;
}
if(normalizedPos< kSlotBound6.s2)
{
   rtnId ++;
}
else if(normalizedPos<kSlotBound6.s1)
{
   rtnId ++;
}
else if(normalizedPos<kSlotBound6.s0)
{
   rtnId ++;
}
return rtnId;
}

4 switch-case优化

下面这个实例取材自DX11 SDK SubD11. switch-case总共有16个不同的分支，调用了三个不同的函数。我们可以将它合并为3个函数的调用。

D:\Program Files\Microsoft DirectX SDK (June 2010)\Samples\C++\Direct3D11\SubD11\SubD11.hlsl

原始Tessellation Shader:
BEZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p,
                                     uint i : SV_OutputControlPointID,
                                     uint PatchID : SV_PrimitiveID )
{
    // Valences and prefixes are Constant for this case (4,4,4,4)
    static const uint Val[4] = (uint[4])uint4(4,4,4,4);
    static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);

    float3 CornerB = float3(0,0,0);
    float3 CornerU = float3(0,0,0);
    float3 CornerV = float3(0,0,0);

    BEZIER_CONTROL_POINT Output;
    Output.vPosition = float3(0,0,0);

    // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
    // inefficient code for the sake of readability.
    switch( i )
    {
    // Interior vertices
    case 5:
        Output.vPosition = ComputeInteriorVertex( 0, Val, p );
        break;
    case 6:
        Output.vPosition = ComputeInteriorVertex( 1, Val, p );
        break;
    case 10:
        Output.vPosition = ComputeInteriorVertex( 2, Val, p );
        break;
    case 9:
        Output.vPosition = ComputeInteriorVertex( 3, Val, p );
        break;

    // Corner vertices
    case 0:
        ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
        Output.vPosition = CornerB;
        break;
    case 3:
        ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
        Output.vPosition = CornerB;
        break;
    case 15:
        ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
        Output.vPosition = CornerB;
        break;
    case 12:
        ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
        Output.vPosition = CornerB;
        break;

    // Edge vertices
    case 1:
        Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
        break;
    case 2:
        Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
        break;
    case 13:
        Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
        break;
    case 14:
        Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
        break;
    case 4:
        Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
        break;
    case 8:
        Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
        break;
    case 7:
        Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
        break;
    case 11:
        Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
        break;
    }

    return Output;
}

优化后，我们将Switch合并为三个函数的调用，不过对程序的整体性能帮助不大，因为这个Shader不是主要的Bottleneck.

EZIER_CONTROL_POINT SubDToBezierHS4444( InputPatch<VS_CONTROL_POINT_OUTPUT, MAX_POINTS> p,
                                     uint i : SV_OutputControlPointID,
                                     uint PatchID : SV_PrimitiveID )
{
    // Valences and prefixes are Constant for this case (4,4,4,4)
    static const uint Val[4] = (uint[4])uint4(4,4,4,4);
    static const uint Prefixes[4] = (uint[4])uint4(7,10,13,16);

    float3 CornerB = float3(0,0,0);
    float3 CornerU = float3(0,0,0);
    float3 CornerV = float3(0,0,0);

    BEZIER_CONTROL_POINT Output;
    Output.vPosition = float3(0,0,0);

    // !! PERFORMANCE NOTE: As mentioned above, this switch statement generates
    // inefficient code for the sake of readability.
    uint t1,t2;
    switch( i )
    {
    // Interior vertices
    case 5:
   t1 = 0;
   t2 = 0;
        //Output.vPosition = ComputeInteriorVertex( 0, Val, p );
        break;
    case 6:
        t1 = 0;
   t2 = 1;
        //Output.vPosition = ComputeInteriorVertex( 1, Val, p );
        break;
    case 10:
        t1 = 0;
   t2 = 2;
        //Output.vPosition = ComputeInteriorVertex( 2, Val, p );
        break;
    case 9:
        t1 = 0;
   t2 = 3;
        //Output.vPosition = ComputeInteriorVertex( 3, Val, p );
        break;

    // Corner vertices
    case 0:
        t1 = 1;
   t2 = 0;
        //ComputeCornerVertex4444( 0, CornerB, CornerU, CornerV, p, Val, Prefixes );
        //Output.vPosition = CornerB;
        break;
    case 3:
        t1 = 1;
   t2 = 1;
        //ComputeCornerVertex4444( 1, CornerB, CornerU, CornerV, p, Val, Prefixes );
        //Output.vPosition = CornerB;
        break;
    case 15:
        t1 = 1;
   t2 = 2;
        //ComputeCornerVertex4444( 2, CornerB, CornerU, CornerV, p, Val, Prefixes );
        //Output.vPosition = CornerB;
        break;
    case 12:
        t1 = 1;
   t2 = 3;
        //ComputeCornerVertex4444( 3, CornerB, CornerU, CornerV, p, Val, Prefixes );
        //Output.vPosition = CornerB;
        break;

    // Edge vertices
    case 1:
        t1 = 2;
   t2 = 0;
        //Output.vPosition = ComputeEdgeVertex( 0, p, Val, Prefixes );
        break;
    case 2:
        t1 = 2;
   t2 = 1;
        //Output.vPosition = ComputeEdgeVertex( 1, p, Val, Prefixes );
        break;
    case 13:
        t1 = 2;
   t2 = 2;
        //Output.vPosition = ComputeEdgeVertex( 2, p, Val, Prefixes );
        break;
    case 14:
        t1 = 2;
   t2 = 3;
        //Output.vPosition = ComputeEdgeVertex( 3, p, Val, Prefixes );
        break;
    case 4:
        t1 = 2;
   t2 = 4;
        //Output.vPosition = ComputeEdgeVertex( 4, p, Val, Prefixes );
        break;
    case 8:
        t1 = 2;
   t2 = 5;
        //Output.vPosition = ComputeEdgeVertex( 5, p, Val, Prefixes );
        break;
    case 7:
        t1 = 2;
   t2 = 6;
        //Output.vPosition = ComputeEdgeVertex( 6, p, Val, Prefixes );
        break;
    case 11:
        t1 = 2;
   t2 = 7;
        //Output.vPosition = ComputeEdgeVertex( 7, p, Val, Prefixes );
        break;
    }

if(t1 == 0)
    {
   Output.vPosition = ComputeInteriorVertex( t2, Val, p );
}
if(t1 == 1)
    {
        ComputeCornerVertex4444( t2, CornerB, CornerU, CornerV, p, Val, Prefixes );
        Output.vPosition = CornerB;
}
if(t1 == 2)
    {
       Output.vPosition = ComputeEdgeVertex( t2, p, Val, Prefixes );
    }
    return Output;
}

posted @ 2014-03-09 11:05 Charlie@ZJU 阅读(1708) 评论(0) 收藏举报

刷新页面返回顶部

Forly

OpenCL性能优化实例研究系列4：if-else/switch-case优化案例

公告