SwapBuffers的效率问题

最近看了ms实现的opengl 1.1 source,写的非常不错,但也发现了不少问题,而且这个问题在后续版本中并没有改掉(难道ms 为保 d3d 故意的?太恶劣了。。。)
这里就SwapBuffers API的问题简单说一下,希望大家看到他的缺陷,并且避开它。

这是ms opengl 1.1 中的source:

__inline FARPROC GetAPI(char *szDll, char *szAPI, HMODULE *phDll)
{
    *phDll = LoadLibraryA(szDll);

    if (*phDll == NULL)
    {
        return NULL;
    }

    return GetProcAddress(*phDll, szAPI);
}

/***********************************************************************/

BOOL WINAPI SwapBuffers(HDC hdc)
{
    HMODULE hDll;
    PFN5    pfn = (PFN5)GetAPI(szOpenGL, "wglSwapBuffers", &hDll);
    BOOL    bRet = FALSE;

    if (pfn)
    {
        bRet = (*pfn)(hdc);
    }
    
    if (hDll)
    {
        FreeLibrary(hDll);
    }
        
    return bRet;
}

可以看到SwapBuffers API的问题,很低效!为了证实opengl 的后续版本也存在这个问题,我在windows xp sp2 下用ida 5.2 打开gdi32.dll 找到了SwapBuffers 的反汇编代码如下:

.text:77F2599E ; *************** S U B R O U T I N E ***************************************
.text:77F2599E
.text:77F2599E ; Attributes: bp-based frame
.text:77F2599E
.text:77F2599E ; BOOL __stdcall SwapBuffers(HDC)
.text:77F2599E                 public __stdcall SwapBuffers(x)
.text:77F2599E __stdcall SwapBuffers(x) proc near
.text:77F2599E
.text:77F2599E hLibModule      = dword ptr -4
.text:77F2599E arg_0           = dword ptr  8
.text:77F2599E
.text:77F2599E                 mov     edi, edi
.text:77F259A0                 push    ebp
.text:77F259A1                 mov     ebp, esp
.text:77F259A3                 push    ecx
.text:77F259A4                 push    esi
.text:77F259A5                 lea     eax, [ebp+hLibModule]
.text:77F259A8                 push    eax             ; int
.text:77F259A9                 push    offset s_Wglswapbuffer ; "wglSwapBuffers"
.text:77F259AE                 push    offset s_Opengl32 ; "OPENGL32"
.text:77F259B3                 call    GetAPI(x,x,x)
.text:77F259B3
.text:77F259B8                 xor     esi, esi
.text:77F259BA                 test    eax, eax
.text:77F259BC                 jz      short loc_77F259C5
.text:77F259BC
.text:77F259BE                 push    [ebp+arg_0]
.text:77F259C1                 call    eax
.text:77F259C3                 mov     esi, eax
.text:77F259C3
.text:77F259C5
.text:77F259C5 loc_77F259C5:                           ; CODE XREF: SwapBuffers(x)+1Ej
.text:77F259C5                 cmp     [ebp+hLibModule], 0
.text:77F259C9                 jz      short loc_77F259D4
.text:77F259C9
.text:77F259CB                 push    [ebp+hLibModule] ; hLibModule
.text:77F259CE                 call    ds:FreeLibrary(x)
.text:77F259CE
.text:77F259D4
.text:77F259D4 loc_77F259D4:                           ; CODE XREF: SwapBuffers(x)+2Bj
.text:77F259D4                 mov     eax, esi
.text:77F259D6                 pop     esi
.text:77F259D7                 leave
.text:77F259D8                 retn    4
.text:77F259D8
.text:77F259D8 __stdcall SwapBuffers(x) endp

可以看到确实SwapBuffer每次调用都在GetAPI,而GetAPI先LoadLibrary,再GetProcAddress;虽然SwapBuffers每次LoadLibrary/FreeLibaray也许只是改变引用计数,但是我想GetProcAddress总是有开销的,即便有Cache机制也会有开销的。追根溯源,我又找到了GetProcAdress的source code,发现GetProcAddress是调用LdrGetProcedureAddress,继续追查,LdrGetProcedureAddress:

NTSTATUS
LdrpGetProcedureAddress (
    IN PVOID DllHandle,
    IN PANSI_STRING ProcedureName OPTIONAL,
    IN ULONG ProcedureNumber OPTIONAL,
    OUT PVOID *ProcedureAddress,
    IN BOOLEAN RunInitRoutines
    )

/*++

Routine Description:

    This function locates the address of the specified procedure in the
    specified DLL and returns its address.

Arguments:

    DllHandle - Supplies a handle to the DLL that the address is being
        looked up in.

    ProcedureName - Supplies that address of a string that contains the
        name of the procedure to lookup in the DLL.  If this argument is
        not specified, then the ProcedureNumber is used.

    ProcedureNumber - Supplies the procedure number to lookup.  If
        ProcedureName is specified, then this argument is ignored.
        Otherwise, it specifies the procedure ordinal number to locate
        in the DLL.

    ProcedureAddress - Returns the address of the procedure found in
        the DLL.

Return Value:

    TBD

--*/

{
    NTSTATUS st;
    UCHAR FunctionNameBuffer[64];
    PUCHAR src, dst;
    ULONG cb, ExportSize;
    PLDR_DATA_TABLE_ENTRY LdrDataTableEntry;
    IMAGE_THUNK_DATA Thunk;
    PVOID ImageBase;
    PIMAGE_IMPORT_BY_NAME FunctionName;
    PIMAGE_EXPORT_DIRECTORY ExportDirectory;
    PLIST_ENTRY Next;

    if (ShowSnaps) {
        DbgPrint("LDR: LdrGetProcedureAddress by ");
    }

    FunctionName = NULL;
    if ( ARGUMENT_PRESENT(ProcedureName) ) {

        if (ShowSnaps) {
            DbgPrint("NAME - %s\n", ProcedureName->Buffer);
        }

        //
        // BUGBUG need STRING to PSZ
        //


        if (ProcedureName->Length >= sizeof( FunctionNameBuffer )-1 ) {
            FunctionName = RtlAllocateHeap(RtlProcessHeap(), MAKE_TAG( TEMP_TAG ),ProcedureName->Length+1+sizeof(USHORT));
            if ( !FunctionName ) {
                return STATUS_INVALID_PARAMETER;
                }
        } else {
            FunctionName = (PIMAGE_IMPORT_BY_NAME) FunctionNameBuffer;
        }

        FunctionName->Hint = 0;

        cb = ProcedureName->Length;
        src = ProcedureName->Buffer;
        dst = FunctionName->Name;

        ImageBase = NtCurrentPeb()->ImageBaseAddress;
        while (cb--) {
            *dst++ = *src++;
        }
        *dst = '\0';

        Thunk.u1.AddressOfData = FunctionName;

    } else {
             if (ShowSnaps) {
                 DbgPrint("ORDINAL - %lx\n", ProcedureNumber);
             }

             if (ProcedureNumber) {
                 Thunk.u1.Ordinal = ProcedureNumber | IMAGE_ORDINAL_FLAG;
             } else {
                      return STATUS_INVALID_PARAMETER;
                    }
          }

    if ( LdrpInLdrInit == FALSE ) {
        RtlEnterCriticalSection((PRTL_CRITICAL_SECTION)NtCurrentPeb()->LoaderLock);
        }
    try {

        if (!LdrpCheckForLoadedDllHandle(DllHandle, &LdrDataTableEntry)) {
            st = STATUS_DLL_NOT_FOUND;
            return st;
        }

        ExportDirectory = (PIMAGE_EXPORT_DIRECTORY)RtlImageDirectoryEntryToData(
                           LdrDataTableEntry->DllBase,
                           TRUE,
                           IMAGE_DIRECTORY_ENTRY_EXPORT,
                           &ExportSize
                           );

        if (!ExportDirectory) {
            return STATUS_PROCEDURE_NOT_FOUND;
        }

        st = LdrpSnapThunk(LdrDataTableEntry->DllBase,
                           0,
                           &Thunk,
                           &Thunk,
                           ExportDirectory,
                           ExportSize,
                           FALSE,
                           NULL
                          );

        if ( RunInitRoutines ) {

            //
            // Look at last entry in init order list. If entry processed
            // flag is not set, then a forwarded dll was loaded during the
            // getprocaddr call and we need to run init routines
            //

            Next = NtCurrentPeb()->Ldr->InInitializationOrderModuleList.Blink;
            LdrDataTableEntry = CONTAINING_RECORD(Next, LDR_DATA_TABLE_ENTRY, InInitializationOrderLinks);
            if ( !(LdrDataTableEntry->Flags & LDRP_ENTRY_PROCESSED) ) {
                try {
                    st = LdrpRunInitializeRoutines(NULL);
                    }
                except( EXCEPTION_EXECUTE_HANDLER ) {
                    st = GetExceptionCode();
                    }

                }
            }

        if ( NT_SUCCESS(st) ) {
            *ProcedureAddress = Thunk.u1.Function;
            }
    } finally {
        if ( FunctionName && (FunctionName != (PIMAGE_IMPORT_BY_NAME) FunctionNameBuffer) ) {
            RtlFreeHeap(RtlProcessHeap(),0,FunctionName);
            }
        if ( LdrpInLdrInit == FALSE ) {
            RtlLeaveCriticalSection((PRTL_CRITICAL_SECTION)NtCurrentPeb()->LoaderLock);
            }
    }
    return st;
}

LdrpGetProcedureAddress 其实是去查IAT,找到函数地址,并返回。

我想,关于SwapBuffers这个被频繁调用的API的性能的问题,也许用性能分析工具,比如intel vtune或者amd codeanalyst 就可以看得出来,下面使用vs2005中使用自带的PerformanceTools进行分析了一个性能分析,测试代码如下:

for( int i = 0;i<100;i++ )
{        
    ::glClear( GL_COLOR_BUFFER_BIT );
    ::SwapBuffers( wnd->m_hdc );
}
        
for( int i = 0;i<100;i++ )
{
    ::glClear( GL_COLOR_BUFFER_BIT );
    wglSwapBuffers( wnd->m_hdc );
}

测试结果分三次:

第一次:
                                  Time( msecs )             %
SwapBuffers               1379.503985                21.323
wglSwapBuffers          1333.716537                20.615

第二次:
                                  Time( msecs )             %
SwapBuffers               1374.064678                29.757
wglSwapBuffers          1333.665637                28.882

第三次:
                                  Time( msecs )             %
SwapBuffers               1382.822897                21.076
wglSwapBuffers          1334.037943                20.332

测试环境:

CPU:Intel Core2 Duo  E4500 2.20 GHz  
显卡:NVIDIA GEFORCE 8600 GT
OS:Windows XP SP2

上面的数据就是最好的说明,平均每帧多开销0.5个毫秒。想一下,FPS达到 30帧的时候,每帧可用时间片只有33毫秒,却被它白白浪费的0.5个毫秒。

要避免这个问题,最好还是自己去获得wglSwapBuffers这个函数,才是最高效的,因为wglSwapBuffers是厂商驱动提供的。在软模式opengl下,它也会回调os 的 GdiSwapBuffers

另外其他API的问题也是这样的 ChoosePixelFormat,DescribePixelFormat,GetPixelFormat,SetPixelFormat,它们都是先得到wglXXX版本的函数地址,比如wglChoosePixelFormat,wglDescribePixelFormat,wglGetPixelFormat,wglSetPixelFormat然后再调用。

我曾经遇到过这样的问题,在真正opengl调用之前调用DescribePixelFormat,会非常慢。他一定是LoadLibrary ,FreeLibrary,从下面的输出窗口可以看到执行的时候会输出加载卸载opengl32.dll的信息。但是当opengl api 被调用后,这个时候就LoadLibrary,FreeLibrary 只是改变引用计数了。一种解决的方法是,在调用ChoosePixelFormat,DescribePixelFormat,GetPixelFormat,SetPixelFormat这些函数之前,先调用 LoadLibrary("OpenGL32.dll");这样就不会频繁的加载卸载dll;更好的方法还是自己获取wgl版本的相应函数;
如果你要列举所有像素格式,如果不这么做,会发现慢的要死,每个调用都要LoadLibrary/FreeLibrary,大概需要1分钟时间才能列举完所有像素格式,不是一般的恐怖。。。

posted on 2009-04-01 16:15  cgwolver  阅读(9488)  评论(6编辑  收藏  举报

导航