fast powf - Len3d - 博客园

公告

测试结果：
sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389
#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h>

/*
 * (c) Ian Stephenson
 *
 * ian@dctsystems.co.uk
 *
 * Fast pow() reference implementation
 */

/*
 * http://www.dctsystems.co.uk/Software/power.html
 * http://www.dctsystems.co.uk/Software/power.c
 */
const float shift23=(1<<23);
const float OOshift23=1.0/(1<<23);

__forceinline float myFloorf(float a)
{
    return (float)((int)a - (a < 0.0f));
}

__forceinline float myLog2(float i)
    {
    float LogBodge=0.346607f;
    float x;
    float y;
    x=(float)(*(int *)&i);
    x*= OOshift23; //1/pow(2,23);
    x=x-127;

    y=x-myFloorf(x);
    y=(y-y*y)*LogBodge;
    return x+y;
    }
__forceinline float myPow2(float i)
    {
    float PowBodge=0.33971f;
    float x;
    float y=i-myFloorf(i);
    y=(y-y*y)*PowBodge;

    x=i+127-y;
    x*= shift23; //pow(2,23);
    *(int*)&x=(int)x;
    return x;
    }

__forceinline float myPow(float a, float b)
    {
    return myPow2(b*myLog2(a));
    }

///////////////////////////////////////

/* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
    float offset = (p < 0) ? 1.0f : 0.0f;
    float clipp = (p < -126) ? -126.0f : p;
    int w = (int)clipp;
    float z = clipp - w + offset;
    union { unsigned int i; float f; } v = { (unsigned int)((1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
    return v.f;
}

__forceinline float fastlog2(float x)
{
    union { float f; unsigned int i; } vx = { x };
    union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
    float y = (float)vx.i;
    y *= 1.1920928955078125e-7f;
    return y - 124.22551499f 
        - 1.498030302f * mx.f 
        - 1.72587999f / (0.3520887068f + mx.f);
}

__forceinline float fastpow(float x, float p)
{
    return fastpow2(p * fastlog2(x));
}

/////////////////////////////////////////////////

#define FLT_MIN        1.175494351e-38F
#define FLT_MAX        3.402823466e+38F

template <typename T>
__forceinline T min(T a, T b)
{
    return ((a < b) ? a : b);
}

__forceinline float fast_fabs(float x)
{
    union { float f; unsigned int i; } v = {x};
    v.i &= 0x7FFFFFFF;
    return v.f;
}

/// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
    // NOTE:  in the future we may want to explicitly ask for a fused
    // multiply-add in a specialized version for float.
    // NOTE2: GCC/ICC will turn this (for float) into a FMA unless
    // explicitly asked not to, clang seems to leave the code alone.
    return a * b + c;
}

template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
    union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
    cvt.in_val = in;
    return cvt.out_val;
}

__forceinline float fast_log2 (float x) {
    // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
    if (x < FLT_MIN) x = FLT_MIN;
    if (x > FLT_MAX) x = FLT_MAX;
    // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
    unsigned bits = bit_cast<float, unsigned>(x);
    int exponent = int(bits >> 23) - 127;
    float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
    // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
    // ulp histogram:
    //  0  = 97.46%
    //  1  =  2.29%
    //  2  =  0.11%
    float f2 = f * f;
    float f4 = f2 * f2;
    float hi = madd(f, -0.00931049621349f,  0.05206469089414f);
    float lo = madd(f,  0.47868480909345f, -0.72116591947498f);
    hi = madd(f, hi, -0.13753123777116f);
    hi = madd(f, hi,  0.24187369696082f);
    hi = madd(f, hi, -0.34730547155299f);
    lo = madd(f, lo,  1.442689881667200f);
    return ((f4 * hi) + (f * lo)) + exponent;
}

__forceinline float fast_exp2 (float x) {
    // clamp to safe range for final addition
    if (x < -126.0f) x = -126.0f;
    if (x >  126.0f) x =  126.0f;
    // range reduction
    int m = int(x); x -= m;
    x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
    // 5th degree polynomial generated with sollya
    // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
    // ulp histogram:
    //  0  = 87.81%
    //  1  =  4.18%
    float r = 1.33336498402e-3f;
    r = madd(x, r, 9.810352697968e-3f);
    r = madd(x, r, 5.551834031939e-2f);
    r = madd(x, r, 0.2401793301105f);
    r = madd(x, r, 0.693144857883f);
    r = madd(x, r, 1.0f);
    // multiply by 2 ^ m by adding in the exponent
    // NOTE: left-shift of negative number is undefined behavior
    return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
}

__forceinline float fast_safe_pow (float x, float y) {
    if (y == 0) return 1.0f; // x^0=1
    if (x == 0) return 0.0f; // 0^y=0
    // be cheap & exact for special case of squaring and identity
    if (y == 1.0f)
        return x;
    if (y == 2.0f)
        return min (x*x, FLT_MAX);
    float sign = 1.0f;
    if (x < 0) {
        // if x is negative, only deal with integer powers
        // powf returns NaN for non-integers, we will return 0 instead
        int ybits = bit_cast<float, int>(y) & 0x7fffffff;
        if (ybits >= 0x4b800000) {
            // always even int, keep positive
        } else if (ybits >= 0x3f800000) {
            // bigger than 1, check
            int k = (ybits >> 23) - 127;  // get exponent
            int j =  ybits >> (23 - k);   // shift out possible fractional bits
            if ((j << (23 - k)) == ybits) // rebuild number and check for a match
                sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
            else
                return 0.0f; // not integer
        } else {
            return 0.0f; // not integer
        }
    }
    return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
}

/////////
int main(int argc, char *argv[])
{
    const int N = 100000000;
    float *buf = new float[N];
    float *a = new float[N];
    float *b = new float[N];
    float *c = new float[N];
    float *d = new float[N];
    for (int i = 0; i < N; ++i)
    {
        buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
    }

    int start_time;


    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        a[i] = myPow(buf[i], 0.8f);
    }
    printf("sum (fast) in clock %d\n", clock() - start_time);



    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        c[i] = fastpow(buf[i], 0.8f);
    }
    printf("sum (fast2) in clock %d\n", clock() - start_time);



    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        d[i] = fast_safe_pow(buf[i], 0.8f);
    }
    printf("sum (fast3) in clock %d\n", clock() - start_time);



    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        b[i] = powf(buf[i], 0.8f);
    }
    printf("sum in clock %d\n", clock() - start_time);


    float max_err = 0.0f;
    for (int i = 0; i < N; ++i)
    {
        float err = fabsf(a[i] - b[i]);
        if (err > max_err)
            max_err = err;
    }
    printf("Error is %f\n", max_err);


    max_err = 0.0f;
    for (int i = 0; i < N; ++i)
    {
        float err = fabsf(b[i] - c[i]);
        if (err > max_err)
            max_err = err;
    }
    printf("Error2 is %f\n", max_err);


    max_err = 0.0f;
    for (int i = 0; i < N; ++i)
    {
        float err = fabsf(b[i] - d[i]);
        if (err > max_err)
            max_err = err;
    }
    printf("Error3 is %f\n", max_err);


    delete[]buf;
    delete[]a;
    delete[]b;
    delete[]c;
    delete[]d;
    return 0;
}
posted on 2017-11-28 21:29 Len3d 阅读(563) 评论(0) 收藏举报
刷新页面返回顶部
len3d

公告