len3d

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

测试结果:

sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389

 

 

#include <stdio.h>
#include <xmmintrin.h>
#define NOMINMAX
#include <windows.h>
#include <math.h>
#include <time.h>

/*
 * (c) Ian Stephenson
 *
 * ian@dctsystems.co.uk
 *
 * Fast pow() reference implementation
 */

/*
 * http://www.dctsystems.co.uk/Software/power.html
 * http://www.dctsystems.co.uk/Software/power.c
 */
const float shift23=(1<<23);
const float OOshift23=1.0/(1<<23);

__forceinline float myFloorf(float a)
{
    return (float)((int)a - (a < 0.0f));
}

__forceinline float myLog2(float i)
    {
    float LogBodge=0.346607f;
    float x;
    float y;
    x=(float)(*(int *)&i);
    x*= OOshift23; //1/pow(2,23);
    x=x-127;

    y=x-myFloorf(x);
    y=(y-y*y)*LogBodge;
    return x+y;
    }
__forceinline float myPow2(float i)
    {
    float PowBodge=0.33971f;
    float x;
    float y=i-myFloorf(i);
    y=(y-y*y)*PowBodge;

    x=i+127-y;
    x*= shift23; //pow(2,23);
    *(int*)&x=(int)x;
    return x;
    }

__forceinline float myPow(float a, float b)
    {
    return myPow2(b*myLog2(a));
    }

///////////////////////////////////////

/* Code below are from http://code.google.com/p/fastapprox/ */
__forceinline float fastpow2(float p)
{
    float offset = (p < 0) ? 1.0f : 0.0f;
    float clipp = (p < -126) ? -126.0f : p;
    int w = (int)clipp;
    float z = clipp - w + offset;
    union { unsigned int i; float f; } v = { (unsigned int)((1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) };
    return v.f;
}

__forceinline float fastlog2(float x)
{
    union { float f; unsigned int i; } vx = { x };
    union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 };
    float y = (float)vx.i;
    y *= 1.1920928955078125e-7f;
    return y - 124.22551499f 
        - 1.498030302f * mx.f 
        - 1.72587999f / (0.3520887068f + mx.f);
}

__forceinline float fastpow(float x, float p)
{
    return fastpow2(p * fastlog2(x));
}

/////////////////////////////////////////////////

#define FLT_MIN        1.175494351e-38F
#define FLT_MAX        3.402823466e+38F

template <typename T>
__forceinline T min(T a, T b)
{
    return ((a < b) ? a : b);
}

__forceinline float fast_fabs(float x)
{
    union { float f; unsigned int i; } v = {x};
    v.i &= 0x7FFFFFFF;
    return v.f;
}

/// Multiply and add: (a * b) + c
template <typename T>
__forceinline T madd (const T& a, const T& b, const T& c) {
    // NOTE:  in the future we may want to explicitly ask for a fused
    // multiply-add in a specialized version for float.
    // NOTE2: GCC/ICC will turn this (for float) into a FMA unless
    // explicitly asked not to, clang seems to leave the code alone.
    return a * b + c;
}

template <typename IN_TYPE, typename OUT_TYPE>
__forceinline OUT_TYPE bit_cast (const IN_TYPE in) {
    union { IN_TYPE in_val; OUT_TYPE out_val; } cvt;
    cvt.in_val = in;
    return cvt.out_val;
}

__forceinline float fast_log2 (float x) {
    // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
    if (x < FLT_MIN) x = FLT_MIN;
    if (x > FLT_MAX) x = FLT_MAX;
    // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
    unsigned bits = bit_cast<float, unsigned>(x);
    int exponent = int(bits >> 23) - 127;
    float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
    // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
    // ulp histogram:
    //  0  = 97.46%
    //  1  =  2.29%
    //  2  =  0.11%
    float f2 = f * f;
    float f4 = f2 * f2;
    float hi = madd(f, -0.00931049621349f,  0.05206469089414f);
    float lo = madd(f,  0.47868480909345f, -0.72116591947498f);
    hi = madd(f, hi, -0.13753123777116f);
    hi = madd(f, hi,  0.24187369696082f);
    hi = madd(f, hi, -0.34730547155299f);
    lo = madd(f, lo,  1.442689881667200f);
    return ((f4 * hi) + (f * lo)) + exponent;
}

__forceinline float fast_exp2 (float x) {
    // clamp to safe range for final addition
    if (x < -126.0f) x = -126.0f;
    if (x >  126.0f) x =  126.0f;
    // range reduction
    int m = int(x); x -= m;
    x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
    // 5th degree polynomial generated with sollya
    // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
    // ulp histogram:
    //  0  = 87.81%
    //  1  =  4.18%
    float r = 1.33336498402e-3f;
    r = madd(x, r, 9.810352697968e-3f);
    r = madd(x, r, 5.551834031939e-2f);
    r = madd(x, r, 0.2401793301105f);
    r = madd(x, r, 0.693144857883f);
    r = madd(x, r, 1.0f);
    // multiply by 2 ^ m by adding in the exponent
    // NOTE: left-shift of negative number is undefined behavior
    return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
}

__forceinline float fast_safe_pow (float x, float y) {
    if (y == 0) return 1.0f; // x^0=1
    if (x == 0) return 0.0f; // 0^y=0
    // be cheap & exact for special case of squaring and identity
    if (y == 1.0f)
        return x;
    if (y == 2.0f)
        return min (x*x, FLT_MAX);
    float sign = 1.0f;
    if (x < 0) {
        // if x is negative, only deal with integer powers
        // powf returns NaN for non-integers, we will return 0 instead
        int ybits = bit_cast<float, int>(y) & 0x7fffffff;
        if (ybits >= 0x4b800000) {
            // always even int, keep positive
        } else if (ybits >= 0x3f800000) {
            // bigger than 1, check
            int k = (ybits >> 23) - 127;  // get exponent
            int j =  ybits >> (23 - k);   // shift out possible fractional bits
            if ((j << (23 - k)) == ybits) // rebuild number and check for a match
                sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
            else
                return 0.0f; // not integer
        } else {
            return 0.0f; // not integer
        }
    }
    return sign * fast_exp2(y * fast_log2(fast_fabs(x)));
}

/////////
int main(int argc, char *argv[])
{
    const int N = 100000000;
    float *buf = new float[N];
    float *a = new float[N];
    float *b = new float[N];
    float *c = new float[N];
    float *d = new float[N];
    for (int i = 0; i < N; ++i)
    {
        buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX;
    }

    int start_time;


    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        a[i] = myPow(buf[i], 0.8f);
    }
    printf("sum (fast) in clock %d\n", clock() - start_time);



    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        c[i] = fastpow(buf[i], 0.8f);
    }
    printf("sum (fast2) in clock %d\n", clock() - start_time);



    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        d[i] = fast_safe_pow(buf[i], 0.8f);
    }
    printf("sum (fast3) in clock %d\n", clock() - start_time);



    start_time = clock();
    for (int i = 0; i < N; ++i)
    {
        b[i] = powf(buf[i], 0.8f);
    }
    printf("sum in clock %d\n", clock() - start_time);


    float max_err = 0.0f;
    for (int i = 0; i < N; ++i)
    {
        float err = fabsf(a[i] - b[i]);
        if (err > max_err)
            max_err = err;
    }
    printf("Error is %f\n", max_err);


    max_err = 0.0f;
    for (int i = 0; i < N; ++i)
    {
        float err = fabsf(b[i] - c[i]);
        if (err > max_err)
            max_err = err;
    }
    printf("Error2 is %f\n", max_err);


    max_err = 0.0f;
    for (int i = 0; i < N; ++i)
    {
        float err = fabsf(b[i] - d[i]);
        if (err > max_err)
            max_err = err;
    }
    printf("Error3 is %f\n", max_err);


    delete[]buf;
    delete[]a;
    delete[]b;
    delete[]c;
    delete[]d;
    return 0;
}

 

posted on 2017-11-28 21:29  Len3d  阅读(444)  评论(0编辑  收藏  举报