对比使用C# unsafe代码和OpenCV进行图像处理的效率(下)

        经过前面的讨论,我对Image类进行了优化,代码如下:

    //C#灰度图像处理类,作者:wmesci
    //http://http://blog.csdn.net/wmesci
    unsafe class Image :CriticalHandle,  IDisposable
    {
        [DllImport("kernel32.dll")]
        static extern IntPtr LocalAlloc(int flags, int size);

        [DllImport("kernel32.dll")]
        static extern IntPtr LocalFree(IntPtr memBlock);

        [DllImport("kernel32.dll", EntryPoint = "RtlMoveMemory")]
        static extern unsafe void CopyMemory(void* dst, void* src, int count);

        [DllImport("ntdll.dll")]
        static extern unsafe void* memset(void* src, byte value, uint size);

        const byte Max = 255;
        const byte Min = 0;

        public Image(int width, int height) 
            : base(IntPtr.Zero)
        {
            if (width <= 0 || height <= 0)
                throw new ArgumentOutOfRangeException();

            Width = width;
            Height = height;
            Stride = (width + 3) & ~3;
            Length = Stride * Height;
            base.SetHandle(LocalAlloc(0x40, Length));

            Pointer = (byte*)handle.ToPointer();
        }

        public Image(int width, int height, byte* data)
            : this(width, height)
        {
            SetData(data);
        }

        public void GetData(void* dst) 
        {
            CopyMemory(dst, Pointer, Length);
        }

        public void SetData(void* src)
        {
            CopyMemory(Pointer, src, Length);
        }

        public readonly int Width;

        public readonly int Height;

        public readonly int Length;

        public readonly int Stride;

        public readonly byte* Pointer;

        public byte this[int x, int y] 
        {
            get
            {
                return *(Pointer + y * Stride + x);
            }
            set
            {
                *(Pointer + y * Stride + x) = value;
            }
        }

        public Image Clone()
        {
            return new Image(Width, Height, Pointer);
        }

        public void Add(Image img)
        {
            Action<int> act = y =>
            {
                byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4)
                {
                    int d = (int)p1[0] + (int)p2[0];
                    if (d < 0)
                        p1[0] = 0;
                    else if (d > 255)
                        p1[0] = 255;
                    else
                        p1[0] = (byte)d;

                    d = (int)p1[1] + (int)p2[1];
                    if (d < 0)
                        p1[1] = 0;
                    else if (d > 255)
                        p1[1] = 255;
                    else
                        p1[1] = (byte)d;

                    d = (int)p1[2] + (int)p2[2];
                    if (d < 0)
                        p1[2] = 0;
                    else if (d > 255)
                        p1[2] = 255;
                    else
                        p1[2] = (byte)d;

                    d = (int)p1[3] + (int)p2[3];
                    if (d < 0)
                        p1[3] = 0;
                    else if (d > 255)
                        p1[3] = 255;
                    else
                        p1[3] = (byte)d;
                }
            };
            Parallel.For(0, Height, act);
        }

        public void Sub(Image img) 
        {
            Action<int> act = y =>
            {
                byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4)
                {
                    int d = (int)p1[0] - (int)p2[0];
                    if (d < 0)
                        p1[0] = 0;
                    else if (d > 255)
                        p1[0] = 255;
                    else
                        p1[0] = (byte)d;

                    d = (int)p1[1] - (int)p2[1];
                    if (d < 0)
                        p1[1] = 0;
                    else if (d > 255)
                        p1[1] = 255;
                    else
                        p1[1] = (byte)d;

                    d = (int)p1[2] - (int)p2[2];
                    if (d < 0)
                        p1[2] = 0;
                    else if (d > 255)
                        p1[2] = 255;
                    else
                        p1[2] = (byte)d;

                    d = (int)p1[3] - (int)p2[3];
                    if (d < 0)
                        p1[3] = 0;
                    else if (d > 255)
                        p1[3] = 255;
                    else
                        p1[3] = (byte)d;
                }
            };
            Parallel.For(0, Height, act);
        }

        /// <summary>OK</summary>
        public void Mul(Image img, double scale)
        {
            Action<int> act = y =>
            {
                byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                for (int x = 0; x < Stride; x+=4, p1+=4, p2+=4)
                {
                    double d = (int)p1[0] * (int)p2[0] * scale;
                    if (d < 0)
                        p1[0] = 0;
                    else if (d > 255)
                        p1[0] = 255;
                    else
                        p1[0] = (byte)d;

                    d = (int)p1[1] * (int)p2[1] * scale;
                    if (d < 0)
                        p1[1] = 0;
                    else if (d > 255)
                        p1[1] = 255;
                    else
                        p1[1] = (byte)d;

                    d = (int)p1[2] * (int)p2[2] * scale;
                    if (d < 0)
                        p1[2] = 0;
                    else if (d > 255)
                        p1[2] = 255;
                    else
                        p1[2] = (byte)d;

                    d = (int)p1[3] * (int)p2[3] * scale;
                    if (d < 0)
                        p1[3] = 0;
                    else if (d > 255)
                        p1[3] = 255;
                    else
                        p1[3] = (byte)d;
                }
            };
            Parallel.For(0, Height, act);
        }

        public void Threshold(byte threshold) 
        {
            Action<int> act = y => 
            {
                byte* p = Pointer + y * Stride;
                for (int x = 0; x < Stride; x+=4, p+=4)
                {
                    p[0] = p[0] < threshold ? Min : Max;
                    p[1] = p[1] < threshold ? Min : Max;
                    p[2] = p[2] < threshold ? Min : Max;
                    p[3] = p[3] < threshold ? Min : Max;
                }
            };
            Parallel.For(0, Height, act);
        }

        /// <summary>OK</summary>
        public void AddWeighted(Image img, double a, double b)
        {
            int* taba = stackalloc int[256];
            for (int i = 0; i < 256; i++)
                taba[i] = (int)(i * a);
            int* tabb = stackalloc int[256];
            for (int i = 0; i < 256; i++)
                tabb[i] = (int)(i * b);

            Action<int> act = y =>
            {
                byte* p1 = this.Pointer + y * this.Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                for (int x = 0; x < this.Stride; x+=4, p1+=4, p2+=4)
                {
                    int d = taba[p1[0]] + taba[p2[0]];
                    if (d < 0)
                        p1[0] = 0;
                    else if (d > 255)
                        p1[0] = 255;
                    else 
                        p1[0] = (byte)d;

                    d = taba[p1[1]] + taba[p2[1]];
                    if (d < 0)
                        p1[1] = 0;
                    else if (d > 255)
                        p1[1] = 255;
                    else
                        p1[1] = (byte)d;

                    d = taba[p1[2]] + taba[p2[2]];
                    if (d < 0)
                        p1[2] = 0;
                    else if (d > 255)
                        p1[2] = 255;
                    else
                        p1[2] = (byte)d;

                    d = taba[p1[3]] + taba[p2[3]];
                    if (d < 0)
                        p1[3] = 0;
                    else if (d > 255)
                        p1[3] = 255;
                    else
                        p1[3] = (byte)d;
                }
            };
            Parallel.For(0, this.Height, act);
        }

        public static void Smooth(Image src, Image dst, int n)
        {
            //分配一块临时存储区
            int* tmp = (int*)Marshal.AllocHGlobal(src.Stride * src.Height * 4).ToPointer();
            Action<int> act = y =>
            {
                byte* p = src.Pointer + y * src.Stride;
                int d = 0;
                for (int i = -n; i <= n; i++)
                {
                    int xx = Clamp(i, src.Stride);

                    d += p[xx];
                }
                tmp[y * src.Stride] = d;
            };
            Parallel.For(0, src.Height, act);

            act = y =>
            {
                int i = y * src.Stride;
                byte* p = src.Pointer + y * src.Stride;
                for (int x = 1; x < src.Stride; x++)
                {
                    int d = tmp[i];

                    int x1 = Clamp(x - n - 1, src.Stride);
                    int x2 = Clamp(x + n, src.Stride);

                    d += (p[x2] - p[x1]);

                    tmp[++i] = d;
                }
            };
            Parallel.For(0, src.Height, act);

            double f = 1.0 / (2 * n + 1);
            f *= f;

            act = x =>
            {
                int d = 0;
                byte* p = dst.Pointer + x;
                for (int j = -n; j <= n; j++)
                {
                    int yy = Clamp(j, src.Height);

                    d += tmp[x + yy * src.Stride];
                }
                *p = (byte)(d * f);
                p += src.Stride;

                for (int y = 1; y < src.Height; y++, p += src.Stride)
                {
                    int y1 = Clamp(y - n - 1, src.Height);
                    int y2 = Clamp(y + n, src.Height);

                    d += (tmp[x + y2 * src.Stride] - tmp[x + y1 * src.Stride]);

                    *p = (byte)(d * f);
                }
            };

            Parallel.For(0, src.Stride, act);
            Marshal.FreeHGlobal(new IntPtr(tmp));
        }

        private static int Clamp(int i, int max)
        {
            if (i < 0) return 0;
            if (i >= max) return max - 1;
            return i;
        }

        public override bool IsInvalid
        {
            get { return handle == IntPtr.Zero; }
        }

        protected override bool ReleaseHandle()
        {
            LocalFree(handle);
            return true;
        }
    }

        主要修改的地方如下:

        1、将图像的每一行4字节对齐,增加Stribe属性,其值等于Width向上取最近的4的倍数,然后在所有的for循环里,每次操作4个字节。这样一来,减少了循环次数。

        2、减少浮点运算

                A:Add/Sub方法中的临时变量d改为int型

                B:Mul方法中,调整运算顺序,由scale * *p1 * *p2改为p1[0] * p2[0] * scale,区别在于,前一种先算scale * *p0,是一个浮点乘法,其结果也是浮点数,然后再算和*p2的乘积,共两次浮点乘法;而后一种先算p1[0] * p2[0],这是一次整数乘法,然后再算和scale的积,共一次整数乘法一次浮点乘法。由于浮点乘法比整数乘法慢,因此效率会有所提高。

    3、AddWeighted改为使用查表法进行运算,首先算出0~255这256个数和a、b的积,放在数组taba、tabb中,其后的循环中只需查表再相加即可,效率大幅提高!


下面是优化后的测试结果(数值表示Image类方法和对应的OpenCV方法执行时间之比):

CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
样本:600 X 896
-------------------------------------
Add         1.446  1.315
Sub         1.171  1.109
Mul         0.651  0.580
Threshold   1.511  1.432
Smooth      0.938  0.908
AddWeighted 0.528  0.474

CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
样本:1600 X 1200
-------------------------------------
Add         1.041  1.052
Sub         0.910  0.906
Mul         0.562  0.558
Threshold   1.277  1.236
Smooth      1.020  1.024
AddWeighted 0.462  0.461

CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
样本:1600 X 1200
-------------------------------------
Add         1.514  1.533
Sub         1.225  1.163
Mul         1.085  1.095
Threshold   1.643  1.630
Smooth      1.847  1.867
AddWeighted 0.957  0.924

CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
样本:600 X 896
-------------------------------------
Add         2.559  2.073  2.676
Sub         2.240  1.784  1.856
Mul         1.261  1.352  1.284
Threshold   2.453  2.511  3.101
Smooth      1.660  1.647  1.663
AddWeighted 0.978  1.017  0.961

CPU:Intel Core i3 M330 2.13GHz  (双核四线程)
样本:1600 X 1200
-------------------------------------
Add         2.611
Sub         2.545
Mul         1.011
Threshold   2.882
Smooth      1.891
AddWeighted 0.525

CPU:Intel Core i3 M330 2.13GHz  (双核四线程)
样本:600 X 896
-------------------------------------
Add         4.483
Sub         3.576
Mul         1.101
Threshold   5.953
Smooth      2.029
AddWeighted 0.581

CPU:Intel Core i7 2360QM 2.00GHz  (四核八线程)
样本:600 X 896
-------------------------------------
Add         1.080  1.020
Sub         0.977  1.010
Mul         0.575  0.558
Threshold   0.842  0.898
Smooth      1.447  1.386
AddWeighted 0.325  0.366

CPU:Intel Core i7 2360QM 2.00GHz  (四核八线程)
样本:1600 X 1200
-------------------------------------
Add         1.420
Sub         1.134
Mul         0.535
Threshold   0.878
Smooth      1.379
AddWeighted 0.325


    分析以上数据,我们不难发现以下几点:

    1、样本大小相同时,CPU核心数越多,Image/OpenCV就越小,这说明了多线程算法在多核CPU下的优势。

    2、CPU相同时,样本大小越打,比值越小。

    3、OpenCV针对Intel CPU使用IPP进行了优化,因此在Intel CPU上跑,比值会比在AMD CPU上打很多。

    4、OpenCV里使用SSE优化过的方法,用C#实现时差距比较明显,怎么才能达到差不多的效率,这个暂时还没想到。但是OpenCV里没使用SSE优化的方法,如Mul、AddWeighted,使用C#完全可以达到相同的性能,甚至超过OpenCV,如AddWeighted,在双核CPU上也比OpenCV要快,在4核以上CPU上远超OpenCV!!

    5、使用自写方法替代OpenCV是完全可行的!!


    以上测试有几点需要说明:

    1、CLR是在第一次运行某个方法时才进行编译,因此第一次执行某个方法时会慢很多,在计算时间时要排除第一次执行的时间。

    2、C#调用OpenCV需要经过封送处理,但封送处理所消耗的时间在这里无法避免。


    各位看官如有其它优化修改意见,还望不吝赐教!!!同时我会不断对这个类进行修改以及添加其它图像处理方法,对图像处理、OpenCV以及C#代码优化感兴趣的同学,请关注本贴!!

posted @ 2011-11-25 20:17  RayTracer  阅读(1432)  评论(0编辑  收藏  举报