对比使用C# unsafe代码和OpenCV进行图像处理的效率(下)
经过前面的讨论,我对Image类进行了优化,代码如下:
//C#灰度图像处理类,作者:wmesci //http://http://blog.csdn.net/wmesci
unsafe class Image :CriticalHandle, IDisposable { [DllImport("kernel32.dll")] static extern IntPtr LocalAlloc(int flags, int size); [DllImport("kernel32.dll")] static extern IntPtr LocalFree(IntPtr memBlock); [DllImport("kernel32.dll", EntryPoint = "RtlMoveMemory")] static extern unsafe void CopyMemory(void* dst, void* src, int count); [DllImport("ntdll.dll")] static extern unsafe void* memset(void* src, byte value, uint size); const byte Max = 255; const byte Min = 0; public Image(int width, int height) : base(IntPtr.Zero) { if (width <= 0 || height <= 0) throw new ArgumentOutOfRangeException(); Width = width; Height = height; Stride = (width + 3) & ~3; Length = Stride * Height; base.SetHandle(LocalAlloc(0x40, Length)); Pointer = (byte*)handle.ToPointer(); } public Image(int width, int height, byte* data) : this(width, height) { SetData(data); } public void GetData(void* dst) { CopyMemory(dst, Pointer, Length); } public void SetData(void* src) { CopyMemory(Pointer, src, Length); } public readonly int Width; public readonly int Height; public readonly int Length; public readonly int Stride; public readonly byte* Pointer; public byte this[int x, int y] { get { return *(Pointer + y * Stride + x); } set { *(Pointer + y * Stride + x) = value; } } public Image Clone() { return new Image(Width, Height, Pointer); } public void Add(Image img) { Action<int> act = y => { byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride; for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4) { int d = (int)p1[0] + (int)p2[0]; if (d < 0) p1[0] = 0; else if (d > 255) p1[0] = 255; else p1[0] = (byte)d; d = (int)p1[1] + (int)p2[1]; if (d < 0) p1[1] = 0; else if (d > 255) p1[1] = 255; else p1[1] = (byte)d; d = (int)p1[2] + (int)p2[2]; if (d < 0) p1[2] = 0; else if (d > 255) p1[2] = 255; else p1[2] = (byte)d; d = (int)p1[3] + (int)p2[3]; if (d < 0) p1[3] = 0; else if (d > 255) p1[3] = 255; else p1[3] = (byte)d; } }; Parallel.For(0, Height, act); } public void Sub(Image img) { Action<int> act = y => { byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride; for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4) { int d = (int)p1[0] - (int)p2[0]; if (d < 0) p1[0] = 0; else if (d > 255) p1[0] = 255; else p1[0] = (byte)d; d = (int)p1[1] - (int)p2[1]; if (d < 0) p1[1] = 0; else if (d > 255) p1[1] = 255; else p1[1] = (byte)d; d = (int)p1[2] - (int)p2[2]; if (d < 0) p1[2] = 0; else if (d > 255) p1[2] = 255; else p1[2] = (byte)d; d = (int)p1[3] - (int)p2[3]; if (d < 0) p1[3] = 0; else if (d > 255) p1[3] = 255; else p1[3] = (byte)d; } }; Parallel.For(0, Height, act); } /// <summary>OK</summary> public void Mul(Image img, double scale) { Action<int> act = y => { byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride; for (int x = 0; x < Stride; x+=4, p1+=4, p2+=4) { double d = (int)p1[0] * (int)p2[0] * scale; if (d < 0) p1[0] = 0; else if (d > 255) p1[0] = 255; else p1[0] = (byte)d; d = (int)p1[1] * (int)p2[1] * scale; if (d < 0) p1[1] = 0; else if (d > 255) p1[1] = 255; else p1[1] = (byte)d; d = (int)p1[2] * (int)p2[2] * scale; if (d < 0) p1[2] = 0; else if (d > 255) p1[2] = 255; else p1[2] = (byte)d; d = (int)p1[3] * (int)p2[3] * scale; if (d < 0) p1[3] = 0; else if (d > 255) p1[3] = 255; else p1[3] = (byte)d; } }; Parallel.For(0, Height, act); } public void Threshold(byte threshold) { Action<int> act = y => { byte* p = Pointer + y * Stride; for (int x = 0; x < Stride; x+=4, p+=4) { p[0] = p[0] < threshold ? Min : Max; p[1] = p[1] < threshold ? Min : Max; p[2] = p[2] < threshold ? Min : Max; p[3] = p[3] < threshold ? Min : Max; } }; Parallel.For(0, Height, act); } /// <summary>OK</summary> public void AddWeighted(Image img, double a, double b) { int* taba = stackalloc int[256]; for (int i = 0; i < 256; i++) taba[i] = (int)(i * a); int* tabb = stackalloc int[256]; for (int i = 0; i < 256; i++) tabb[i] = (int)(i * b); Action<int> act = y => { byte* p1 = this.Pointer + y * this.Stride, p2 = (byte*)img.Pointer + y * img.Stride; for (int x = 0; x < this.Stride; x+=4, p1+=4, p2+=4) { int d = taba[p1[0]] + taba[p2[0]]; if (d < 0) p1[0] = 0; else if (d > 255) p1[0] = 255; else p1[0] = (byte)d; d = taba[p1[1]] + taba[p2[1]]; if (d < 0) p1[1] = 0; else if (d > 255) p1[1] = 255; else p1[1] = (byte)d; d = taba[p1[2]] + taba[p2[2]]; if (d < 0) p1[2] = 0; else if (d > 255) p1[2] = 255; else p1[2] = (byte)d; d = taba[p1[3]] + taba[p2[3]]; if (d < 0) p1[3] = 0; else if (d > 255) p1[3] = 255; else p1[3] = (byte)d; } }; Parallel.For(0, this.Height, act); } public static void Smooth(Image src, Image dst, int n) { //分配一块临时存储区 int* tmp = (int*)Marshal.AllocHGlobal(src.Stride * src.Height * 4).ToPointer(); Action<int> act = y => { byte* p = src.Pointer + y * src.Stride; int d = 0; for (int i = -n; i <= n; i++) { int xx = Clamp(i, src.Stride); d += p[xx]; } tmp[y * src.Stride] = d; }; Parallel.For(0, src.Height, act); act = y => { int i = y * src.Stride; byte* p = src.Pointer + y * src.Stride; for (int x = 1; x < src.Stride; x++) { int d = tmp[i]; int x1 = Clamp(x - n - 1, src.Stride); int x2 = Clamp(x + n, src.Stride); d += (p[x2] - p[x1]); tmp[++i] = d; } }; Parallel.For(0, src.Height, act); double f = 1.0 / (2 * n + 1); f *= f; act = x => { int d = 0; byte* p = dst.Pointer + x; for (int j = -n; j <= n; j++) { int yy = Clamp(j, src.Height); d += tmp[x + yy * src.Stride]; } *p = (byte)(d * f); p += src.Stride; for (int y = 1; y < src.Height; y++, p += src.Stride) { int y1 = Clamp(y - n - 1, src.Height); int y2 = Clamp(y + n, src.Height); d += (tmp[x + y2 * src.Stride] - tmp[x + y1 * src.Stride]); *p = (byte)(d * f); } }; Parallel.For(0, src.Stride, act); Marshal.FreeHGlobal(new IntPtr(tmp)); } private static int Clamp(int i, int max) { if (i < 0) return 0; if (i >= max) return max - 1; return i; } public override bool IsInvalid { get { return handle == IntPtr.Zero; } } protected override bool ReleaseHandle() { LocalFree(handle); return true; } }
主要修改的地方如下:
1、将图像的每一行4字节对齐,增加Stribe属性,其值等于Width向上取最近的4的倍数,然后在所有的for循环里,每次操作4个字节。这样一来,减少了循环次数。
2、减少浮点运算
A:Add/Sub方法中的临时变量d改为int型
B:Mul方法中,调整运算顺序,由scale * *p1 * *p2改为p1[0] * p2[0] * scale,区别在于,前一种先算scale * *p0,是一个浮点乘法,其结果也是浮点数,然后再算和*p2的乘积,共两次浮点乘法;而后一种先算p1[0] * p2[0],这是一次整数乘法,然后再算和scale的积,共一次整数乘法一次浮点乘法。由于浮点乘法比整数乘法慢,因此效率会有所提高。
3、AddWeighted改为使用查表法进行运算,首先算出0~255这256个数和a、b的积,放在数组taba、tabb中,其后的循环中只需查表再相加即可,效率大幅提高!
下面是优化后的测试结果(数值表示Image类方法和对应的OpenCV方法执行时间之比):
CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
样本:600 X 896
-------------------------------------
Add 1.446 1.315
Sub 1.171 1.109
Mul 0.651 0.580
Threshold 1.511 1.432
Smooth 0.938 0.908
AddWeighted 0.528 0.474
CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
样本:1600 X 1200
-------------------------------------
Add 1.041 1.052
Sub 0.910 0.906
Mul 0.562 0.558
Threshold 1.277 1.236
Smooth 1.020 1.024
AddWeighted 0.462 0.461
CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
样本:1600 X 1200
-------------------------------------
Add 1.514 1.533
Sub 1.225 1.163
Mul 1.085 1.095
Threshold 1.643 1.630
Smooth 1.847 1.867
AddWeighted 0.957 0.924
CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
样本:600 X 896
-------------------------------------
Add 2.559 2.073 2.676
Sub 2.240 1.784 1.856
Mul 1.261 1.352 1.284
Threshold 2.453 2.511 3.101
Smooth 1.660 1.647 1.663
AddWeighted 0.978 1.017 0.961
CPU:Intel Core i3 M330 2.13GHz (双核四线程)
样本:1600 X 1200
-------------------------------------
Add 2.611
Sub 2.545
Mul 1.011
Threshold 2.882
Smooth 1.891
AddWeighted 0.525
CPU:Intel Core i3 M330 2.13GHz (双核四线程)
样本:600 X 896
-------------------------------------
Add 4.483
Sub 3.576
Mul 1.101
Threshold 5.953
Smooth 2.029
AddWeighted 0.581
CPU:Intel Core i7 2360QM 2.00GHz (四核八线程)
样本:600 X 896
-------------------------------------
Add 1.080 1.020
Sub 0.977 1.010
Mul 0.575 0.558
Threshold 0.842 0.898
Smooth 1.447 1.386
AddWeighted 0.325 0.366
CPU:Intel Core i7 2360QM 2.00GHz (四核八线程)
样本:1600 X 1200
-------------------------------------
Add 1.420
Sub 1.134
Mul 0.535
Threshold 0.878
Smooth 1.379
AddWeighted 0.325
分析以上数据,我们不难发现以下几点:
1、样本大小相同时,CPU核心数越多,Image/OpenCV就越小,这说明了多线程算法在多核CPU下的优势。
2、CPU相同时,样本大小越打,比值越小。
3、OpenCV针对Intel CPU使用IPP进行了优化,因此在Intel CPU上跑,比值会比在AMD CPU上打很多。
4、OpenCV里使用SSE优化过的方法,用C#实现时差距比较明显,怎么才能达到差不多的效率,这个暂时还没想到。但是OpenCV里没使用SSE优化的方法,如Mul、AddWeighted,使用C#完全可以达到相同的性能,甚至超过OpenCV,如AddWeighted,在双核CPU上也比OpenCV要快,在4核以上CPU上远超OpenCV!!
5、使用自写方法替代OpenCV是完全可行的!!
以上测试有几点需要说明:
1、CLR是在第一次运行某个方法时才进行编译,因此第一次执行某个方法时会慢很多,在计算时间时要排除第一次执行的时间。
2、C#调用OpenCV需要经过封送处理,但封送处理所消耗的时间在这里无法避免。
各位看官如有其它优化修改意见,还望不吝赐教!!!同时我会不断对这个类进行修改以及添加其它图像处理方法,对图像处理、OpenCV以及C#代码优化感兴趣的同学,请关注本贴!!