# zyl910

 博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 :: 管理 ::

## 一、背景

### 1.1 循环展开简介

循环展开（Loop unrolling）技术是一种提升程序执行速度的非常有效的优化方法，它可以由程序员手工编写，也可由编译器自动优化。循环展开的本质是，利用CPU指令级并行，来降低循环的开销，当然，同时也有利于指令流水线的高效调度。
……



## 二、在C#中使用

### 2.1 对基础算法做循环展开

private static float SumBase(float[] src, int count, int loops) {
float rt = 0; // Result.
for (int j=0; j< loops; ++j) {
for(int i=0; i< count; ++i) {
rt += src[i];
}
}
return rt;
}


private static float SumBaseU4(float[] src, int count, int loops) {
float rt = 0; // Result.
float rt1 = 0;
float rt2 = 0;
float rt3 = 0;
int nBlockWidth = 4; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
int p; // Index for src data.
int i;
for (int j = 0; j < loops; ++j) {
p = 0;
// Block processs.
for (i = 0; i < cntBlock; ++i) {
rt += src[p];
rt1 += src[p + 1];
rt2 += src[p + 2];
rt3 += src[p + 3];
p += nBlockWidth;
}
// Remainder processs.
//p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
rt = rt + rt1 + rt2 + rt3;
return rt;
}


#### 2.1.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336


### 2.2 对 Vector4 版算法做循环展开

private static float SumVector4(float[] src, int count, int loops) {
float rt = 0; // Result.
const int VectorWidth = 4;
int nBlockWidth = VectorWidth; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector4 vrt = Vector4.Zero; // Vector result.
int p; // Index for src data.
int i;
// Load.
Vector4[] vsrc = new Vector4[cntBlock]; // Vector src.
p = 0;
for (i = 0; i < vsrc.Length; ++i) {
vsrc[i] = new Vector4(src[p], src[p + 1], src[p + 2], src[p + 3]);
p += VectorWidth;
}
// Body.
for (int j = 0; j < loops; ++j) {
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
// Equivalent to scalar model: rt += src[i];
vrt += vsrc[i]; // Add.
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
rt += vrt.X + vrt.Y + vrt.Z + vrt.W;
return rt;
}


private static float SumVector4U4(float[] src, int count, int loops) {
float rt = 0; // Result.
const int LoopUnrolling = 4;
const int VectorWidth = 4;
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector4 vrt = Vector4.Zero; // Vector result.
Vector4 vrt1 = Vector4.Zero;
Vector4 vrt2 = Vector4.Zero;
Vector4 vrt3 = Vector4.Zero;
int p; // Index for src data.
int i;
// Load.
Vector4[] vsrc = new Vector4[count / VectorWidth]; // Vector src.
p = 0;
for (i = 0; i < vsrc.Length; ++i) {
vsrc[i] = new Vector4(src[p], src[p + 1], src[p + 2], src[p + 3]);
p += VectorWidth;
}
// Body.
for (int j = 0; j < loops; ++j) {
p = 0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vrt += vsrc[p]; // Add.
vrt1 += vsrc[p + 1];
vrt2 += vsrc[p + 2];
vrt3 += vsrc[p + 3];
p += LoopUnrolling;
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
vrt = vrt + vrt1 + vrt2 + vrt3;
rt += vrt.X + vrt.Y + vrt.Z + vrt.W;
return rt;
}


#### 2.2.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVector4:     2.748779E+11    # msUsed=1218, MFLOPS/s=3362.8899835796387, scale=4.054187192118227
SumVector4U4:   1.0995116E+12   # msUsed=532, MFLOPS/s=7699.248120300752, scale=9.281954887218046


SumVector4U4对比基础算法（SumBase），性能倍数是 9.281954887218046。
SumVector4U4对比未循环展开的算法（SumVector4），倍数是 9.281954887218046/4.054187192118227=2.2894736842105263092984587836542

### 2.3 对 Vector<T> 版算法做循环展开

private static float SumVectorT(float[] src, int count, int loops) {
float rt = 0; // Result.
int VectorWidth = Vector<float>.Count; // Block width.
int nBlockWidth = VectorWidth; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector<float> vrt = Vector<float>.Zero; // Vector result.
int p; // Index for src data.
int i;
// Load.
Vector<float>[] vsrc = new Vector<float>[cntBlock]; // Vector src.
p = 0;
for (i = 0; i < vsrc.Length; ++i) {
vsrc[i] = new Vector<float>(src, p);
p += VectorWidth;
}
// Body.
for (int j = 0; j < loops; ++j) {
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vrt += vsrc[i]; // Add.
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
for (i = 0; i < VectorWidth; ++i) {
rt += vrt[i];
}
return rt;
}


private static float SumVectorTU4(float[] src, int count, int loops) {
float rt = 0; // Result.
const int LoopUnrolling = 4;
int VectorWidth = Vector<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector<float> vrt = Vector<float>.Zero; // Vector result.
Vector<float> vrt1 = Vector<float>.Zero;
Vector<float> vrt2 = Vector<float>.Zero;
Vector<float> vrt3 = Vector<float>.Zero;
int p; // Index for src data.
int i;
// Load.
Vector<float>[] vsrc = new Vector<float>[count / VectorWidth]; // Vector src.
p = 0;
for (i = 0; i < vsrc.Length; ++i) {
vsrc[i] = new Vector<float>(src, p);
p += VectorWidth;
}
// Body.
for (int j = 0; j < loops; ++j) {
p = 0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vrt += vsrc[p]; // Add.
vrt1 += vsrc[p + 1];
vrt2 += vsrc[p + 1];
vrt3 += vsrc[p + 1];
p += LoopUnrolling;
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
vrt = vrt + vrt1 + vrt2 + vrt3;
for (i = 0; i < VectorWidth; ++i) {
rt += vrt[i];
}
return rt;
}



#### 2.3.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVectorT:     5.497558E+11    # msUsed=609, MFLOPS/s=6725.7799671592775, scale=8.108374384236454
SumVectorTU4:   2.1990233E+12   # msUsed=203, MFLOPS/s=20177.339901477833, scale=24.32512315270936


SumVectorTU4对比基础算法（SumBase），性能倍数是 24.32512315270936。
SumVectorTU4对比未循环展开的算法（SumVectorT），倍数是 24.32512315270936/8.108374384236454=2.9999999999999997533414337788579

### 2.4 对 Avx版算法做循环展开

/// <summary>
/// Sum - Vector AVX.
/// </summary>
/// <param name="src">Soure array.</param>
/// <param name="count">Soure array count.</param>
/// <param name="loops">Benchmark loops.</param>
/// <returns>Return the sum value.</returns>
private static float SumVectorAvx(float[] src, int count, int loops) {
#if Allow_Intrinsics
float rt = 0; // Result.
//int VectorWidth = 32 / 4; // sizeof(__m256) / sizeof(float);
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
int p; // Index for src data.
int i;
// Load.
Vector256<float>[] vsrc = new Vector256<float>[cntBlock]; // Vector src.
p = 0;
for (i = 0; i < cntBlock; ++i) {
vsrc[i] = Vector256.Create(src[p], src[p + 1], src[p + 2], src[p + 3], src[p + 4], src[p + 5], src[p + 6], src[p + 7]); // Load.
p += VectorWidth;
}
// Body.
for (int j = 0; j < loops; ++j) {
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vrt = Avx.Add(vrt, vsrc[i]);    // Add. vrt += vsrc[i];
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
#else
throw new NotSupportedException();
#endif
}

/// <summary>
/// Sum - Vector AVX - Loop unrolling *4.
/// </summary>
/// <param name="src">Soure array.</param>
/// <param name="count">Soure array count.</param>
/// <param name="loops">Benchmark loops.</param>
/// <returns>Return the sum value.</returns>
private static float SumVectorAvxU4(float[] src, int count, int loops) {
#if Allow_Intrinsics
float rt = 0; // Result.
const int LoopUnrolling = 4;
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
Vector256<float> vrt1 = Vector256<float>.Zero;
Vector256<float> vrt2 = Vector256<float>.Zero;
Vector256<float> vrt3 = Vector256<float>.Zero;
int p; // Index for src data.
int i;
// Load.
Vector256<float>[] vsrc = new Vector256<float>[count / VectorWidth]; // Vector src.
p = 0;
for (i = 0; i < vsrc.Length; ++i) {
vsrc[i] = Vector256.Create(src[p], src[p + 1], src[p + 2], src[p + 3], src[p + 4], src[p + 5], src[p + 6], src[p + 7]); // Load.
p += VectorWidth;
}
// Body.
for (int j = 0; j < loops; ++j) {
p = 0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vrt = Avx.Add(vrt, vsrc[p]);    // Add. vrt += vsrc[p];
vrt1 = Avx.Add(vrt1, vsrc[p + 1]);
vrt2 = Avx.Add(vrt2, vsrc[p + 2]);
vrt3 = Avx.Add(vrt3, vsrc[p + 3]);
p += LoopUnrolling;
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
vrt = Avx.Add(Avx.Add(vrt, vrt1), Avx.Add(vrt2, vrt3)); // vrt = vrt + vrt1 + vrt2 + vrt3;
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
#else
throw new NotSupportedException();
#endif
}

/// <summary>
/// Sum - Vector AVX - Span.
/// </summary>
/// <param name="src">Soure array.</param>
/// <param name="count">Soure array count.</param>
/// <param name="loops">Benchmark loops.</param>
/// <returns>Return the sum value.</returns>
private static float SumVectorAvxSpan(float[] src, int count, int loops) {
#if Allow_Intrinsics
float rt = 0; // Result.
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
int p; // Index for src data.
ReadOnlySpan<Vector256<float>> vsrc; // Vector src.
int i;
// Body.
for (int j = 0; j < loops; ++j) {
// Vector processs.
vsrc = System.Runtime.InteropServices.MemoryMarshal.Cast<float, Vector256<float> >(new Span<float>(src)); // Reinterpret cast. float* to Vector256<float>*.
for (i = 0; i < cntBlock; ++i) {
vrt = Avx.Add(vrt, vsrc[i]);    // Add. vrt += vsrc[i];
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
#else
throw new NotSupportedException();
#endif
}

/// <summary>
/// Sum - Vector AVX - Span - Loop unrolling *4.
/// </summary>
/// <param name="src">Soure array.</param>
/// <param name="count">Soure array count.</param>
/// <param name="loops">Benchmark loops.</param>
/// <returns>Return the sum value.</returns>
private static float SumVectorAvxSpanU4(float[] src, int count, int loops) {
#if Allow_Intrinsics
float rt = 0; // Result.
const int LoopUnrolling = 4;
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
Vector256<float> vrt1 = Vector256<float>.Zero;
Vector256<float> vrt2 = Vector256<float>.Zero;
Vector256<float> vrt3 = Vector256<float>.Zero;
int p; // Index for src data.
ReadOnlySpan<Vector256<float>> vsrc; // Vector src.
int i;
// Body.
for (int j = 0; j < loops; ++j) {
p = 0;
// Vector processs.
vsrc = System.Runtime.InteropServices.MemoryMarshal.Cast<float, Vector256<float>>(new Span<float>(src)); // Reinterpret cast. float* to Vector256<float>*.
for (i = 0; i < cntBlock; ++i) {
vrt = Avx.Add(vrt, vsrc[p]);    // Add. vrt += vsrc[p];
vrt1 = Avx.Add(vrt1, vsrc[p + 1]);
vrt2 = Avx.Add(vrt2, vsrc[p + 2]);
vrt3 = Avx.Add(vrt3, vsrc[p + 3]);
p += LoopUnrolling;
}
// Remainder processs.
p = cntBlock * nBlockWidth;
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
vrt = Avx.Add(Avx.Add(vrt, vrt1), Avx.Add(vrt2, vrt3)); // vrt = vrt + vrt1 + vrt2 + vrt3;
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
#else
throw new NotSupportedException();
#endif
}

/// <summary>
/// Sum - Vector AVX - Ptr.
/// </summary>
/// <param name="src">Soure array.</param>
/// <param name="count">Soure array count.</param>
/// <param name="loops">Benchmark loops.</param>
/// <returns>Return the sum value.</returns>
private static float SumVectorAvxPtr(float[] src, int count, int loops) {
#if Allow_Intrinsics && UNSAFE
unsafe {
float rt = 0; // Result.
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
Vector256<float> vload;
float* p; // Pointer for src data.
int i;
// Body.
fixed(float* p0 = &src[0]) {
for (int j = 0; j < loops; ++j) {
p = p0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vload = Avx.LoadVector256(p);    // Load. vload = *(*__m256)p;
vrt = Avx.Add(vrt, vload);    // Add. vrt += vsrc[i];
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
}
// Reduce.
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
}
#else
throw new NotSupportedException();
#endif
}

/// <summary>
/// Sum - Vector AVX - Ptr - Loop unrolling *4.
/// </summary>
/// <param name="src">Soure array.</param>
/// <param name="count">Soure array count.</param>
/// <param name="loops">Benchmark loops.</param>
/// <returns>Return the sum value.</returns>
private static float SumVectorAvxPtrU4(float[] src, int count, int loops) {
#if Allow_Intrinsics && UNSAFE
unsafe {
float rt = 0; // Result.
const int LoopUnrolling = 4;
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
Vector256<float> vrt1 = Vector256<float>.Zero;
Vector256<float> vrt2 = Vector256<float>.Zero;
Vector256<float> vrt3 = Vector256<float>.Zero;
Vector256<float> vload;
Vector256<float> vload1;
Vector256<float> vload2;
Vector256<float> vload3;
float* p; // Pointer for src data.
int i;
// Body.
fixed (float* p0 = &src[0]) {
for (int j = 0; j < loops; ++j) {
p = p0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vload = Avx.LoadVector256(p);    // Load. vload = *(*__m256)p;
vload1 = Avx.LoadVector256(p + VectorWidth * 1);
vload2 = Avx.LoadVector256(p + VectorWidth * 2);
vload3 = Avx.LoadVector256(p + VectorWidth * 3);
vrt = Avx.Add(vrt, vload);    // Add. vrt += vsrc[i];
vrt1 = Avx.Add(vrt1, vload1);
vrt2 = Avx.Add(vrt2, vload2);
vrt3 = Avx.Add(vrt3, vload3);
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
}
// Reduce.
vrt = Avx.Add(Avx.Add(vrt, vrt1), Avx.Add(vrt2, vrt3)); // vrt = vrt + vrt1 + vrt2 + vrt3;
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
}
#else
throw new NotSupportedException();
#endif
}



#### 2.4.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVectorAvx:   5.497558E+11    # msUsed=609, MFLOPS/s=6725.7799671592775, scale=8.108374384236454
SumVectorAvxSpan:       5.497558E+11    # msUsed=625, MFLOPS/s=6553.6, scale=7.9008
SumVectorAvxPtr:        5.497558E+11    # msUsed=610, MFLOPS/s=6714.754098360656, scale=8.095081967213115
SumVectorAvxU4: 2.1990233E+12   # msUsed=328, MFLOPS/s=12487.80487804878, scale=15.054878048780488
SumVectorAvxSpanU4:     2.1990233E+12   # msUsed=312, MFLOPS/s=13128.205128205129, scale=15.826923076923078
SumVectorAvxPtrU4:      2.1990233E+12   # msUsed=157, MFLOPS/s=26089.171974522294, scale=31.452229299363058


SumVectorAvxPtrU4 对比基础算法（SumBase），性能倍数是 31.452229299363058。
SumVectorAvxPtrU4 对比未循环展开的算法（SumVectorAvxPtr），倍数是 31.452229299363058/8.095081967213115=3.8853503184713375449974589366746。

### 2.5 对 Avx版算法做循环展开16次

private static float SumVectorAvxPtrU16(float[] src, int count, int loops) {
#if Allow_Intrinsics && UNSAFE
unsafe {
float rt = 0; // Result.
const int LoopUnrolling = 16;
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
Vector256<float> vrt = Vector256<float>.Zero; // Vector result.
Vector256<float> vrt1 = Vector256<float>.Zero;
Vector256<float> vrt2 = Vector256<float>.Zero;
Vector256<float> vrt3 = Vector256<float>.Zero;
Vector256<float> vrt4 = Vector256<float>.Zero;
Vector256<float> vrt5 = Vector256<float>.Zero;
Vector256<float> vrt6 = Vector256<float>.Zero;
Vector256<float> vrt7 = Vector256<float>.Zero;
Vector256<float> vrt8 = Vector256<float>.Zero;
Vector256<float> vrt9 = Vector256<float>.Zero;
Vector256<float> vrt10 = Vector256<float>.Zero;
Vector256<float> vrt11 = Vector256<float>.Zero;
Vector256<float> vrt12 = Vector256<float>.Zero;
Vector256<float> vrt13 = Vector256<float>.Zero;
Vector256<float> vrt14 = Vector256<float>.Zero;
Vector256<float> vrt15 = Vector256<float>.Zero;
float* p; // Pointer for src data.
int i;
// Body.
fixed (float* p0 = &src[0]) {
for (int j = 0; j < loops; ++j) {
p = p0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
//vload = Avx.LoadVector256(p);    // Load. vload = *(*__m256)p;
vrt = Avx.Add(vrt, Avx.LoadVector256(p)); // Add. vrt[k] += *((*__m256)(p)+k);
vrt1 = Avx.Add(vrt1, Avx.LoadVector256(p + VectorWidth * 1));
vrt2 = Avx.Add(vrt2, Avx.LoadVector256(p + VectorWidth * 2));
vrt3 = Avx.Add(vrt3, Avx.LoadVector256(p + VectorWidth * 3));
vrt4 = Avx.Add(vrt4, Avx.LoadVector256(p + VectorWidth * 4));
vrt5 = Avx.Add(vrt5, Avx.LoadVector256(p + VectorWidth * 5));
vrt6 = Avx.Add(vrt6, Avx.LoadVector256(p + VectorWidth * 6));
vrt7 = Avx.Add(vrt7, Avx.LoadVector256(p + VectorWidth * 7));
vrt8 = Avx.Add(vrt8, Avx.LoadVector256(p + VectorWidth * 8));
vrt9 = Avx.Add(vrt9, Avx.LoadVector256(p + VectorWidth * 9));
vrt10 = Avx.Add(vrt10, Avx.LoadVector256(p + VectorWidth * 10));
vrt11 = Avx.Add(vrt11, Avx.LoadVector256(p + VectorWidth * 11));
vrt12 = Avx.Add(vrt12, Avx.LoadVector256(p + VectorWidth * 12));
vrt13 = Avx.Add(vrt13, Avx.LoadVector256(p + VectorWidth * 13));
vrt14 = Avx.Add(vrt14, Avx.LoadVector256(p + VectorWidth * 14));
vrt15 = Avx.Add(vrt15, Avx.LoadVector256(p + VectorWidth * 15));
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
}
// Reduce.
vrt = Avx.Add( Avx.Add( Avx.Add(Avx.Add(vrt, vrt1), Avx.Add(vrt2, vrt3))
, Avx.Add(Avx.Add(vrt4, vrt5), Avx.Add(vrt6, vrt7)) )
, Avx.Add( Avx.Add(Avx.Add(vrt8, vrt9), Avx.Add(vrt10, vrt11))
, Avx.Add(Avx.Add(vrt12, vrt13), Avx.Add(vrt14, vrt15)) ) )
; // vrt = vrt + vrt1 + vrt2 + vrt3 + ... vrt15;
for (i = 0; i < VectorWidth; ++i) {
rt += vrt.GetElement(i);
}
return rt;
}
#else
throw new NotSupportedException();
#endif
}


#### 2.5.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVectorAvxPtr:        5.497558E+11    # msUsed=610, MFLOPS/s=6714.754098360656, scale=8.095081967213115
SumVectorAvxPtrU4:      2.1990233E+12   # msUsed=157, MFLOPS/s=26089.171974522294, scale=31.452229299363058
SumVectorAvxPtrU16:     8.386202E+12    # msUsed=125, MFLOPS/s=32768, scale=39.504


SumVectorAvxPtrU16 对比基础算法（SumBase），性能倍数是 39.504。
SumVectorAvxPtrU16 对比未循环展开的算法（SumVectorAvxPtr），倍数是 39.504/8.095081967213115=4.8799999999999998517618469015796。
SumVectorAvxPtrU16 对比循环展开4次的算法（SumVectorAvxPtrU4），倍数是 39.504/31.452229299363058=1.2559999999999999730384771162414。

### 2.6 尝试用数组来存储循环展开的临时变量

private static float SumVectorAvxPtrU16A(float[] src, int count, int loops) {
#if Allow_Intrinsics && UNSAFE
unsafe {
float rt = 0; // Result.
const int LoopUnrolling = 16;
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
int i;
//Vector256<float>[] vrt = new Vector256<float>[LoopUnrolling]; // Vector result.
Vector256<float>* vrt = stackalloc Vector256<float>[LoopUnrolling]; ; // Vector result.
for (i = 0; i< LoopUnrolling; ++i) {
vrt[i] = Vector256<float>.Zero;
}
float* p; // Pointer for src data.
// Body.
fixed (float* p0 = &src[0]) {
for (int j = 0; j < loops; ++j) {
p = p0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
//vload = Avx.LoadVector256(p);    // Load. vload = *(*__m256)p;
vrt[0] = Avx.Add(vrt[0], Avx.LoadVector256(p)); // Add. vrt[k] += *((*__m256)(p)+k);
vrt[1] = Avx.Add(vrt[1], Avx.LoadVector256(p + VectorWidth * 1));
vrt[2] = Avx.Add(vrt[2], Avx.LoadVector256(p + VectorWidth * 2));
vrt[3] = Avx.Add(vrt[3], Avx.LoadVector256(p + VectorWidth * 3));
vrt[4] = Avx.Add(vrt[4], Avx.LoadVector256(p + VectorWidth * 4));
vrt[5] = Avx.Add(vrt[5], Avx.LoadVector256(p + VectorWidth * 5));
vrt[6] = Avx.Add(vrt[6], Avx.LoadVector256(p + VectorWidth * 6));
vrt[7] = Avx.Add(vrt[7], Avx.LoadVector256(p + VectorWidth * 7));
vrt[8] = Avx.Add(vrt[8], Avx.LoadVector256(p + VectorWidth * 8));
vrt[9] = Avx.Add(vrt[9], Avx.LoadVector256(p + VectorWidth * 9));
vrt[10] = Avx.Add(vrt[10], Avx.LoadVector256(p + VectorWidth * 10));
vrt[11] = Avx.Add(vrt[11], Avx.LoadVector256(p + VectorWidth * 11));
vrt[12] = Avx.Add(vrt[12], Avx.LoadVector256(p + VectorWidth * 12));
vrt[13] = Avx.Add(vrt[13], Avx.LoadVector256(p + VectorWidth * 13));
vrt[14] = Avx.Add(vrt[14], Avx.LoadVector256(p + VectorWidth * 14));
vrt[15] = Avx.Add(vrt[15], Avx.LoadVector256(p + VectorWidth * 15));
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
}
// Reduce.
for (i = 1; i < LoopUnrolling; ++i) {
vrt[0] = Avx.Add(vrt[0], vrt[i]); // vrt[0] += vrt[i]
}
for (i = 0; i < VectorWidth; ++i) {
rt += vrt[0].GetElement(i);
}
return rt;
}
#else
throw new NotSupportedException();
#endif
}


#### 2.6.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVectorAvxPtr:        5.497558E+11    # msUsed=610, MFLOPS/s=6714.754098360656, scale=8.095081967213115
SumVectorAvxPtrU4:      2.1990233E+12   # msUsed=157, MFLOPS/s=26089.171974522294, scale=31.452229299363058
SumVectorAvxPtrU16:     8.386202E+12    # msUsed=125, MFLOPS/s=32768, scale=39.504
SumVectorAvxPtrU16A:    8.3862026E+12   # msUsed=187, MFLOPS/s=21903.74331550802, scale=26.406417112299465


### 2.7 尝试用栈数组来减少相关性

private static float SumVectorAvxPtrUX(float[] src, int count, int loops, int LoopUnrolling) {
#if Allow_Intrinsics && UNSAFE
unsafe {
float rt = 0; // Result.
//const int LoopUnrolling = 16;
if (LoopUnrolling <= 0) throw new ArgumentOutOfRangeException("LoopUnrolling", "Argument LoopUnrolling must >0 !");
int VectorWidth = Vector256<float>.Count; // Block width.
int nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
int cntBlock = count / nBlockWidth; // Block count.
int cntRem = count % nBlockWidth; // Remainder count.
int i;
//Vector256<float>[] vrt = new Vector256<float>[LoopUnrolling]; // Vector result.
Vector256<float>* vrt = stackalloc Vector256<float>[LoopUnrolling]; ; // Vector result.
for (i = 0; i < LoopUnrolling; ++i) {
vrt[i] = Vector256<float>.Zero;
}
float* p; // Pointer for src data.
// Body.
fixed (float* p0 = &src[0]) {
for (int j = 0; j < loops; ++j) {
p = p0;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
for(int k=0; k< LoopUnrolling; ++k) {
vrt[k] = Avx.Add(vrt[k], Avx.LoadVector256(p + VectorWidth * k)); // Add. vrt[k] += *(*__m256)(p+k);
}
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
}
// Reduce.
for (i = 1; i < LoopUnrolling; ++i) {
vrt[0] = Avx.Add(vrt[0], vrt[i]); // vrt[0] += vrt[i]
} // vrt = vrt + vrt1 + vrt2 + vrt3 + ... vrt15;
for (i = 0; i < VectorWidth; ++i) {
rt += vrt[0].GetElement(i);
}
return rt;
}
#else
throw new NotSupportedException();
#endif
}


// SumVectorAvxPtrUX.
int[] LoopUnrollingArray = { 4, 8, 16 };
foreach (int loopUnrolling in LoopUnrollingArray) {
tickBegin = Environment.TickCount;
rt = SumVectorAvxPtrUX(src, count, loops, loopUnrolling);
msUsed = Environment.TickCount - tickBegin;
mFlops = countMFlops * 1000 / msUsed;
scale = mFlops / mFlopsBase;
tw.WriteLine(indent + string.Format("SumVectorAvxPtrUX[{4}]:\t{0}\t# msUsed={1}, MFLOPS/s={2}, scale={3}", rt, msUsed, mFlops, scale, loopUnrolling));
}


#### 2.7.1 测试结果:

SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVectorAvxPtr:        5.497558E+11    # msUsed=610, MFLOPS/s=6714.754098360656, scale=8.095081967213115
SumVectorAvxPtrU4:      2.1990233E+12   # msUsed=157, MFLOPS/s=26089.171974522294, scale=31.452229299363058
SumVectorAvxPtrU16:     8.386202E+12    # msUsed=125, MFLOPS/s=32768, scale=39.504
SumVectorAvxPtrU16A:    8.3862026E+12   # msUsed=187, MFLOPS/s=21903.74331550802, scale=26.406417112299465
SumVectorAvxPtrUX[4]:   2.1990233E+12   # msUsed=547, MFLOPS/s=7488.117001828154, scale=9.027422303473491
SumVectorAvxPtrUX[8]:   4.3980465E+12   # msUsed=500, MFLOPS/s=8192, scale=9.876
SumVectorAvxPtrUX[16]:  8.3862026E+12   # msUsed=500, MFLOPS/s=8192, scale=9.876


### 2.8 测试结果汇总

BenchmarkVectorCore30

IsRelease:      True
EnvironmentVariable(PROCESSOR_IDENTIFIER):      Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
Environment.ProcessorCount:     8
Environment.Is64BitOperatingSystem:     True
Environment.Is64BitProcess:     True
Environment.OSVersion:  Microsoft Windows NT 6.2.9200.0
Environment.Version:    3.1.26
RuntimeEnvironment.GetRuntimeDirectory: C:\Program Files\dotnet\shared\Microsoft.NETCore.App\3.1.26\
RuntimeInformation.FrameworkDescription:        .NET Core 3.1.26
BitConverter.IsLittleEndian:    True
IntPtr.Size:    8
Vector.IsHardwareAccelerated:   True
Vector<byte>.Count:     32      # 256bit
Vector<float>.Count:    8       # 256bit
Vector<double>.Count:   4       # 256bit
Vector4.Assembly.CodeBase:      file:///C:/Program Files/dotnet/shared/Microsoft.NETCore.App/3.1.26/System.Numerics.Vectors.dll
Vector<T>.Assembly.CodeBase:    file:///C:/Program Files/dotnet/shared/Microsoft.NETCore.App/3.1.26/System.Private.CoreLib.dll

Benchmark:      count=4096, loops=1000000, countMFlops=4096
SumBase:        6.871948E+10    # msUsed=4938, MFLOPS/s=829.485621709194
SumBaseU4:      2.748779E+11    # msUsed=1875, MFLOPS/s=2184.5333333333333, scale=2.6336
SumVector4:     2.748779E+11    # msUsed=1218, MFLOPS/s=3362.8899835796387, scale=4.054187192118227
SumVector4U4:   1.0995116E+12   # msUsed=532, MFLOPS/s=7699.248120300752, scale=9.281954887218046
SumVectorT:     5.497558E+11    # msUsed=609, MFLOPS/s=6725.7799671592775, scale=8.108374384236454
SumVectorTU4:   2.1990233E+12   # msUsed=203, MFLOPS/s=20177.339901477833, scale=24.32512315270936
SumVectorAvx:   5.497558E+11    # msUsed=609, MFLOPS/s=6725.7799671592775, scale=8.108374384236454
SumVectorAvxSpan:       5.497558E+11    # msUsed=625, MFLOPS/s=6553.6, scale=7.9008
SumVectorAvxPtr:        5.497558E+11    # msUsed=610, MFLOPS/s=6714.754098360656, scale=8.095081967213115
SumVectorAvxU4: 2.1990233E+12   # msUsed=328, MFLOPS/s=12487.80487804878, scale=15.054878048780488
SumVectorAvxSpanU4:     2.1990233E+12   # msUsed=312, MFLOPS/s=13128.205128205129, scale=15.826923076923078
SumVectorAvxPtrU4:      2.1990233E+12   # msUsed=157, MFLOPS/s=26089.171974522294, scale=31.452229299363058
SumVectorAvxPtrU16:     8.386202E+12    # msUsed=125, MFLOPS/s=32768, scale=39.504
SumVectorAvxPtrU16A:    8.3862026E+12   # msUsed=187, MFLOPS/s=21903.74331550802, scale=26.406417112299465
SumVectorAvxPtrUX[4]:   2.1990233E+12   # msUsed=547, MFLOPS/s=7488.117001828154, scale=9.027422303473491
SumVectorAvxPtrUX[8]:   4.3980465E+12   # msUsed=500, MFLOPS/s=8192, scale=9.876
SumVectorAvxPtrUX[16]:  8.3862026E+12   # msUsed=500, MFLOPS/s=8192, scale=9.876


## 三、在C++中使用

### 3.1 修改代码

BenchmarkVectorCpp.cpp 的全部代码如下:

// BenchmarkVectorCpp.cpp : This file contains the 'main' function. Program execution begins and ends there.
//

#include <immintrin.h>
#include <malloc.h>
#include <stdio.h>
#include <time.h>

#ifndef EXCEPTION_EXECUTE_HANDLER
#define EXCEPTION_EXECUTE_HANDLER (1)
#endif // !EXCEPTION_EXECUTE_HANDLER

// Sum - base.
float SumBase(const float* src, size_t count, int loops) {
float rt = 0; // Result.
size_t i;
for (int j = 0; j < loops; ++j) {
for (i = 0; i < count; ++i) {
rt += src[i];
}
}
return rt;
}

// Sum - base - Loop unrolling *4.
float SumBaseU4(const float* src, size_t count, int loops) {
float rt = 0; // Result.
float rt1=0;
float rt2 = 0;
float rt3 = 0;
size_t nBlockWidth = 4; // Block width.
size_t cntBlock = count / nBlockWidth; // Block count.
size_t cntRem = count % nBlockWidth; // Remainder count.
size_t p; // Index for src data.
size_t i;
for (int j = 0; j < loops; ++j) {
p = 0;
// Block processs.
for (i = 0; i < cntBlock; ++i) {
rt += src[p];
rt1 += src[p + 1];
rt2 += src[p + 2];
rt3 += src[p + 3];
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += src[p + i];
}
}
// Reduce.
rt = rt + rt1 + rt2 + rt3;
return rt;
}

// Sum - Vector AVX.
float SumVectorAvx(const float* src, size_t count, int loops) {
float rt = 0; // Result.
size_t VectorWidth = sizeof(__m256) / sizeof(float); // Block width.
size_t nBlockWidth = VectorWidth; // Block width.
size_t cntBlock = count / nBlockWidth; // Block count.
size_t cntRem = count % nBlockWidth; // Remainder count.
__m256 vrt = _mm256_setzero_ps(); // Vector result. [AVX] Set zero.
__m256 vload; // Vector load.
const float* p; // Pointer for src data.
size_t i;
// Body.
for (int j = 0; j < loops; ++j) {
p = src;
// Vector processs.
for (i = 0; i < cntBlock; ++i) {
vload = _mm256_load_ps(p);    // Load. vload = *(*__m256)p;
vrt = _mm256_add_ps(vrt, vload);    // Add. vrt += vload;
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
// Reduce.
p = (const float*)&vrt;
for (i = 0; i < VectorWidth; ++i) {
rt += p[i];
}
return rt;
}

// Sum - Vector AVX - Loop unrolling *4.
float SumVectorAvxU4(const float* src, size_t count, int loops) {
float rt = 0;    // Result.
const int LoopUnrolling = 4;
size_t VectorWidth = sizeof(__m256) / sizeof(float); // Block width.
size_t nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
size_t cntBlock = count / nBlockWidth; // Block count.
size_t cntRem = count % nBlockWidth; // Remainder count.
__m256 vrt = _mm256_setzero_ps(); // Vector result. [AVX] Set zero.
__m256 vrt1 = _mm256_setzero_ps();
__m256 vrt2 = _mm256_setzero_ps();
__m256 vrt3 = _mm256_setzero_ps();
__m256 vload; // Vector load.
__m256 vload1, vload2, vload3;
const float* p; // Pointer for src data.
size_t i;
// Body.
for (int j = 0; j < loops; ++j) {
p = src;
// Block processs.
for (i = 0; i < cntBlock; ++i) {
vload = _mm256_load_ps(p);    // Load. vload = *(*__m256)p;
vload1 = _mm256_load_ps(p + VectorWidth * 1);
vload2 = _mm256_load_ps(p + VectorWidth * 2);
vload3 = _mm256_load_ps(p + VectorWidth * 3);
vrt = _mm256_add_ps(vrt, vload);    // Add. vrt += vload;
vrt1 = _mm256_add_ps(vrt1, vload1);
vrt2 = _mm256_add_ps(vrt2, vload2);
vrt3 = _mm256_add_ps(vrt3, vload3);
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
// Reduce.
vrt = _mm256_add_ps(_mm256_add_ps(vrt, vrt1), _mm256_add_ps(vrt2, vrt3)); // vrt = vrt + vrt1 + vrt2 + vrt3;
p = (const float*)&vrt;
for (i = 0; i < VectorWidth; ++i) {
rt += p[i];
}
return rt;
}

// Sum - Vector AVX - Loop unrolling *16.
float SumVectorAvxU16(const float* src, size_t count, int loops) {
float rt = 0;    // Result.
const int LoopUnrolling = 16;
size_t VectorWidth = sizeof(__m256) / sizeof(float); // Block width.
size_t nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
size_t cntBlock = count / nBlockWidth; // Block count.
size_t cntRem = count % nBlockWidth; // Remainder count.
__m256 vrt = _mm256_setzero_ps(); // Vector result. [AVX] Set zero.
__m256 vrt1 = _mm256_setzero_ps();
__m256 vrt2 = _mm256_setzero_ps();
__m256 vrt3 = _mm256_setzero_ps();
__m256 vrt4 = _mm256_setzero_ps();
__m256 vrt5 = _mm256_setzero_ps();
__m256 vrt6 = _mm256_setzero_ps();
__m256 vrt7 = _mm256_setzero_ps();
__m256 vrt8 = _mm256_setzero_ps();
__m256 vrt9 = _mm256_setzero_ps();
__m256 vrt10 = _mm256_setzero_ps();
__m256 vrt11 = _mm256_setzero_ps();
__m256 vrt12 = _mm256_setzero_ps();
__m256 vrt13 = _mm256_setzero_ps();
__m256 vrt14 = _mm256_setzero_ps();
__m256 vrt15 = _mm256_setzero_ps();
const float* p; // Pointer for src data.
size_t i;
// Body.
for (int j = 0; j < loops; ++j) {
p = src;
// Block processs.
for (i = 0; i < cntBlock; ++i) {
vrt = _mm256_add_ps(vrt, _mm256_load_ps(p));    // Add. vrt += *((*__m256)(p)+k);
vrt1 = _mm256_add_ps(vrt1, _mm256_load_ps(p + VectorWidth * 1));
vrt2 = _mm256_add_ps(vrt2, _mm256_load_ps(p + VectorWidth * 2));
vrt3 = _mm256_add_ps(vrt3, _mm256_load_ps(p + VectorWidth * 3));
vrt4 = _mm256_add_ps(vrt4, _mm256_load_ps(p + VectorWidth * 4));
vrt5 = _mm256_add_ps(vrt5, _mm256_load_ps(p + VectorWidth * 5));
vrt6 = _mm256_add_ps(vrt6, _mm256_load_ps(p + VectorWidth * 6));
vrt7 = _mm256_add_ps(vrt7, _mm256_load_ps(p + VectorWidth * 7));
vrt8 = _mm256_add_ps(vrt8, _mm256_load_ps(p + VectorWidth * 8));
vrt9 = _mm256_add_ps(vrt9, _mm256_load_ps(p + VectorWidth * 9));
vrt10 = _mm256_add_ps(vrt10, _mm256_load_ps(p + VectorWidth * 10));
vrt11 = _mm256_add_ps(vrt11, _mm256_load_ps(p + VectorWidth * 11));
vrt12 = _mm256_add_ps(vrt12, _mm256_load_ps(p + VectorWidth * 12));
vrt13 = _mm256_add_ps(vrt13, _mm256_load_ps(p + VectorWidth * 13));
vrt14 = _mm256_add_ps(vrt14, _mm256_load_ps(p + VectorWidth * 14));
vrt15 = _mm256_add_ps(vrt15, _mm256_load_ps(p + VectorWidth * 15));
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
// Reduce.
vrt = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vrt, vrt1), _mm256_add_ps(vrt2, vrt3))
, _mm256_add_ps(_mm256_add_ps(vrt4, vrt5), _mm256_add_ps(vrt6, vrt7)))
, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vrt8, vrt9), _mm256_add_ps(vrt10, vrt11))
, _mm256_add_ps(_mm256_add_ps(vrt12, vrt13), _mm256_add_ps(vrt14, vrt15))))
; // vrt = vrt + vrt1 + vrt2 + vrt3 + ... vrt15;
p = (const float*)&vrt;
for (i = 0; i < VectorWidth; ++i) {
rt += p[i];
}
return rt;
}

// Sum - Vector AVX - Loop unrolling *16 - Array.
float SumVectorAvxU16A(const float* src, size_t count, int loops) {
float rt = 0;    // Result.
const int LoopUnrolling = 16;
size_t VectorWidth = sizeof(__m256) / sizeof(float); // Block width.
size_t nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
size_t cntBlock = count / nBlockWidth; // Block count.
size_t cntRem = count % nBlockWidth; // Remainder count.
size_t i;
__m256 vrt[LoopUnrolling]; // Vector result.
for (i = 0; i < LoopUnrolling; ++i) {
vrt[i] = _mm256_setzero_ps(); // [AVX] Set zero.
}
const float* p; // Pointer for src data.
// Body.
for (int j = 0; j < loops; ++j) {
p = src;
// Block processs.
for (i = 0; i < cntBlock; ++i) {
vrt[0] = _mm256_add_ps(vrt[0], _mm256_load_ps(p));    // Add. vrt += *((*__m256)(p)+k);
vrt[1] = _mm256_add_ps(vrt[1], _mm256_load_ps(p + VectorWidth * 1));
vrt[2] = _mm256_add_ps(vrt[2], _mm256_load_ps(p + VectorWidth * 2));
vrt[3] = _mm256_add_ps(vrt[3], _mm256_load_ps(p + VectorWidth * 3));
vrt[4] = _mm256_add_ps(vrt[4], _mm256_load_ps(p + VectorWidth * 4));
vrt[5] = _mm256_add_ps(vrt[5], _mm256_load_ps(p + VectorWidth * 5));
vrt[6] = _mm256_add_ps(vrt[6], _mm256_load_ps(p + VectorWidth * 6));
vrt[7] = _mm256_add_ps(vrt[7], _mm256_load_ps(p + VectorWidth * 7));
vrt[8] = _mm256_add_ps(vrt[8], _mm256_load_ps(p + VectorWidth * 8));
vrt[9] = _mm256_add_ps(vrt[9], _mm256_load_ps(p + VectorWidth * 9));
vrt[10] = _mm256_add_ps(vrt[10], _mm256_load_ps(p + VectorWidth * 10));
vrt[11] = _mm256_add_ps(vrt[11], _mm256_load_ps(p + VectorWidth * 11));
vrt[12] = _mm256_add_ps(vrt[12], _mm256_load_ps(p + VectorWidth * 12));
vrt[13] = _mm256_add_ps(vrt[13], _mm256_load_ps(p + VectorWidth * 13));
vrt[14] = _mm256_add_ps(vrt[14], _mm256_load_ps(p + VectorWidth * 14));
vrt[15] = _mm256_add_ps(vrt[15], _mm256_load_ps(p + VectorWidth * 15));
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
// Reduce.
vrt[0] = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vrt[0], vrt[1]), _mm256_add_ps(vrt[2], vrt[3]))
, _mm256_add_ps(_mm256_add_ps(vrt[4], vrt[5]), _mm256_add_ps(vrt[6], vrt[7])))
, _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vrt[8], vrt[9]), _mm256_add_ps(vrt[10], vrt[11]))
, _mm256_add_ps(_mm256_add_ps(vrt[12], vrt[13]), _mm256_add_ps(vrt[14], vrt[15]))))
; // vrt = vrt + vrt1 + vrt2 + vrt3 + ... vrt15;
p = (const float*)&vrt;
for (i = 0; i < VectorWidth; ++i) {
rt += p[i];
}
return rt;
}

// Sum - Vector AVX - Loop unrolling *X - Array.
float SumVectorAvxUX(const float* src, size_t count, int loops, const int LoopUnrolling) {
float rt = 0;    // Result.
size_t VectorWidth = sizeof(__m256) / sizeof(float); // Block width.
size_t nBlockWidth = VectorWidth * LoopUnrolling; // Block width.
size_t cntBlock = count / nBlockWidth; // Block count.
size_t cntRem = count % nBlockWidth; // Remainder count.
size_t i;
__m256* vrt = new __m256[LoopUnrolling]; // Vector result.
for (i = 0; i < LoopUnrolling; ++i) {
vrt[i] = _mm256_setzero_ps(); // [AVX] Set zero.
}
const float* p; // Pointer for src data.
// Body.
for (int j = 0; j < loops; ++j) {
p = src;
// Block processs.
for (i = 0; i < cntBlock; ++i) {
for (int k = 0; k < LoopUnrolling; ++k) {
vrt[k] = _mm256_add_ps(vrt[k], _mm256_load_ps(p + VectorWidth * k));    // Add. vrt += *((*__m256)(p)+k);
}
p += nBlockWidth;
}
// Remainder processs.
for (i = 0; i < cntRem; ++i) {
rt += p[i];
}
}
// Reduce.
for (i = 1; i < LoopUnrolling; ++i) {
vrt[0] = _mm256_add_ps(vrt[0], vrt[i]); // vrt[0] += vrt[i]
} // vrt = vrt + vrt1 + vrt2 + vrt3 + ... vrt15;
p = (const float*)&vrt[0];
for (i = 0; i < VectorWidth; ++i) {
rt += p[i];
}
delete[] vrt;
return rt;
}

// Do Benchmark.
void Benchmark() {
const size_t alignment = 256 / 8; // sizeof(__m256) / sizeof(BYTE);
// init.
clock_t tickBegin, msUsed;
double mFlops; // MFLOPS/s .
double scale;
float rt;
const int count = 1024 * 4;
const int loops = 1000 * 1000;
//const int loops = 1;
const double countMFlops = count * (double)loops / (1000.0 * 1000);
float* src = (float*)_aligned_malloc(sizeof(float)*count, alignment); // new float[count];
if (NULL == src) {
printf("Memory alloc fail!");
return;
}
for (int i = 0; i < count; ++i) {
src[i] = (float)i;
}
printf("Benchmark: \tcount=%d, loops=%d, countMFlops=%f\n", count, loops, countMFlops);
// SumBase.
tickBegin = clock();
rt = SumBase(src, count, loops);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
printf("SumBase:\t%g\t# msUsed=%d, MFLOPS/s=%f\n", rt, (int)msUsed, mFlops);
double mFlopsBase = mFlops;
// SumBaseU4.
tickBegin = clock();
rt = SumBaseU4(src, count, loops);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
scale = mFlops / mFlopsBase;
printf("SumBaseU4:\t%g\t# msUsed=%d, MFLOPS/s=%f, scale=%f\n", rt, (int)msUsed, mFlops, scale);
// SumVectorAvx.
__try {
tickBegin = clock();
rt = SumVectorAvx(src, count, loops);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
scale = mFlops / mFlopsBase;
printf("SumVectorAvx:\t%g\t# msUsed=%d, MFLOPS/s=%f, scale=%f\n", rt, (int)msUsed, mFlops, scale);
// SumVectorAvxU4.
tickBegin = clock();
rt = SumVectorAvxU4(src, count, loops);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
scale = mFlops / mFlopsBase;
printf("SumVectorAvxU4:\t%g\t# msUsed=%d, MFLOPS/s=%f, scale=%f\n", rt, (int)msUsed, mFlops, scale);
// SumVectorAvxU16.
tickBegin = clock();
rt = SumVectorAvxU16(src, count, loops);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
scale = mFlops / mFlopsBase;
printf("SumVectorAvxU16:\t%g\t# msUsed=%d, MFLOPS/s=%f, scale=%f\n", rt, (int)msUsed, mFlops, scale);
// SumVectorAvxU16A.
tickBegin = clock();
rt = SumVectorAvxU16A(src, count, loops);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
scale = mFlops / mFlopsBase;
printf("SumVectorAvxU16A:\t%g\t# msUsed=%d, MFLOPS/s=%f, scale=%f\n", rt, (int)msUsed, mFlops, scale);
// SumVectorAvxUX.
int LoopUnrollingArray[] = { 4, 8, 16 };
for (int i = 0; i < sizeof(LoopUnrollingArray) / sizeof(LoopUnrollingArray[0]); ++i) {
int loopUnrolling = LoopUnrollingArray[i];
tickBegin = clock();
rt = SumVectorAvxUX(src, count, loops, loopUnrolling);
msUsed = clock() - tickBegin;
mFlops = countMFlops * CLOCKS_PER_SEC / msUsed;
scale = mFlops / mFlopsBase;
printf("SumVectorAvxUX[%d]:\t%g\t# msUsed=%d, MFLOPS/s=%f, scale=%f\n", loopUnrolling, rt, (int)msUsed, mFlops, scale);
}
}
__except (EXCEPTION_EXECUTE_HANDLER) {
printf("Run SumVectorAvx fail!");
}
// done.
_aligned_free(src);
}

int main() {
printf("BenchmarkVectorCpp\n");
printf("\n");
printf("Pointer size:\t%d\n", (int)(sizeof(void*)));
#ifdef _DEBUG
printf("IsRelease:\tFalse\n");
#else
printf("IsRelease:\tTrue\n");
#endif // _DEBUG
#ifdef _MSC_VER
printf("_MSC_VER:\t%d\n", _MSC_VER);
#endif // _MSC_VER
#ifdef __AVX__
printf("__AVX__:\t%d\n", __AVX__);
#endif // __AVX__
printf("\n");
// Benchmark.
Benchmark();
}


### 3.2 测试结果

BenchmarkVectorCpp

Pointer size:   8
IsRelease:      True
_MSC_VER:       1916
__AVX__:        1

Benchmark:      count=4096, loops=1000000, countMFlops=4096.000000
SumBase:        6.87195e+10     # msUsed=4938, MFLOPS/s=829.485622
SumBaseU4:      2.74878e+11     # msUsed=1229, MFLOPS/s=3332.790887, scale=4.017901
SumVectorAvx:   5.49756e+11     # msUsed=616, MFLOPS/s=6649.350649, scale=8.016234
SumVectorAvxU4: 2.19902e+12     # msUsed=247, MFLOPS/s=16582.995951, scale=19.991903
SumVectorAvxU16:        8.3862e+12      # msUsed=89, MFLOPS/s=46022.471910, scale=55.483146
SumVectorAvxU16A:       8.3862e+12      # msUsed=89, MFLOPS/s=46022.471910, scale=55.483146
SumVectorAvxUX[4]:      2.19902e+12     # msUsed=465, MFLOPS/s=8808.602151, scale=10.619355
SumVectorAvxUX[8]:      4.39805e+12     # msUsed=336, MFLOPS/s=12190.476190, scale=14.696429
SumVectorAvxUX[16]:     8.3862e+12      # msUsed=323, MFLOPS/s=12681.114551, scale=15.287926


• SumBaseU4：C++版的MFLOPS/s为 3332.790887，C#版的MFLOPS/s为 2184.5333333333333。3332.790887/2184.5333333333333=1.5256305940246582264042754215188，即大约是 1.5256 倍。
• SumVectorAvxU4：C++版的MFLOPS/s为 16582.995951，C#版的MFLOPS/s为 12487.80487804878。16582.995951/12487.80487804878=1.3279352226386719268724696343231，即大约是 1.3279 倍。
• SumVectorAvxU16：C++版的MFLOPS/s为 46022.471910，C#版的MFLOPS/s为 32768。46022.471910/32768=1.40449438201904296875，即大约是 1.4045 倍。

• SumVectorAvxU16A 与 SumVectorAvxU16 的性能差不多，表示C++编译器能很好地优化数组访问，能达到局部变量同级别的速度。故C++中可以放心使用数组来存储循环展开的临时变量。
• SumVectorAvxUX 的 C++ 版性能比C#好一些。而且临时数组长度大于8时，也能带来一定的性能提升。只可惜还是存在性价比不高的问题，还是经典的循环展开更好用。

## 四、小结

C#中使用循环展开的心得总结——

• 使用循环展开能提高性能，但由于编码比较麻烦，且会增加代码维护的成本。故应作为最后手段，应该先尝试其他优化手段。
• 使用循环展开后，指针版代码比数组版、Span版要高一些，可能是因为指针更贴近底层硬件、更易于编译器优化。故推荐优先使用指针。
• 在C#中做循环展开，一般展开4次就行。展开16次只有少量提升，性价比不高。
• 循环展开会引起临时变量倍增，应该坚持逐个定义局部变量。若用数组会造成性能下降，包括用指针操作栈分配数组也会下降。

C++还有个优点，是可以用数组来存储循环展开的临时变量，且性能不会下降。

## 参考文献

posted on 2022-11-16 23:57  zyl910  阅读(189)  评论(0编辑  收藏  举报