Matrix Transpose with SIMD

背景

这里介绍下在写Parallel Computing Final Project项目时候用到的一些技术。Parallel Computer Architecture and Programming是一门非常赞的课,这里面有公开的视频,大家可以学习。这里只介绍下,Final Project里面用到的一些技术。

什么是SIMD

这篇的重点不在介绍SIMD是什么,所以这里只做简单的描述性介绍。一般而言,普通的指令,一次只能在一个数据上使用,比如,\(a+b\) 只能对\(a\)\(b\) 做加法。而SIMD则是对一个vector做加法,vector的长度不同的指令集不一样。所以如果用SIMD,一个指令可以对多个数据一次操作。这个就是简单的概念。指令集合可以参考Intrinsics Guide.

并行矩阵转置

这里我们首先考虑\(4\times 4\) 矩阵:

\[\begin{bmatrix} a_1 & a_2 & a_3 & a_4 \\\\ b_1 & b_2 & b_3 & b_4 \\\\ c_1 & c_2 & c_3 & c_4 \\\\ d_1 & d_2 & d_3 & d_4 \end{bmatrix} \]

转置的结果则为

\[\begin{bmatrix} a_1 & b_1 & c_1 & d_1 \\\\ a_2 & b_2 & c_2 & d_2 \\\\ a_3 & b_3 & c_3 & d_3 \\\\ a_4 & b_4 & c_4 & d_4 \end{bmatrix} \]

方法1

从这里我们可以看出,我们希望\(a_1\)\(b_1\) 在一行。那么我们第一步就是想法子到达这一步:如果吧\(\mathbf{a}\)\(\mathbf{b}\) 做交叉(interleave),那么我们可以得到\((a_1,b_1,a_2,b_2)\)\((a_3,b_3,a_4,b_4)\)。Intel指令集合中有如下函数:

// return (a[0], b[0], a[1], b[1])
__m128 _mm_unpacklo_ps (__m128 a, __m128 b);

// return (a[2], b[2], a[3], b[3])
__m128 _mm_unpackhi_ps (__m128 a, __m128 b);

// return (a[0], a[1], b[0], b[1])
__m128 _mm_movelh_ps (__m128 a, __m128 b);

// return (a[2], a[3], b[2], b[3])
__m128 _mm_movehl_ps (__m128 a, __m128 b);

这样,如下代码是可以做到\(4\times 4\) 矩阵的转置

__m128 a, b, c, d; // load data into these variables
__m128 t0 = _mm_unpacklo_ps(a, b); // t1 = (a[0], b[0], a[1], b[1])
__m128 t1 = _mm_unpackhi_ps(a, b); // t2 = (a[2], b[2], a[3], b[3])
__m128 t2 = _mm_unpacklo_ps(c, d); // t3 = (c[0], d[0], c[1], d[1])
__m128 t3 = _mm_unpackhi_ps(c, d); // t4 = (c[2], d[2], c[3], d[3])
__m128 w = _mm_movelh_ps(t0, t2); // w = (a[0], b[0], c[0], d[0])
__m128 x = _mm_movehl_ps(t0, t2); // x = (a[1], b[1], c[1], d[1])
__m128 y = _mm_movelh_ps(t1, t3); // y = (a[2], b[2], c[2], d[2])
__m128 z = _mm_movehl_ps(t1, t3); // z = (a[3], b[3], c[3], d[3])

至此\(w,x,y,z\) 为已经转置好的数据。

方法2

考虑一下是否可以只用 _mm_unpacklo_ps/_mm_unpackhi_ps 来完成相应的功能?

__m128 a, b, c, d; // load data into these variables
__m128 t0 = _mm_unpacklo_ps(a, c); // t0 = (a[0], c[0], a[1], c[1])
__m128 t1 = _mm_unpackhi_ps(a, c); // t1 = (a[2], c[2], a[3], c[3])
__m128 t2 = _mm_unpacklo_ps(b, d); // t2 = (b[0], d[0], b[1], d[1])
__m128 t3 = _mm_unpackhi_ps(b, d); // t3 = (b[2], d[2], b[3], d[3])
__m128 w = _mm_unpacklo_ps(t0, t2); // w = (a[0], b[0], c[0], d[0])
__m128 x = _mm_unpackhi_ps(t0, t2); // x = (a[1], b[1], c[1], d[1])
__m128 y = _mm_unpacklo_ps(t1, t3); // y = (a[2], b[2], c[2], d[2])
__m128 z = _mm_unpackhi_ps(t1, t3); // z = (a[3], b[3], c[3], d[3])

至此\((w, x, y, z)\) 为已经转置好的矩阵。

方法2的推广

上一篇提到的方法2还可以推广到更大的矩阵,这里我们考虑一个\(8\times 8\)的矩阵$$(\mathbf{a}, \mathbf{b}, \mathbf{c}, \mathbf{d}, \mathbf{d}, \mathbf{e}, \mathbf{f}, \mathbf{g}, \mathbf{h})$$,然后我们首先对\(\mathbf{a}\)\(\mathbf{e}\)做interleave操作,再对\(\mathbf{b}\)\(\mathbf{f}\)做interleave操作,后面以此类推,代码如下:

/*
 [a_l, a_h;
  b_l, b_h;
  c_l, c_h;
  d_l, d_h;
  e_l, e_h;
  f_l, f_h;
  g_l, g_h]
*/
__m128 a_l, a_h, b_l, b_h, c_l, c_h, d_l, d_h;
__m128 e_l, e_h, f_l, f_h, g_l, g_h, h_l, h_h;

__m128 t0_l = _mm_unpacklo_ps(a_l, e_l);
__m128 t0_h = _mm_unpackhi_ps(a_l, e_l);
// (t0_l, t0_h) = (a[0], e[0], a[1], e[1], a[2], e[2], a[3], e[3])

__m128 t1_l = _mm_unpacklo_ps(a_h, e_h);
__m128 t1_h = _mm_unpackhi_ps(a_h, e_h);
// (t1_l, t1_h) = (a[4], e[4], a[5], e[5], a[6], e[6], a[7], e[7])

__m128 t2_l = _mm_unpacklo_ps(b_l, f_l);
__m128 t2_h = _mm_unpackhi_ps(b_l, f_l);
// (t2_l, t2_h) = (b[0], f[0], b[1], f[1], b[2], f[2], b[3], f[3])

__m128 t3_l = _mm_unpacklo_ps(b_h, f_h);
__m128 t3_h = _mm_unpackhi_ps(b_h, f_h);
// (t3_l, t3_h) = (b[4], f[4], b[5], f[5], b[6], f[6], b[7], f[7])

__m128 t4_l = _mm_unpacklo_ps(c_l, g_l);
__m128 t4_h = _mm_unpackhi_ps(c_l, g_l);
// (t4_l, t4_h) = (c[0], g[0], c[1], g[1], c[2], g[2], c[3], g[3])

__m128 t5_l = _mm_unpacklo_ps(c_h, g_h);
__m128 t5_h = _mm_unpackhi_ps(c_h, g_h);
// (t5_l, t5_h) = (c[4], g[4], c[5], g[5], c[6], g[6], c[7], g[7])

__m128 t6_l = _mm_unpacklo_ps(d_l, h_l);
__m128 t6_h = _mm_unpackhi_ps(d_l, h_l);
// (t6_l, t6_h) = (d[0], h[0], d[1], h[1], d[2], h[2], d[3], h[3])

__m128 t7_l = _mm_unpacklo_ps(d_h, h_h);
__m128 t7_h = _mm_unpackhi_ps(d_h, h_h);
// (t7_l, t7_h) = (d[4], h[4], d[5], h[5], d[6], h[6], d[7], h[7])

/*
Finally, we have
[a[0], e[0], a[1], e[1], a[2], e[2], a[3], e[3]],
[a[4], e[4], a[5], e[5], a[6], e[6], a[7], e[7]],
[b[0], f[0], b[1], f[1], b[2], f[2], b[3], f[3]],
[b[4], f[4], b[5], f[5], b[6], f[6], b[7], f[7]],
[c[0], g[0], c[1], g[1], c[2], g[2], c[3], g[3]],
[c[4], g[4], c[5], g[5], c[6], g[6], c[7], g[7]],
[d[0], h[0], d[1], h[1], d[2], h[2], d[3], h[3]],
[d[4], h[4], d[5], h[5], d[6], h[6], d[7], h[7]],
*/

然后对\(t_0 \sim t_7\)做同样的操作,也就是替换上面的\(a \sim h\), 我们可以得到\(m_0 \sim m_7\):

__m128 m0_l = _mm_unpacklo_ps(t0_l, t4_l); 
__m128 m0_h = _mm_unpackhi_ps(t0_l, t4_l);
// (m0_l, m0_h) = (a[0], c[0], e[0], g[0], a[1], c[1], e[1], g[1])

__m128 m1_l = _mm_unpacklo_ps(t0_h, t4_h);
__m128 m1_h = _mm_unpackhi_ps(t0_h, t4_h);
// (m1_l, m1_h) = (a[2], c[2], e[2], g[2], a[3], c[3], e[3], g[3])

__m128 m2_l = _mm_unpacklo_ps(t1_l, t5_l);
__m128 m2_h = _mm_unpackhi_ps(t1_l, t5_l);
// (m2_l, m2_h) = (a[4], c[4], e[4], g[4], a[5], c[5], e[5], g[5])

__m128 m3_l = _mm_unpacklo_ps(t1_h, t5_h);
__m128 m3_h = _mm_unpackhi_ps(t1_h, t5_h);
// (m3_l, m3_h) = (a[6], c[6], e[6], g[6], a[7], c[7], e[7], g[7])

__m128 m4_l = _mm_unpacklo_ps(t2_l, t6_l);
__m128 m4_h = _mm_unpackhi_ps(t2_l, t6_l);
// (m4_l, m4_h) = (b[0], d[0], f[0], h[0], b[1], d[1], f[1], h[1])

__m128 m5_l = _mm_unpacklo_ps(t2_h, t6_h);
__m128 m5_h = _mm_unpackhi_ps(t2_h, t6_h);
// (m5_l, m5_h) = (b[2], d[2], f[2], h[2], b[3], d[3], f[3], h[3])

__m128 m6_l = _mm_unpacklo_ps(t3_l, t7_l);
__m128 m6_h = _mm_unpackhi_ps(t3_l, t7_l);
// (m6_l, m6_h) = (b[4], d[4], f[4], h[4], b[5], d[5], f[5], h[5])

__m128 m7_l = _mm_unpacklo_ps(t3_h, t7_h);
__m128 m7_h = _mm_unpackhi_ps(t3_h, t7_h);
// (m7_l, m7_h) = (b[6], d[6], f[6], h[6], b[7], d[7], f[7], h[7])

然后再对\(m_0 ~ m_7\)做同样的操作,就可以得到最后的结果了,参考以下代码:

__m128 w_l = _mm_unpacklo_ps(m0_l, m4_l); 
__m128 w_h = _mm_unpackhi_ps(m0_l, m4_l);
// (w_l, w_h) = (a[0], b[0], c[0], d[0], e[0], f[0], g[0], h[0])

__m128 x_l = _mm_unpacklo_ps(m0_h, m4_h);
__m128 x_h = _mm_unpackhi_ps(m0_h, m4_h);
// (x_l, x_h) = (a[1], b[1], c[1], d[1], e[1], f[1], g[1], h[1])

__m128 y_l = _mm_unpacklo_ps(m1_l, m5_l);
__m128 y_h = _mm_unpackhi_ps(m1_l, m5_l);
// (y_l, y_h) = (a[2], b[2], c[2], d[2], e[2], f[2], g[2], h[2])

__m128 z_l = _mm_unpacklo_ps(m1_h, m5_h);
__m128 z_h = _mm_unpackhi_ps(m1_h, m5_h);
// (z_l, z_h) = (a[3], b[3], c[3], d[3], e[3], f[3], g[3], h[3])

__m128 a_l = _mm_unpacklo_ps(m2_l, m6_l);
__m128 a_h = _mm_unpackhi_ps(m2_l, m6_l);
// (a_l, a_h) = (a[4], b[4], c[4], d[4], e[4], f[4], g[4], h[4])

__m128 b_l = _mm_unpacklo_ps(m2_h, m6_h);
__m128 b_h = _mm_unpackhi_ps(m2_h, m6_h);
// (b_l, b_h) = (a[5], b[5], c[5], d[5], e[5], f[5], g[5], h[5])

__m128 c_l = _mm_unpacklo_ps(m3_l, m7_l);
__m128 c_h = _mm_unpackhi_ps(m3_l, m7_l);
// (c_l, c_h) = (a[6], b[6], c[6], d[6], e[6], f[6], g[6], h[6])

__m128 d_l = _mm_unpacklo_ps(m3_h, m7_h);
__m128 d_h = _mm_unpackhi_ps(m3_h, m7_h);
// (d_l, d_h) = (a[7], b[7], c[7], d[7], e[7], f[7], g[7], h[7])

至此,完成转置。

原理

其实方法2中提供的方法在Bit粒度有一个精妙的解释。对于矩阵转置,我们需要做到的是\(a_{i,j} = a_{j,i}\)。其实也就是把元素存储到下标转换之后的地方。考虑二进制的表示\((r_2r_1r_0c_2c_1c_0)\),用方法2做奇-偶interleave,我们其实是得到如下的转换

\[(r_2r_1r_0c_2c_1c_0) \rightarrow (r_1r_0c_2c_1c_0r_2) \]

这样,我们做三次类似的操作,就可以得到如下转换

\[(r_2r_1r_0c_2c_1c_0) \rightarrow (c_2c_1c_0r_2r_1r_0) \]

所以从本质上来看,是对元素的索引做了bit粒度的rotate。用这个原理,我们也可以对非方阵来做转换,假设\(4\times 8\)的方阵,bit表示下标则有\((r_1r_0c_2c_1c_0)\),这样我们做两次rotate,则可以达到目的。

posted on 2015-05-03 01:40  FishBoneX  阅读(1040)  评论(0编辑  收藏  举报

导航