[转]Auto-vectorization in GCC
原文:http://gcc.gnu.org/projects/tree-ssa/vectorization.html
文章2:Google-Using GCC Auto-Vectorizer - Linaro
Contributing
This project was started by Dorit (Naishlos) Nuzman. Current contributors to this project include Revital Eres, Richard Guenther, Jakub Jelinek, Michael Matz, Richard Sandiford, and Ira Rosen. This web page is maintained by Ira Rosen <irar@il.ibm.com>. For a list of missing features and possible enhancements seehttp://gcc.gnu.org/wiki/VectorizationTasks.
Using the Vectorizer
Vectorization is enabled by the flag -ftree-vectorize and by default at -O3. To allow vectorization on powerpc* platforms also use -maltivec. On i?86 and x86_64 platforms use -msse/-msse2. To enable vectorization of floating point reductions use -ffast-math or -fassociative-math.
The vectorizer test cases demonstrate the current vectorization capabilities; these can be found under gcc/gcc/testsuite/gcc.dg/vect/. Information on which loops were or were not vectorized and why, can be obtained using the flag -ftree-vectorizer-verbose. For details see http://gcc.gnu.org/ml/gcc-patches/2005-01/msg01247.html. Example output using -ftree-vectorizer-verbose=2:
vect-1.c:82: note: not vectorized, possible dependence between data-refs a[i_124] and a[i_83] vect-1.c:72: note: LOOP VECTORIZED. vect-1.c:64: note: LOOP VECTORIZED. vect-1.c:56: note: LOOP VECTORIZED. vect-1.c:49: note: LOOP VECTORIZED. vect-1.c:41: note: not vectorized: unsupported use in stmt. vect-1.c:31: note: not vectorized: unsupported use in stmt. vect-1.c:13: note: vectorized 4 loops in function.
Basic block vectorization, aka SLP, is enabled by the flag -ftree-slp-vectorize, and requires the same platform dependent flags as loop vectorization. Basic block SLP is enabled by default at -O3 and when -ftree-vectorize is enabled.
Vectorizable Loops
"feature" indicates the vectorization capabilities demonstrated by the example.
example1:
int a[256], b[256], c[256];
foo () {
int i;
for (i=0; i<256; i++){
a[i] = b[i] + c[i];
}
}
int a[256], b[256], c[256];
foo (int n, int x) {
int i;
/* feature: support for unknown loop bound */
/* feature: support for loop invariants */
for (i=0; i<n; i++)
b[i] = x;
}
/* feature: general loop exit condition */
/* feature: support for bitwise operations */
while (n--){
a[i] = b[i]&c[i]; i++;
}
}
typedef int aint __attribute__ ((__aligned__(16)));
foo (int n, aint * __restrict__ p, aint * __restrict q) {
/* feature: support for (aligned) pointer accesses. */
while (n--){
*p++ = *q++;
}
}
example4:
typedef int aint __attribute__ ((__aligned__(16)));
int a[256], b[256], c[256];
foo (int n, aint * __restrict__ p, aint * __restrict__ q) {
int i;
/* feature: support for (aligned) pointer accesses */
/* feature: support for constants */
while (n--){
*p++ = *q++ + 5;
}
/* feature: support for read accesses with a compile time known misalignment */
for (i=0; i<n; i++){
a[i] = b[i+1] + c[i+3];
}
/* feature: support for if-conversion */
for (i=0; i<n; i++){
j = a[i];
b[i] = (j > MAX ? MAX : 0);
}
}
example5:
struct a {
int ca[N];
} s;
for (i = 0; i < N; i++)
{
/* feature: support for alignable struct access */
s.ca[i] = 5;
}
DIMENSION A(1000000), B(1000000), C(1000000) READ*, X, Y A = LOG(X); B = LOG(Y); C = A + B PRINT*, C(500000) END
example7:
int a[256], b[256];
foo (int x) {
int i;
/* feature: support for read accesses with an unknown misalignment */
for (i=0; i<N; i++){
a[i] = b[i+x];
}
}
int a[M][N];
foo (int x) {
int i,j;
/* feature: support for multidimensional arrays */
for (i=0; i<M; i++) {
for (j=0; j<N; j++) {
a[i][j] = x;
}
}
}
unsigned int ub[N], uc[N];
foo () {
int i;
/* feature: support summation reduction.
note: in case of floats use -funsafe-math-optimizations */
unsigned int diff = 0;
for (i = 0; i < N; i++) {
udiff += (ub[i] - uc[i]);
}
example10:
/* feature: support data-types of different sizes.
Currently only a single vector-size per target is supported;
it can accommodate n elements such that n = vector-size/element-size
(e.g, 4 ints, 8 shorts, or 16 chars for a vector of size 16 bytes).
A combination of data-types of different sizes in the same loop
requires special handling. This support is now present in mainline,
and also includes support for type conversions. */
short *sa, *sb, *sc;
int *ia, *ib, *ic;
for (i = 0; i < N; i++) {
ia[i] = ib[i] + ic[i];
sa[i] = sb[i] + sc[i];
}
for (i = 0; i < N; i++) {
ia[i] = (int) sb[i];
}
/* feature: support strided accesses - the data elements
that are to be operated upon in parallel are not consecutive - they
are accessed with a stride > 1 (in the example, the stride is 2): */
for (i = 0; i < N/2; i++){
a[i] = b[2*i+1] * c[2*i+1] - b[2*i] * c[2*i];
d[i] = b[2*i] * c[2*i+1] + b[2*i+1] * c[2*i];
}
for (i = 0; i < N; i++) {
a[i] = i;
}
for (i = 0; i < M; i++) {
diff = 0;
for (j = 0; j < N; j+=8) {
diff += (a[i][j] - b[i][j]);
}
out[i] = diff;
}
}
for (k = 0; k < K; k++) {
sum = 0;
for (j = 0; j < M; j++)
for (i = 0; i < N; i++)
sum += in[i+k][j] * coeff[i][j];
out[k] = sum;
}
example15: condition in nested loop:
for (j = 0; j < M; j++)
{
x = x_in[j];
curr_a = a[0];
for (i = 0; i < N; i++)
{
next_a = a[i+1];
curr_a = x > c[i] ? curr_a : next_a;
}
x_out[j] = curr_a;
}
example16: load permutation in loop-aware SLP:
for (i = 0; i < N; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
}
void foo ()
{
unsigned int *pin = &in[0];
unsigned int *pout = &out[0];
*pout++ = *pin++;
*pout++ = *pin++;
*pout++ = *pin++;
*pout++ = *pin++;
}
example18: Simple reduction in SLP:
int sum1;
int sum2;
int a[128];
void foo (void)
{
int i;
for (i = 0; i < 64; i++)
{
sum1 += a[2*i];
sum2 += a[2*i+1];
}
}
example19: Reduction chain in SLP:
int sum;
int a[128];
void foo (void)
{
int i;
for (i = 0; i < 64; i++)
{
sum += a[2*i];
sum += a[2*i+1];
}
}
example20: Basic block SLP with multiple types, loads with different offsets, misaligned load, and not-affine accesses:
void foo (int * __restrict__ dst, short * __restrict__ src,
int h, int stride, short A, short B)
{
int i;
for (i = 0; i < h; i++)
{
dst[0] += A*src[0] + B*src[1];
dst[1] += A*src[1] + B*src[2];
dst[2] += A*src[2] + B*src[3];
dst[3] += A*src[3] + B*src[4];
dst[4] += A*src[4] + B*src[5];
dst[5] += A*src[5] + B*src[6];
dst[6] += A*src[6] + B*src[7];
dst[7] += A*src[7] + B*src[8];
dst += stride;
src += stride;
}
}
int foo (int *b, int n)
{
int i, a = 0;
for (i = n-1; i ≥ 0; i--)
a += b[i];
return a;
}
void foo (int *out1, int *in1, int *in2, int n)
{
int i;
out1 = __builtin_assume_aligned (out1, 32, 16);
in1 = __builtin_assume_aligned (in1, 32, 16);
in2 = __builtin_assume_aligned (in2, 32, 0);
for (i = 0; i < n; i++)
out1[i] = in1[i] * in2[i];
}
void foo (unsigned short *src, unsigned int *dst)
{
int i;
for (i = 0; i < 256; i++)
*dst++ = *src++ << 7;
}
example24: Condition with mixed types:
#define N 1024
float a[N], b[N];
int c[N];
void foo (short x, short y)
{
int i;
for (i = 0; i < N; i++)
c[i] = a[i] < b[i] ? x : y;
}
#define N 1024
float a[N], b[N], c[N], d[N];
int j[N];
void foo (void)
{
int i;
_Bool x, y;
for (i = 0; i < N; i++)
{
x = (a[i] < b[i]);
y = (c[i] < d[i]);
j[i] = x & y;
}
}
Unvectorizable Loops
Examples of loops that currently cannot be vectorized:
example1: uncountable loop:
while (*p != NULL) {
*q++ = *p++;
}
Also see http://gcc.gnu.org/wiki/VectorizationTasks and a list of vectorizer missed-optimization PRs in GCC Bugzilla.

浙公网安备 33010602011771号