[转]Auto-vectorization in GCC

原文:http://gcc.gnu.org/projects/tree-ssa/vectorization.html

文章2:Google-Using GCC Auto-Vectorizer - Linaro

 

Contributing

This project was started by Dorit (Naishlos) Nuzman. Current contributors to this project include Revital Eres, Richard Guenther, Jakub Jelinek, Michael Matz, Richard Sandiford, and Ira Rosen. This web page is maintained by Ira Rosen <irar@il.ibm.com>. For a list of missing features and possible enhancements seehttp://gcc.gnu.org/wiki/VectorizationTasks.

Using the Vectorizer

Vectorization is enabled by the flag -ftree-vectorize and by default at -O3. To allow vectorization on powerpc* platforms also use -maltivec. On i?86 and x86_64 platforms use -msse/-msse2. To enable vectorization of floating point reductions use -ffast-math or -fassociative-math.

The vectorizer test cases demonstrate the current vectorization capabilities; these can be found under gcc/gcc/testsuite/gcc.dg/vect/. Information on which loops were or were not vectorized and why, can be obtained using the flag -ftree-vectorizer-verbose. For details see http://gcc.gnu.org/ml/gcc-patches/2005-01/msg01247.html. Example output using -ftree-vectorizer-verbose=2:

vect-1.c:82: note: not vectorized, possible dependence between data-refs a[i_124] and a[i_83]
vect-1.c:72: note: LOOP VECTORIZED.
vect-1.c:64: note: LOOP VECTORIZED.
vect-1.c:56: note: LOOP VECTORIZED.
vect-1.c:49: note: LOOP VECTORIZED.
vect-1.c:41: note: not vectorized: unsupported use in stmt.
vect-1.c:31: note: not vectorized: unsupported use in stmt.
vect-1.c:13: note: vectorized 4 loops in function.

Basic block vectorization, aka SLP, is enabled by the flag -ftree-slp-vectorize, and requires the same platform dependent flags as loop vectorization. Basic block SLP is enabled by default at -O3 and when -ftree-vectorize is enabled.

Vectorizable Loops

"feature" indicates the vectorization capabilities demonstrated by the example.

example1:

int a[256], b[256], c[256];
foo () {
  int i;

  for (i=0; i<256; i++){
    a[i] = b[i] + c[i];
  }
}

example2:

int a[256], b[256], c[256];
foo (int n, int x) {
   int i;

   /* feature: support for unknown loop bound  */
   /* feature: support for loop invariants  */
   for (i=0; i<n; i++)
      b[i] = x;
   }

   /* feature: general loop exit condition  */
   /* feature: support for bitwise operations  */
   while (n--){
      a[i] = b[i]&c[i]; i++;
   }
}

example3:

typedef int aint __attribute__ ((__aligned__(16)));
foo (int n, aint * __restrict__ p, aint * __restrict q) {

   /* feature: support for (aligned) pointer accesses.  */
   while (n--){
      *p++ = *q++;
   }
}

example4:

typedef int aint __attribute__ ((__aligned__(16)));
int a[256], b[256], c[256];
foo (int n, aint * __restrict__ p, aint * __restrict__ q) {
   int i;

   /* feature: support for (aligned) pointer accesses  */
   /* feature: support for constants  */
   while (n--){
      *p++ = *q++ + 5;
   }

   /* feature: support for read accesses with a compile time known misalignment  */
   for (i=0; i<n; i++){
      a[i] = b[i+1] + c[i+3];
   }

   /* feature: support for if-conversion  */
   for (i=0; i<n; i++){
      j = a[i];
      b[i] = (j > MAX ? MAX : 0);
   }
}

example5:

struct a {
  int ca[N];
} s;
for (i = 0; i < N; i++)
  {
    /* feature: support for alignable struct access  */
    s.ca[i] = 5;
  }

example6 (gfortran):

DIMENSION A(1000000), B(1000000), C(1000000)
READ*, X, Y
A = LOG(X); B = LOG(Y); C = A + B
PRINT*, C(500000)
END

example7:

int a[256], b[256];
foo (int x) {
   int i;

   /* feature: support for read accesses with an unknown misalignment  */
   for (i=0; i<N; i++){
      a[i] = b[i+x];
   }
}

example8:

int a[M][N];
foo (int x) {
   int i,j;

   /* feature: support for multidimensional arrays  */
   for (i=0; i<M; i++) {
     for (j=0; j<N; j++) {
       a[i][j] = x;
     }
   }
}

example9:

unsigned int ub[N], uc[N];
foo () {
  int i;

  /* feature: support summation reduction.
     note: in case of floats use -funsafe-math-optimizations  */
  unsigned int diff = 0;
  for (i = 0; i < N; i++) {
    udiff += (ub[i] - uc[i]);
  }

example10:

/* feature: support data-types of different sizes.
   Currently only a single vector-size per target is supported; 
   it can accommodate n elements such that n = vector-size/element-size 
   (e.g, 4 ints, 8 shorts, or 16 chars for a vector of size 16 bytes). 
   A combination of data-types of different sizes in the same loop 
   requires special handling. This support is now present in mainline,
   and also includes support for type conversions.  */

short *sa, *sb, *sc;
int *ia, *ib, *ic;
for (i = 0; i < N; i++) {
  ia[i] = ib[i] + ic[i];
  sa[i] = sb[i] + sc[i];
}

for (i = 0; i < N; i++) {
  ia[i] = (int) sb[i];
}

example11:

/* feature: support strided accesses - the data elements
   that are to be operated upon in parallel are not consecutive - they
   are accessed with a stride > 1 (in the example, the stride is 2):  */

for (i = 0; i < N/2; i++){
  a[i] = b[2*i+1] * c[2*i+1] - b[2*i] * c[2*i];
  d[i] = b[2*i] * c[2*i+1] + b[2*i+1] * c[2*i];
}

example12: induction:

for (i = 0; i < N; i++) {
  a[i] = i;
}

example13: outer-loop:

  for (i = 0; i < M; i++) {
    diff = 0;
    for (j = 0; j < N; j+=8) {
      diff += (a[i][j] - b[i][j]);
    }
    out[i] = diff;
  }
}

example14: double reduction:

  for (k = 0; k < K; k++) {
    sum = 0;
    for (j = 0; j < M; j++)
      for (i = 0; i < N; i++)
          sum += in[i+k][j] * coeff[i][j];

    out[k] = sum;
  }

example15: condition in nested loop:

  for (j = 0; j < M; j++)
    {
      x = x_in[j];
      curr_a = a[0];

      for (i = 0; i < N; i++)
        {
          next_a = a[i+1];
          curr_a = x > c[i] ? curr_a : next_a;
        }

      x_out[j] = curr_a;
    }

example16: load permutation in loop-aware SLP:

  for (i = 0; i < N; i++)
    {
       a = *pInput++;
       b = *pInput++;
       c = *pInput++;

       *pOutput++ = M00 * a + M01 * b + M02 * c;
       *pOutput++ = M10 * a + M11 * b + M12 * c;
       *pOutput++ = M20 * a + M21 * b + M22 * c;
    }

example17: basic block SLP:

void foo ()
{
  unsigned int *pin = &in[0];
  unsigned int *pout = &out[0];

  *pout++ = *pin++;
  *pout++ = *pin++;
  *pout++ = *pin++;
  *pout++ = *pin++;
}

example18: Simple reduction in SLP:

int sum1;
int sum2;
int a[128];
void foo (void)
{
  int i;

  for (i = 0; i < 64; i++)
    {
      sum1 += a[2*i];
      sum2 += a[2*i+1];
    }
}

example19: Reduction chain in SLP:

int sum;
int a[128];
void foo (void)
{
  int i;

  for (i = 0; i < 64; i++)
    {
      sum += a[2*i];
      sum += a[2*i+1];
    }
}

example20: Basic block SLP with multiple types, loads with different offsets, misaligned load, and not-affine accesses:

void foo (int * __restrict__ dst, short * __restrict__ src,
          int h, int stride, short A, short B)
{
  int i;
  for (i = 0; i < h; i++)
    {
      dst[0] += A*src[0] + B*src[1];
      dst[1] += A*src[1] + B*src[2];
      dst[2] += A*src[2] + B*src[3];
      dst[3] += A*src[3] + B*src[4];
      dst[4] += A*src[4] + B*src[5];
      dst[5] += A*src[5] + B*src[6];
      dst[6] += A*src[6] + B*src[7];
      dst[7] += A*src[7] + B*src[8];
      dst += stride;
      src += stride;
    }
}

example21: Backward access:

int foo (int *b, int n)
{
  int i, a = 0;

  for (i = n-1; i ≥ 0; i--)
    a += b[i];

  return a;
}

example22: Alignment hints:

void foo (int *out1, int *in1, int *in2, int n)
{
  int i;

  out1 = __builtin_assume_aligned (out1, 32, 16);
  in1 = __builtin_assume_aligned (in1, 32, 16);
  in2 = __builtin_assume_aligned (in2, 32, 0);

  for (i = 0; i < n; i++)
    out1[i] = in1[i] * in2[i];
}

example23: Widening shift:

void foo (unsigned short *src, unsigned int *dst)
{
  int i;

  for (i = 0; i < 256; i++)
    *dst++ = *src++ << 7;
}

example24: Condition with mixed types:

#define N 1024
float a[N], b[N];
int c[N];

void foo (short x, short y)
{
  int i;
  for (i = 0; i < N; i++)
    c[i] = a[i] < b[i] ? x : y;
}

example25: Loop with bool:

#define N 1024
float a[N], b[N], c[N], d[N];
int j[N];

void foo (void)
{
  int i;
  _Bool x, y;
  for (i = 0; i < N; i++)
    {
      x = (a[i] < b[i]);
      y = (c[i] < d[i]);
      j[i] = x & y;
    }
}

Unvectorizable Loops

Examples of loops that currently cannot be vectorized:

example1: uncountable loop:

while (*p != NULL) {
  *q++ = *p++;
}

Also see http://gcc.gnu.org/wiki/VectorizationTasks and a list of vectorizer missed-optimization PRs in GCC Bugzilla.

posted @ 2013-09-09 22:03  Scan.  阅读(1145)  评论(0)    收藏  举报