高性能计算-gemv-向量化优化(16)

1. 目标:矩阵向量乘法 y = A * x (列向量 = 矩阵 *列向量),进行串行,循环展开+simd, simd+omp的效率对比。

2. 源码

#include <iostream>
#include <ctime>
#include <arm_neon.h>
#include <omp.h>
using namespace std;


void dgemv(const int n, const int m, const double *const A,
           const double *const x, double *y)
{
    for (int i = 0; i < n; i++)
    {
        double y_tmp = 0.0;
        for (int j = 0; j < m; j++)
        {
            y_tmp += A[i * m + j] * x[j];
        }
        y[i] = y_tmp;
    }
    return;
}

//循环展开 + neon
void dgemv_neon(const int n, const int m, const double *const A,
           const double *const x, double *y)
{
    for (int i = 0; i < n; i+=2)
    {
        float64x2_t vsum1 = {0.0};
        float64x2_t vsum2 = {0.0};
        for (int j = 0; j < m; j+=2)
        {
            float64x2_t vX = vld1q_f64(x+j);
            float64x2_t vA1 = vld1q_f64(A+i*m+j);
            float64x2_t vA2 = vld1q_f64(A+(i+1)*m+j);
            vsum1 = vfmaq_f64(vsum1,vA1,vX);
            vsum2 = vfmaq_f64(vsum2,vA2,vX);
        }
        y[i] = vaddvq_f64(vsum1);
        y[i+1] = vaddvq_f64(vsum2);
    }
    return;
}

//循环展开 + neon + omp
void dgemv_neon_omp(const int n, const int m, const double *const A,
           const double *const x, double *y)
{
    omp_set_num_threads(4);
    #pragma omp paralel for proc_bind(close)
    for (int i = 0; i < n; i+=2)
    {
        float64x2_t vsum1 = {0.0};
        float64x2_t vsum2 = {0.0};
        for (int j = 0; j < m; j+=2)
        {
            float64x2_t vX = vld1q_f64(x+j);
            float64x2_t vA1 = vld1q_f64(A+i*m+j);
            float64x2_t vA2 = vld1q_f64(A+(i+1)*m+j);
            vsum1 = vfmaq_f64(vsum1,vA1,vX);
            vsum2 = vfmaq_f64(vsum2,vA2,vX);
        }
        y[i] = vaddvq_f64(vsum1);
        y[i+1] = vaddvq_f64(vsum2);
        #pragma omp barrier
    }
    return;
}

void printM(double* A,int n,int m)
{
    for(int i=0;i<n;i++)
    {
        for(int j=0;j<m;j++)
            printf("%.2f ",A[i*m+j]);
        printf("\n");
    }
}

int main(int argc,char** argv)
{
    if(argc!=2)
    {
        printf("should parameter 0:serial 1:neon 2:neon+omp");
        return 0;
    }
    int mode = atoi(argv[1]);
    int nloop = 20;
    int n = 10240;
    double *mat_A = new double[n * n];
    double *vec_x = new double[n];
    double *vec_y = new double[n];

    srand(0);
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
            mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
        }
        vec_y[i] = 0.0;
    }

    clock_t t = clock();
    for (int iloop = 0; iloop < nloop; iloop++)
    {
        switch (mode)
        {
            case 0:
                dgemv(n, n, mat_A, vec_x, vec_y);
                break;
            case 1:
                dgemv_neon(n, n, mat_A, vec_x, vec_y);
                break;
            case 2:
                dgemv_neon_omp(n, n, mat_A, vec_x, vec_y);
                break;
            default:
                break;
        }
    }
    t = clock() - t;
    // printM(mat_A,n,n);

    cout << "y[0]:" << vec_y[0] << endl;
    cout << "y[1]:" << vec_y[1] << endl;
    cout << "y[2]:" << vec_y[2] << endl;
    cout << "y[3]:" << vec_y[3] << endl;
    cout << "y[4]:" << vec_y[4] << endl;

    cout << "gemv mode "<<mode <<" ave cost time(clock):" << t / nloop << endl;
}

3. 数据

g++ gemv.cpp -o gemv -fopenmp
g++ gemv.cpp -o gemv -fopenmp -O1

耗时为clock_t时间单位

函数\编译优化设置 O0无优化 O1无附加参数优化
串行 721297 235374
循环展开+SIMD 759981 173499
循环展开+SIMD+OMP 690216 170610

4. 结果分析

(1). O0优化耗时差别不大

(2). O1优化下,循环展开+SIMD 加速比为 1.35

(3). neon经测试效率没有明显提升

posted @ 2024-12-01 18:30  安洛8  阅读(414)  评论(0)    收藏  举报