转载-稀疏矩阵乘法
MPI_INT,
0,
MPI_COMM_WORLD);
if (!comRank)
{
for (int i = disp.data[0] = 0; i < comSize - 1; ++i)
disp.data[i + 1] = disp.data[i] + count.data[i];
c_csr.nnz = disp.data[comSize - 1] + count.data[comSize - 1];
ScooMalloc(&c_csr);
}
MPI_Gatherv(
c_csr_local.RowInd.data,
c_csr_local.nnz,
MPI_INT,
c_csr.RowInd.data,
count.data,
disp.data,
MPI_INT,
0,
MPI_COMM_WORLD);
MPI_Gatherv(
c_csr_local.ColInd.data,
c_csr_local.nnz,
MPI_INT,
c_csr.ColInd.data,
count.data,
disp.data,
MPI_INT,
0,
MPI_COMM_WORLD);
MPI_Gatherv(
c_csr_local.Val.data,
c_csr_local.nnz,
MPI_FLOAT,
c_csr.ColInd.data,
count.data,
disp.data,
MPI_FLOAT,
0,
MPI_COMM_WORLD);
ScooFree(&c_csr_local);
IntVectorFree(&count);
IntVectorFree(&disp);
if (!comRank)
{
ScooFree(&a_csr);
ScsrInit(&c_csr);
ScooWrite(&c_csr, stdout);
ScooFree(&c_csr);
}
MPI_Finalize();
}
### CUDA 算法`sparseScsrmm_cuda.cu`
终端执行下述指令,可以计时执行并将结果写入`sparseScsrmm_cuda_out.txt`。
```bash
nvcc sparseScsrmm_cuda.cu -o sparseScsrmm_cuda
time ./sparseScsrmm_cuda matrix12.mat matrix.mat > sparseScsrmm_cuda_out.txt
#include "WuKSPARSE.h"
void __global__ cuScsrMulScs(
const int *a_csr_RowPtr,
const int *a_csr_CowInd,
const float *a_csr_Val,
const int *b_csc_ColPtr,
const int *b_csc_RowInd,
const float *b_csc_Val,
float *c,
int m,
int n)
{
int
i = blockIdx.x * blockDim.x + threadIdx.x,
j = blockIdx.y * blockDim.y + threadIdx.y;
float res = 0;
for (int
kb = b_csc_ColPtr[j],
ka = a_csr_RowPtr[i];
kb < b_csc_ColPtr[j + 1] &&
ka < a_csr_RowPtr[i + 1];
++kb)
{
while (a_csr_CowInd[ka] < b_csc_RowInd[kb])
++ka;
while (a_csr_CowInd[ka] == b_csc_RowInd[kb])
res += a_csr_Val[ka] * b_csc_Val[kb], ++ka;
}
c[i * n + j] = res;
}
void ScsrMulScsr(const Scoo *a_csr, const Scoo *b_csr, Scoo *c_csr)
{
Scoo B_csc;
ScooToScsc(b_csr, &B_csc);
int
*a_RowPtr,
*a_ColInd,
*b_ColPtr,
*b_RowInd;
float
*a_Val,
*b_Val,
*d_c;
cudaMalloc(&a_RowPtr, sizeof(int) * (a_csr->m + 1));
cudaMemcpy(a_RowPtr, a_csr->RowPtr.data, sizeof(int) * (a_csr->m + 1), cudaMemcpyHostToDevice);
cudaMalloc(&a_ColInd, sizeof(int) * a_csr->nnz);
cudaMemcpy(a_ColInd, a_csr->ColInd.data, sizeof(int) * a_csr->nnz, cudaMemcpyHostToDevice);
cudaMalloc(&a_Val, sizeof(float) * a_csr->nnz);
cudaMemcpy(a_Val, a_csr->Val.data, sizeof(float) * a_csr->nnz, cudaMemcpyHostToDevice);
cudaMalloc(&b_ColPtr, sizeof(int) * (B_csc.n + 1));
cudaMemcpy(b_ColPtr, B_csc.ColPtr.data, sizeof(int) * (B_csc.n + 1), cudaMemcpyHostToDevice);
cudaMalloc(&b_RowInd, sizeof(int) * B_csc.nnz);
cudaMemcpy(b_RowInd, B_csc.RowInd.data, sizeof(int) * B_csc.nnz, cudaMemcpyHostToDevice);
cudaMalloc(&b_Val, sizeof(float) * B_csc.nnz);
cudaMemcpy(b_Val, B_csc.Val.data, sizeof(float) * B_csc.nnz, cudaMemcpyHostToDevice);
cudaMalloc(&d_c, sizeof(float) * a_csr->m * B_csc.n);
dim3
block(16, 16),
grid(a_csr->m / block.x, B_csc.n / block.y);
cuScsrMulScs<<<grid, block>>>(
a_RowPtr,
a_ColInd,
a_Val,
b_ColPtr,
b_RowInd,
b_Val,
d_c,
a_csr->m,
B_csc.n);
FloatVector c;
FloatVectorInit(&c, a_csr->m * B_csc.n);
cudaMemcpy(c.data, d_c, sizeof(float) * a_csr->m * B_csc.n, cudaMemcpyDeviceToHost);
cudaFree(a_RowPtr);
cudaFree(a_ColInd);
cudaFree(b_ColPtr);
cudaFree(b_RowInd);
cudaFree(a_Val);
cudaFree(b_Val);
cudaFree(d_c);
c_csr->m = a_csr->m;
c_csr->n = B_csc.n;
c_csr->nnz = 0;
ScooMalloc(c_csr);
for (int i = 0; i < c_csr->m; ++i)
for (int j = 0; j < c_csr->n; ++j)
if (c.data[i * c_csr->n + j])
ScooPushBack(c_csr, i, j, c.data[i * c_csr->n + j]);
FloatVectorFree(&c);
ScooFree(&B_csc);
ScsrInit(c_csr);
}
int main(int argc, char **argv)
{
Scoo a_coo, b_coo, a_csr, b_csr, c_csr;
ScooRead(&a_coo, fopen(argv[1], "r"));
ScooRead(&b_coo, fopen(argv[2], "r"));
ScooToScsr(&a_coo, &a_csr);
ScooToScsr(&b_coo, &b_csr);
ScooFree(&a_coo);
ScooFree(&b_coo);
ScsrMulScsr(&a_csr, &b_csr, &c_csr);
ScooWrite(&c_csr, stdout);
ScooFree(&a_csr);
ScooFree(&b_csr);
ScooFree(&c_csr);
}
稀疏矩阵处理库WuKSPARSE.h
#ifndef __WuKSPARSE_hpp__
#define __WuKSPARSE_hpp__
#include <stdio.h>
#include "WuKfloatVector.h"
#include "WuKintVector.h"
typedef struct Scoo
{
int
m, //行
n, //列
nnz; //非零元素数量
IntVector
RowPtr, //大小m+1
ColPtr, //大小n+1
RowInd, //大小nnz,行坐标
ColInd; //大小nnz,列坐标
FloatVector
Val; //大小为nnz
} Scoo;
void ScooFree(Scoo *a)
{
IntVectorFree(&a->RowPtr);
IntVectorFree(&a->ColPtr);
IntVectorFree(&a->RowInd);
IntVectorFree(&a->ColInd);
FloatVectorFree(&a->Val);
}
void ScooMalloc(Scoo *a)
{
IntVectorInit(&a->RowPtr, a->m + 1);
IntVectorInit(&a->ColPtr, a->n + 1);
IntVectorInit(&a->RowInd, a->nnz);
IntVectorInit(&a->ColInd, a->nnz);
FloatVectorInit(&a->Val, a->nnz);
}
void ScooPushBack(Scoo *a, int r, int c, float v)
{
IntVectorPushBack(&a->RowInd, r);
IntVectorPushBack(&a->ColInd, c);
FloatVectorPushBack(&a->Val, v);
++a->nnz;
}
void ScooRead(Scoo *a, FILE *f)
{
#define MAXLEN 511
for (char s[MAXLEN]; fgets(s, MAXLEN, f) && !feof(f);)
if (s[0] != '%')
{
int nnz;
sscanf(s, "%d%d%d", &a->m, &a->n, &nnz);
a->nnz = 0;
ScooMalloc(a);
for (int line = 0; line < nnz; ++line)
{
int r, c;
float v;
fscanf(f, "%d%d%f", &r, &c, &v);
if (0 <= r && r < a->m &&
0 <= c && c < a->n)
ScooPushBack(a, r, c, v);
}
break;
}
#undef MAXLEN
}
void ScooWrite(const Scoo *a, FILE *f)
{
fprintf(
f,
"%%MatrixMarket matrix coordinate real general\n%d %d %d\n",
a->m,
a->n,
a->nnz);
for (int j = 0; j < a->nnz; ++j)
fprintf(
f,
"%d %d %f\n",
a->RowInd.data[j],
a->ColInd.data[j],
a->Val.data[j]);
}
void ScsrInit(Scoo *a)
{
int cur = 0;
for (int i = a->RowPtr.data[0] = 0; i < a->nnz; ++i)
for (; cur < a->RowInd.data[i]; ++cur)
a->RowPtr.data[cur + 1] = i;
for (; cur < a->m; ++cur)
a->RowPtr.data[cur + 1] = a->nnz;
}
void ScooToScsr(const Scoo *a, Scoo *b)
{
IntVector beg, s;
IntVectorInit(&beg, a->nnz);
IntVectorInit(&s, 2);
for (int i = 0; i < a->nnz; ++i)
beg.data[i] = i;
s.data[0] = 0, s.data[1] = a->nnz - 1;
while (s.size)
{
int right = s.data[--s.size], left = s.data[--s.size], l = left, r = right, pivot = beg.data[l];
while (l < r)
{
#define LESS(x, y) (a->RowInd.data[x] < a->RowInd.data[y] || a->RowInd.data[x] == a->RowInd.data[y] && a->ColInd.data[x] < a->ColInd.data[y])
while (l < r && !LESS(beg.data[r], pivot))
--r;
beg.data[l] = beg.data[r];
while (l < r && !LESS(pivot, beg.data[l]))
++l;
beg.data[r] = beg.data[l];
#undef LESS
}
beg.data[l] = pivot;
if (l - 1 > left)
{
IntVectorPushBack(&s, left);
IntVectorPushBack(&s, l - 1);
}
if (l + 1 < right)
{
IntVectorPushBack(&s, l + 1);
IntVectorPushBack(&s, right);
}
}
IntVectorFree(&s);
*b = *a;
ScooMalloc(b);
for (int i = 0; i < b->nnz; ++i)
{
b->RowInd.data[i] = a->RowInd.data[beg.data[i]];
b->ColInd.data[i] = a->ColInd.data[beg.data[i]];
b->Val.data[i] = a->Val.data[beg.data[i]];
}
ScsrInit(b);
}
void ScscInit(Scoo *a)
{
int cur = 0;
for (int i = a->ColPtr.data[0] = 0; i < a->nnz; ++i)
for (; cur < a->ColInd.data[i]; ++cur)
a->ColPtr.data[cur + 1] = i;
for (; cur < a->m; ++cur)
a->ColPtr.data[cur + 1] = a->nnz;
}
void ScooToScsc(const Scoo *a, Scoo *b)
{
IntVector beg, s;
IntVectorInit(&beg, a->nnz);
IntVectorInit(&s, 2);
for (int i = 0; i < a->nnz; ++i)
beg.data[i] = i;
s.data[0] = 0, s.data[1] = a->nnz - 1;
while (s.size)
{
int right = s.data[--s.size], left = s.data[--s.size], l = left, r = right, pivot = beg.data[l];
while (l < r)
{
#define LESS(x, y) (a->ColInd.data[x] < a->ColInd.data[y] || a->ColInd.data[x] == a->ColInd.data[y] && a->RowInd.data[x] < a->RowInd.data[y])
while (l < r && !LESS(beg.data[r], pivot))
--r;
beg.data[l] = beg.data[r];
while (l < r && !LESS(pivot, beg.data[l]))
++l;
beg.data[r] = beg.data[l];
#undef LESS
}
beg.data[l] = pivot;
if (l - 1 > left)
{
IntVectorPushBack(&s, left);
IntVectorPushBack(&s, l - 1);
}
if (l + 1 < right)
{
IntVectorPushBack(&s, l + 1);
IntVectorPushBack(&s, right);
}
}
IntVectorFree(&s);
*b = *a;
ScooMalloc(b);
for (int i = 0; i < b->nnz; ++i)
{
b->RowInd.data[i] = a->RowInd.data[beg.data[i]];
b->ColInd.data[i] = a->ColInd.data[beg.data[i]];
b->Val.data[i] = a->Val.data[beg.data[i]];
}
ScscInit(b);
IntVectorFree(&beg);
}
#endif
浮点动态向量库WuKfloatVector.h
#ifndef __WuKfloatVector_h__
#define __WuKfloatVector_h__
#include <malloc.h>
#include <string.h>
typedef struct FloatVector
{
int size, cap;
float *data;
} FloatVector;
void FloatVectorFree(FloatVector *v) { free(v->data); }
void FloatVectorInit(FloatVector *v, int size)
{
v->size = v->cap = size;
if (v->cap < 1)
v->cap = 1;
v->data = (float *)malloc(sizeof(float) * v->cap);
}
void FloatVectorPushBack(FloatVector *v, float val)
{
while (v->cap <= v->size)
{
FloatVector v_new;
FloatVectorInit(&v_new, v->cap << 1);
memcpy(v_new.data, v->data, sizeof(float) * v->size);
FloatVectorFree(v);
*v = v_new;
v->size >>= 1;
}
v->data[v->size++] = val;
}
#endif
整型动态向量库WuKintVector.h
#ifndef __WuKintVector_h__
#define __WuKintVector_h__
#include <malloc.h>
#include <string.h>
typedef struct IntVector
{
int size, cap;
int *data;
} IntVector;
void IntVectorFree(IntVector *v) { free(v->data); }
void IntVectorInit(IntVector *v, int size)
{
v->size = v->cap = size;
if (v->cap < 1)
v->cap = 1;
v->data = (int *)malloc(sizeof(int) * v->cap);
}
void IntVectorPushBack(IntVector *v, int val)
{
while (v->cap <= v->size)
{
IntVector v_new;
IntVectorInit(&v_new, v->cap << 1);
memcpy(v_new.data, v->data, sizeof(int) * v->size);
IntVectorFree(v);
*v = v_new;
v->size >>= 1;
}
v->data[v->size++] = val;
}
#endif