转载-稀疏矩阵乘法

	MPI_INT,
	0,
	MPI_COMM_WORLD);

if (!comRank)
{
	for (int i = disp.data[0] = 0; i < comSize - 1; ++i)
		disp.data[i + 1] = disp.data[i] + count.data[i];
	c_csr.nnz = disp.data[comSize - 1] + count.data[comSize - 1];
	ScooMalloc(&c_csr);
}

MPI_Gatherv(
	c_csr_local.RowInd.data,
	c_csr_local.nnz,
	MPI_INT,
	c_csr.RowInd.data,
	count.data,
	disp.data,
	MPI_INT,
	0,
	MPI_COMM_WORLD);

MPI_Gatherv(
	c_csr_local.ColInd.data,
	c_csr_local.nnz,
	MPI_INT,
	c_csr.ColInd.data,
	count.data,
	disp.data,
	MPI_INT,
	0,
	MPI_COMM_WORLD);

MPI_Gatherv(
	c_csr_local.Val.data,
	c_csr_local.nnz,
	MPI_FLOAT,
	c_csr.ColInd.data,
	count.data,
	disp.data,
	MPI_FLOAT,
	0,
	MPI_COMM_WORLD);

ScooFree(&c_csr_local);
IntVectorFree(&count);
IntVectorFree(&disp);

if (!comRank)
{
	ScooFree(&a_csr);
	ScsrInit(&c_csr);
	ScooWrite(&c_csr, stdout);
	ScooFree(&c_csr);
}

MPI_Finalize();

}


### CUDA 算法`sparseScsrmm_cuda.cu`

终端执行下述指令,可以计时执行并将结果写入`sparseScsrmm_cuda_out.txt`。

```bash
nvcc sparseScsrmm_cuda.cu -o sparseScsrmm_cuda
time ./sparseScsrmm_cuda matrix12.mat matrix.mat > sparseScsrmm_cuda_out.txt
#include "WuKSPARSE.h"
void __global__ cuScsrMulScs(
	const int *a_csr_RowPtr,
	const int *a_csr_CowInd,
	const float *a_csr_Val,
	const int *b_csc_ColPtr,
	const int *b_csc_RowInd,
	const float *b_csc_Val,
	float *c,
	int m,
	int n)
{
	int
		i = blockIdx.x * blockDim.x + threadIdx.x,
		j = blockIdx.y * blockDim.y + threadIdx.y;
	float res = 0;
	for (int
			 kb = b_csc_ColPtr[j],
			 ka = a_csr_RowPtr[i];
		 kb < b_csc_ColPtr[j + 1] &&
		 ka < a_csr_RowPtr[i + 1];
		 ++kb)
	{
		while (a_csr_CowInd[ka] < b_csc_RowInd[kb])
			++ka;
		while (a_csr_CowInd[ka] == b_csc_RowInd[kb])
			res += a_csr_Val[ka] * b_csc_Val[kb], ++ka;
	}
	c[i * n + j] = res;
}
void ScsrMulScsr(const Scoo *a_csr, const Scoo *b_csr, Scoo *c_csr)
{
	Scoo B_csc;
	ScooToScsc(b_csr, &B_csc);

	int
		*a_RowPtr,
		*a_ColInd,
		*b_ColPtr,
		*b_RowInd;
	float
		*a_Val,
		*b_Val,
		*d_c;

	cudaMalloc(&a_RowPtr, sizeof(int) * (a_csr->m + 1));
	cudaMemcpy(a_RowPtr, a_csr->RowPtr.data, sizeof(int) * (a_csr->m + 1), cudaMemcpyHostToDevice);

	cudaMalloc(&a_ColInd, sizeof(int) * a_csr->nnz);
	cudaMemcpy(a_ColInd, a_csr->ColInd.data, sizeof(int) * a_csr->nnz, cudaMemcpyHostToDevice);

	cudaMalloc(&a_Val, sizeof(float) * a_csr->nnz);
	cudaMemcpy(a_Val, a_csr->Val.data, sizeof(float) * a_csr->nnz, cudaMemcpyHostToDevice);

	cudaMalloc(&b_ColPtr, sizeof(int) * (B_csc.n + 1));
	cudaMemcpy(b_ColPtr, B_csc.ColPtr.data, sizeof(int) * (B_csc.n + 1), cudaMemcpyHostToDevice);

	cudaMalloc(&b_RowInd, sizeof(int) * B_csc.nnz);
	cudaMemcpy(b_RowInd, B_csc.RowInd.data, sizeof(int) * B_csc.nnz, cudaMemcpyHostToDevice);

	cudaMalloc(&b_Val, sizeof(float) * B_csc.nnz);
	cudaMemcpy(b_Val, B_csc.Val.data, sizeof(float) * B_csc.nnz, cudaMemcpyHostToDevice);

	cudaMalloc(&d_c, sizeof(float) * a_csr->m * B_csc.n);

	dim3
		block(16, 16),
		grid(a_csr->m / block.x, B_csc.n / block.y);

	cuScsrMulScs<<<grid, block>>>(
		a_RowPtr,
		a_ColInd,
		a_Val,
		b_ColPtr,
		b_RowInd,
		b_Val,
		d_c,
		a_csr->m,
		B_csc.n);

	FloatVector c;
	FloatVectorInit(&c, a_csr->m * B_csc.n);
	cudaMemcpy(c.data, d_c, sizeof(float) * a_csr->m * B_csc.n, cudaMemcpyDeviceToHost);

	cudaFree(a_RowPtr);
	cudaFree(a_ColInd);
	cudaFree(b_ColPtr);
	cudaFree(b_RowInd);
	cudaFree(a_Val);
	cudaFree(b_Val);
	cudaFree(d_c);

	c_csr->m = a_csr->m;
	c_csr->n = B_csc.n;
	c_csr->nnz = 0;
	ScooMalloc(c_csr);

	for (int i = 0; i < c_csr->m; ++i)
		for (int j = 0; j < c_csr->n; ++j)
			if (c.data[i * c_csr->n + j])
				ScooPushBack(c_csr, i, j, c.data[i * c_csr->n + j]);
	FloatVectorFree(&c);
	ScooFree(&B_csc);
	ScsrInit(c_csr);
}
int main(int argc, char **argv)
{
	Scoo a_coo, b_coo, a_csr, b_csr, c_csr;

	ScooRead(&a_coo, fopen(argv[1], "r"));
	ScooRead(&b_coo, fopen(argv[2], "r"));

	ScooToScsr(&a_coo, &a_csr);
	ScooToScsr(&b_coo, &b_csr);

	ScooFree(&a_coo);
	ScooFree(&b_coo);

	ScsrMulScsr(&a_csr, &b_csr, &c_csr);

	ScooWrite(&c_csr, stdout);

	ScooFree(&a_csr);
	ScooFree(&b_csr);
	ScooFree(&c_csr);
}

稀疏矩阵处理库WuKSPARSE.h

#ifndef __WuKSPARSE_hpp__
#define __WuKSPARSE_hpp__
#include <stdio.h>
#include "WuKfloatVector.h"
#include "WuKintVector.h"
typedef struct Scoo
{
	int
		m,   //行
		n,   //列
		nnz; //非零元素数量
	IntVector
		RowPtr, //大小m+1
		ColPtr, //大小n+1
		RowInd, //大小nnz,行坐标
		ColInd; //大小nnz,列坐标
	FloatVector
		Val; //大小为nnz
} Scoo;
void ScooFree(Scoo *a)
{
	IntVectorFree(&a->RowPtr);
	IntVectorFree(&a->ColPtr);
	IntVectorFree(&a->RowInd);
	IntVectorFree(&a->ColInd);
	FloatVectorFree(&a->Val);
}
void ScooMalloc(Scoo *a)
{
	IntVectorInit(&a->RowPtr, a->m + 1);
	IntVectorInit(&a->ColPtr, a->n + 1);
	IntVectorInit(&a->RowInd, a->nnz);
	IntVectorInit(&a->ColInd, a->nnz);
	FloatVectorInit(&a->Val, a->nnz);
}
void ScooPushBack(Scoo *a, int r, int c, float v)
{
	IntVectorPushBack(&a->RowInd, r);
	IntVectorPushBack(&a->ColInd, c);
	FloatVectorPushBack(&a->Val, v);
	++a->nnz;
}
void ScooRead(Scoo *a, FILE *f)
{
#define MAXLEN 511
	for (char s[MAXLEN]; fgets(s, MAXLEN, f) && !feof(f);)
		if (s[0] != '%')
		{
			int nnz;
			sscanf(s, "%d%d%d", &a->m, &a->n, &nnz);
			a->nnz = 0;
			ScooMalloc(a);
			for (int line = 0; line < nnz; ++line)
			{
				int r, c;
				float v;
				fscanf(f, "%d%d%f", &r, &c, &v);
				if (0 <= r && r < a->m &&
					0 <= c && c < a->n)
					ScooPushBack(a, r, c, v);
			}
			break;
		}
#undef MAXLEN
}
void ScooWrite(const Scoo *a, FILE *f)
{
	fprintf(
		f,
		"%%MatrixMarket matrix coordinate real general\n%d %d %d\n",
		a->m,
		a->n,
		a->nnz);
	for (int j = 0; j < a->nnz; ++j)
		fprintf(
			f,
			"%d %d %f\n",
			a->RowInd.data[j],
			a->ColInd.data[j],
			a->Val.data[j]);
}
void ScsrInit(Scoo *a)
{
	int cur = 0;
	for (int i = a->RowPtr.data[0] = 0; i < a->nnz; ++i)
		for (; cur < a->RowInd.data[i]; ++cur)
			a->RowPtr.data[cur + 1] = i;
	for (; cur < a->m; ++cur)
		a->RowPtr.data[cur + 1] = a->nnz;
}
void ScooToScsr(const Scoo *a, Scoo *b)
{
	IntVector beg, s;
	IntVectorInit(&beg, a->nnz);
	IntVectorInit(&s, 2);
	for (int i = 0; i < a->nnz; ++i)
		beg.data[i] = i;
	s.data[0] = 0, s.data[1] = a->nnz - 1;
	while (s.size)
	{
		int right = s.data[--s.size], left = s.data[--s.size], l = left, r = right, pivot = beg.data[l];
		while (l < r)
		{
#define LESS(x, y) (a->RowInd.data[x] < a->RowInd.data[y] || a->RowInd.data[x] == a->RowInd.data[y] && a->ColInd.data[x] < a->ColInd.data[y])
			while (l < r && !LESS(beg.data[r], pivot))
				--r;
			beg.data[l] = beg.data[r];
			while (l < r && !LESS(pivot, beg.data[l]))
				++l;
			beg.data[r] = beg.data[l];
#undef LESS
		}
		beg.data[l] = pivot;
		if (l - 1 > left)
		{
			IntVectorPushBack(&s, left);
			IntVectorPushBack(&s, l - 1);
		}
		if (l + 1 < right)
		{
			IntVectorPushBack(&s, l + 1);
			IntVectorPushBack(&s, right);
		}
	}
	IntVectorFree(&s);
	*b = *a;
	ScooMalloc(b);
	for (int i = 0; i < b->nnz; ++i)
	{
		b->RowInd.data[i] = a->RowInd.data[beg.data[i]];
		b->ColInd.data[i] = a->ColInd.data[beg.data[i]];
		b->Val.data[i] = a->Val.data[beg.data[i]];
	}
	ScsrInit(b);
}
void ScscInit(Scoo *a)
{
	int cur = 0;
	for (int i = a->ColPtr.data[0] = 0; i < a->nnz; ++i)
		for (; cur < a->ColInd.data[i]; ++cur)
			a->ColPtr.data[cur + 1] = i;
	for (; cur < a->m; ++cur)
		a->ColPtr.data[cur + 1] = a->nnz;
}
void ScooToScsc(const Scoo *a, Scoo *b)
{
	IntVector beg, s;
	IntVectorInit(&beg, a->nnz);
	IntVectorInit(&s, 2);
	for (int i = 0; i < a->nnz; ++i)
		beg.data[i] = i;
	s.data[0] = 0, s.data[1] = a->nnz - 1;
	while (s.size)
	{
		int right = s.data[--s.size], left = s.data[--s.size], l = left, r = right, pivot = beg.data[l];
		while (l < r)
		{
#define LESS(x, y) (a->ColInd.data[x] < a->ColInd.data[y] || a->ColInd.data[x] == a->ColInd.data[y] && a->RowInd.data[x] < a->RowInd.data[y])
			while (l < r && !LESS(beg.data[r], pivot))
				--r;
			beg.data[l] = beg.data[r];
			while (l < r && !LESS(pivot, beg.data[l]))
				++l;
			beg.data[r] = beg.data[l];
#undef LESS
		}
		beg.data[l] = pivot;
		if (l - 1 > left)
		{
			IntVectorPushBack(&s, left);
			IntVectorPushBack(&s, l - 1);
		}
		if (l + 1 < right)
		{
			IntVectorPushBack(&s, l + 1);
			IntVectorPushBack(&s, right);
		}
	}
	IntVectorFree(&s);
	*b = *a;
	ScooMalloc(b);
	for (int i = 0; i < b->nnz; ++i)
	{
		b->RowInd.data[i] = a->RowInd.data[beg.data[i]];
		b->ColInd.data[i] = a->ColInd.data[beg.data[i]];
		b->Val.data[i] = a->Val.data[beg.data[i]];
	}
	ScscInit(b);
	IntVectorFree(&beg);
}
#endif

浮点动态向量库WuKfloatVector.h

#ifndef __WuKfloatVector_h__
#define __WuKfloatVector_h__
#include <malloc.h>
#include <string.h>
typedef struct FloatVector
{
    int size, cap;
    float *data;
} FloatVector;
void FloatVectorFree(FloatVector *v) { free(v->data); }
void FloatVectorInit(FloatVector *v, int size)
{
    v->size = v->cap = size;
    if (v->cap < 1)
        v->cap = 1;
    v->data = (float *)malloc(sizeof(float) * v->cap);
}
void FloatVectorPushBack(FloatVector *v, float val)
{
    while (v->cap <= v->size)
    {
        FloatVector v_new;
        FloatVectorInit(&v_new, v->cap << 1);
        memcpy(v_new.data, v->data, sizeof(float) * v->size);
        FloatVectorFree(v);
        *v = v_new;
        v->size >>= 1;
    }
    v->data[v->size++] = val;
}
#endif

整型动态向量库WuKintVector.h

#ifndef __WuKintVector_h__
#define __WuKintVector_h__
#include <malloc.h>
#include <string.h>
typedef struct IntVector
{
    int size, cap;
    int *data;
} IntVector;
void IntVectorFree(IntVector *v) { free(v->data); }
void IntVectorInit(IntVector *v, int size)
{
    v->size = v->cap = size;
    if (v->cap < 1)
        v->cap = 1;
    v->data = (int *)malloc(sizeof(int) * v->cap);
}
void IntVectorPushBack(IntVector *v, int val)
{
    while (v->cap <= v->size)
    {
        IntVector v_new;
        IntVectorInit(&v_new, v->cap << 1);
        memcpy(v_new.data, v->data, sizeof(int) * v->size);
        IntVectorFree(v);
        *v = v_new;
        v->size >>= 1;
    }
    v->data[v->size++] = val;
}
#endif
posted @ 2020-07-07 11:11  Neo_KH  阅读(228)  评论(0)    收藏  举报