Multicolumn HLG product

cuda-multivect
Salvatore Filippone 10 months ago
parent 5b95f1920c
commit 897cfb4028

@ -182,20 +182,27 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
}
void
dspmdmmhll_gpu (double *z, int s, int vPitch, double *y, double alpha, double* cM, int* rP,
int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, double *x, double beta, int firstIndex)
dspmdmmhll_gpu (double *z, int count, int zPitch, double alpha, double* cM, int* rP,
int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows,
double *x, int xPitch, double beta, int firstIndex)
{
int i=0;
spgpuHandle_t handle=psb_cudaGetHandle();
for (i=0; i<s; i++)
{
spgpuDhellspmv (handle, (double*) z, (double*)y, alpha, (double*) cM, rP,
#if defined(NEW_MM)
spgpuDhellspmm(handle, count, (double*) z, zPitch, (double*)z, zPitch,
alpha, (double*) cM, rP,hackSize, hackOffs, rS, NULL,
rows, (double*)x, xPitch, beta, firstIndex);
#else
for (i=0; i<count; i++) {
spgpuDhellspmv (handle, (double*) z, (double*)z, alpha,
(double*) cM, rP,
hackSize, hackOffs, rS, NULL,
avgNnzPerRow, rows, (double*)x, beta, firstIndex);
z += vPitch;
y += vPitch;
x += vPitch;
z += zPitch;
x += xPitch;
}
#endif
}
//new
@ -211,11 +218,11 @@ int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, (double *)y->v_,
dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_,
alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
devMat->avgNzr, devMat->rows,
(double *)x->v_, beta, devMat->baseIndex);
(double *)x->v_, x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}

@ -104,7 +104,26 @@ void spgpuDhellspmv (spgpuHandle_t handle,
const __device double *x,
double beta,
int baseIndex);
#if defined(NEW_MM)
void spgpuDhellspmm(spgpuHandle_t handle,
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device double *x,
int xpitch,
double beta,
int baseIndex);
#endif
/**
* \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)

@ -16,7 +16,7 @@
#include "cudadebug.h"
#include "cudalang.h"
#include <stdio.h>
extern "C"
{
#include "core.h"
@ -30,3 +30,202 @@ extern "C"
#define TEX_FETCH_TYPE int2
#include "hell_spmv_base.cuh"
#if defined(NEW_MM)
#define MMBSZ 12
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
#undef GEN_SPGPU_HELL_NAME_VANILLA
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[THREAD_BLOCK][MMBSZ];
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
if (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rP += hackOffset;
cM += hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[threadIdx.x][k] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += hackSize;
value = cM[0];
cM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[threadIdx.x][k] =
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[threadIdx.x][k]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[threadIdx.x][k]));
py += yPitch;
pz += zPitch;
}
else
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[threadIdx.x][k]);
pz += zPitch;
}
}
}
void
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
const int* rS, const __device int* rIdx, int rows,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize;
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
alpha, cM, rP, hackSize, hackOffsets, rS, rows,
x, xPitch, beta, baseIndex);
}
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
//fprintf(stderr,"Entering kernel %d maxNForACall\n",maxNForACall);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rS = rS + maxNForACall;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rIdx, rows,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
rows,
px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}
#endif

@ -14,7 +14,6 @@
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
@ -156,4 +155,3 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
cudaCheckError("CUDA error on hell_spmv");
}

@ -38,8 +38,10 @@
#include "core.h"
//new
MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned int size,
unsigned int elementType)
MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
unsigned int size,
unsigned int elementType,
unsigned int storage2d)
{
struct MultiVectorDeviceParams params;
@ -75,6 +77,7 @@ MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned
params.count = count;
params.size = size;
params.storage2d = storage2d ;
return params;
@ -89,6 +92,7 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
*remoteMultiVec = (void *)tmp;
tmp->size_ = params->size;
tmp->count_ = params->count;
tmp->storage2d_ = params->storage2d;
if (params->elementType == SPGPU_TYPE_INT)
{
@ -163,9 +167,9 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
{ int i;
struct MultiVectorDeviceParams p;
p = getMultiVectorDeviceParams(count, size, elementType);
p = getMultiVectorDeviceParams(count, size, elementType, PSB_VECT_STOR_BY_COLS);
i = allocMultiVecDevice(deviceMultiVec, &p);
//fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
//fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.count);
//cudaSync();
if (i != 0) {
fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);
@ -194,3 +198,9 @@ int getMultiVecDevicePitch(void* deviceVec)
return(i);
}
int getMultiVecDeviceStorage2d(void* deviceVec)
{ int i;
struct MultiVectDevice *dev = (struct MultiVectDevice *) deviceVec;
i = dev->storage2d_;
return(i);
}

@ -37,8 +37,14 @@
#include "cintrf.h"
#include <complex.h>
#define PSB_VECT_STOR_BY_COLS 0
#define PSB_VECT_STOR_BY_ROWS 1
struct MultiVectDevice
{
// storage of 2d data
int storage2d_;
// number of vectors
int count_;
@ -54,6 +60,8 @@ struct MultiVectDevice
typedef struct MultiVectorDeviceParams
{
// storage of 2d data
unsigned int storage2d;
// number on vectors
unsigned int count; //1 for a simple vector
@ -76,7 +84,8 @@ int unregisterMapped(void *);
MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
unsigned int size,
unsigned int elementType);
unsigned int elementType,
unsigned int storage2d);
int FallocMultiVecDevice(void** deviceMultiVec, unsigned count,
unsigned int size, unsigned int elementType);
@ -85,3 +94,4 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
int getMultiVecDeviceSize(void* deviceVec);
int getMultiVecDeviceCount(void* deviceVec);
int getMultiVecDevicePitch(void* deviceVec);
int getMultiVecDeviceStorage2d(void* deviceVec);

@ -596,7 +596,7 @@ program pdegenmm
! solver parameters
integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
real(psb_dpk_) :: err, eps
integer, parameter :: ntests=50, ngpu=50, ncnv=20
integer, parameter :: ntests=50, ngpu=10, ncnv=20
type(psb_d_coo_sparse_mat), target :: acoo
type(psb_d_csr_sparse_mat), target :: acsr
type(psb_d_ell_sparse_mat), target :: aell
@ -659,12 +659,12 @@ program pdegenmm
!
! get parameters
!
!call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
nrhs=2
acfmt='CSR'
agfmt='HLG'
idim=2
tnd=.false.
call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
!nrhs=8
!acfmt='CSR'
!agfmt='CSRG'
!idim=100
!tnd=.false.
call psb_init_timers()
!
! allocate and fill in the coefficient matrix and initial vectors
@ -911,7 +911,7 @@ program pdegenmm
write(psb_out_unit,'("Size of matrix: ",i20)') nr
write(psb_out_unit,'("Number of nonzeros: ",i20)') annz
write(psb_out_unit,'("Memory occupation: ",i20)') amatsize
flops = ntests*(2.d0*annz)
flops = ntests*(2.d0*annz)*nrhs
tflops = flops
gflops = flops * ngpu
write(psb_out_unit,'("Storage type for A: ",a)') a%get_fmt()
@ -970,7 +970,7 @@ program pdegenmm
write(psb_out_unit,*)
write(psb_out_unit,'("MBYTES/S sust. effective bandwidth (CPU) : ",F20.3)') bdwdth
#ifdef HAVE_CUDA
bdwdth = ngpu*ntests*nbytes/(gt2*1.d6)
bdwdth = nrhs*ngpu*ntests*nbytes/(gt2*1.d6)
write(psb_out_unit,'("MBYTES/S sust. effective bandwidth (GPU) : ",F20.3)') bdwdth
bdwdth = psb_cuda_MemoryPeakBandwidth()
write(psb_out_unit,'("MBYTES/S peak bandwidth (GPU) : ",F20.3)') bdwdth

Loading…
Cancel
Save