diff --git a/cuda/hlldev.c b/cuda/hlldev.c index f9359bf7..eb78c16c 100644 --- a/cuda/hlldev.c +++ b/cuda/hlldev.c @@ -182,20 +182,27 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX, } void -dspmdmmhll_gpu (double *z, int s, int vPitch, double *y, double alpha, double* cM, int* rP, - int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, double *x, double beta, int firstIndex) +dspmdmmhll_gpu (double *z, int count, int zPitch, double alpha, double* cM, int* rP, + int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, + double *x, int xPitch, double beta, int firstIndex) { int i=0; spgpuHandle_t handle=psb_cudaGetHandle(); - for (i=0; isize_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ #endif - dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, (double *)y->v_, - alpha, (double *)devMat->cM, - devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs, - devMat->avgNzr, devMat->rows, - (double *)x->v_, beta, devMat->baseIndex); + dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, + alpha, (double *)devMat->cM, + devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs, + devMat->avgNzr, devMat->rows, + (double *)x->v_, x->pitch_, beta, devMat->baseIndex); return SPGPU_SUCCESS; } diff --git a/cuda/spgpu/hell.h b/cuda/spgpu/hell.h index d177686c..e62f9ae0 100644 --- a/cuda/spgpu/hell.h +++ b/cuda/spgpu/hell.h @@ -104,7 +104,26 @@ void spgpuDhellspmv (spgpuHandle_t handle, const __device double *x, double beta, int baseIndex); - +#if defined(NEW_MM) +void spgpuDhellspmm(spgpuHandle_t handle, + int count, + __device double *z, + int zpitch, + const __device double *y, + int ypitch, + double alpha, + const __device double* cM, + const __device int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const __device double *x, + int xpitch, + double beta, + int baseIndex); +#endif /** * \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex) diff --git a/cuda/spgpu/kernels/hell_dspmv.cu b/cuda/spgpu/kernels/hell_dspmv.cu index b06f8cfb..dc99e842 100644 --- a/cuda/spgpu/kernels/hell_dspmv.cu +++ b/cuda/spgpu/kernels/hell_dspmv.cu @@ -16,7 +16,7 @@ #include "cudadebug.h" #include "cudalang.h" - +#include extern "C" { #include "core.h" @@ -30,3 +30,202 @@ extern "C" #define TEX_FETCH_TYPE int2 #include "hell_spmv_base.cuh" + + +#if defined(NEW_MM) + +#define MMBSZ 12 + +#undef GEN_SPGPU_HELL_NAME +#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm) +#undef GEN_SPGPU_HELL_NAME_VANILLA +#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla) + + +__global__ void +CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch, + VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, + int hackSize, const int* hackOffsets, const int* rS, int rows, + const VALUE_TYPE *x, int xPitch, + VALUE_TYPE beta, int baseIndex) +{ + VALUE_TYPE *pz,*px,*py; + VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); + VALUE_TYPE yVal; + __shared__ VALUE_TYPE temp[THREAD_BLOCK][MMBSZ]; + + int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK); + + if (i < rows) { + int j; + int hackId = i / hackSize; + int hackLaneId = i % hackSize; + + int hackOffset; + unsigned int laneId = threadIdx.x % 32; + if (laneId == 0) + hackOffset = hackOffsets[hackId]; + //__syncthreads(); + hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId; + + rP += hackOffset; + cM += hackOffset; + + int rowSize = rS[i]; + for (int k=0; kcurrentStream >>> (count, z, zPitch,y, yPitch, + alpha, cM, rP, hackSize, hackOffsets, rS, rows, + x, xPitch, beta, baseIndex); +} + +void +GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) + (spgpuHandle_t handle, + int count, + VALUE_TYPE* z, + int zPitch, + const VALUE_TYPE *y, + int yPitch, + VALUE_TYPE alpha, + const VALUE_TYPE* cM, + const int* rP, + int hackSize, + const __device int* hackOffsets, + const __device int* rS, + const __device int* rIdx, + int rows, + const VALUE_TYPE *x, + int xPitch, + VALUE_TYPE beta, + int baseIndex) +{ + VALUE_TYPE *px,*py, *pz; + int cnt; + int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX); + + // maxNForACall should be a multiple of hackSize + maxNForACall = (maxNForACall/hackSize)*hackSize; + //fprintf(stderr,"Entering kernel %d maxNForACall\n",maxNForACall); + while (rows > maxNForACall) {//managing large vectors + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > MMBSZ) { + //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, + py, yPitch, + alpha, cM, rP, + hackSize, hackOffsets, + rS, rIdx, + maxNForACall, + px, xPitch, beta, baseIndex); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt >0) { + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, + py, yPitch, + alpha, cM, rP, + hackSize, hackOffsets, + rS, rIdx, + maxNForACall, + px, xPitch, beta, baseIndex); + } + + y = y + maxNForACall; + z = z + maxNForACall; + hackOffsets = hackOffsets + maxNForACall/hackSize; + rS = rS + maxNForACall; + + rows -= maxNForACall; + } + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > MMBSZ) { + //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch, + alpha, cM, rP, hackSize, hackOffsets, + rS, rIdx, rows, + px, xPitch, beta, baseIndex); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt >0) { + CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, + py, yPitch, + alpha, cM, rP, + hackSize, hackOffsets, + rS, rIdx, + rows, + px, xPitch, beta, baseIndex); + } + + + cudaCheckError("CUDA error on hell_spmm"); +} + +#endif diff --git a/cuda/spgpu/kernels/hell_spmv_base.cuh b/cuda/spgpu/kernels/hell_spmv_base.cuh index ca074d33..b2f9b053 100644 --- a/cuda/spgpu/kernels/hell_spmv_base.cuh +++ b/cuda/spgpu/kernels/hell_spmv_base.cuh @@ -14,7 +14,6 @@ * GNU General Public License for more details. */ - #define PRE_CONCAT(A, B) A ## B #define CONCAT(A, B) PRE_CONCAT(A, B) @@ -156,4 +155,3 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) cudaCheckError("CUDA error on hell_spmv"); } - diff --git a/cuda/vectordev.c b/cuda/vectordev.c index 22039b5c..877be2d2 100644 --- a/cuda/vectordev.c +++ b/cuda/vectordev.c @@ -38,8 +38,10 @@ #include "core.h" //new -MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned int size, - unsigned int elementType) +MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, + unsigned int size, + unsigned int elementType, + unsigned int storage2d) { struct MultiVectorDeviceParams params; @@ -75,6 +77,7 @@ MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned params.count = count; params.size = size; + params.storage2d = storage2d ; return params; @@ -87,8 +90,9 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams * struct MultiVectDevice *tmp = (struct MultiVectDevice *)malloc(sizeof(struct MultiVectDevice)); *remoteMultiVec = (void *)tmp; - tmp->size_ = params->size; - tmp->count_ = params->count; + tmp->size_ = params->size; + tmp->count_ = params->count; + tmp->storage2d_ = params->storage2d; if (params->elementType == SPGPU_TYPE_INT) { @@ -163,9 +167,9 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count, { int i; struct MultiVectorDeviceParams p; - p = getMultiVectorDeviceParams(count, size, elementType); + p = getMultiVectorDeviceParams(count, size, elementType, PSB_VECT_STOR_BY_COLS); i = allocMultiVecDevice(deviceMultiVec, &p); - //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size); + //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.count); //cudaSync(); if (i != 0) { fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size); @@ -194,3 +198,9 @@ int getMultiVecDevicePitch(void* deviceVec) return(i); } +int getMultiVecDeviceStorage2d(void* deviceVec) +{ int i; + struct MultiVectDevice *dev = (struct MultiVectDevice *) deviceVec; + i = dev->storage2d_; + return(i); +} diff --git a/cuda/vectordev.h b/cuda/vectordev.h index 6d5a16b0..80fc1678 100644 --- a/cuda/vectordev.h +++ b/cuda/vectordev.h @@ -37,8 +37,14 @@ #include "cintrf.h" #include +#define PSB_VECT_STOR_BY_COLS 0 +#define PSB_VECT_STOR_BY_ROWS 1 + struct MultiVectDevice { + // storage of 2d data + int storage2d_; + // number of vectors int count_; @@ -54,17 +60,19 @@ struct MultiVectDevice typedef struct MultiVectorDeviceParams { - // number on vectors - unsigned int count; //1 for a simple vector - - // The resulting allocation will be pitch*s*(size of the elementType) - unsigned int elementType; - - // Pitch (in number of elements) - unsigned int pitch; - - // Size of a single vector (in number of elements). - unsigned int size; + // storage of 2d data + unsigned int storage2d; + // number on vectors + unsigned int count; //1 for a simple vector + + // The resulting allocation will be pitch*s*(size of the elementType) + unsigned int elementType; + + // Pitch (in number of elements) + unsigned int pitch; + + // Size of a single vector (in number of elements). + unsigned int size; } MultiVectorDeviceParams; @@ -76,7 +84,8 @@ int unregisterMapped(void *); MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned int size, - unsigned int elementType); + unsigned int elementType, + unsigned int storage2d); int FallocMultiVecDevice(void** deviceMultiVec, unsigned count, unsigned int size, unsigned int elementType); @@ -85,3 +94,4 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams * int getMultiVecDeviceSize(void* deviceVec); int getMultiVecDeviceCount(void* deviceVec); int getMultiVecDevicePitch(void* deviceVec); +int getMultiVecDeviceStorage2d(void* deviceVec); diff --git a/test/block_krylov/kernel/dpdegenmm.F90 b/test/block_krylov/kernel/dpdegenmm.F90 index cece5417..2b0db219 100644 --- a/test/block_krylov/kernel/dpdegenmm.F90 +++ b/test/block_krylov/kernel/dpdegenmm.F90 @@ -596,7 +596,7 @@ program pdegenmm ! solver parameters integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes real(psb_dpk_) :: err, eps - integer, parameter :: ntests=50, ngpu=50, ncnv=20 + integer, parameter :: ntests=50, ngpu=10, ncnv=20 type(psb_d_coo_sparse_mat), target :: acoo type(psb_d_csr_sparse_mat), target :: acsr type(psb_d_ell_sparse_mat), target :: aell @@ -659,12 +659,12 @@ program pdegenmm ! ! get parameters ! - !call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd) - nrhs=2 - acfmt='CSR' - agfmt='HLG' - idim=2 - tnd=.false. + call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd) + !nrhs=8 + !acfmt='CSR' + !agfmt='CSRG' + !idim=100 + !tnd=.false. call psb_init_timers() ! ! allocate and fill in the coefficient matrix and initial vectors @@ -911,7 +911,7 @@ program pdegenmm write(psb_out_unit,'("Size of matrix: ",i20)') nr write(psb_out_unit,'("Number of nonzeros: ",i20)') annz write(psb_out_unit,'("Memory occupation: ",i20)') amatsize - flops = ntests*(2.d0*annz) + flops = ntests*(2.d0*annz)*nrhs tflops = flops gflops = flops * ngpu write(psb_out_unit,'("Storage type for A: ",a)') a%get_fmt() @@ -970,7 +970,7 @@ program pdegenmm write(psb_out_unit,*) write(psb_out_unit,'("MBYTES/S sust. effective bandwidth (CPU) : ",F20.3)') bdwdth #ifdef HAVE_CUDA - bdwdth = ngpu*ntests*nbytes/(gt2*1.d6) + bdwdth = nrhs*ngpu*ntests*nbytes/(gt2*1.d6) write(psb_out_unit,'("MBYTES/S sust. effective bandwidth (GPU) : ",F20.3)') bdwdth bdwdth = psb_cuda_MemoryPeakBandwidth() write(psb_out_unit,'("MBYTES/S peak bandwidth (GPU) : ",F20.3)') bdwdth @@ -1053,4 +1053,4 @@ contains end subroutine get_parms -end program pdegenmm \ No newline at end of file +end program pdegenmm