From 7d74ebf5c4cf38a215422176f4fc1842cc22de6d Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Wed, 10 Apr 2024 04:46:18 -0400 Subject: [PATCH] Make multivectors work --- cuda/cuda_util.c | 8 ++++---- cuda/cuda_util.h | 4 ++-- cuda/dvectordev.c | 17 ++++++++++++----- cuda/fcusparse_fct.h | 6 +++--- cuda/vectordev.h | 2 +- 5 files changed, 22 insertions(+), 15 deletions(-) diff --git a/cuda/cuda_util.c b/cuda/cuda_util.c index c66fb96f..28a159c1 100644 --- a/cuda/cuda_util.c +++ b/cuda/cuda_util.c @@ -182,9 +182,9 @@ int writeRemoteBuffer(void* hostSrc, void* buffer, int count) } // TODO -int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size) +int writeRemoteBufferR2(void* hostSrc, int hpitch, void* buffer, int count, int pitch, int size) { - cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, count, count, size, cudaMemcpyHostToDevice); + cudaError_t err = cudaMemcpy2D(buffer, pitch, hostSrc, hpitch, size, count, cudaMemcpyHostToDevice); if (err == cudaSuccess) return SPGPU_SUCCESS; @@ -221,9 +221,9 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count) } // TODO sistemare pitch e size (si possono gestire senza realloc su fortran) -int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size) +int readRemoteBufferR2(void* hostDest, int hpitch, void* buffer, int count, int pitch, int size) { - cudaError_t err = cudaMemcpy2D(hostDest, count, buffer, pitch, count, size, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy2D(hostDest, hpitch, buffer, pitch, size, count, cudaMemcpyDeviceToHost); if (err == cudaSuccess) return SPGPU_SUCCESS; diff --git a/cuda/cuda_util.h b/cuda/cuda_util.h index 6921d98e..190a372f 100644 --- a/cuda/cuda_util.h +++ b/cuda/cuda_util.h @@ -49,9 +49,9 @@ int allocMappedMemory(void **buffer, void **dp, int size); int registerMappedMemory(void *buffer, void **dp, int size); int unregisterMappedMemory(void *buffer); int writeRemoteBuffer(void* hostSrc, void* buffer, int count); -int writeRemoteBufferR2(void* hostSrc, void* buffer, int count, int pitch, int size); +int writeRemoteBufferR2(void* hostSrc, int hpitch, void* buffer, int count, int pitch, int size); int readRemoteBuffer(void* hostDest, void* buffer, int count); -int readRemoteBufferR2(void* hostDest, void* buffer, int count, int pitch, int size); +int readRemoteBufferR2(void* hostDest, int hpitch, void* buffer, int count, int pitch, int size); int freeRemoteBuffer(void* buffer); int gpuInit(int dev); int getDeviceCount(); diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index 28b9055a..9df0f625 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -56,9 +56,12 @@ int writeMultiVecDeviceDouble(void* deviceVec, double* hostVec) int writeMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) { int i; struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec; - i = writeRemoteBufferR2((void*) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_); + double *hv, *dv; + i = writeRemoteBufferR2((void*) hostVec, ld*sizeof(double), + (void *)devVec->v_, (devVec->count_), + sizeof(double)*(devVec->pitch_), (devVec->size_)*sizeof(double)); // i = writeMultiVecDeviceDouble(deviceVec, (void *) hostVec); - fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_); + //fprintf(stderr,"From routine : %s : %p %p\n","writeMultiVecDeviceDoubleR2",devVec->v_,devVec->v_+devVec->pitch_); if (i != 0) { fprintf(stderr,"From routine : %s : %d \n","writeMultiVecDeviceDoubleR2",i); } @@ -78,10 +81,14 @@ int readMultiVecDeviceDouble(void* deviceVec, double* hostVec) int readMultiVecDeviceDoubleR2(void* deviceVec, double* hostVec, int ld) { int i; + double *hv, *dv; struct MultiVectDevice *devVec = (struct MultiVectDevice *) deviceVec; - i = readRemoteBufferR2((void *) hostVec, (void *)devVec->v_, devVec->count_*sizeof(double), devVec->pitch_, devVec->size_); + + i = readRemoteBufferR2((void *) hostVec, ld*sizeof(double), + (void *)devVec->v_, devVec->count_, + sizeof(double)*(devVec->pitch_), (devVec->size_)*sizeof(double)); // i = readMultiVecDeviceDouble(deviceVec, hostVec); - fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_); +// fprintf(stderr,"From routine : %s : %p \n","readMultiVecDeviceDoubleR2",devVec->v_); if (i != 0) { fprintf(stderr,"From routine : %s : %d \n","readMultiVecDeviceDoubleR2",i); } @@ -242,7 +249,7 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX, return SPGPU_UNSUPPORTED; for(j=0;jcount_;j++) - fprintf(stderr,"CUDA ENTERED %d %d %d %d \n",j, n, pitch, devVecY->size_); + //fprintf(stderr,"CUDA ENTERED %d %d %d %d \n",j, n, pitch, devVecY->size_); spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, (double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j); return(i); diff --git a/cuda/fcusparse_fct.h b/cuda/fcusparse_fct.h index 4d901134..4fabdd40 100644 --- a/cuda/fcusparse_fct.h +++ b/cuda/fcusparse_fct.h @@ -234,8 +234,8 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, // CHECK_CUSPARSE(cusparseDestroyDnMat(vecX)); // CHECK_CUSPARSE(cusparseDestroyDnMat(vecY)); for(j=0;jcount_;j++) { - vX=x->v_+pitch*j; - vY=y->v_+pitch*j; + vX=x->v_+pitch*j*sizeof(TYPE); + vY=y->v_+pitch*j*sizeof(TYPE); fprintf(stderr,"CUDA ENTERED 1 %d %p %p %d %d %d %d\n",j, vX, vY, pitch, y->size_, cMat->m, cMat->n); CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, cMat->m, vY, CUSPARSE_BASE_TYPE) ); CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, cMat->n, vX, CUSPARSE_BASE_TYPE) ); @@ -257,7 +257,7 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX, &alpha,(*(cMat->spmvDescr)),vecX,&beta,vecY, CUSPARSE_BASE_TYPE,CUSPARSE_SPMV_ALG_DEFAULT, cMat->mvbuffer)); - fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_); + //fprintf(stderr,"CUDA ENTERED 2 %d %p %p %d %d %d %d\n",j, vX, vY, *((double*)vX), *((double*)vY), pitch, y->size_); CHECK_CUSPARSE(cusparseDestroyDnVec(vecX) ); CHECK_CUSPARSE(cusparseDestroyDnVec(vecY) ); } diff --git a/cuda/vectordev.h b/cuda/vectordev.h index 8eca7063..6d5a16b0 100644 --- a/cuda/vectordev.h +++ b/cuda/vectordev.h @@ -45,7 +45,7 @@ struct MultiVectDevice //number of elements for a single vector int size_; - //pithc in number of elements + //pitch in number of elements int pitch_; // Vectors in device memory (single allocation)