diff --git a/cuda/cuda_util.c b/cuda/cuda_util.c index 3fe61cc0..4f60cb69 100644 --- a/cuda/cuda_util.c +++ b/cuda/cuda_util.c @@ -37,14 +37,14 @@ static int hasUVA=-1; static struct cudaDeviceProp *prop=NULL; static spgpuHandle_t psb_cuda_handle = NULL; static cublasHandle_t psb_cublas_handle = NULL; -#if defined(TRACK_CUDA_MALLOC) +#if 0&& defined(TRACK_CUDA_MALLOC) static long long total_cuda_mem = 0; #endif int allocRemoteBuffer(void** buffer, int count) { cudaError_t err = cudaMalloc(buffer, count); -#if defined(TRACK_CUDA_MALLOC) +#if 0&& defined(TRACK_CUDA_MALLOC) total_cuda_mem += count; fprintf(stderr,"Tracking CUDA allocRemoteBuffer for %ld bytes total %ld address %p\n", count, total_cuda_mem, *buffer); @@ -205,6 +205,21 @@ int readRemoteBuffer(void* hostDest, void* buffer, int count) return SPGPU_UNSPECIFIED; } } +#if 0&& defined(TRACK_CUDA_MALLOC) +int freeAndTrackRemoteBuffer(void* buffer,int size) +{ + cudaError_t err = cudaFree(buffer); + total_cuda_mem -= size; + fprintf(stderr,"Tracking CUDA free for %ld bytes total %ld address %p\n", + size, total_cuda_mem, buffer); + if (err == cudaSuccess) + return SPGPU_SUCCESS; + else { + fprintf(stderr,"CUDA Error freeRemoteBuffer: %s %p\n", cudaGetErrorString(err),buffer); + return SPGPU_UNSPECIFIED; + } +} +#endif int freeRemoteBuffer(void* buffer) { diff --git a/cuda/cuda_util.h b/cuda/cuda_util.h index 95c8d1dc..526e83f1 100644 --- a/cuda/cuda_util.h +++ b/cuda/cuda_util.h @@ -51,6 +51,9 @@ int unregisterMappedMemory(void *buffer); int writeRemoteBuffer(void* hostSrc, void* buffer, int count); int readRemoteBuffer(void* hostDest, void* buffer, int count); int freeRemoteBuffer(void* buffer); +#if 0&&defined(TRACK_CUDA_MALLOC) +int freeAndTrackRemoteBuffer(void* buffer,int size); +#endif int gpuInit(int dev); int getDeviceCount(); int getDevice(); diff --git a/cuda/hlldev.c b/cuda/hlldev.c index 9da6a48c..b14ecb73 100644 --- a/cuda/hlldev.c +++ b/cuda/hlldev.c @@ -72,7 +72,7 @@ int allocHllDevice(void ** remoteMatrix, HllDeviceParams* params) HllDevice *tmp = (HllDevice *)malloc(sizeof(HllDevice)); int ret=SPGPU_SUCCESS; *remoteMatrix = (void *)tmp; - + fprintf(stderr,"Allocated HllDevice %p\n",tmp); tmp->hackSize = params->hackSize; tmp->allocsize = params->allocsize; @@ -131,13 +131,16 @@ int allocHllDevice(void ** remoteMatrix, HllDeviceParams* params) void freeHllDevice(void* remoteMatrix) { HllDevice *devMat = (HllDevice *) remoteMatrix; - //fprintf(stderr,"freeHllDevice\n"); + fprintf(stderr,"freeHllDevice: %p \n",devMat); if (devMat != NULL) { + fprintf(stderr,"freeHllDevice: doing free(s) %p\n",devMat); freeRemoteBuffer(devMat->rS); freeRemoteBuffer(devMat->diag); freeRemoteBuffer(devMat->rP); freeRemoteBuffer(devMat->cM); free(remoteMatrix); + } else { + fprintf(stderr,"Just called FreeHllDevice on a NULL pointer!\n"); } } diff --git a/cuda/impl/psb_c_cuda_cp_hlg_from_coo.F90 b/cuda/impl/psb_c_cuda_cp_hlg_from_coo.F90 index 8b0d9f2a..927c982a 100644 --- a/cuda/impl/psb_c_cuda_cp_hlg_from_coo.F90 +++ b/cuda/impl/psb_c_cuda_cp_hlg_from_coo.F90 @@ -73,12 +73,14 @@ subroutine psb_c_cuda_cp_hlg_from_coo(a,b,info) if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) if (c_associated(a%deviceMat)) then + call trackCudafree(' c_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) if (info == 0) info = psi_CopyCooToHlg(nr,nc,nza, hksz,noffs,isz,& & a%irn,a%hkoffs,idisp,b%ja, b%val, a%deviceMat) call a%set_dev() + call trackCudaAlloc(' c_hlg ',a%sizeof()) else ! This is to guarantee tmp%is_by_rows() call b%cp_to_coo(tmp,info) @@ -95,7 +97,8 @@ subroutine psb_c_cuda_cp_hlg_from_coo(a,b,info) end if if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaFree(' c_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) @@ -104,6 +107,7 @@ subroutine psb_c_cuda_cp_hlg_from_coo(a,b,info) call tmp%free() call a%set_dev() + call trackCudaAlloc(' c_hlg ',a%sizeof()) end if if (info /= 0) goto 9999 diff --git a/cuda/impl/psb_c_cuda_csrg_to_gpu.F90 b/cuda/impl/psb_c_cuda_csrg_to_gpu.F90 index cc3fbaaf..f027302b 100644 --- a/cuda/impl/psb_c_cuda_csrg_to_gpu.F90 +++ b/cuda/impl/psb_c_cuda_csrg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_c_cuda_csrg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_c_cuda_csrg_mat_mod, psb_protect_name => psb_c_cuda_csrg_to_gpu implicit none @@ -52,7 +53,8 @@ subroutine psb_c_cuda_csrg_to_gpu(a,info,nzrm) m = a%get_nrows() n = a%get_ncols() nz = a%get_nzeros() - if (c_associated(a%deviceMat%Mat)) then + if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' c_csrg ',a%sizeof()) info = CSRGDeviceFree(a%deviceMat) end if #if (CUDA_SHORT_VERSION <= 10 ) @@ -369,6 +371,7 @@ subroutine psb_c_cuda_csrg_to_gpu(a,info,nzrm) endif #endif + call trackCudaAlloc(' c_csrg ',a%sizeof()) call a%set_sync() if (info /= 0) then diff --git a/cuda/impl/psb_c_cuda_diag_to_gpu.F90 b/cuda/impl/psb_c_cuda_diag_to_gpu.F90 index 88bbd8b5..0b9dc139 100644 --- a/cuda/impl/psb_c_cuda_diag_to_gpu.F90 +++ b/cuda/impl/psb_c_cuda_diag_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_c_cuda_diag_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use diagdev_mod use psb_vectordev_mod use psb_c_cuda_diag_mat_mod, psb_protect_name => psb_c_cuda_diag_to_gpu @@ -55,12 +56,14 @@ subroutine psb_c_cuda_diag_to_gpu(a,info,nzrm) !allocsize = a%get_size() !write(*,*) 'Create the DIAG matrix' gpu_parms = FgetDiagDeviceParams(n,c,d,spgpu_type_complex_float) - if (c_associated(a%deviceMat)) then - call freeDiagDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' c_diag ',a%sizeof()) + call freeDiagDevice(a%deviceMat) endif info = FallocDiagDevice(a%deviceMat,n,c,d,spgpu_type_complex_float) if (info == 0) info = & & writeDiagDevice(a%deviceMat,a%data,a%offset,n) + call trackCudaAlloc(' c_diag ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_c_cuda_diag_to_gpu diff --git a/cuda/impl/psb_c_cuda_elg_to_gpu.F90 b/cuda/impl/psb_c_cuda_elg_to_gpu.F90 index 495207c7..a6f93957 100644 --- a/cuda/impl/psb_c_cuda_elg_to_gpu.F90 +++ b/cuda/impl/psb_c_cuda_elg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_c_cuda_elg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use elldev_mod use psb_vectordev_mod use psb_c_cuda_elg_mat_mod, psb_protect_name => psb_c_cuda_elg_to_gpu @@ -64,7 +65,8 @@ subroutine psb_c_cuda_elg_to_gpu(a,info,nzrm) end if if ((pitch /= gpu_parms%pitch).or.(maxrowsize /= gpu_parms%maxRowSize)) then - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaAlloc(' c_elg ',a%sizeof()) call freeEllDevice(a%deviceMat) endif info = FallocEllDevice(a%deviceMat,m,nzm,nzt,n,spgpu_type_complex_float,1) @@ -79,6 +81,7 @@ subroutine psb_c_cuda_elg_to_gpu(a,info,nzrm) end if if (info == 0) info = & & writeEllDevice(a%deviceMat,a%val,a%ja,size(a%ja,1),a%irn,a%idiag) + call trackCudaAlloc(' c_elg ',a%sizeof()) call a%set_sync() end subroutine psb_c_cuda_elg_to_gpu diff --git a/cuda/impl/psb_c_cuda_hdiag_to_gpu.F90 b/cuda/impl/psb_c_cuda_hdiag_to_gpu.F90 index 8d1b61a1..a2977952 100644 --- a/cuda/impl/psb_c_cuda_hdiag_to_gpu.F90 +++ b/cuda/impl/psb_c_cuda_hdiag_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_c_cuda_hdiag_to_gpu(a,info) use psb_base_mod + use psb_cuda_env_mod use hdiagdev_mod use psb_vectordev_mod use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_to_gpu @@ -65,12 +66,13 @@ subroutine psb_c_cuda_hdiag_to_gpu(a,info) end if if (c_associated(a%deviceMat)) then - call freeHdiagDevice(a%deviceMat) + call trackCudaFree(' c_hdiag ',a%sizeof()) + call freeHdiagDevice(a%deviceMat) endif info = FAllocHdiagDevice(a%deviceMat,nr,nc,& & allocheight,hacksize,hackCount,spgpu_type_double) if (info == 0) info = & & writeHdiagDevice(a%deviceMat,a%val,a%diaOffsets,a%hackOffsets) - + call trackCudaAlloc(' c_hdiag ',a%sizeof()) end subroutine psb_c_cuda_hdiag_to_gpu diff --git a/cuda/impl/psb_c_cuda_hlg_to_gpu.F90 b/cuda/impl/psb_c_cuda_hlg_to_gpu.F90 index d7d179e7..5e800f9d 100644 --- a/cuda/impl/psb_c_cuda_hlg_to_gpu.F90 +++ b/cuda/impl/psb_c_cuda_hlg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_c_cuda_hlg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use hlldev_mod use psb_vectordev_mod use psb_c_cuda_hlg_mat_mod, psb_protect_name => psb_c_cuda_hlg_to_gpu @@ -51,11 +52,13 @@ subroutine psb_c_cuda_hlg_to_gpu(a,info,nzrm) allocsize = a%get_size() nza = a%get_nzeros() if (c_associated(a%deviceMat)) then - call freehllDevice(a%deviceMat) + call trackCudaFree(' to_gpu c_hlg ',a%sizeof()) + call freehllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,a%hksz,n,nza,allocsize,spgpu_type_complex_float,1) if (info == 0) info = & & writehllDevice(a%deviceMat,a%val,a%ja,a%hkoffs,a%irn,a%idiag) + call trackCudaAlloc(' c_hlg ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_c_cuda_hlg_to_gpu diff --git a/cuda/impl/psb_c_cuda_hybg_to_gpu.F90 b/cuda/impl/psb_c_cuda_hybg_to_gpu.F90 index 15a65abc..631145e8 100644 --- a/cuda/impl/psb_c_cuda_hybg_to_gpu.F90 +++ b/cuda/impl/psb_c_cuda_hybg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_c_cuda_hybg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_c_cuda_hybg_mat_mod, psb_protect_name => psb_c_cuda_hybg_to_gpu implicit none @@ -53,6 +54,7 @@ subroutine psb_c_cuda_hybg_to_gpu(a,info,nzrm) n = a%get_ncols() nz = a%get_nzeros() if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' c_hybg ',a%sizeof()) info = HYBGDeviceFree(a%deviceMat) end if if (a%is_unit()) then @@ -139,6 +141,7 @@ subroutine psb_c_cuda_hybg_to_gpu(a,info,nzrm) info = HYBGDeviceHybsmAnalysis(a%deviceMat) end if + call trackCudaAlloc(' c_hybg ',a%sizeof()) if (info /= 0) then write(0,*) 'Error in HYBG_TO_GPU ',info diff --git a/cuda/impl/psb_d_cuda_cp_hlg_from_coo.F90 b/cuda/impl/psb_d_cuda_cp_hlg_from_coo.F90 index 34b999a9..e228feeb 100644 --- a/cuda/impl/psb_d_cuda_cp_hlg_from_coo.F90 +++ b/cuda/impl/psb_d_cuda_cp_hlg_from_coo.F90 @@ -73,12 +73,14 @@ subroutine psb_d_cuda_cp_hlg_from_coo(a,b,info) if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) if (c_associated(a%deviceMat)) then + call trackCudafree(' d_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) if (info == 0) info = psi_CopyCooToHlg(nr,nc,nza, hksz,noffs,isz,& & a%irn,a%hkoffs,idisp,b%ja, b%val, a%deviceMat) call a%set_dev() + call trackCudaAlloc(' d_hlg ',a%sizeof()) else ! This is to guarantee tmp%is_by_rows() call b%cp_to_coo(tmp,info) @@ -95,7 +97,8 @@ subroutine psb_d_cuda_cp_hlg_from_coo(a,b,info) end if if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaFree(' d_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) @@ -104,6 +107,7 @@ subroutine psb_d_cuda_cp_hlg_from_coo(a,b,info) call tmp%free() call a%set_dev() + call trackCudaAlloc(' d_hlg ',a%sizeof()) end if if (info /= 0) goto 9999 diff --git a/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 b/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 index d7a1b1e7..1fd1fca9 100644 --- a/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_csrg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_d_cuda_csrg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_d_cuda_csrg_mat_mod, psb_protect_name => psb_d_cuda_csrg_to_gpu implicit none @@ -52,7 +53,8 @@ subroutine psb_d_cuda_csrg_to_gpu(a,info,nzrm) m = a%get_nrows() n = a%get_ncols() nz = a%get_nzeros() - if (c_associated(a%deviceMat%Mat)) then + if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' d_csrg ',a%sizeof()) info = CSRGDeviceFree(a%deviceMat) end if #if (CUDA_SHORT_VERSION <= 10 ) @@ -369,6 +371,7 @@ subroutine psb_d_cuda_csrg_to_gpu(a,info,nzrm) endif #endif + call trackCudaAlloc(' d_csrg ',a%sizeof()) call a%set_sync() if (info /= 0) then diff --git a/cuda/impl/psb_d_cuda_diag_to_gpu.F90 b/cuda/impl/psb_d_cuda_diag_to_gpu.F90 index 9b648962..ca194dc1 100644 --- a/cuda/impl/psb_d_cuda_diag_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_diag_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_d_cuda_diag_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use diagdev_mod use psb_vectordev_mod use psb_d_cuda_diag_mat_mod, psb_protect_name => psb_d_cuda_diag_to_gpu @@ -55,12 +56,14 @@ subroutine psb_d_cuda_diag_to_gpu(a,info,nzrm) !allocsize = a%get_size() !write(*,*) 'Create the DIAG matrix' gpu_parms = FgetDiagDeviceParams(n,c,d,spgpu_type_double) - if (c_associated(a%deviceMat)) then - call freeDiagDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' d_diag ',a%sizeof()) + call freeDiagDevice(a%deviceMat) endif info = FallocDiagDevice(a%deviceMat,n,c,d,spgpu_type_double) if (info == 0) info = & & writeDiagDevice(a%deviceMat,a%data,a%offset,n) + call trackCudaAlloc(' d_diag ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_d_cuda_diag_to_gpu diff --git a/cuda/impl/psb_d_cuda_elg_to_gpu.F90 b/cuda/impl/psb_d_cuda_elg_to_gpu.F90 index 9b88af69..f162dc35 100644 --- a/cuda/impl/psb_d_cuda_elg_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_elg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_d_cuda_elg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use elldev_mod use psb_vectordev_mod use psb_d_cuda_elg_mat_mod, psb_protect_name => psb_d_cuda_elg_to_gpu @@ -64,7 +65,8 @@ subroutine psb_d_cuda_elg_to_gpu(a,info,nzrm) end if if ((pitch /= gpu_parms%pitch).or.(maxrowsize /= gpu_parms%maxRowSize)) then - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaAlloc(' d_elg ',a%sizeof()) call freeEllDevice(a%deviceMat) endif info = FallocEllDevice(a%deviceMat,m,nzm,nzt,n,spgpu_type_double,1) @@ -79,6 +81,7 @@ subroutine psb_d_cuda_elg_to_gpu(a,info,nzrm) end if if (info == 0) info = & & writeEllDevice(a%deviceMat,a%val,a%ja,size(a%ja,1),a%irn,a%idiag) + call trackCudaAlloc(' d_elg ',a%sizeof()) call a%set_sync() end subroutine psb_d_cuda_elg_to_gpu diff --git a/cuda/impl/psb_d_cuda_hdiag_to_gpu.F90 b/cuda/impl/psb_d_cuda_hdiag_to_gpu.F90 index 73c4a47d..063b8730 100644 --- a/cuda/impl/psb_d_cuda_hdiag_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_hdiag_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_d_cuda_hdiag_to_gpu(a,info) use psb_base_mod + use psb_cuda_env_mod use hdiagdev_mod use psb_vectordev_mod use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_to_gpu @@ -65,12 +66,13 @@ subroutine psb_d_cuda_hdiag_to_gpu(a,info) end if if (c_associated(a%deviceMat)) then - call freeHdiagDevice(a%deviceMat) + call trackCudaFree(' d_hdiag ',a%sizeof()) + call freeHdiagDevice(a%deviceMat) endif info = FAllocHdiagDevice(a%deviceMat,nr,nc,& & allocheight,hacksize,hackCount,spgpu_type_double) if (info == 0) info = & & writeHdiagDevice(a%deviceMat,a%val,a%diaOffsets,a%hackOffsets) - + call trackCudaAlloc(' d_hdiag ',a%sizeof()) end subroutine psb_d_cuda_hdiag_to_gpu diff --git a/cuda/impl/psb_d_cuda_hlg_to_gpu.F90 b/cuda/impl/psb_d_cuda_hlg_to_gpu.F90 index 566c94bd..04cac6a5 100644 --- a/cuda/impl/psb_d_cuda_hlg_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_hlg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_d_cuda_hlg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use hlldev_mod use psb_vectordev_mod use psb_d_cuda_hlg_mat_mod, psb_protect_name => psb_d_cuda_hlg_to_gpu @@ -51,11 +52,13 @@ subroutine psb_d_cuda_hlg_to_gpu(a,info,nzrm) allocsize = a%get_size() nza = a%get_nzeros() if (c_associated(a%deviceMat)) then - call freehllDevice(a%deviceMat) + call trackCudaFree(' to_gpu d_hlg ',a%sizeof()) + call freehllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,a%hksz,n,nza,allocsize,spgpu_type_double,1) if (info == 0) info = & & writehllDevice(a%deviceMat,a%val,a%ja,a%hkoffs,a%irn,a%idiag) + call trackCudaAlloc(' d_hlg ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_d_cuda_hlg_to_gpu diff --git a/cuda/impl/psb_d_cuda_hybg_to_gpu.F90 b/cuda/impl/psb_d_cuda_hybg_to_gpu.F90 index 7b8e2e5f..50849ab6 100644 --- a/cuda/impl/psb_d_cuda_hybg_to_gpu.F90 +++ b/cuda/impl/psb_d_cuda_hybg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_d_cuda_hybg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_d_cuda_hybg_mat_mod, psb_protect_name => psb_d_cuda_hybg_to_gpu implicit none @@ -53,6 +54,7 @@ subroutine psb_d_cuda_hybg_to_gpu(a,info,nzrm) n = a%get_ncols() nz = a%get_nzeros() if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' d_hybg ',a%sizeof()) info = HYBGDeviceFree(a%deviceMat) end if if (a%is_unit()) then @@ -139,6 +141,7 @@ subroutine psb_d_cuda_hybg_to_gpu(a,info,nzrm) info = HYBGDeviceHybsmAnalysis(a%deviceMat) end if + call trackCudaAlloc(' d_hybg ',a%sizeof()) if (info /= 0) then write(0,*) 'Error in HYBG_TO_GPU ',info diff --git a/cuda/impl/psb_s_cuda_cp_hlg_from_coo.F90 b/cuda/impl/psb_s_cuda_cp_hlg_from_coo.F90 index c254b15a..9a9e7d06 100644 --- a/cuda/impl/psb_s_cuda_cp_hlg_from_coo.F90 +++ b/cuda/impl/psb_s_cuda_cp_hlg_from_coo.F90 @@ -73,12 +73,14 @@ subroutine psb_s_cuda_cp_hlg_from_coo(a,b,info) if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) if (c_associated(a%deviceMat)) then + call trackCudafree(' s_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) if (info == 0) info = psi_CopyCooToHlg(nr,nc,nza, hksz,noffs,isz,& & a%irn,a%hkoffs,idisp,b%ja, b%val, a%deviceMat) call a%set_dev() + call trackCudaAlloc(' s_hlg ',a%sizeof()) else ! This is to guarantee tmp%is_by_rows() call b%cp_to_coo(tmp,info) @@ -95,7 +97,8 @@ subroutine psb_s_cuda_cp_hlg_from_coo(a,b,info) end if if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaFree(' s_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) @@ -104,6 +107,7 @@ subroutine psb_s_cuda_cp_hlg_from_coo(a,b,info) call tmp%free() call a%set_dev() + call trackCudaAlloc(' s_hlg ',a%sizeof()) end if if (info /= 0) goto 9999 diff --git a/cuda/impl/psb_s_cuda_csrg_to_gpu.F90 b/cuda/impl/psb_s_cuda_csrg_to_gpu.F90 index cc5b9c8d..85733f84 100644 --- a/cuda/impl/psb_s_cuda_csrg_to_gpu.F90 +++ b/cuda/impl/psb_s_cuda_csrg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_s_cuda_csrg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_s_cuda_csrg_mat_mod, psb_protect_name => psb_s_cuda_csrg_to_gpu implicit none @@ -52,7 +53,8 @@ subroutine psb_s_cuda_csrg_to_gpu(a,info,nzrm) m = a%get_nrows() n = a%get_ncols() nz = a%get_nzeros() - if (c_associated(a%deviceMat%Mat)) then + if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' s_csrg ',a%sizeof()) info = CSRGDeviceFree(a%deviceMat) end if #if (CUDA_SHORT_VERSION <= 10 ) @@ -369,6 +371,7 @@ subroutine psb_s_cuda_csrg_to_gpu(a,info,nzrm) endif #endif + call trackCudaAlloc(' s_csrg ',a%sizeof()) call a%set_sync() if (info /= 0) then diff --git a/cuda/impl/psb_s_cuda_diag_to_gpu.F90 b/cuda/impl/psb_s_cuda_diag_to_gpu.F90 index c8578e75..f4f62b10 100644 --- a/cuda/impl/psb_s_cuda_diag_to_gpu.F90 +++ b/cuda/impl/psb_s_cuda_diag_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_s_cuda_diag_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use diagdev_mod use psb_vectordev_mod use psb_s_cuda_diag_mat_mod, psb_protect_name => psb_s_cuda_diag_to_gpu @@ -55,12 +56,14 @@ subroutine psb_s_cuda_diag_to_gpu(a,info,nzrm) !allocsize = a%get_size() !write(*,*) 'Create the DIAG matrix' gpu_parms = FgetDiagDeviceParams(n,c,d,spgpu_type_float) - if (c_associated(a%deviceMat)) then - call freeDiagDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' s_diag ',a%sizeof()) + call freeDiagDevice(a%deviceMat) endif info = FallocDiagDevice(a%deviceMat,n,c,d,spgpu_type_float) if (info == 0) info = & & writeDiagDevice(a%deviceMat,a%data,a%offset,n) + call trackCudaAlloc(' s_diag ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_s_cuda_diag_to_gpu diff --git a/cuda/impl/psb_s_cuda_elg_to_gpu.F90 b/cuda/impl/psb_s_cuda_elg_to_gpu.F90 index 9c16ea8d..cc5ac789 100644 --- a/cuda/impl/psb_s_cuda_elg_to_gpu.F90 +++ b/cuda/impl/psb_s_cuda_elg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_s_cuda_elg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use elldev_mod use psb_vectordev_mod use psb_s_cuda_elg_mat_mod, psb_protect_name => psb_s_cuda_elg_to_gpu @@ -64,7 +65,8 @@ subroutine psb_s_cuda_elg_to_gpu(a,info,nzrm) end if if ((pitch /= gpu_parms%pitch).or.(maxrowsize /= gpu_parms%maxRowSize)) then - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaAlloc(' s_elg ',a%sizeof()) call freeEllDevice(a%deviceMat) endif info = FallocEllDevice(a%deviceMat,m,nzm,nzt,n,spgpu_type_float,1) @@ -79,6 +81,7 @@ subroutine psb_s_cuda_elg_to_gpu(a,info,nzrm) end if if (info == 0) info = & & writeEllDevice(a%deviceMat,a%val,a%ja,size(a%ja,1),a%irn,a%idiag) + call trackCudaAlloc(' s_elg ',a%sizeof()) call a%set_sync() end subroutine psb_s_cuda_elg_to_gpu diff --git a/cuda/impl/psb_s_cuda_hdiag_to_gpu.F90 b/cuda/impl/psb_s_cuda_hdiag_to_gpu.F90 index bc3fa325..2074bb68 100644 --- a/cuda/impl/psb_s_cuda_hdiag_to_gpu.F90 +++ b/cuda/impl/psb_s_cuda_hdiag_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_s_cuda_hdiag_to_gpu(a,info) use psb_base_mod + use psb_cuda_env_mod use hdiagdev_mod use psb_vectordev_mod use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_to_gpu @@ -65,12 +66,13 @@ subroutine psb_s_cuda_hdiag_to_gpu(a,info) end if if (c_associated(a%deviceMat)) then - call freeHdiagDevice(a%deviceMat) + call trackCudaFree(' s_hdiag ',a%sizeof()) + call freeHdiagDevice(a%deviceMat) endif info = FAllocHdiagDevice(a%deviceMat,nr,nc,& & allocheight,hacksize,hackCount,spgpu_type_double) if (info == 0) info = & & writeHdiagDevice(a%deviceMat,a%val,a%diaOffsets,a%hackOffsets) - + call trackCudaAlloc(' s_hdiag ',a%sizeof()) end subroutine psb_s_cuda_hdiag_to_gpu diff --git a/cuda/impl/psb_s_cuda_hlg_to_gpu.F90 b/cuda/impl/psb_s_cuda_hlg_to_gpu.F90 index 91cfd5ad..b2913f2d 100644 --- a/cuda/impl/psb_s_cuda_hlg_to_gpu.F90 +++ b/cuda/impl/psb_s_cuda_hlg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_s_cuda_hlg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use hlldev_mod use psb_vectordev_mod use psb_s_cuda_hlg_mat_mod, psb_protect_name => psb_s_cuda_hlg_to_gpu @@ -51,11 +52,13 @@ subroutine psb_s_cuda_hlg_to_gpu(a,info,nzrm) allocsize = a%get_size() nza = a%get_nzeros() if (c_associated(a%deviceMat)) then - call freehllDevice(a%deviceMat) + call trackCudaFree(' to_gpu s_hlg ',a%sizeof()) + call freehllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,a%hksz,n,nza,allocsize,spgpu_type_float,1) if (info == 0) info = & & writehllDevice(a%deviceMat,a%val,a%ja,a%hkoffs,a%irn,a%idiag) + call trackCudaAlloc(' s_hlg ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_s_cuda_hlg_to_gpu diff --git a/cuda/impl/psb_s_cuda_hybg_to_gpu.F90 b/cuda/impl/psb_s_cuda_hybg_to_gpu.F90 index 168a0981..c1369a4b 100644 --- a/cuda/impl/psb_s_cuda_hybg_to_gpu.F90 +++ b/cuda/impl/psb_s_cuda_hybg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_s_cuda_hybg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_s_cuda_hybg_mat_mod, psb_protect_name => psb_s_cuda_hybg_to_gpu implicit none @@ -53,6 +54,7 @@ subroutine psb_s_cuda_hybg_to_gpu(a,info,nzrm) n = a%get_ncols() nz = a%get_nzeros() if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' s_hybg ',a%sizeof()) info = HYBGDeviceFree(a%deviceMat) end if if (a%is_unit()) then @@ -139,6 +141,7 @@ subroutine psb_s_cuda_hybg_to_gpu(a,info,nzrm) info = HYBGDeviceHybsmAnalysis(a%deviceMat) end if + call trackCudaAlloc(' s_hybg ',a%sizeof()) if (info /= 0) then write(0,*) 'Error in HYBG_TO_GPU ',info diff --git a/cuda/impl/psb_z_cuda_cp_hlg_from_coo.F90 b/cuda/impl/psb_z_cuda_cp_hlg_from_coo.F90 index f7be0835..56fc3116 100644 --- a/cuda/impl/psb_z_cuda_cp_hlg_from_coo.F90 +++ b/cuda/impl/psb_z_cuda_cp_hlg_from_coo.F90 @@ -73,12 +73,14 @@ subroutine psb_z_cuda_cp_hlg_from_coo(a,b,info) if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) if (c_associated(a%deviceMat)) then + call trackCudafree(' z_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) if (info == 0) info = psi_CopyCooToHlg(nr,nc,nza, hksz,noffs,isz,& & a%irn,a%hkoffs,idisp,b%ja, b%val, a%deviceMat) call a%set_dev() + call trackCudaAlloc(' z_hlg ',a%sizeof()) else ! This is to guarantee tmp%is_by_rows() call b%cp_to_coo(tmp,info) @@ -95,7 +97,8 @@ subroutine psb_z_cuda_cp_hlg_from_coo(a,b,info) end if if (debug)write(0,*) ' From psi_compute_hckoff:',noffs,isz,a%hkoffs(1:min(10,noffs+1)) - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaFree(' z_hlg ',a%sizeof()) call freeHllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,hksz,nr,nza,isz,spgpu_type_double,1) @@ -104,6 +107,7 @@ subroutine psb_z_cuda_cp_hlg_from_coo(a,b,info) call tmp%free() call a%set_dev() + call trackCudaAlloc(' z_hlg ',a%sizeof()) end if if (info /= 0) goto 9999 diff --git a/cuda/impl/psb_z_cuda_csrg_to_gpu.F90 b/cuda/impl/psb_z_cuda_csrg_to_gpu.F90 index 56943f37..b101b1ef 100644 --- a/cuda/impl/psb_z_cuda_csrg_to_gpu.F90 +++ b/cuda/impl/psb_z_cuda_csrg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_z_cuda_csrg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_z_cuda_csrg_mat_mod, psb_protect_name => psb_z_cuda_csrg_to_gpu implicit none @@ -52,7 +53,8 @@ subroutine psb_z_cuda_csrg_to_gpu(a,info,nzrm) m = a%get_nrows() n = a%get_ncols() nz = a%get_nzeros() - if (c_associated(a%deviceMat%Mat)) then + if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' z_csrg ',a%sizeof()) info = CSRGDeviceFree(a%deviceMat) end if #if (CUDA_SHORT_VERSION <= 10 ) @@ -369,6 +371,7 @@ subroutine psb_z_cuda_csrg_to_gpu(a,info,nzrm) endif #endif + call trackCudaAlloc(' z_csrg ',a%sizeof()) call a%set_sync() if (info /= 0) then diff --git a/cuda/impl/psb_z_cuda_diag_to_gpu.F90 b/cuda/impl/psb_z_cuda_diag_to_gpu.F90 index 672ce938..f2ff306e 100644 --- a/cuda/impl/psb_z_cuda_diag_to_gpu.F90 +++ b/cuda/impl/psb_z_cuda_diag_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_z_cuda_diag_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use diagdev_mod use psb_vectordev_mod use psb_z_cuda_diag_mat_mod, psb_protect_name => psb_z_cuda_diag_to_gpu @@ -55,12 +56,14 @@ subroutine psb_z_cuda_diag_to_gpu(a,info,nzrm) !allocsize = a%get_size() !write(*,*) 'Create the DIAG matrix' gpu_parms = FgetDiagDeviceParams(n,c,d,spgpu_type_complex_double) - if (c_associated(a%deviceMat)) then - call freeDiagDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' z_diag ',a%sizeof()) + call freeDiagDevice(a%deviceMat) endif info = FallocDiagDevice(a%deviceMat,n,c,d,spgpu_type_complex_double) if (info == 0) info = & & writeDiagDevice(a%deviceMat,a%data,a%offset,n) + call trackCudaAlloc(' z_diag ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_z_cuda_diag_to_gpu diff --git a/cuda/impl/psb_z_cuda_elg_to_gpu.F90 b/cuda/impl/psb_z_cuda_elg_to_gpu.F90 index 3a0ecd14..021386f4 100644 --- a/cuda/impl/psb_z_cuda_elg_to_gpu.F90 +++ b/cuda/impl/psb_z_cuda_elg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_z_cuda_elg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use elldev_mod use psb_vectordev_mod use psb_z_cuda_elg_mat_mod, psb_protect_name => psb_z_cuda_elg_to_gpu @@ -64,7 +65,8 @@ subroutine psb_z_cuda_elg_to_gpu(a,info,nzrm) end if if ((pitch /= gpu_parms%pitch).or.(maxrowsize /= gpu_parms%maxRowSize)) then - if (c_associated(a%deviceMat)) then + if (c_associated(a%deviceMat)) then + call trackCudaAlloc(' z_elg ',a%sizeof()) call freeEllDevice(a%deviceMat) endif info = FallocEllDevice(a%deviceMat,m,nzm,nzt,n,spgpu_type_complex_double,1) @@ -79,6 +81,7 @@ subroutine psb_z_cuda_elg_to_gpu(a,info,nzrm) end if if (info == 0) info = & & writeEllDevice(a%deviceMat,a%val,a%ja,size(a%ja,1),a%irn,a%idiag) + call trackCudaAlloc(' z_elg ',a%sizeof()) call a%set_sync() end subroutine psb_z_cuda_elg_to_gpu diff --git a/cuda/impl/psb_z_cuda_hdiag_to_gpu.F90 b/cuda/impl/psb_z_cuda_hdiag_to_gpu.F90 index a1140961..a81c2508 100644 --- a/cuda/impl/psb_z_cuda_hdiag_to_gpu.F90 +++ b/cuda/impl/psb_z_cuda_hdiag_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_z_cuda_hdiag_to_gpu(a,info) use psb_base_mod + use psb_cuda_env_mod use hdiagdev_mod use psb_vectordev_mod use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_to_gpu @@ -65,12 +66,13 @@ subroutine psb_z_cuda_hdiag_to_gpu(a,info) end if if (c_associated(a%deviceMat)) then - call freeHdiagDevice(a%deviceMat) + call trackCudaFree(' z_hdiag ',a%sizeof()) + call freeHdiagDevice(a%deviceMat) endif info = FAllocHdiagDevice(a%deviceMat,nr,nc,& & allocheight,hacksize,hackCount,spgpu_type_double) if (info == 0) info = & & writeHdiagDevice(a%deviceMat,a%val,a%diaOffsets,a%hackOffsets) - + call trackCudaAlloc(' z_hdiag ',a%sizeof()) end subroutine psb_z_cuda_hdiag_to_gpu diff --git a/cuda/impl/psb_z_cuda_hlg_to_gpu.F90 b/cuda/impl/psb_z_cuda_hlg_to_gpu.F90 index 8f81842a..431ebdc8 100644 --- a/cuda/impl/psb_z_cuda_hlg_to_gpu.F90 +++ b/cuda/impl/psb_z_cuda_hlg_to_gpu.F90 @@ -32,6 +32,7 @@ subroutine psb_z_cuda_hlg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use hlldev_mod use psb_vectordev_mod use psb_z_cuda_hlg_mat_mod, psb_protect_name => psb_z_cuda_hlg_to_gpu @@ -51,11 +52,13 @@ subroutine psb_z_cuda_hlg_to_gpu(a,info,nzrm) allocsize = a%get_size() nza = a%get_nzeros() if (c_associated(a%deviceMat)) then - call freehllDevice(a%deviceMat) + call trackCudaFree(' to_gpu z_hlg ',a%sizeof()) + call freehllDevice(a%deviceMat) endif info = FallochllDevice(a%deviceMat,a%hksz,n,nza,allocsize,spgpu_type_complex_double,1) if (info == 0) info = & & writehllDevice(a%deviceMat,a%val,a%ja,a%hkoffs,a%irn,a%idiag) + call trackCudaAlloc(' z_hlg ',a%sizeof()) ! if (info /= 0) goto 9999 end subroutine psb_z_cuda_hlg_to_gpu diff --git a/cuda/impl/psb_z_cuda_hybg_to_gpu.F90 b/cuda/impl/psb_z_cuda_hybg_to_gpu.F90 index f3a32c81..d0132a95 100644 --- a/cuda/impl/psb_z_cuda_hybg_to_gpu.F90 +++ b/cuda/impl/psb_z_cuda_hybg_to_gpu.F90 @@ -33,6 +33,7 @@ subroutine psb_z_cuda_hybg_to_gpu(a,info,nzrm) use psb_base_mod + use psb_cuda_env_mod use cusparse_mod use psb_z_cuda_hybg_mat_mod, psb_protect_name => psb_z_cuda_hybg_to_gpu implicit none @@ -53,6 +54,7 @@ subroutine psb_z_cuda_hybg_to_gpu(a,info,nzrm) n = a%get_ncols() nz = a%get_nzeros() if (c_associated(a%deviceMat%Mat)) then + call trackCudaFree(' z_hybg ',a%sizeof()) info = HYBGDeviceFree(a%deviceMat) end if if (a%is_unit()) then @@ -139,6 +141,7 @@ subroutine psb_z_cuda_hybg_to_gpu(a,info,nzrm) info = HYBGDeviceHybsmAnalysis(a%deviceMat) end if + call trackCudaAlloc(' z_hybg ',a%sizeof()) if (info /= 0) then write(0,*) 'Error in HYBG_TO_GPU ',info diff --git a/cuda/psb_c_cuda_csrg_mat_mod.F90 b/cuda/psb_c_cuda_csrg_mat_mod.F90 index 1fdeec4a..c68fa56a 100644 --- a/cuda/psb_c_cuda_csrg_mat_mod.F90 +++ b/cuda/psb_c_cuda_csrg_mat_mod.F90 @@ -35,7 +35,8 @@ module psb_c_cuda_csrg_mat_mod use iso_c_binding use psb_c_mat_mod use cusparse_mod - + use psb_cuda_env_mod + integer(psb_ipk_), parameter, private :: is_host = -1 integer(psb_ipk_), parameter, private :: is_sync = 0 integer(psb_ipk_), parameter, private :: is_dev = 1 @@ -352,6 +353,7 @@ contains class(psb_c_cuda_csrg_sparse_mat), intent(inout) :: a + call trackCudaFree(' c_csrg ',a%sizeof()) info = CSRGDeviceFree(a%deviceMat) call a%psb_c_csr_sparse_mat%free() @@ -366,6 +368,7 @@ contains type(psb_c_cuda_csrg_sparse_mat), intent(inout) :: a + call trackCudaFree(' c_csrg ',a%sizeof()) info = CSRGDeviceFree(a%deviceMat) return diff --git a/cuda/psb_c_cuda_diag_mat_mod.F90 b/cuda/psb_c_cuda_diag_mat_mod.F90 index 93ffe498..607cfc39 100644 --- a/cuda/psb_c_cuda_diag_mat_mod.F90 +++ b/cuda/psb_c_cuda_diag_mat_mod.F90 @@ -34,6 +34,7 @@ module psb_c_cuda_diag_mat_mod use iso_c_binding use psb_base_mod + use psb_cuda_env_mod use psb_c_dia_mat_mod type, extends(psb_c_dia_sparse_mat) :: psb_c_cuda_diag_sparse_mat @@ -265,8 +266,10 @@ contains integer(psb_ipk_) :: info class(psb_c_cuda_diag_sparse_mat), intent(inout) :: a - if (c_associated(a%deviceMat)) & - & call freeDiagDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' c_diag ',a%sizeof()) + call freeDiagDevice(a%deviceMat) + end if a%deviceMat = c_null_ptr call a%psb_c_dia_sparse_mat%free() diff --git a/cuda/psb_c_cuda_dnsg_mat_mod.F90 b/cuda/psb_c_cuda_dnsg_mat_mod.F90 index b0ca8c46..14264800 100644 --- a/cuda/psb_c_cuda_dnsg_mat_mod.F90 +++ b/cuda/psb_c_cuda_dnsg_mat_mod.F90 @@ -35,6 +35,7 @@ module psb_c_cuda_dnsg_mat_mod use iso_c_binding use psb_c_mat_mod use psb_c_dns_mat_mod + use psb_cuda_env_mod use dnsdev_mod type, extends(psb_c_dns_sparse_mat) :: psb_c_cuda_dnsg_sparse_mat @@ -251,8 +252,10 @@ contains integer(psb_ipk_) :: info class(psb_c_cuda_dnsg_sparse_mat), intent(inout) :: a - if (c_associated(a%deviceMat)) & - & call freeDnsDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' c_dnsg ',a%sizeof()) + call freeDnsDevice(a%deviceMat) + end if a%deviceMat = c_null_ptr call a%psb_c_dns_sparse_mat%free() diff --git a/cuda/psb_c_cuda_elg_mat_mod.F90 b/cuda/psb_c_cuda_elg_mat_mod.F90 index c9b48005..3df12815 100644 --- a/cuda/psb_c_cuda_elg_mat_mod.F90 +++ b/cuda/psb_c_cuda_elg_mat_mod.F90 @@ -35,6 +35,7 @@ module psb_c_cuda_elg_mat_mod use iso_c_binding use psb_c_mat_mod use psb_c_ell_mat_mod + use psb_cuda_env_mod use psb_i_cuda_vect_mod integer(psb_ipk_), parameter, private :: is_host = -1 @@ -369,8 +370,10 @@ contains class(psb_c_cuda_elg_sparse_mat), intent(inout) :: a - if (c_associated(a%deviceMat)) & - & call freeEllDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' c_elg ',a%sizeof()) + call freeEllDevice(a%deviceMat) + end if a%deviceMat = c_null_ptr call a%psb_c_ell_sparse_mat%free() call a%set_sync() diff --git a/cuda/psb_c_cuda_hdiag_mat_mod.F90 b/cuda/psb_c_cuda_hdiag_mat_mod.F90 index f06e501e..6b23987f 100644 --- a/cuda/psb_c_cuda_hdiag_mat_mod.F90 +++ b/cuda/psb_c_cuda_hdiag_mat_mod.F90 @@ -34,6 +34,7 @@ module psb_c_cuda_hdiag_mat_mod use iso_c_binding use psb_base_mod + use psb_cuda_env_mod use psb_c_hdia_mat_mod type, extends(psb_c_hdia_sparse_mat) :: psb_c_cuda_hdiag_sparse_mat @@ -243,8 +244,10 @@ contains integer(psb_ipk_) :: info class(psb_c_cuda_hdiag_sparse_mat), intent(inout) :: a - if (c_associated(a%deviceMat)) & - & call freeHdiagDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' c_hdiag ',a%sizeof()) + call freeHdiagDevice(a%deviceMat) + end if a%deviceMat = c_null_ptr call a%psb_c_hdia_sparse_mat%free() diff --git a/cuda/psb_c_cuda_hlg_mat_mod.F90 b/cuda/psb_c_cuda_hlg_mat_mod.F90 index e98f2474..4a0d888d 100644 --- a/cuda/psb_c_cuda_hlg_mat_mod.F90 +++ b/cuda/psb_c_cuda_hlg_mat_mod.F90 @@ -34,6 +34,7 @@ module psb_c_cuda_hlg_mat_mod use iso_c_binding use psb_c_mat_mod + use psb_cuda_env_mod use psb_c_hll_mat_mod @@ -291,8 +292,10 @@ contains integer(psb_ipk_) :: info class(psb_c_cuda_hlg_sparse_mat), intent(inout) :: a - if (c_associated(a%deviceMat)) & - & call freeHllDevice(a%deviceMat) + if (c_associated(a%deviceMat)) then + call trackCudaFree(' hlg_free c_hlg ',a%sizeof()) + call freeHllDevice(a%deviceMat) + end if a%deviceMat = c_null_ptr call a%psb_c_hll_sparse_mat%free() diff --git a/cuda/psb_c_cuda_hybg_mat_mod.F90 b/cuda/psb_c_cuda_hybg_mat_mod.F90 index 1c94bc0f..34efbf66 100644 --- a/cuda/psb_c_cuda_hybg_mat_mod.F90 +++ b/cuda/psb_c_cuda_hybg_mat_mod.F90 @@ -35,6 +35,7 @@ module psb_c_cuda_hybg_mat_mod use iso_c_binding use psb_c_mat_mod + use psb_cuda_env_mod use cusparse_mod type, extends(psb_c_csr_sparse_mat) :: psb_c_cuda_hybg_sparse_mat @@ -265,6 +266,7 @@ contains integer(psb_ipk_) :: info class(psb_c_cuda_hybg_sparse_mat), intent(inout) :: a + call trackCudaFree(' c_hybg ',a%sizeof()) info = HYBGDeviceFree(a%deviceMat) call a%psb_c_csr_sparse_mat%free() diff --git a/cuda/psb_c_cuda_vect_mod.F90 b/cuda/psb_c_cuda_vect_mod.F90 index 38480e34..fb0b8dc7 100644 --- a/cuda/psb_c_cuda_vect_mod.F90 +++ b/cuda/psb_c_cuda_vect_mod.F90 @@ -30,7 +30,7 @@ ! -module psb_c_cuda_vect_mod +module psb_c_cuda_vect_mod use iso_c_binding use psb_const_mod use psb_error_mod @@ -728,12 +728,14 @@ contains end if if (c_associated(x%deviceVect)) then nd = getMultiVecDeviceSize(x%deviceVect) - if (nd < nh ) then + if (nd < nh ) then + call trackCudaFree(' c_vect_cuda ',x%sizeof()) call freeMultiVecDevice(x%deviceVect) x%deviceVect=c_null_ptr end if end if - if (.not.c_associated(x%deviceVect)) then + if (.not.c_associated(x%deviceVect)) then + call trackCudaAlloc(' c_vect_cuda ',x%sizeof()) info = FallocMultiVecDevice(x%deviceVect,1,nh,spgpu_type_complex_float) if (info /= 0) then if (info == spgpu_outofmem) then @@ -755,6 +757,7 @@ contains if (x%is_host()) then if (.not.c_associated(x%deviceVect)) then n = size(x%v) + call trackCudaAlloc(' c_vect_cuda ',x%sizeof()) info = FallocMultiVecDevice(x%deviceVect,1,n,spgpu_type_complex_float) end if if (info == 0) & @@ -790,6 +793,7 @@ contains if (allocated(x%v)) deallocate(x%v, stat=info) if (c_associated(x%deviceVect)) then !!$ write(0,*)'d_cuda_free Calling freeMultiVecDevice' + call trackCudaFree(' c_vect_cuda ',x%sizeof()) call freeMultiVecDevice(x%deviceVect) x%deviceVect=c_null_ptr end if @@ -1972,12 +1976,14 @@ contains md = getMultiVecDevicePitch(x%deviceVect) nd = getMultiVecDeviceCount(x%deviceVect) if ((md < mh).or.(ndpitch_ = (((params->size*sizeof(int) + 255)/256)*256)/sizeof(int); //fprintf(stderr,"Allocating an INT vector %ld\n",tmp->pitch_*tmp->count_*sizeof(double)); - + tmp->msize_ = tmp->pitch_*params->count*sizeof(int); return allocRemoteBuffer((void **)&(tmp->v_), tmp->pitch_*params->count*sizeof(int)); } else if (params->elementType == SPGPU_TYPE_FLOAT) @@ -106,7 +106,7 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams * tmp->pitch_ = params->size; else tmp->pitch_ = (((params->size*sizeof(float) + 255)/256)*256)/sizeof(float); - + tmp->msize_ = tmp->pitch_*params->count*sizeof(float); return allocRemoteBuffer((void **)&(tmp->v_), tmp->pitch_*params->count*sizeof(float)); } else if (params->elementType == SPGPU_TYPE_DOUBLE) @@ -117,7 +117,7 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams * else tmp->pitch_ = (int)(((params->size*sizeof(double) + 255)/256)*256)/sizeof(double); //fprintf(stderr,"Allocating a DOUBLE vector %ld\n",tmp->pitch_*tmp->count_*sizeof(double)); - + tmp->msize_ = tmp->pitch_*params->count*sizeof(double); return allocRemoteBuffer((void **)&(tmp->v_), tmp->pitch_*tmp->count_*sizeof(double)); } else if (params->elementType == SPGPU_TYPE_COMPLEX_FLOAT) @@ -126,6 +126,7 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams * tmp->pitch_ = params->size; else tmp->pitch_ = (int)(((params->size*sizeof(cuFloatComplex) + 255)/256)*256)/sizeof(cuFloatComplex); + tmp->msize_ = tmp->pitch_*params->count*sizeof(cuFloatComplex); return allocRemoteBuffer((void **)&(tmp->v_), tmp->pitch_*tmp->count_*sizeof(cuFloatComplex)); } else if (params->elementType == SPGPU_TYPE_COMPLEX_DOUBLE) @@ -134,6 +135,7 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams * tmp->pitch_ = params->size; else tmp->pitch_ = (int)(((params->size*sizeof(cuDoubleComplex) + 255)/256)*256)/sizeof(cuDoubleComplex); + tmp->msize_ = tmp->pitch_*params->count*sizeof(cuDoubleComplex); return allocRemoteBuffer((void **)&(tmp->v_), tmp->pitch_*tmp->count_*sizeof(cuDoubleComplex)); } else @@ -153,7 +155,11 @@ void freeMultiVecDevice(void* deviceVec) // fprintf(stderr,"freeMultiVecDevice\n"); if (devVec != NULL) { //fprintf(stderr,"Before freeMultiVecDevice% ld\n",devVec->pitch_*devVec->count_*sizeof(double)); +#if 0&& defined(TRACK_CUDA_MALLOC) + freeAndTrackRemoteBuffer(devVec->v_,devVec->msize_); +#else freeRemoteBuffer(devVec->v_); +#endif free(deviceVec); } } diff --git a/cuda/vectordev.h b/cuda/vectordev.h index 8eca7063..7b5604c8 100644 --- a/cuda/vectordev.h +++ b/cuda/vectordev.h @@ -45,9 +45,12 @@ struct MultiVectDevice //number of elements for a single vector int size_; - //pithc in number of elements + //pitch in number of elements int pitch_; + // malloc size + int msize_; + // Vectors in device memory (single allocation) void *v_; }; diff --git a/test/cudakern/dpdegenmv.F90 b/test/cudakern/dpdegenmv.F90 index 85059e81..121bbe8f 100644 --- a/test/cudakern/dpdegenmv.F90 +++ b/test/cudakern/dpdegenmv.F90 @@ -594,7 +594,7 @@ program pdgenmv ! solver parameters integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes - real(psb_dpk_) :: err, eps + real(psb_dpk_) :: err, eps, td integer, parameter :: ntests=200, ngpu=50, ncnv=20 type(psb_d_coo_sparse_mat), target :: acoo type(psb_d_csr_sparse_mat), target :: acsr @@ -823,6 +823,28 @@ program pdgenmv ! FIXME: cache flush needed here call xg%set(x0) call xg%sync() + + call psb_barrier(ctxt) + gt1 = psb_wtime() + do i=1,ntests*ngpu + td = psb_gedot(xg,bg,desc_a,info) + ! For timing purposes we need to make sure all threads + ! in the device are done. + if ((info /= 0).or.(psb_get_errstatus()/=0)) then + write(0,*) 'From 1 gedot',info,i,ntests + call psb_error() + stop + end if + + end do + call psb_cuda_DeviceSync() + call psb_barrier(ctxt) + gt2 = psb_wtime() - gt1 + call psb_amx(ctxt,gt2) + write(0,*) 'DOT time : ',gt2,gt2/(ntests*ngpu) + + + call psb_barrier(ctxt) gt1 = psb_wtime() do i=1,ntests*ngpu