diff --git a/cuda/cuda_util.c b/cuda/cuda_util.c index e0ab9bdc..8e5385b4 100644 --- a/cuda/cuda_util.c +++ b/cuda/cuda_util.c @@ -375,6 +375,13 @@ int getGPUMaxRegistersPerBlock() return(count); } +int getGPUSharedMemPerBlock() +{ int count=0; + if (prop!=NULL) + count = prop->sharedMemPerBlock; + return(count); +} + void cpyGPUNameString(char *cstring) { *cstring='\0'; diff --git a/cuda/cuda_util.h b/cuda/cuda_util.h index 190a372f..dfb4e9d3 100644 --- a/cuda/cuda_util.h +++ b/cuda/cuda_util.h @@ -65,6 +65,7 @@ int getGPUWarpSize(); int getGPUMaxThreadsPerBlock(); int getGPUMaxThreadsPerMP(); int getGPUMaxRegistersPerBlock(); +int getGPUSharedMemPerBlock(); void cpyGPUNameString(char *cstring); diff --git a/cuda/psb_cuda_env_mod.F90 b/cuda/psb_cuda_env_mod.F90 index 4778d229..2fe8cf5a 100644 --- a/cuda/psb_cuda_env_mod.F90 +++ b/cuda/psb_cuda_env_mod.F90 @@ -165,6 +165,11 @@ module psb_cuda_env_mod use iso_c_binding integer(c_int) :: res end function psb_C_get_MaxRegistersPerBlock + function psb_C_get_SharedMemPerBlock() & + & result(res) bind(c,name='getGPUSharedMemsPerBlock') + use iso_c_binding + integer(c_int) :: res + end function psb_C_get_SharedMemPerBlock end interface interface subroutine psb_C_cpy_NameString(cstring) & diff --git a/cuda/spgpu/kernels/hell_dspmv.cu b/cuda/spgpu/kernels/hell_dspmv.cu index dc99e842..b8c02be9 100644 --- a/cuda/spgpu/kernels/hell_dspmv.cu +++ b/cuda/spgpu/kernels/hell_dspmv.cu @@ -21,6 +21,7 @@ extern "C" { #include "core.h" #include "hell.h" + int getGPUSharedMemPerBlock(); } #include "debug.h" @@ -34,7 +35,7 @@ extern "C" #if defined(NEW_MM) -#define MMBSZ 12 +#define MMBSZ 8 #undef GEN_SPGPU_HELL_NAME #define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm) @@ -53,7 +54,7 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) VALUE_TYPE *pz,*px,*py; VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); VALUE_TYPE yVal; - __shared__ VALUE_TYPE temp[THREAD_BLOCK][MMBSZ]; + __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK]; int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK); @@ -74,7 +75,7 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) int rowSize = rS[i]; for (int k=0; kcurrentStream >>> (count, z, zPitch,y, yPitch, @@ -229,3 +231,4 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) } #endif +