#include #include #include "psi_cuda_common.cuh" #undef GEN_PSI_FUNC_NAME #define GEN_PSI_FUNC_NAME(x) CONCAT(CONCAT(psi_cuda_,x),_CopyCooToHlg) #define THREAD_BLOCK 256 #ifdef __cplusplus extern "C" { #endif void GEN_PSI_FUNC_NAME(TYPE_SYMBOL)(spgpuHandle_t handle, int nr, int nc, int nza, int baseIdx, int hacksz, int noffs, int isz, int *rS, int *hackOffs, int *devIdisp, int *devJa, VALUE_TYPE *devVal, int *idiag, int *rP, VALUE_TYPE *cM); #ifdef __cplusplus } #endif __global__ void CONCAT(GEN_PSI_FUNC_NAME(TYPE_SYMBOL),_krn)(int ii, int nrws, int nr, int nza, int baseIdx, int hacksz, int noffs, int isz, int *rS, int *hackOffs, int *devIdisp, int *devJa, VALUE_TYPE *devVal, int *idiag, int *rP, VALUE_TYPE *cM) { int ir, k, ipnt, rsz,jc; int ki = threadIdx.x + blockIdx.x * (THREAD_BLOCK); int i=ii+ki; if (ki >= nrws) return; if (icurrentStream >>>(i,nrws,nr, nza, baseIdx, hacksz, noffs, isz, rS,hackOffs,devIdisp,devJa,devVal,idiag,rP,cM); } void GEN_PSI_FUNC_NAME(TYPE_SYMBOL)(spgpuHandle_t handle, int nr, int nc, int nza, int baseIdx, int hacksz, int noffs, int isz, int *rS, int *hackOffs, int *devIdisp, int *devJa, VALUE_TYPE *devVal, int *idiag, int *rP, VALUE_TYPE *cM) { int i, nrws; //int maxNForACall = THREAD_BLOCK*handle->maxGridSizeX; int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX); //fprintf(stderr,"Loop on j: %d\n",j); for (i=0; i