diff --git a/base/modules/serial/psb_c_base_mat_mod.F90 b/base/modules/serial/psb_c_base_mat_mod.F90 index 52fd8111..5bd41e6b 100644 --- a/base/modules/serial/psb_c_base_mat_mod.F90 +++ b/base/modules/serial/psb_c_base_mat_mod.F90 @@ -104,33 +104,34 @@ module psb_c_base_mat_mod ! ! Computational methods: defined here but not implemented. ! - procedure, pass(a) :: vect_mv => psb_c_base_vect_mv - procedure, pass(a) :: csmv => psb_c_base_csmv - procedure, pass(a) :: csmm => psb_c_base_csmm - generic, public :: spmm => csmm, csmv, vect_mv - procedure, pass(a) :: in_vect_sv => psb_c_base_inner_vect_sv - procedure, pass(a) :: inner_cssv => psb_c_base_inner_cssv - procedure, pass(a) :: inner_cssm => psb_c_base_inner_cssm - generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv - procedure, pass(a) :: vect_cssv => psb_c_base_vect_cssv - procedure, pass(a) :: cssv => psb_c_base_cssv - procedure, pass(a) :: cssm => psb_c_base_cssm - generic, public :: spsm => cssm, cssv, vect_cssv - procedure, pass(a) :: scals => psb_c_base_scals - procedure, pass(a) :: scalv => psb_c_base_scal - generic, public :: scal => scals, scalv - procedure, pass(a) :: maxval => psb_c_base_maxval - procedure, pass(a) :: spnmi => psb_c_base_csnmi - procedure, pass(a) :: spnm1 => psb_c_base_csnm1 - procedure, pass(a) :: rowsum => psb_c_base_rowsum - procedure, pass(a) :: arwsum => psb_c_base_arwsum - procedure, pass(a) :: colsum => psb_c_base_colsum - procedure, pass(a) :: aclsum => psb_c_base_aclsum - procedure, pass(a) :: scalpid => psb_c_base_scalplusidentity - procedure, pass(a) :: spaxpby => psb_c_base_spaxpby - procedure, pass(a) :: cmpval => psb_c_base_cmpval - procedure, pass(a) :: cmpmat => psb_c_base_cmpmat - generic, public :: spcmp => cmpval, cmpmat + procedure, pass(a) :: vect_mv => psb_c_base_vect_mv + procedure, pass(a) :: multivect_mv => psb_c_base_multivect_mv + procedure, pass(a) :: csmv => psb_c_base_csmv + procedure, pass(a) :: csmm => psb_c_base_csmm + generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv + procedure, pass(a) :: in_vect_sv => psb_c_base_inner_vect_sv + procedure, pass(a) :: inner_cssv => psb_c_base_inner_cssv + procedure, pass(a) :: inner_cssm => psb_c_base_inner_cssm + generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv + procedure, pass(a) :: vect_cssv => psb_c_base_vect_cssv + procedure, pass(a) :: cssv => psb_c_base_cssv + procedure, pass(a) :: cssm => psb_c_base_cssm + generic, public :: spsm => cssm, cssv, vect_cssv + procedure, pass(a) :: scals => psb_c_base_scals + procedure, pass(a) :: scalv => psb_c_base_scal + generic, public :: scal => scals, scalv + procedure, pass(a) :: maxval => psb_c_base_maxval + procedure, pass(a) :: spnmi => psb_c_base_csnmi + procedure, pass(a) :: spnm1 => psb_c_base_csnm1 + procedure, pass(a) :: rowsum => psb_c_base_rowsum + procedure, pass(a) :: arwsum => psb_c_base_arwsum + procedure, pass(a) :: colsum => psb_c_base_colsum + procedure, pass(a) :: aclsum => psb_c_base_aclsum + procedure, pass(a) :: scalpid => psb_c_base_scalplusidentity + procedure, pass(a) :: spaxpby => psb_c_base_spaxpby + procedure, pass(a) :: cmpval => psb_c_base_cmpval + procedure, pass(a) :: cmpmat => psb_c_base_cmpmat + generic, public :: spcmp => cmpval, cmpmat end type psb_c_base_sparse_mat private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, & @@ -1240,6 +1241,42 @@ module psb_c_base_mat_mod end subroutine psb_c_base_vect_mv end interface + !> Function multivect_mv: + !! \memberof psb_c_base_sparse_mat + !! \brief Product by an encapsulated array type(psb_c_multivect_type) + !! + !! Compute + !! Y = alpha*op(A)*X + beta*Y + !! Usually the unwrapping of the encapsulated multivector is done + !! here, so that all the derived classes need only the + !! versions with the standard arrays. + !! Must be overridden explicitly in case of non standard memory + !! management; an example would be external memory allocation + !! in attached processors such as GPUs. + !! + !! + !! \param alpha Scaling factor for Ax + !! \param A the input sparse matrix + !! \param x the input X + !! \param beta Scaling factor for y + !! \param y the input/output Y + !! \param info return code + !! \param trans [N] Whether to use A (N), its transpose (T) + !! or its conjugate transpose (C) + !! + ! + interface + subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans) + import + class(psb_c_base_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta + class(psb_c_base_multivect_type), intent(inout) :: x + class(psb_c_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_c_base_multivect_mv + end interface + ! !> Function cssm: !! \memberof psb_c_base_sparse_mat diff --git a/base/modules/serial/psb_s_base_mat_mod.F90 b/base/modules/serial/psb_s_base_mat_mod.F90 index 29b1c2dc..37bd1436 100644 --- a/base/modules/serial/psb_s_base_mat_mod.F90 +++ b/base/modules/serial/psb_s_base_mat_mod.F90 @@ -104,33 +104,34 @@ module psb_s_base_mat_mod ! ! Computational methods: defined here but not implemented. ! - procedure, pass(a) :: vect_mv => psb_s_base_vect_mv - procedure, pass(a) :: csmv => psb_s_base_csmv - procedure, pass(a) :: csmm => psb_s_base_csmm - generic, public :: spmm => csmm, csmv, vect_mv - procedure, pass(a) :: in_vect_sv => psb_s_base_inner_vect_sv - procedure, pass(a) :: inner_cssv => psb_s_base_inner_cssv - procedure, pass(a) :: inner_cssm => psb_s_base_inner_cssm - generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv - procedure, pass(a) :: vect_cssv => psb_s_base_vect_cssv - procedure, pass(a) :: cssv => psb_s_base_cssv - procedure, pass(a) :: cssm => psb_s_base_cssm - generic, public :: spsm => cssm, cssv, vect_cssv - procedure, pass(a) :: scals => psb_s_base_scals - procedure, pass(a) :: scalv => psb_s_base_scal - generic, public :: scal => scals, scalv - procedure, pass(a) :: maxval => psb_s_base_maxval - procedure, pass(a) :: spnmi => psb_s_base_csnmi - procedure, pass(a) :: spnm1 => psb_s_base_csnm1 - procedure, pass(a) :: rowsum => psb_s_base_rowsum - procedure, pass(a) :: arwsum => psb_s_base_arwsum - procedure, pass(a) :: colsum => psb_s_base_colsum - procedure, pass(a) :: aclsum => psb_s_base_aclsum - procedure, pass(a) :: scalpid => psb_s_base_scalplusidentity - procedure, pass(a) :: spaxpby => psb_s_base_spaxpby - procedure, pass(a) :: cmpval => psb_s_base_cmpval - procedure, pass(a) :: cmpmat => psb_s_base_cmpmat - generic, public :: spcmp => cmpval, cmpmat + procedure, pass(a) :: vect_mv => psb_s_base_vect_mv + procedure, pass(a) :: multivect_mv => psb_s_base_multivect_mv + procedure, pass(a) :: csmv => psb_s_base_csmv + procedure, pass(a) :: csmm => psb_s_base_csmm + generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv + procedure, pass(a) :: in_vect_sv => psb_s_base_inner_vect_sv + procedure, pass(a) :: inner_cssv => psb_s_base_inner_cssv + procedure, pass(a) :: inner_cssm => psb_s_base_inner_cssm + generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv + procedure, pass(a) :: vect_cssv => psb_s_base_vect_cssv + procedure, pass(a) :: cssv => psb_s_base_cssv + procedure, pass(a) :: cssm => psb_s_base_cssm + generic, public :: spsm => cssm, cssv, vect_cssv + procedure, pass(a) :: scals => psb_s_base_scals + procedure, pass(a) :: scalv => psb_s_base_scal + generic, public :: scal => scals, scalv + procedure, pass(a) :: maxval => psb_s_base_maxval + procedure, pass(a) :: spnmi => psb_s_base_csnmi + procedure, pass(a) :: spnm1 => psb_s_base_csnm1 + procedure, pass(a) :: rowsum => psb_s_base_rowsum + procedure, pass(a) :: arwsum => psb_s_base_arwsum + procedure, pass(a) :: colsum => psb_s_base_colsum + procedure, pass(a) :: aclsum => psb_s_base_aclsum + procedure, pass(a) :: scalpid => psb_s_base_scalplusidentity + procedure, pass(a) :: spaxpby => psb_s_base_spaxpby + procedure, pass(a) :: cmpval => psb_s_base_cmpval + procedure, pass(a) :: cmpmat => psb_s_base_cmpmat + generic, public :: spcmp => cmpval, cmpmat end type psb_s_base_sparse_mat private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, & @@ -1240,6 +1241,42 @@ module psb_s_base_mat_mod end subroutine psb_s_base_vect_mv end interface + !> Function multivect_mv: + !! \memberof psb_s_base_sparse_mat + !! \brief Product by an encapsulated array type(psb_s_multivect_type) + !! + !! Compute + !! Y = alpha*op(A)*X + beta*Y + !! Usually the unwrapping of the encapsulated multivector is done + !! here, so that all the derived classes need only the + !! versions with the standard arrays. + !! Must be overridden explicitly in case of non standard memory + !! management; an example would be external memory allocation + !! in attached processors such as GPUs. + !! + !! + !! \param alpha Scaling factor for Ax + !! \param A the input sparse matrix + !! \param x the input X + !! \param beta Scaling factor for y + !! \param y the input/output Y + !! \param info return code + !! \param trans [N] Whether to use A (N), its transpose (T) + !! or its conjugate transpose (C) + !! + ! + interface + subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans) + import + class(psb_s_base_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta + class(psb_s_base_multivect_type), intent(inout) :: x + class(psb_s_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_s_base_multivect_mv + end interface + ! !> Function cssm: !! \memberof psb_s_base_sparse_mat diff --git a/base/modules/serial/psb_z_base_mat_mod.F90 b/base/modules/serial/psb_z_base_mat_mod.F90 index 0cee142e..37d810c3 100644 --- a/base/modules/serial/psb_z_base_mat_mod.F90 +++ b/base/modules/serial/psb_z_base_mat_mod.F90 @@ -104,33 +104,34 @@ module psb_z_base_mat_mod ! ! Computational methods: defined here but not implemented. ! - procedure, pass(a) :: vect_mv => psb_z_base_vect_mv - procedure, pass(a) :: csmv => psb_z_base_csmv - procedure, pass(a) :: csmm => psb_z_base_csmm - generic, public :: spmm => csmm, csmv, vect_mv - procedure, pass(a) :: in_vect_sv => psb_z_base_inner_vect_sv - procedure, pass(a) :: inner_cssv => psb_z_base_inner_cssv - procedure, pass(a) :: inner_cssm => psb_z_base_inner_cssm - generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv - procedure, pass(a) :: vect_cssv => psb_z_base_vect_cssv - procedure, pass(a) :: cssv => psb_z_base_cssv - procedure, pass(a) :: cssm => psb_z_base_cssm - generic, public :: spsm => cssm, cssv, vect_cssv - procedure, pass(a) :: scals => psb_z_base_scals - procedure, pass(a) :: scalv => psb_z_base_scal - generic, public :: scal => scals, scalv - procedure, pass(a) :: maxval => psb_z_base_maxval - procedure, pass(a) :: spnmi => psb_z_base_csnmi - procedure, pass(a) :: spnm1 => psb_z_base_csnm1 - procedure, pass(a) :: rowsum => psb_z_base_rowsum - procedure, pass(a) :: arwsum => psb_z_base_arwsum - procedure, pass(a) :: colsum => psb_z_base_colsum - procedure, pass(a) :: aclsum => psb_z_base_aclsum - procedure, pass(a) :: scalpid => psb_z_base_scalplusidentity - procedure, pass(a) :: spaxpby => psb_z_base_spaxpby - procedure, pass(a) :: cmpval => psb_z_base_cmpval - procedure, pass(a) :: cmpmat => psb_z_base_cmpmat - generic, public :: spcmp => cmpval, cmpmat + procedure, pass(a) :: vect_mv => psb_z_base_vect_mv + procedure, pass(a) :: multivect_mv => psb_z_base_multivect_mv + procedure, pass(a) :: csmv => psb_z_base_csmv + procedure, pass(a) :: csmm => psb_z_base_csmm + generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv + procedure, pass(a) :: in_vect_sv => psb_z_base_inner_vect_sv + procedure, pass(a) :: inner_cssv => psb_z_base_inner_cssv + procedure, pass(a) :: inner_cssm => psb_z_base_inner_cssm + generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv + procedure, pass(a) :: vect_cssv => psb_z_base_vect_cssv + procedure, pass(a) :: cssv => psb_z_base_cssv + procedure, pass(a) :: cssm => psb_z_base_cssm + generic, public :: spsm => cssm, cssv, vect_cssv + procedure, pass(a) :: scals => psb_z_base_scals + procedure, pass(a) :: scalv => psb_z_base_scal + generic, public :: scal => scals, scalv + procedure, pass(a) :: maxval => psb_z_base_maxval + procedure, pass(a) :: spnmi => psb_z_base_csnmi + procedure, pass(a) :: spnm1 => psb_z_base_csnm1 + procedure, pass(a) :: rowsum => psb_z_base_rowsum + procedure, pass(a) :: arwsum => psb_z_base_arwsum + procedure, pass(a) :: colsum => psb_z_base_colsum + procedure, pass(a) :: aclsum => psb_z_base_aclsum + procedure, pass(a) :: scalpid => psb_z_base_scalplusidentity + procedure, pass(a) :: spaxpby => psb_z_base_spaxpby + procedure, pass(a) :: cmpval => psb_z_base_cmpval + procedure, pass(a) :: cmpmat => psb_z_base_cmpmat + generic, public :: spcmp => cmpval, cmpmat end type psb_z_base_sparse_mat private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, & @@ -1240,6 +1241,42 @@ module psb_z_base_mat_mod end subroutine psb_z_base_vect_mv end interface + !> Function multivect_mv: + !! \memberof psb_z_base_sparse_mat + !! \brief Product by an encapsulated array type(psb_z_multivect_type) + !! + !! Compute + !! Y = alpha*op(A)*X + beta*Y + !! Usually the unwrapping of the encapsulated multivector is done + !! here, so that all the derived classes need only the + !! versions with the standard arrays. + !! Must be overridden explicitly in case of non standard memory + !! management; an example would be external memory allocation + !! in attached processors such as GPUs. + !! + !! + !! \param alpha Scaling factor for Ax + !! \param A the input sparse matrix + !! \param x the input X + !! \param beta Scaling factor for y + !! \param y the input/output Y + !! \param info return code + !! \param trans [N] Whether to use A (N), its transpose (T) + !! or its conjugate transpose (C) + !! + ! + interface + subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans) + import + class(psb_z_base_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta + class(psb_z_base_multivect_type), intent(inout) :: x + class(psb_z_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_z_base_multivect_mv + end interface + ! !> Function cssm: !! \memberof psb_z_base_sparse_mat diff --git a/base/serial/impl/psb_c_base_mat_impl.F90 b/base/serial/impl/psb_c_base_mat_impl.F90 index 17f2cdc8..474eff9f 100644 --- a/base/serial/impl/psb_c_base_mat_impl.F90 +++ b/base/serial/impl/psb_c_base_mat_impl.F90 @@ -2012,6 +2012,26 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans) call y%set_host() end subroutine psb_c_base_vect_mv +subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans) + use psb_error_mod + use psb_const_mod + use psb_c_base_mat_mod, psb_protect_name => psb_c_base_multivect_mv + implicit none + class(psb_c_base_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta + class(psb_c_base_multivect_type), intent(inout) :: x + class(psb_c_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + ! For the time being we just throw everything back + ! onto the normal routines. + call x%sync() + call y%sync() + call a%spmm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() +end subroutine psb_c_base_multivect_mv + subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv use psb_c_base_vect_mod diff --git a/base/serial/impl/psb_s_base_mat_impl.F90 b/base/serial/impl/psb_s_base_mat_impl.F90 index 4a99a684..3d3e2ca5 100644 --- a/base/serial/impl/psb_s_base_mat_impl.F90 +++ b/base/serial/impl/psb_s_base_mat_impl.F90 @@ -2012,6 +2012,26 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans) call y%set_host() end subroutine psb_s_base_vect_mv +subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans) + use psb_error_mod + use psb_const_mod + use psb_s_base_mat_mod, psb_protect_name => psb_s_base_multivect_mv + implicit none + class(psb_s_base_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta + class(psb_s_base_multivect_type), intent(inout) :: x + class(psb_s_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + ! For the time being we just throw everything back + ! onto the normal routines. + call x%sync() + call y%sync() + call a%spmm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() +end subroutine psb_s_base_multivect_mv + subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv use psb_s_base_vect_mod diff --git a/base/serial/impl/psb_z_base_mat_impl.F90 b/base/serial/impl/psb_z_base_mat_impl.F90 index 404027c5..123f4c73 100644 --- a/base/serial/impl/psb_z_base_mat_impl.F90 +++ b/base/serial/impl/psb_z_base_mat_impl.F90 @@ -2012,6 +2012,26 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans) call y%set_host() end subroutine psb_z_base_vect_mv +subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans) + use psb_error_mod + use psb_const_mod + use psb_z_base_mat_mod, psb_protect_name => psb_z_base_multivect_mv + implicit none + class(psb_z_base_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta + class(psb_z_base_multivect_type), intent(inout) :: x + class(psb_z_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + ! For the time being we just throw everything back + ! onto the normal routines. + call x%sync() + call y%sync() + call a%spmm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() +end subroutine psb_z_base_multivect_mv + subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv use psb_z_base_vect_mod diff --git a/cuda/hdiagdev.c b/cuda/hdiagdev.c index 6302eed1..1f55457f 100644 --- a/cuda/hdiagdev.c +++ b/cuda/hdiagdev.c @@ -267,6 +267,27 @@ int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, return SPGPU_SUCCESS; } +int spmmHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, + double beta, void* deviceY) +{ + struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + spgpuHandle_t handle=psb_cudaGetHandle(); + +#ifdef VERBOSE + /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ + /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ + /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ +#endif + spgpuDhdiaspmm (handle, y->count_, (double*)y->v_, y->pitch_, + (double *)y->v_, y->pitch_, alpha, (double *)devMat->cM, + devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, + devMat->rows, devMat->cols, (double *)x->v_, x->pitch_, beta); + + return SPGPU_SUCCESS; +} + int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets) { int i=0,fo,fa,j,k,p; char buf_a[255], buf_o[255],tmp[255]; @@ -384,3 +405,23 @@ int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, return SPGPU_SUCCESS; } +int spmmHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, + float beta, void* deviceY) +{ + struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat; + struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX; + struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY; + spgpuHandle_t handle=psb_cudaGetHandle(); + +#ifdef VERBOSE + /*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/ + /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/ + /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/ +#endif + spgpuShdiaspmm (handle, y->count_, (float*)y->v_, y->pitch_, + (float *)y->v_, y->pitch_, alpha, (float *)devMat->cM, + devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets, + devMat->rows, devMat->cols, (float *)x->v_, x->pitch_, beta); + + return SPGPU_SUCCESS; +} diff --git a/cuda/hdiagdev.h b/cuda/hdiagdev.h index 5cd9f803..5e7fed61 100644 --- a/cuda/hdiagdev.h +++ b/cuda/hdiagdev.h @@ -95,12 +95,13 @@ int allocHdiagDevice(void ** remoteMatrix, HdiagDeviceParams* params); void freeHdiagDevice(void* remoteMatrix); int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets); -int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX, - float beta, void* deviceY); - int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets); -int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, - double beta, void* deviceY); +//int writeHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets); +//int writeHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets); +//int readHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets); +//int readHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets); +//int readHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets); +//int readHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets); #endif diff --git a/cuda/hdiagdev_mod.F90 b/cuda/hdiagdev_mod.F90 index 9a3530e7..64a63818 100644 --- a/cuda/hdiagdev_mod.F90 +++ b/cuda/hdiagdev_mod.F90 @@ -196,4 +196,35 @@ module hdiagdev_mod !!$ end function spmvHdiagDeviceDoubleComplex end interface spmvHdiagDevice + interface spmmHdiagDevice + function spmmHdiagDeviceFloat(deviceMat,alpha,x,beta,y) & + & result(res) bind(c,name='spmmHdiagDeviceFloat') + use iso_c_binding + integer(c_int) :: res + type(c_ptr), value :: deviceMat, x, y + real(c_float),value :: alpha, beta + end function spmmHdiagDeviceFloat + function spmmHdiagDeviceDouble(deviceMat,alpha,x,beta,y) & + & result(res) bind(c,name='spmmHdiagDeviceDouble') + use iso_c_binding + integer(c_int) :: res + type(c_ptr), value :: deviceMat, x, y + real(c_double),value :: alpha, beta + end function spmmHdiagDeviceDouble +!!$ function spmmHdiagDeviceFloatComplex(deviceMat,alpha,x,beta,y) & +!!$ & result(res) bind(c,name='spmmHdiagDeviceFloatComplex') +!!$ use iso_c_binding +!!$ integer(c_int) :: res +!!$ type(c_ptr), value :: deviceMat, x, y +!!$ complex(c_float_complex),value :: alpha, beta +!!$ end function spmmHdiagDeviceFloatComplex +!!$ function spmmHdiagDeviceDoubleComplex(deviceMat,alpha,x,beta,y) & +!!$ & result(res) bind(c,name='spmmHdiagDeviceDoubleComplex') +!!$ use iso_c_binding +!!$ integer(c_int) :: res +!!$ type(c_ptr), value :: deviceMat, x, y +!!$ complex(c_double_complex),value :: alpha, beta +!!$ end function spmmHdiagDeviceDoubleComplex + end interface spmmHdiagDevice + end module hdiagdev_mod diff --git a/cuda/impl/Makefile b/cuda/impl/Makefile index 0f8536b8..e9d70568 100644 --- a/cuda/impl/Makefile +++ b/cuda/impl/Makefile @@ -288,13 +288,7 @@ psb_s_cuda_hdiag_vect_mv.o \ psb_s_cuda_dnsg_mat_impl.o \ psb_d_cuda_dnsg_mat_impl.o \ psb_c_cuda_dnsg_mat_impl.o \ -psb_z_cuda_dnsg_mat_impl.o \ -psb_z_cuda_hdiag_csmv.o \ -psb_z_cuda_hdiag_csmm.o \ -psb_z_cuda_hdiag_vect_mv.o \ -psb_c_cuda_hdiag_csmv.o \ -psb_c_cuda_hdiag_csmm.o \ -psb_c_cuda_hdiag_vect_mv.o +psb_z_cuda_dnsg_mat_impl.o objs: $(OBJS) lib: objs diff --git a/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90 b/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90 index 824ef63d..f69a3bdf 100644 --- a/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90 +++ b/cuda/impl/psb_d_cuda_hdiag_vect_mv.F90 @@ -124,7 +124,7 @@ subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv use psb_d_cuda_multivect_mod implicit none - class(psb_d_cuda_hdiag_mat_mod), intent(in) :: a + class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a real(psb_dpk_), intent(in) :: alpha, beta class(psb_d_base_multivect_type), intent(inout) :: x class(psb_d_base_multivect_type), intent(inout) :: y diff --git a/cuda/impl/psb_s_cuda_hdiag_csmm.F90 b/cuda/impl/psb_s_cuda_hdiag_csmm.F90 index 7066daf4..a57ba57c 100644 --- a/cuda/impl/psb_s_cuda_hdiag_csmm.F90 +++ b/cuda/impl/psb_s_cuda_hdiag_csmm.F90 @@ -89,7 +89,7 @@ subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) if (tra) then if (a%is_dev()) call a%sync() - call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) else ! ! Just to test, move X/Y to/from the GPU. diff --git a/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90 b/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90 index 6e1eb35a..11e8afb4 100644 --- a/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90 +++ b/cuda/impl/psb_s_cuda_hdiag_vect_mv.F90 @@ -124,7 +124,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv use psb_s_cuda_multivect_mod implicit none - class(psb_s_cuda_hdiag_mat_mod), intent(in) :: a + class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a real(psb_spk_), intent(in) :: alpha, beta class(psb_s_base_multivect_type), intent(inout) :: x class(psb_s_base_multivect_type), intent(inout) :: y @@ -159,7 +159,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) if (beta /= dzero) then if (.not.y%is_host()) call y%sync() end if - call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) call y%set_host() else if (a%is_host()) call a%sync() diff --git a/cuda/spgpu/hdia.h b/cuda/spgpu/hdia.h index e8808fb7..680323fd 100644 --- a/cuda/spgpu/hdia.h +++ b/cuda/spgpu/hdia.h @@ -27,6 +27,10 @@ extern "C" { #endif +// DIA/HDIA Compressed Matrix Format routines + +/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations. +#define DIA_PITCH_ALIGN_BYTE 128 /** * \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta) @@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle, float beta); +/** +* \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param dM The stacked HDIA non zero values allocation pointer + * \param offsets The stacked HDIA diagonals offsets vector + * \param hackSize The constant size of every hack (must be a multiple of 32) + * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector + * \param rows the rows count + * \param cols the columns count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + */ +void +spgpuShdiaspmm (spgpuHandle_t handle, + int count, + __device float *z, + int zpitch, + const __device float *y, + int ypitch, + float alpha, + const __device float* dM, + const __device int* offsets, + int hackSize, + const __device int* hackOffsets, + int rows, + int cols, + const __device float *x, + int xpitch, + float beta); + /** * \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta) * Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU. @@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle, const double *x, double beta); +/** +* \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param dM The stacked HDIA non zero values allocation pointer + * \param offsets The stacked HDIA diagonals offsets vector + * \param hackSize The constant size of every hack (must be a multiple of 32) + * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector + * \param rows the rows count + * \param cols the columns count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + */ +void +spgpuDhdiaspmm (spgpuHandle_t handle, + int count, + __device double *z, + int zpitch, + const __device double *y, + int ypitch, + double alpha, + const __device double* dM, + const __device int* offsets, + int hackSize, + const __device int* hackOffsets, + int rows, + int cols, + const __device double *x, + int xpitch, + double beta); /** * \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta) @@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle, const cuFloatComplex *x, cuFloatComplex beta); +/** +* \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param dM The stacked HDIA non zero values allocation pointer + * \param offsets The stacked HDIA diagonals offsets vector + * \param hackSize The constant size of every hack (must be a multiple of 32) + * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector + * \param rows the rows count + * \param cols the columns count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + */ +void +spgpuChdiaspmm (spgpuHandle_t handle, + int count, + __device cuFloatComplex *z, + int zpitch, + const __device cuFloatComplex *y, + int ypitch, + cuFloatComplex alpha, + const __device cuFloatComplex* dM, + const __device int* offsets, + int hackSize, + const __device int* hackOffsets, + int rows, + int cols, + const __device cuFloatComplex *x, + int xpitch, + cuFloatComplex beta); /** * \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta) @@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle, int cols, const cuDoubleComplex *x, cuDoubleComplex beta); + +/** +* \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta) + * Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU. + * \param handle The spgpu handle used to call this routine + * \param count The cols count + * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y). + * \param zpitch The pitch of the output vector + * \param y The y input vector + * \param ypitch The pitch of the y input vector + * \param alpha The alpha scalar + * \param dM The stacked HDIA non zero values allocation pointer + * \param offsets The stacked HDIA diagonals offsets vector + * \param hackSize The constant size of every hack (must be a multiple of 32) + * \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector + * \param rows the rows count + * \param cols the columns count + * \param x the x vector + * \param xpitch The pitch of the x input vector + * \param beta the beta scalar + */ +void +spgpuZhdiaspmm (spgpuHandle_t handle, + int count, + __device cuDoubleComplex *z, + int zpitch, + const __device cuDoubleComplex *y, + int ypitch, + cuDoubleComplex alpha, + const __device cuDoubleComplex* dM, + const __device int* offsets, + int hackSize, + const __device int* hackOffsets, + int rows, + int cols, + const __device cuDoubleComplex *x, + int xpitch, + cuDoubleComplex beta); + /** @}*/ diff --git a/cuda/spgpu/kernels/ell_spmm_base.cuh b/cuda/spgpu/kernels/ell_spmm_base.cuh index da54794c..cccc165d 100644 --- a/cuda/spgpu/kernels/ell_spmm_base.cuh +++ b/cuda/spgpu/kernels/ell_spmm_base.cuh @@ -99,6 +99,86 @@ fetchTex (int pointer) #undef GEN_SPGPU_ELL_NAME #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm) +#if 0 + +void +GEN_SPGPU_ELL_NAME(TYPE_SYMBOL) +(spgpuHandle_t handle, + int count, + VALUE_TYPE* z, + int zPitch, + const VALUE_TYPE *y, + int yPitch, + VALUE_TYPE alpha, + const VALUE_TYPE* cM, + const int* rP, + int cMPitch, + int rPPitch, + const __device int* rS, + const __device int* rIdx, + int avgNnzPerRow, + int maxNnzPerRow, + int rows, + const VALUE_TYPE *x, + int xPitch, + VALUE_TYPE beta, + int baseIndex) +{ + VALUE_TYPE *px,*py, *pz; + int cnt, c1; + + dim3 block (THREAD_BLOCK, 1); + // dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK); + // Should we generalize the code to 1/2/4/8 threads per row? + // And maybe adjust THREAD_BLOCK size? + int shrMemSize,maxShmemSz; + int numMp=getGPUMultiProcessors(); + int maxThMp=getGPUMaxThreadsPerMP(); + int nmblksMp=maxThMp/THREAD_BLOCK; + int nmblk=nmblksMp*numMp; + dim3 grid (nmblk); + + maxShmemSz=getGPUSharedMemPerBlock(); + shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE); + if (shrMemSize > maxShmemSz) { + fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz); + return; + } + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > 2*MMBSZ) { + CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) + <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch, + alpha, cM, rP, cMPitch, rPPitch, + rS, rows, px, xPitch, beta, baseIndex); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt > MMBSZ) { + c1 = cnt/2; + CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) + <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch, + alpha, cM, rP, cMPitch, rPPitch, + rS, rows, px, xPitch, beta, baseIndex); + cnt -= c1; + } + if (cnt > MMBSZ) { + fprintf(stderr,"Invalid residual count %d\n",cnt); + } else if (cnt > 0){ + CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn) + <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch, + alpha, cM, rP, cMPitch, rPPitch, + rS, rows, px, xPitch, beta, baseIndex); + } + cudaCheckError("CUDA error on ell_spmm"); +} + +#endif + void GEN_SPGPU_ELL_NAME(TYPE_SYMBOL) (spgpuHandle_t handle, @@ -110,12 +190,12 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL) VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, - int cMPitch, - int rPPitch, + int cMPitch, + int rPPitch, const __device int* rS, const __device int* rIdx, - int avgNnzPerRow, - int maxNnzPerRow, + int avgNnzPerRow, + int maxNnzPerRow, int rows, const VALUE_TYPE *x, int xPitch, @@ -193,4 +273,4 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL) } cudaCheckError("CUDA error on ell_spmm"); -} \ No newline at end of file +} diff --git a/cuda/spgpu/kernels/ell_spmm_base_template.cuh b/cuda/spgpu/kernels/ell_spmm_base_template.cuh index 712a432a..be243bad 100644 --- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh +++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh @@ -17,6 +17,75 @@ #define THREAD_BLOCK 128 #define MMBSZ 8 +#if 0 + +__global__ void +CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn) + (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch, + VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, + int cMPitch, int rPPitch, const int* rS, int rows, + const VALUE_TYPE *x, int xPitch, + VALUE_TYPE beta, int baseIndex) +{ + VALUE_TYPE *pz,*px,*py; + VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); + VALUE_TYPE yVal; + __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK]; + + unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; + unsigned int gridSize = gridDim.x * blockDim.x; + + while (i < rows) { + + rS += i; rP += i; cM += i; + + int rowSize = rS[i]; + for (int k=0; kcurrentStream >>> (count, z, zPitch, y, yPitch, alpha, cM, rP, cMPitch, rPPitch, rS, rows, x, xPitch, beta, baseIndex); -} \ No newline at end of file +} diff --git a/cuda/spgpu/kernels/hdia_spmm_base.cuh b/cuda/spgpu/kernels/hdia_spmm_base.cuh index 96cb7e2c..38e682f5 100644 --- a/cuda/spgpu/kernels/hdia_spmm_base.cuh +++ b/cuda/spgpu/kernels/hdia_spmm_base.cuh @@ -74,27 +74,115 @@ fetchTex (int pointer) return CONCAT(readValue_,VALUE_TYPE) (fetch); } #endif -#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla) -#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla) -#include "hdia_spmv_base_template.cuh" +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla) +#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla) +#include "hdia_spmm_base_template.cuh" #if 0 #undef GEN_SPGPU_HDIA_NAME -#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch) -#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch) +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch) +#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch) #undef USE_PREFETCHING #define USE_PREFETCHING -#include "hdia_spmv_base_template.cuh" +#include "hdia_spmm_base_template.cuh" #define ENABLE_CACHE #undef GEN_SPGPU_HDIA_NAME -#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch) -#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch) -#include "hdia_spmv_base_template.cuh" +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch) +#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch) +#include "hdia_spmm_base_template.cuh" #undef GEN_SPGPU_HDIA_NAME #undef USE_PREFETCHING -#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache) -#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache) -#include "hdia_spmv_base_template.cuh" +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache) +#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache) +#include "hdia_spmm_base_template.cuh" #endif #undef GEN_SPGPU_HDIA_NAME -#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv) +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm) +void +GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL) +(spgpuHandle_t handle, + int count, + VALUE_TYPE* z, + int zPitch, + const VALUE_TYPE *y, + int yPitch, + VALUE_TYPE alpha, + const VALUE_TYPE* cM, + const int* hdiaOffsets, + int hackSize, + const __device int* hackOffsets, + int rows, + int cols, + const VALUE_TYPE *x, + int xPitch, + VALUE_TYPE beta) +{ + VALUE_TYPE *px,*py, *pz; + int cnt; + int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX); + + // maxNForACall should be a multiple of hackSize + maxNForACall = (maxNForACall/hackSize)*hackSize; + int maxShmemSz; + maxShmemSz=getGPUSharedMemPerBlock(); + //fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz); + while (rows > maxNForACall) {//managing large vectors + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > MMBSZ) { + //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); + CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, + py, yPitch, + alpha, cM, hdiaOffsets, + hackSize, hackOffsets, + maxNForACall, cols, + px, xPitch, beta); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt >0) { + CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, + py, yPitch, + alpha, cM, hdiaOffsets, + hackSize, hackOffsets, + maxNForACall, cols, + px, xPitch, beta); + } + + y = y + maxNForACall; + z = z + maxNForACall; + hackOffsets = hackOffsets + maxNForACall/hackSize; + rows -= maxNForACall; + } + cnt = count; + px = (VALUE_TYPE *) x; + py = (VALUE_TYPE *) y; + pz = (VALUE_TYPE *) z; + while (cnt > MMBSZ) { + //fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz); + CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, + py, yPitch, + alpha, cM, hdiaOffsets, + hackSize, hackOffsets, + rows, cols, + px, xPitch, beta); + px += xPitch*MMBSZ; + py += yPitch*MMBSZ; + pz += zPitch*MMBSZ; + cnt -= MMBSZ; + } + if (cnt >0) { + CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch, + py, yPitch, + alpha, cM, hdiaOffsets, + hackSize, hackOffsets, + rows, cols, + px, xPitch, beta); + } + + cudaCheckError("CUDA error on hdiag_spmm"); +} \ No newline at end of file diff --git a/cuda/spgpu/kernels/hdia_spmm_base_template.cuh b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh index b1902ab5..a06377bc 100644 --- a/cuda/spgpu/kernels/hdia_spmm_base_template.cuh +++ b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh @@ -15,4 +15,127 @@ */ #define THREAD_BLOCK 128 +#define MMBSZ 8 +__global__ void +CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) + (int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, + int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, + int hackSize, const int* hackOffsets, int rows, int cols, + const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta) +{ + VALUE_TYPE *pz,*px,*py; + VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); + VALUE_TYPE yVal; + __shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK]; + + int hackCount = (rows + hackSize - 1)/hackSize; + + int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK); + + int hackId = i / hackSize; + int hackLaneId = i % hackSize; + + // shared between offsetsChunks and warpHackOffsetTemp + extern __shared__ int dynShrMem[]; + + int hackOffset = 0; + int nextOffset = 0; + + unsigned int laneId = threadIdx.x % warpSize; + unsigned int warpId = threadIdx.x / warpSize; + + if (laneId == 0 && i < rows) { + hackOffset = hackOffsets[hackId]; + nextOffset = hackOffsets[hackId+1]; + } + + hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0); + nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0); + + if (hackId >= hackCount) + return; + + cM += hackOffset*hackSize + hackLaneId; + hdiaOffsets += hackOffset; + + for (int k=0; k= 0 && column < cols) { + px = (VALUE_TYPE *) x; + for (int k = 0; k < count; k++) { + VALUE_TYPE xValue = px[column]; + VALUE_TYPE mValue = cM[0]; + temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]); + px = px + xPitch; + } + } + cM += hackSize; + } + } + diags -= warpSize; + hdiaOffsets += warpSize; + } + + // Since z and y are accessed with the same offset by the same thread, + // and the write to z follows the y read, y and z can share the same base address (in-place computing). + if (i >= rows) + return; + + py = (VALUE_TYPE *) y; + pz = z; + + if (CONCAT(VALUE_TYPE, _isNotZero(beta))) + for (int k=0; kcurrentStream >>> (count, z, zPitch, y, yPitch, + alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols, + x, xPitch, beta); +} diff --git a/cuda/spgpu/kernels/hell_spmm_base.cuh b/cuda/spgpu/kernels/hell_spmm_base.cuh index 1aaf6b85..fda20e4b 100644 --- a/cuda/spgpu/kernels/hell_spmm_base.cuh +++ b/cuda/spgpu/kernels/hell_spmm_base.cuh @@ -106,11 +106,11 @@ extern __shared__ int dynShrMem[]; void GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) (spgpuHandle_t handle, - int count, - VALUE_TYPE* z, - int zPitch, + int count, + VALUE_TYPE* z, + int zPitch, const VALUE_TYPE *y, - int yPitch, + int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, @@ -120,9 +120,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) const __device int* rIdx, int rows, const VALUE_TYPE *x, - int xPitch, + int xPitch, VALUE_TYPE beta, - int baseIndex) + int baseIndex) { VALUE_TYPE *px,*py, *pz; int cnt, c1; @@ -149,7 +149,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) py = (VALUE_TYPE *) y; pz = (VALUE_TYPE *) z; while (cnt > 2*MMBSZ) { - CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch, alpha, cM, rP, hackSize, hackOffsets, rS, rows, px, xPitch, beta, baseIndex); @@ -160,7 +160,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) } if (cnt > MMBSZ) { c1 = cnt/2; - CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch, alpha, cM, rP, hackSize, hackOffsets, rS, rows, px, xPitch, beta, baseIndex); @@ -169,7 +169,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) if (cnt > MMBSZ) { fprintf(stderr,"Invalid residual count %d\n",cnt); } else if (cnt > 0){ - CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) + CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn) <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch, alpha, cM, rP, hackSize, hackOffsets, rS, rows, px, xPitch, beta, baseIndex); @@ -182,11 +182,11 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) void GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) (spgpuHandle_t handle, - int count, - VALUE_TYPE* z, - int zPitch, + int count, + VALUE_TYPE* z, + int zPitch, const VALUE_TYPE *y, - int yPitch, + int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, @@ -196,9 +196,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) const __device int* rIdx, int rows, const VALUE_TYPE *x, - int xPitch, + int xPitch, VALUE_TYPE beta, - int baseIndex) + int baseIndex) { VALUE_TYPE *px,*py, *pz; int cnt; @@ -270,6 +270,5 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) px, xPitch, beta, baseIndex); } - cudaCheckError("CUDA error on hell_spmm"); } diff --git a/cuda/spgpu/kernels/hell_spmm_base_template.cuh b/cuda/spgpu/kernels/hell_spmm_base_template.cuh index 1d085875..4ee4ed07 100644 --- a/cuda/spgpu/kernels/hell_spmm_base_template.cuh +++ b/cuda/spgpu/kernels/hell_spmm_base_template.cuh @@ -71,10 +71,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) px = (VALUE_TYPE *) x; for (int k=0; k