SpMM HDIAG working

cuda-multivect
gabrielequatrana 9 months ago
parent bdd04a6911
commit 4ff0f112a9

@ -104,33 +104,34 @@ module psb_c_base_mat_mod
! !
! Computational methods: defined here but not implemented. ! Computational methods: defined here but not implemented.
! !
procedure, pass(a) :: vect_mv => psb_c_base_vect_mv procedure, pass(a) :: vect_mv => psb_c_base_vect_mv
procedure, pass(a) :: csmv => psb_c_base_csmv procedure, pass(a) :: multivect_mv => psb_c_base_multivect_mv
procedure, pass(a) :: csmm => psb_c_base_csmm procedure, pass(a) :: csmv => psb_c_base_csmv
generic, public :: spmm => csmm, csmv, vect_mv procedure, pass(a) :: csmm => psb_c_base_csmm
procedure, pass(a) :: in_vect_sv => psb_c_base_inner_vect_sv generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv
procedure, pass(a) :: inner_cssv => psb_c_base_inner_cssv procedure, pass(a) :: in_vect_sv => psb_c_base_inner_vect_sv
procedure, pass(a) :: inner_cssm => psb_c_base_inner_cssm procedure, pass(a) :: inner_cssv => psb_c_base_inner_cssv
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv procedure, pass(a) :: inner_cssm => psb_c_base_inner_cssm
procedure, pass(a) :: vect_cssv => psb_c_base_vect_cssv generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: cssv => psb_c_base_cssv procedure, pass(a) :: vect_cssv => psb_c_base_vect_cssv
procedure, pass(a) :: cssm => psb_c_base_cssm procedure, pass(a) :: cssv => psb_c_base_cssv
generic, public :: spsm => cssm, cssv, vect_cssv procedure, pass(a) :: cssm => psb_c_base_cssm
procedure, pass(a) :: scals => psb_c_base_scals generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scalv => psb_c_base_scal procedure, pass(a) :: scals => psb_c_base_scals
generic, public :: scal => scals, scalv procedure, pass(a) :: scalv => psb_c_base_scal
procedure, pass(a) :: maxval => psb_c_base_maxval generic, public :: scal => scals, scalv
procedure, pass(a) :: spnmi => psb_c_base_csnmi procedure, pass(a) :: maxval => psb_c_base_maxval
procedure, pass(a) :: spnm1 => psb_c_base_csnm1 procedure, pass(a) :: spnmi => psb_c_base_csnmi
procedure, pass(a) :: rowsum => psb_c_base_rowsum procedure, pass(a) :: spnm1 => psb_c_base_csnm1
procedure, pass(a) :: arwsum => psb_c_base_arwsum procedure, pass(a) :: rowsum => psb_c_base_rowsum
procedure, pass(a) :: colsum => psb_c_base_colsum procedure, pass(a) :: arwsum => psb_c_base_arwsum
procedure, pass(a) :: aclsum => psb_c_base_aclsum procedure, pass(a) :: colsum => psb_c_base_colsum
procedure, pass(a) :: scalpid => psb_c_base_scalplusidentity procedure, pass(a) :: aclsum => psb_c_base_aclsum
procedure, pass(a) :: spaxpby => psb_c_base_spaxpby procedure, pass(a) :: scalpid => psb_c_base_scalplusidentity
procedure, pass(a) :: cmpval => psb_c_base_cmpval procedure, pass(a) :: spaxpby => psb_c_base_spaxpby
procedure, pass(a) :: cmpmat => psb_c_base_cmpmat procedure, pass(a) :: cmpval => psb_c_base_cmpval
generic, public :: spcmp => cmpval, cmpmat procedure, pass(a) :: cmpmat => psb_c_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
end type psb_c_base_sparse_mat end type psb_c_base_sparse_mat
private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, & private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_c_base_mat_mod
end subroutine psb_c_base_vect_mv end subroutine psb_c_base_vect_mv
end interface end interface
!> Function multivect_mv:
!! \memberof psb_c_base_sparse_mat
!! \brief Product by an encapsulated array type(psb_c_multivect_type)
!!
!! Compute
!! Y = alpha*op(A)*X + beta*Y
!! Usually the unwrapping of the encapsulated multivector is done
!! here, so that all the derived classes need only the
!! versions with the standard arrays.
!! Must be overridden explicitly in case of non standard memory
!! management; an example would be external memory allocation
!! in attached processors such as GPUs.
!!
!!
!! \param alpha Scaling factor for Ax
!! \param A the input sparse matrix
!! \param x the input X
!! \param beta Scaling factor for y
!! \param y the input/output Y
!! \param info return code
!! \param trans [N] Whether to use A (N), its transpose (T)
!! or its conjugate transpose (C)
!!
!
interface
subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
import
class(psb_c_base_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_base_multivect_mv
end interface
! !
!> Function cssm: !> Function cssm:
!! \memberof psb_c_base_sparse_mat !! \memberof psb_c_base_sparse_mat

@ -104,33 +104,34 @@ module psb_s_base_mat_mod
! !
! Computational methods: defined here but not implemented. ! Computational methods: defined here but not implemented.
! !
procedure, pass(a) :: vect_mv => psb_s_base_vect_mv procedure, pass(a) :: vect_mv => psb_s_base_vect_mv
procedure, pass(a) :: csmv => psb_s_base_csmv procedure, pass(a) :: multivect_mv => psb_s_base_multivect_mv
procedure, pass(a) :: csmm => psb_s_base_csmm procedure, pass(a) :: csmv => psb_s_base_csmv
generic, public :: spmm => csmm, csmv, vect_mv procedure, pass(a) :: csmm => psb_s_base_csmm
procedure, pass(a) :: in_vect_sv => psb_s_base_inner_vect_sv generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv
procedure, pass(a) :: inner_cssv => psb_s_base_inner_cssv procedure, pass(a) :: in_vect_sv => psb_s_base_inner_vect_sv
procedure, pass(a) :: inner_cssm => psb_s_base_inner_cssm procedure, pass(a) :: inner_cssv => psb_s_base_inner_cssv
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv procedure, pass(a) :: inner_cssm => psb_s_base_inner_cssm
procedure, pass(a) :: vect_cssv => psb_s_base_vect_cssv generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: cssv => psb_s_base_cssv procedure, pass(a) :: vect_cssv => psb_s_base_vect_cssv
procedure, pass(a) :: cssm => psb_s_base_cssm procedure, pass(a) :: cssv => psb_s_base_cssv
generic, public :: spsm => cssm, cssv, vect_cssv procedure, pass(a) :: cssm => psb_s_base_cssm
procedure, pass(a) :: scals => psb_s_base_scals generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scalv => psb_s_base_scal procedure, pass(a) :: scals => psb_s_base_scals
generic, public :: scal => scals, scalv procedure, pass(a) :: scalv => psb_s_base_scal
procedure, pass(a) :: maxval => psb_s_base_maxval generic, public :: scal => scals, scalv
procedure, pass(a) :: spnmi => psb_s_base_csnmi procedure, pass(a) :: maxval => psb_s_base_maxval
procedure, pass(a) :: spnm1 => psb_s_base_csnm1 procedure, pass(a) :: spnmi => psb_s_base_csnmi
procedure, pass(a) :: rowsum => psb_s_base_rowsum procedure, pass(a) :: spnm1 => psb_s_base_csnm1
procedure, pass(a) :: arwsum => psb_s_base_arwsum procedure, pass(a) :: rowsum => psb_s_base_rowsum
procedure, pass(a) :: colsum => psb_s_base_colsum procedure, pass(a) :: arwsum => psb_s_base_arwsum
procedure, pass(a) :: aclsum => psb_s_base_aclsum procedure, pass(a) :: colsum => psb_s_base_colsum
procedure, pass(a) :: scalpid => psb_s_base_scalplusidentity procedure, pass(a) :: aclsum => psb_s_base_aclsum
procedure, pass(a) :: spaxpby => psb_s_base_spaxpby procedure, pass(a) :: scalpid => psb_s_base_scalplusidentity
procedure, pass(a) :: cmpval => psb_s_base_cmpval procedure, pass(a) :: spaxpby => psb_s_base_spaxpby
procedure, pass(a) :: cmpmat => psb_s_base_cmpmat procedure, pass(a) :: cmpval => psb_s_base_cmpval
generic, public :: spcmp => cmpval, cmpmat procedure, pass(a) :: cmpmat => psb_s_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
end type psb_s_base_sparse_mat end type psb_s_base_sparse_mat
private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, & private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_s_base_mat_mod
end subroutine psb_s_base_vect_mv end subroutine psb_s_base_vect_mv
end interface end interface
!> Function multivect_mv:
!! \memberof psb_s_base_sparse_mat
!! \brief Product by an encapsulated array type(psb_s_multivect_type)
!!
!! Compute
!! Y = alpha*op(A)*X + beta*Y
!! Usually the unwrapping of the encapsulated multivector is done
!! here, so that all the derived classes need only the
!! versions with the standard arrays.
!! Must be overridden explicitly in case of non standard memory
!! management; an example would be external memory allocation
!! in attached processors such as GPUs.
!!
!!
!! \param alpha Scaling factor for Ax
!! \param A the input sparse matrix
!! \param x the input X
!! \param beta Scaling factor for y
!! \param y the input/output Y
!! \param info return code
!! \param trans [N] Whether to use A (N), its transpose (T)
!! or its conjugate transpose (C)
!!
!
interface
subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
import
class(psb_s_base_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_base_multivect_mv
end interface
! !
!> Function cssm: !> Function cssm:
!! \memberof psb_s_base_sparse_mat !! \memberof psb_s_base_sparse_mat

@ -104,33 +104,34 @@ module psb_z_base_mat_mod
! !
! Computational methods: defined here but not implemented. ! Computational methods: defined here but not implemented.
! !
procedure, pass(a) :: vect_mv => psb_z_base_vect_mv procedure, pass(a) :: vect_mv => psb_z_base_vect_mv
procedure, pass(a) :: csmv => psb_z_base_csmv procedure, pass(a) :: multivect_mv => psb_z_base_multivect_mv
procedure, pass(a) :: csmm => psb_z_base_csmm procedure, pass(a) :: csmv => psb_z_base_csmv
generic, public :: spmm => csmm, csmv, vect_mv procedure, pass(a) :: csmm => psb_z_base_csmm
procedure, pass(a) :: in_vect_sv => psb_z_base_inner_vect_sv generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv
procedure, pass(a) :: inner_cssv => psb_z_base_inner_cssv procedure, pass(a) :: in_vect_sv => psb_z_base_inner_vect_sv
procedure, pass(a) :: inner_cssm => psb_z_base_inner_cssm procedure, pass(a) :: inner_cssv => psb_z_base_inner_cssv
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv procedure, pass(a) :: inner_cssm => psb_z_base_inner_cssm
procedure, pass(a) :: vect_cssv => psb_z_base_vect_cssv generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: cssv => psb_z_base_cssv procedure, pass(a) :: vect_cssv => psb_z_base_vect_cssv
procedure, pass(a) :: cssm => psb_z_base_cssm procedure, pass(a) :: cssv => psb_z_base_cssv
generic, public :: spsm => cssm, cssv, vect_cssv procedure, pass(a) :: cssm => psb_z_base_cssm
procedure, pass(a) :: scals => psb_z_base_scals generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scalv => psb_z_base_scal procedure, pass(a) :: scals => psb_z_base_scals
generic, public :: scal => scals, scalv procedure, pass(a) :: scalv => psb_z_base_scal
procedure, pass(a) :: maxval => psb_z_base_maxval generic, public :: scal => scals, scalv
procedure, pass(a) :: spnmi => psb_z_base_csnmi procedure, pass(a) :: maxval => psb_z_base_maxval
procedure, pass(a) :: spnm1 => psb_z_base_csnm1 procedure, pass(a) :: spnmi => psb_z_base_csnmi
procedure, pass(a) :: rowsum => psb_z_base_rowsum procedure, pass(a) :: spnm1 => psb_z_base_csnm1
procedure, pass(a) :: arwsum => psb_z_base_arwsum procedure, pass(a) :: rowsum => psb_z_base_rowsum
procedure, pass(a) :: colsum => psb_z_base_colsum procedure, pass(a) :: arwsum => psb_z_base_arwsum
procedure, pass(a) :: aclsum => psb_z_base_aclsum procedure, pass(a) :: colsum => psb_z_base_colsum
procedure, pass(a) :: scalpid => psb_z_base_scalplusidentity procedure, pass(a) :: aclsum => psb_z_base_aclsum
procedure, pass(a) :: spaxpby => psb_z_base_spaxpby procedure, pass(a) :: scalpid => psb_z_base_scalplusidentity
procedure, pass(a) :: cmpval => psb_z_base_cmpval procedure, pass(a) :: spaxpby => psb_z_base_spaxpby
procedure, pass(a) :: cmpmat => psb_z_base_cmpmat procedure, pass(a) :: cmpval => psb_z_base_cmpval
generic, public :: spcmp => cmpval, cmpmat procedure, pass(a) :: cmpmat => psb_z_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
end type psb_z_base_sparse_mat end type psb_z_base_sparse_mat
private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, & private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, &
@ -1240,6 +1241,42 @@ module psb_z_base_mat_mod
end subroutine psb_z_base_vect_mv end subroutine psb_z_base_vect_mv
end interface end interface
!> Function multivect_mv:
!! \memberof psb_z_base_sparse_mat
!! \brief Product by an encapsulated array type(psb_z_multivect_type)
!!
!! Compute
!! Y = alpha*op(A)*X + beta*Y
!! Usually the unwrapping of the encapsulated multivector is done
!! here, so that all the derived classes need only the
!! versions with the standard arrays.
!! Must be overridden explicitly in case of non standard memory
!! management; an example would be external memory allocation
!! in attached processors such as GPUs.
!!
!!
!! \param alpha Scaling factor for Ax
!! \param A the input sparse matrix
!! \param x the input X
!! \param beta Scaling factor for y
!! \param y the input/output Y
!! \param info return code
!! \param trans [N] Whether to use A (N), its transpose (T)
!! or its conjugate transpose (C)
!!
!
interface
subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
import
class(psb_z_base_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_base_multivect_mv
end interface
! !
!> Function cssm: !> Function cssm:
!! \memberof psb_z_base_sparse_mat !! \memberof psb_z_base_sparse_mat

@ -2012,6 +2012,26 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans)
call y%set_host() call y%set_host()
end subroutine psb_c_base_vect_mv end subroutine psb_c_base_vect_mv
subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_error_mod
use psb_const_mod
use psb_c_base_mat_mod, psb_protect_name => psb_c_base_multivect_mv
implicit none
class(psb_c_base_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
! For the time being we just throw everything back
! onto the normal routines.
call x%sync()
call y%sync()
call a%spmm(alpha,x%v,beta,y%v,info,trans)
call y%set_host()
end subroutine psb_c_base_multivect_mv
subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv
use psb_c_base_vect_mod use psb_c_base_vect_mod

@ -2012,6 +2012,26 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans)
call y%set_host() call y%set_host()
end subroutine psb_s_base_vect_mv end subroutine psb_s_base_vect_mv
subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_error_mod
use psb_const_mod
use psb_s_base_mat_mod, psb_protect_name => psb_s_base_multivect_mv
implicit none
class(psb_s_base_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
! For the time being we just throw everything back
! onto the normal routines.
call x%sync()
call y%sync()
call a%spmm(alpha,x%v,beta,y%v,info,trans)
call y%set_host()
end subroutine psb_s_base_multivect_mv
subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv
use psb_s_base_vect_mod use psb_s_base_vect_mod

@ -2012,6 +2012,26 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans)
call y%set_host() call y%set_host()
end subroutine psb_z_base_vect_mv end subroutine psb_z_base_vect_mv
subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_error_mod
use psb_const_mod
use psb_z_base_mat_mod, psb_protect_name => psb_z_base_multivect_mv
implicit none
class(psb_z_base_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
! For the time being we just throw everything back
! onto the normal routines.
call x%sync()
call y%sync()
call a%spmm(alpha,x%v,beta,y%v,info,trans)
call y%set_host()
end subroutine psb_z_base_multivect_mv
subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv
use psb_z_base_vect_mod use psb_z_base_vect_mod

@ -267,6 +267,27 @@ int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
return SPGPU_SUCCESS; return SPGPU_SUCCESS;
} }
int spmmHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuDhdiaspmm (handle, y->count_, (double*)y->v_, y->pitch_,
(double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets,
devMat->rows, devMat->cols, (double *)x->v_, x->pitch_, beta);
return SPGPU_SUCCESS;
}
int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets) int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets)
{ int i=0,fo,fa,j,k,p; { int i=0,fo,fa,j,k,p;
char buf_a[255], buf_o[255],tmp[255]; char buf_a[255], buf_o[255],tmp[255];
@ -384,3 +405,23 @@ int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
return SPGPU_SUCCESS; return SPGPU_SUCCESS;
} }
int spmmHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShdiaspmm (handle, y->count_, (float*)y->v_, y->pitch_,
(float *)y->v_, y->pitch_, alpha, (float *)devMat->cM,
devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets,
devMat->rows, devMat->cols, (float *)x->v_, x->pitch_, beta);
return SPGPU_SUCCESS;
}

@ -95,12 +95,13 @@ int allocHdiagDevice(void ** remoteMatrix, HdiagDeviceParams* params);
void freeHdiagDevice(void* remoteMatrix); void freeHdiagDevice(void* remoteMatrix);
int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets); int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY);
int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets); int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX, //int writeHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
double beta, void* deviceY); //int writeHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);
#endif #endif

@ -196,4 +196,35 @@ module hdiagdev_mod
!!$ end function spmvHdiagDeviceDoubleComplex !!$ end function spmvHdiagDeviceDoubleComplex
end interface spmvHdiagDevice end interface spmvHdiagDevice
interface spmmHdiagDevice
function spmmHdiagDeviceFloat(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHdiagDeviceFloat')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_float),value :: alpha, beta
end function spmmHdiagDeviceFloat
function spmmHdiagDeviceDouble(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHdiagDeviceDouble')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_double),value :: alpha, beta
end function spmmHdiagDeviceDouble
!!$ function spmmHdiagDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
!!$ & result(res) bind(c,name='spmmHdiagDeviceFloatComplex')
!!$ use iso_c_binding
!!$ integer(c_int) :: res
!!$ type(c_ptr), value :: deviceMat, x, y
!!$ complex(c_float_complex),value :: alpha, beta
!!$ end function spmmHdiagDeviceFloatComplex
!!$ function spmmHdiagDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
!!$ & result(res) bind(c,name='spmmHdiagDeviceDoubleComplex')
!!$ use iso_c_binding
!!$ integer(c_int) :: res
!!$ type(c_ptr), value :: deviceMat, x, y
!!$ complex(c_double_complex),value :: alpha, beta
!!$ end function spmmHdiagDeviceDoubleComplex
end interface spmmHdiagDevice
end module hdiagdev_mod end module hdiagdev_mod

@ -288,13 +288,7 @@ psb_s_cuda_hdiag_vect_mv.o \
psb_s_cuda_dnsg_mat_impl.o \ psb_s_cuda_dnsg_mat_impl.o \
psb_d_cuda_dnsg_mat_impl.o \ psb_d_cuda_dnsg_mat_impl.o \
psb_c_cuda_dnsg_mat_impl.o \ psb_c_cuda_dnsg_mat_impl.o \
psb_z_cuda_dnsg_mat_impl.o \ psb_z_cuda_dnsg_mat_impl.o
psb_z_cuda_hdiag_csmv.o \
psb_z_cuda_hdiag_csmm.o \
psb_z_cuda_hdiag_vect_mv.o \
psb_c_cuda_hdiag_csmv.o \
psb_c_cuda_hdiag_csmm.o \
psb_c_cuda_hdiag_vect_mv.o
objs: $(OBJS) objs: $(OBJS)
lib: objs lib: objs

@ -124,7 +124,7 @@ subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv
use psb_d_cuda_multivect_mod use psb_d_cuda_multivect_mod
implicit none implicit none
class(psb_d_cuda_hdiag_mat_mod), intent(in) :: a class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_multivect_type), intent(inout) :: x class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y class(psb_d_base_multivect_type), intent(inout) :: y

@ -89,7 +89,7 @@ subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
if (tra) then if (tra) then
if (a%is_dev()) call a%sync() if (a%is_dev()) call a%sync()
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
else else
! !
! Just to test, move X/Y to/from the GPU. ! Just to test, move X/Y to/from the GPU.

@ -124,7 +124,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv
use psb_s_cuda_multivect_mod use psb_s_cuda_multivect_mod
implicit none implicit none
class(psb_s_cuda_hdiag_mat_mod), intent(in) :: a class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y class(psb_s_base_multivect_type), intent(inout) :: y
@ -159,7 +159,7 @@ subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
if (beta /= dzero) then if (beta /= dzero) then
if (.not.y%is_host()) call y%sync() if (.not.y%is_host()) call y%sync()
end if end if
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host() call y%set_host()
else else
if (a%is_host()) call a%sync() if (a%is_host()) call a%sync()

@ -27,6 +27,10 @@
extern "C" { extern "C" {
#endif #endif
// DIA/HDIA Compressed Matrix Format routines
/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
#define DIA_PITCH_ALIGN_BYTE 128
/** /**
* \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta) * \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta)
@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle,
float beta); float beta);
/**
* \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuShdiaspmm (spgpuHandle_t handle,
int count,
__device float *z,
int zpitch,
const __device float *y,
int ypitch,
float alpha,
const __device float* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device float *x,
int xpitch,
float beta);
/** /**
* \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta) * \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta)
* Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU. * Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle,
const double *x, const double *x,
double beta); double beta);
/**
* \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuDhdiaspmm (spgpuHandle_t handle,
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device double *x,
int xpitch,
double beta);
/** /**
* \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta) * \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta)
@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle,
const cuFloatComplex *x, const cuFloatComplex *x,
cuFloatComplex beta); cuFloatComplex beta);
/**
* \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuChdiaspmm (spgpuHandle_t handle,
int count,
__device cuFloatComplex *z,
int zpitch,
const __device cuFloatComplex *y,
int ypitch,
cuFloatComplex alpha,
const __device cuFloatComplex* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device cuFloatComplex *x,
int xpitch,
cuFloatComplex beta);
/** /**
* \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta) * \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta)
@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle,
int cols, int cols,
const cuDoubleComplex *x, const cuDoubleComplex *x,
cuDoubleComplex beta); cuDoubleComplex beta);
/**
* \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuZhdiaspmm (spgpuHandle_t handle,
int count,
__device cuDoubleComplex *z,
int zpitch,
const __device cuDoubleComplex *y,
int ypitch,
cuDoubleComplex alpha,
const __device cuDoubleComplex* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device cuDoubleComplex *x,
int xpitch,
cuDoubleComplex beta);
/** @}*/ /** @}*/

@ -99,6 +99,86 @@ fetchTex (int pointer)
#undef GEN_SPGPU_ELL_NAME #undef GEN_SPGPU_ELL_NAME
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm) #define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
#if 0
void
GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt, c1;
dim3 block (THREAD_BLOCK, 1);
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
int numMp=getGPUMultiProcessors();
int maxThMp=getGPUMaxThreadsPerMP();
int nmblksMp=maxThMp/THREAD_BLOCK;
int nmblk=nmblksMp*numMp;
dim3 grid (nmblk);
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
if (shrMemSize > maxShmemSz) {
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
return;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, cMPitch, rPPitch,
rS, rows, px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt > MMBSZ) {
c1 = cnt/2;
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch,
alpha, cM, rP, cMPitch, rPPitch,
rS, rows, px, xPitch, beta, baseIndex);
cnt -= c1;
}
if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch,
alpha, cM, rP, cMPitch, rPPitch,
rS, rows, px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on ell_spmm");
}
#endif
void void
GEN_SPGPU_ELL_NAME(TYPE_SYMBOL) GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle, (spgpuHandle_t handle,
@ -110,12 +190,12 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
VALUE_TYPE alpha, VALUE_TYPE alpha,
const VALUE_TYPE* cM, const VALUE_TYPE* cM,
const int* rP, const int* rP,
int cMPitch, int cMPitch,
int rPPitch, int rPPitch,
const __device int* rS, const __device int* rS,
const __device int* rIdx, const __device int* rIdx,
int avgNnzPerRow, int avgNnzPerRow,
int maxNnzPerRow, int maxNnzPerRow,
int rows, int rows,
const VALUE_TYPE *x, const VALUE_TYPE *x,
int xPitch, int xPitch,
@ -193,4 +273,4 @@ GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
} }
cudaCheckError("CUDA error on ell_spmm"); cudaCheckError("CUDA error on ell_spmm");
} }

@ -17,6 +17,75 @@
#define THREAD_BLOCK 128 #define THREAD_BLOCK 128
#define MMBSZ 8 #define MMBSZ 8
#if 0
__global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int cMPitch, int rPPitch, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gridSize = gridDim.x * blockDim.x;
while (i < rows) {
rS += i; rP += i; cM += i;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += rPPitch;
value = cM[0];
cM += cMPitch;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
} else {
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
i += gridSize;
}
}
#endif
__global__ void __global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn) CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch, (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
@ -53,9 +122,9 @@ CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
px = (VALUE_TYPE *) x; px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
fetch = px[pointer]; fetch = px[pointer];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]); temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch; px = px + xPitch;
} }
} }
@ -97,4 +166,4 @@ CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch, <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
alpha, cM, rP, cMPitch, rPPitch, rS, rows, alpha, cM, rP, cMPitch, rPPitch, rS, rows,
x, xPitch, beta, baseIndex); x, xPitch, beta, baseIndex);
} }

@ -74,27 +74,115 @@ fetchTex (int pointer)
return CONCAT(readValue_,VALUE_TYPE) (fetch); return CONCAT(readValue_,VALUE_TYPE) (fetch);
} }
#endif #endif
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla) #define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla) #define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
#include "hdia_spmv_base_template.cuh" #include "hdia_spmm_base_template.cuh"
#if 0 #if 0
#undef GEN_SPGPU_HDIA_NAME #undef GEN_SPGPU_HDIA_NAME
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch) #define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch) #define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
#undef USE_PREFETCHING #undef USE_PREFETCHING
#define USE_PREFETCHING #define USE_PREFETCHING
#include "hdia_spmv_base_template.cuh" #include "hdia_spmm_base_template.cuh"
#define ENABLE_CACHE #define ENABLE_CACHE
#undef GEN_SPGPU_HDIA_NAME #undef GEN_SPGPU_HDIA_NAME
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch) #define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch) #define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
#include "hdia_spmv_base_template.cuh" #include "hdia_spmm_base_template.cuh"
#undef GEN_SPGPU_HDIA_NAME #undef GEN_SPGPU_HDIA_NAME
#undef USE_PREFETCHING #undef USE_PREFETCHING
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache) #define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache) #define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
#include "hdia_spmv_base_template.cuh" #include "hdia_spmm_base_template.cuh"
#endif #endif
#undef GEN_SPGPU_HDIA_NAME #undef GEN_SPGPU_HDIA_NAME
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv) #define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm)
void
GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* hdiaOffsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
maxNForACall, cols,
px, xPitch, beta);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
maxNForACall, cols,
px, xPitch, beta);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
rows, cols,
px, xPitch, beta);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
rows, cols,
px, xPitch, beta);
}
cudaCheckError("CUDA error on hdiag_spmm");
}

@ -15,4 +15,127 @@
*/ */
#define THREAD_BLOCK 128 #define THREAD_BLOCK 128
#define MMBSZ 8
__global__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
int hackSize, const int* hackOffsets, int rows, int cols,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int hackCount = (rows + hackSize - 1)/hackSize;
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
// shared between offsetsChunks and warpHackOffsetTemp
extern __shared__ int dynShrMem[];
int hackOffset = 0;
int nextOffset = 0;
unsigned int laneId = threadIdx.x % warpSize;
unsigned int warpId = threadIdx.x / warpSize;
if (laneId == 0 && i < rows) {
hackOffset = hackOffsets[hackId];
nextOffset = hackOffsets[hackId+1];
}
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);
nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
if (hackId >= hackCount)
return;
cM += hackOffset*hackSize + hackLaneId;
hdiaOffsets += hackOffset;
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
// diags for this hack is next hackOffset minus current hackOffset
int diags = nextOffset - hackOffset;
// Warp oriented
int rounds = (diags + warpSize - 1)/warpSize;
volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
for (int r = 0; r < rounds; r++) {
// in the last round diags will be <= warpSize
if (laneId < diags)
offsetsChunk[laneId] = hdiaOffsets[laneId];
if (i < rows) {
int dCount = min(diags, warpSize);
for (int j = 0; j < dCount; ++j) {
int column = offsetsChunk[j] + i;
if(column >= 0 && column < cols) {
px = (VALUE_TYPE *) x;
for (int k = 0; k < count; k++) {
VALUE_TYPE xValue = px[column];
VALUE_TYPE mValue = cM[0];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
cM += hackSize;
}
}
diags -= warpSize;
hdiaOffsets += warpSize;
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
if (i >= rows)
return;
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
else
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
void
CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
int xPitch, VALUE_TYPE beta)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
maxShmemSz = getGPUSharedMemPerBlock();
shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
x, xPitch, beta);
}

@ -106,11 +106,11 @@ extern __shared__ int dynShrMem[];
void void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle, (spgpuHandle_t handle,
int count, int count,
VALUE_TYPE* z, VALUE_TYPE* z,
int zPitch, int zPitch,
const VALUE_TYPE *y, const VALUE_TYPE *y,
int yPitch, int yPitch,
VALUE_TYPE alpha, VALUE_TYPE alpha,
const VALUE_TYPE* cM, const VALUE_TYPE* cM,
const int* rP, const int* rP,
@ -120,9 +120,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
const __device int* rIdx, const __device int* rIdx,
int rows, int rows,
const VALUE_TYPE *x, const VALUE_TYPE *x,
int xPitch, int xPitch,
VALUE_TYPE beta, VALUE_TYPE beta,
int baseIndex) int baseIndex)
{ {
VALUE_TYPE *px,*py, *pz; VALUE_TYPE *px,*py, *pz;
int cnt, c1; int cnt, c1;
@ -149,7 +149,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
py = (VALUE_TYPE *) y; py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z; pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) { while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch, <<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets, alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex); rS, rows, px, xPitch, beta, baseIndex);
@ -160,7 +160,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
} }
if (cnt > MMBSZ) { if (cnt > MMBSZ) {
c1 = cnt/2; c1 = cnt/2;
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch, <<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets, alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex); rS, rows, px, xPitch, beta, baseIndex);
@ -169,7 +169,7 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
if (cnt > MMBSZ) { if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt); fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){ } else if (cnt > 0){
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch, <<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets, alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex); rS, rows, px, xPitch, beta, baseIndex);
@ -182,11 +182,11 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
void void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL) GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle, (spgpuHandle_t handle,
int count, int count,
VALUE_TYPE* z, VALUE_TYPE* z,
int zPitch, int zPitch,
const VALUE_TYPE *y, const VALUE_TYPE *y,
int yPitch, int yPitch,
VALUE_TYPE alpha, VALUE_TYPE alpha,
const VALUE_TYPE* cM, const VALUE_TYPE* cM,
const int* rP, const int* rP,
@ -196,9 +196,9 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
const __device int* rIdx, const __device int* rIdx,
int rows, int rows,
const VALUE_TYPE *x, const VALUE_TYPE *x,
int xPitch, int xPitch,
VALUE_TYPE beta, VALUE_TYPE beta,
int baseIndex) int baseIndex)
{ {
VALUE_TYPE *px,*py, *pz; VALUE_TYPE *px,*py, *pz;
int cnt; int cnt;
@ -270,6 +270,5 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
px, xPitch, beta, baseIndex); px, xPitch, beta, baseIndex);
} }
cudaCheckError("CUDA error on hell_spmm"); cudaCheckError("CUDA error on hell_spmm");
} }

@ -71,10 +71,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
px = (VALUE_TYPE *) x; px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
fetch = px[pointer]; fetch = px[pointer];
temp[k][threadIdx.x] = temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]); px = px + xPitch;
px = px + xPitch;
} }
} }
// Since z and y are accessed with the same offset by the same thread, // Since z and y are accessed with the same offset by the same thread,
@ -83,16 +82,15 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
pz = z; pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) { if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
yVal = py[i]; yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x])); py += yPitch;
py += yPitch; pz += zPitch;
pz += zPitch;
} }
} else { } else {
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]); pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch; pz += zPitch;
} }
} }
@ -150,10 +148,9 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
px = (VALUE_TYPE *) x; px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
fetch = px[pointer]; fetch = px[pointer];
temp[k][threadIdx.x] = temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]); px = px + xPitch;
px = px + xPitch;
} }
} }
// Since z and y are accessed with the same offset by the same thread, // Since z and y are accessed with the same offset by the same thread,
@ -162,15 +159,15 @@ CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
pz = z; pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
yVal = py[i]; yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x])); pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch; py += yPitch;
pz += zPitch; pz += zPitch;
} }
else else
for (int k=0; k<count; k++) { for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]); pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch; pz += zPitch;
} }
} }
} }

Loading…
Cancel
Save