Merge remote-tracking branch 'origin/cuda-multivect' into psblas-bgmres

psblas-bgmres
gabrielequatrana 5 months ago
commit c08431d71e

@ -35,6 +35,7 @@ module psb_c_base_mat_mod
use psb_base_mat_mod
use psb_c_base_vect_mod
use psb_c_base_multivect_mod
!> \namespace psb_base_mod \class psb_c_base_sparse_mat
@ -103,33 +104,34 @@ module psb_c_base_mat_mod
!
! Computational methods: defined here but not implemented.
!
procedure, pass(a) :: vect_mv => psb_c_base_vect_mv
procedure, pass(a) :: csmv => psb_c_base_csmv
procedure, pass(a) :: csmm => psb_c_base_csmm
generic, public :: spmm => csmm, csmv, vect_mv
procedure, pass(a) :: in_vect_sv => psb_c_base_inner_vect_sv
procedure, pass(a) :: inner_cssv => psb_c_base_inner_cssv
procedure, pass(a) :: inner_cssm => psb_c_base_inner_cssm
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: vect_cssv => psb_c_base_vect_cssv
procedure, pass(a) :: cssv => psb_c_base_cssv
procedure, pass(a) :: cssm => psb_c_base_cssm
generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scals => psb_c_base_scals
procedure, pass(a) :: scalv => psb_c_base_scal
generic, public :: scal => scals, scalv
procedure, pass(a) :: maxval => psb_c_base_maxval
procedure, pass(a) :: spnmi => psb_c_base_csnmi
procedure, pass(a) :: spnm1 => psb_c_base_csnm1
procedure, pass(a) :: rowsum => psb_c_base_rowsum
procedure, pass(a) :: arwsum => psb_c_base_arwsum
procedure, pass(a) :: colsum => psb_c_base_colsum
procedure, pass(a) :: aclsum => psb_c_base_aclsum
procedure, pass(a) :: scalpid => psb_c_base_scalplusidentity
procedure, pass(a) :: spaxpby => psb_c_base_spaxpby
procedure, pass(a) :: cmpval => psb_c_base_cmpval
procedure, pass(a) :: cmpmat => psb_c_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
procedure, pass(a) :: vect_mv => psb_c_base_vect_mv
procedure, pass(a) :: multivect_mv => psb_c_base_multivect_mv
procedure, pass(a) :: csmv => psb_c_base_csmv
procedure, pass(a) :: csmm => psb_c_base_csmm
generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv
procedure, pass(a) :: in_vect_sv => psb_c_base_inner_vect_sv
procedure, pass(a) :: inner_cssv => psb_c_base_inner_cssv
procedure, pass(a) :: inner_cssm => psb_c_base_inner_cssm
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: vect_cssv => psb_c_base_vect_cssv
procedure, pass(a) :: cssv => psb_c_base_cssv
procedure, pass(a) :: cssm => psb_c_base_cssm
generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scals => psb_c_base_scals
procedure, pass(a) :: scalv => psb_c_base_scal
generic, public :: scal => scals, scalv
procedure, pass(a) :: maxval => psb_c_base_maxval
procedure, pass(a) :: spnmi => psb_c_base_csnmi
procedure, pass(a) :: spnm1 => psb_c_base_csnm1
procedure, pass(a) :: rowsum => psb_c_base_rowsum
procedure, pass(a) :: arwsum => psb_c_base_arwsum
procedure, pass(a) :: colsum => psb_c_base_colsum
procedure, pass(a) :: aclsum => psb_c_base_aclsum
procedure, pass(a) :: scalpid => psb_c_base_scalplusidentity
procedure, pass(a) :: spaxpby => psb_c_base_spaxpby
procedure, pass(a) :: cmpval => psb_c_base_cmpval
procedure, pass(a) :: cmpmat => psb_c_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
end type psb_c_base_sparse_mat
private :: c_base_mat_sync, c_base_mat_is_host, c_base_mat_is_dev, &
@ -1239,6 +1241,42 @@ module psb_c_base_mat_mod
end subroutine psb_c_base_vect_mv
end interface
!> Function multivect_mv:
!! \memberof psb_c_base_sparse_mat
!! \brief Product by an encapsulated array type(psb_c_multivect_type)
!!
!! Compute
!! Y = alpha*op(A)*X + beta*Y
!! Usually the unwrapping of the encapsulated multivector is done
!! here, so that all the derived classes need only the
!! versions with the standard arrays.
!! Must be overridden explicitly in case of non standard memory
!! management; an example would be external memory allocation
!! in attached processors such as GPUs.
!!
!!
!! \param alpha Scaling factor for Ax
!! \param A the input sparse matrix
!! \param x the input X
!! \param beta Scaling factor for y
!! \param y the input/output Y
!! \param info return code
!! \param trans [N] Whether to use A (N), its transpose (T)
!! or its conjugate transpose (C)
!!
!
interface
subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
import
class(psb_c_base_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_base_multivect_mv
end interface
!
!> Function cssm:
!! \memberof psb_c_base_sparse_mat

@ -35,6 +35,7 @@ module psb_s_base_mat_mod
use psb_base_mat_mod
use psb_s_base_vect_mod
use psb_s_base_multivect_mod
!> \namespace psb_base_mod \class psb_s_base_sparse_mat
@ -103,33 +104,34 @@ module psb_s_base_mat_mod
!
! Computational methods: defined here but not implemented.
!
procedure, pass(a) :: vect_mv => psb_s_base_vect_mv
procedure, pass(a) :: csmv => psb_s_base_csmv
procedure, pass(a) :: csmm => psb_s_base_csmm
generic, public :: spmm => csmm, csmv, vect_mv
procedure, pass(a) :: in_vect_sv => psb_s_base_inner_vect_sv
procedure, pass(a) :: inner_cssv => psb_s_base_inner_cssv
procedure, pass(a) :: inner_cssm => psb_s_base_inner_cssm
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: vect_cssv => psb_s_base_vect_cssv
procedure, pass(a) :: cssv => psb_s_base_cssv
procedure, pass(a) :: cssm => psb_s_base_cssm
generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scals => psb_s_base_scals
procedure, pass(a) :: scalv => psb_s_base_scal
generic, public :: scal => scals, scalv
procedure, pass(a) :: maxval => psb_s_base_maxval
procedure, pass(a) :: spnmi => psb_s_base_csnmi
procedure, pass(a) :: spnm1 => psb_s_base_csnm1
procedure, pass(a) :: rowsum => psb_s_base_rowsum
procedure, pass(a) :: arwsum => psb_s_base_arwsum
procedure, pass(a) :: colsum => psb_s_base_colsum
procedure, pass(a) :: aclsum => psb_s_base_aclsum
procedure, pass(a) :: scalpid => psb_s_base_scalplusidentity
procedure, pass(a) :: spaxpby => psb_s_base_spaxpby
procedure, pass(a) :: cmpval => psb_s_base_cmpval
procedure, pass(a) :: cmpmat => psb_s_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
procedure, pass(a) :: vect_mv => psb_s_base_vect_mv
procedure, pass(a) :: multivect_mv => psb_s_base_multivect_mv
procedure, pass(a) :: csmv => psb_s_base_csmv
procedure, pass(a) :: csmm => psb_s_base_csmm
generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv
procedure, pass(a) :: in_vect_sv => psb_s_base_inner_vect_sv
procedure, pass(a) :: inner_cssv => psb_s_base_inner_cssv
procedure, pass(a) :: inner_cssm => psb_s_base_inner_cssm
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: vect_cssv => psb_s_base_vect_cssv
procedure, pass(a) :: cssv => psb_s_base_cssv
procedure, pass(a) :: cssm => psb_s_base_cssm
generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scals => psb_s_base_scals
procedure, pass(a) :: scalv => psb_s_base_scal
generic, public :: scal => scals, scalv
procedure, pass(a) :: maxval => psb_s_base_maxval
procedure, pass(a) :: spnmi => psb_s_base_csnmi
procedure, pass(a) :: spnm1 => psb_s_base_csnm1
procedure, pass(a) :: rowsum => psb_s_base_rowsum
procedure, pass(a) :: arwsum => psb_s_base_arwsum
procedure, pass(a) :: colsum => psb_s_base_colsum
procedure, pass(a) :: aclsum => psb_s_base_aclsum
procedure, pass(a) :: scalpid => psb_s_base_scalplusidentity
procedure, pass(a) :: spaxpby => psb_s_base_spaxpby
procedure, pass(a) :: cmpval => psb_s_base_cmpval
procedure, pass(a) :: cmpmat => psb_s_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
end type psb_s_base_sparse_mat
private :: s_base_mat_sync, s_base_mat_is_host, s_base_mat_is_dev, &
@ -1239,6 +1241,42 @@ module psb_s_base_mat_mod
end subroutine psb_s_base_vect_mv
end interface
!> Function multivect_mv:
!! \memberof psb_s_base_sparse_mat
!! \brief Product by an encapsulated array type(psb_s_multivect_type)
!!
!! Compute
!! Y = alpha*op(A)*X + beta*Y
!! Usually the unwrapping of the encapsulated multivector is done
!! here, so that all the derived classes need only the
!! versions with the standard arrays.
!! Must be overridden explicitly in case of non standard memory
!! management; an example would be external memory allocation
!! in attached processors such as GPUs.
!!
!!
!! \param alpha Scaling factor for Ax
!! \param A the input sparse matrix
!! \param x the input X
!! \param beta Scaling factor for y
!! \param y the input/output Y
!! \param info return code
!! \param trans [N] Whether to use A (N), its transpose (T)
!! or its conjugate transpose (C)
!!
!
interface
subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
import
class(psb_s_base_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_base_multivect_mv
end interface
!
!> Function cssm:
!! \memberof psb_s_base_sparse_mat

@ -35,6 +35,7 @@ module psb_z_base_mat_mod
use psb_base_mat_mod
use psb_z_base_vect_mod
use psb_z_base_multivect_mod
!> \namespace psb_base_mod \class psb_z_base_sparse_mat
@ -103,33 +104,34 @@ module psb_z_base_mat_mod
!
! Computational methods: defined here but not implemented.
!
procedure, pass(a) :: vect_mv => psb_z_base_vect_mv
procedure, pass(a) :: csmv => psb_z_base_csmv
procedure, pass(a) :: csmm => psb_z_base_csmm
generic, public :: spmm => csmm, csmv, vect_mv
procedure, pass(a) :: in_vect_sv => psb_z_base_inner_vect_sv
procedure, pass(a) :: inner_cssv => psb_z_base_inner_cssv
procedure, pass(a) :: inner_cssm => psb_z_base_inner_cssm
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: vect_cssv => psb_z_base_vect_cssv
procedure, pass(a) :: cssv => psb_z_base_cssv
procedure, pass(a) :: cssm => psb_z_base_cssm
generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scals => psb_z_base_scals
procedure, pass(a) :: scalv => psb_z_base_scal
generic, public :: scal => scals, scalv
procedure, pass(a) :: maxval => psb_z_base_maxval
procedure, pass(a) :: spnmi => psb_z_base_csnmi
procedure, pass(a) :: spnm1 => psb_z_base_csnm1
procedure, pass(a) :: rowsum => psb_z_base_rowsum
procedure, pass(a) :: arwsum => psb_z_base_arwsum
procedure, pass(a) :: colsum => psb_z_base_colsum
procedure, pass(a) :: aclsum => psb_z_base_aclsum
procedure, pass(a) :: scalpid => psb_z_base_scalplusidentity
procedure, pass(a) :: spaxpby => psb_z_base_spaxpby
procedure, pass(a) :: cmpval => psb_z_base_cmpval
procedure, pass(a) :: cmpmat => psb_z_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
procedure, pass(a) :: vect_mv => psb_z_base_vect_mv
procedure, pass(a) :: multivect_mv => psb_z_base_multivect_mv
procedure, pass(a) :: csmv => psb_z_base_csmv
procedure, pass(a) :: csmm => psb_z_base_csmm
generic, public :: spmm => csmm, csmv, vect_mv, multivect_mv
procedure, pass(a) :: in_vect_sv => psb_z_base_inner_vect_sv
procedure, pass(a) :: inner_cssv => psb_z_base_inner_cssv
procedure, pass(a) :: inner_cssm => psb_z_base_inner_cssm
generic, public :: inner_spsm => inner_cssm, inner_cssv, in_vect_sv
procedure, pass(a) :: vect_cssv => psb_z_base_vect_cssv
procedure, pass(a) :: cssv => psb_z_base_cssv
procedure, pass(a) :: cssm => psb_z_base_cssm
generic, public :: spsm => cssm, cssv, vect_cssv
procedure, pass(a) :: scals => psb_z_base_scals
procedure, pass(a) :: scalv => psb_z_base_scal
generic, public :: scal => scals, scalv
procedure, pass(a) :: maxval => psb_z_base_maxval
procedure, pass(a) :: spnmi => psb_z_base_csnmi
procedure, pass(a) :: spnm1 => psb_z_base_csnm1
procedure, pass(a) :: rowsum => psb_z_base_rowsum
procedure, pass(a) :: arwsum => psb_z_base_arwsum
procedure, pass(a) :: colsum => psb_z_base_colsum
procedure, pass(a) :: aclsum => psb_z_base_aclsum
procedure, pass(a) :: scalpid => psb_z_base_scalplusidentity
procedure, pass(a) :: spaxpby => psb_z_base_spaxpby
procedure, pass(a) :: cmpval => psb_z_base_cmpval
procedure, pass(a) :: cmpmat => psb_z_base_cmpmat
generic, public :: spcmp => cmpval, cmpmat
end type psb_z_base_sparse_mat
private :: z_base_mat_sync, z_base_mat_is_host, z_base_mat_is_dev, &
@ -1239,6 +1241,42 @@ module psb_z_base_mat_mod
end subroutine psb_z_base_vect_mv
end interface
!> Function multivect_mv:
!! \memberof psb_z_base_sparse_mat
!! \brief Product by an encapsulated array type(psb_z_multivect_type)
!!
!! Compute
!! Y = alpha*op(A)*X + beta*Y
!! Usually the unwrapping of the encapsulated multivector is done
!! here, so that all the derived classes need only the
!! versions with the standard arrays.
!! Must be overridden explicitly in case of non standard memory
!! management; an example would be external memory allocation
!! in attached processors such as GPUs.
!!
!!
!! \param alpha Scaling factor for Ax
!! \param A the input sparse matrix
!! \param x the input X
!! \param beta Scaling factor for y
!! \param y the input/output Y
!! \param info return code
!! \param trans [N] Whether to use A (N), its transpose (T)
!! or its conjugate transpose (C)
!!
!
interface
subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
import
class(psb_z_base_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_base_multivect_mv
end interface
!
!> Function cssm:
!! \memberof psb_z_base_sparse_mat

@ -2012,6 +2012,26 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans)
call y%set_host()
end subroutine psb_c_base_vect_mv
subroutine psb_c_base_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_error_mod
use psb_const_mod
use psb_c_base_mat_mod, psb_protect_name => psb_c_base_multivect_mv
implicit none
class(psb_c_base_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
! For the time being we just throw everything back
! onto the normal routines.
call x%sync()
call y%sync()
call a%spmm(alpha,x%v,beta,y%v,info,trans)
call y%set_host()
end subroutine psb_c_base_multivect_mv
subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
use psb_c_base_mat_mod, psb_protect_name => psb_c_base_vect_cssv
use psb_c_base_vect_mod

@ -2012,6 +2012,26 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans)
call y%set_host()
end subroutine psb_s_base_vect_mv
subroutine psb_s_base_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_error_mod
use psb_const_mod
use psb_s_base_mat_mod, psb_protect_name => psb_s_base_multivect_mv
implicit none
class(psb_s_base_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
! For the time being we just throw everything back
! onto the normal routines.
call x%sync()
call y%sync()
call a%spmm(alpha,x%v,beta,y%v,info,trans)
call y%set_host()
end subroutine psb_s_base_multivect_mv
subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
use psb_s_base_mat_mod, psb_protect_name => psb_s_base_vect_cssv
use psb_s_base_vect_mod

@ -2012,6 +2012,26 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans)
call y%set_host()
end subroutine psb_z_base_vect_mv
subroutine psb_z_base_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_error_mod
use psb_const_mod
use psb_z_base_mat_mod, psb_protect_name => psb_z_base_multivect_mv
implicit none
class(psb_z_base_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
! For the time being we just throw everything back
! onto the normal routines.
call x%sync()
call y%sync()
call a%spmm(alpha,x%v,beta,y%v,info,trans)
call y%set_host()
end subroutine psb_z_base_multivect_mv
subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d)
use psb_z_base_mat_mod, psb_protect_name => psb_z_base_vect_cssv
use psb_z_base_vect_mod

4
configure vendored

@ -11710,7 +11710,7 @@ fi
CCOPT : ${CCOPT}
CUDA : ${HAVE_CUDA}
CUDA_CC : ${CUDA_CC}
CUDA_CC : ${pac_cv_cudacc}
BLAS : ${BLAS_LIBS}
@ -11743,7 +11743,7 @@ $as_echo "$as_me:
CCOPT : ${CCOPT}
CUDA : ${HAVE_CUDA}
CUDA_CC : ${CUDA_CC}
CUDA_CC : ${pac_cv_cudacc}
BLAS : ${BLAS_LIBS}

@ -970,7 +970,7 @@ AC_MSG_NOTICE([
CCOPT : ${CCOPT}
CUDA : ${HAVE_CUDA}
CUDA_CC : ${CUDA_CC}
CUDA_CC : ${pac_cv_cudacc}
BLAS : ${BLAS_LIBS}

@ -375,6 +375,13 @@ int getGPUMaxRegistersPerBlock()
return(count);
}
int getGPUSharedMemPerBlock()
{ int count=0;
if (prop!=NULL)
count = prop->sharedMemPerBlock;
return(count);
}
void cpyGPUNameString(char *cstring)
{
*cstring='\0';

@ -65,6 +65,7 @@ int getGPUWarpSize();
int getGPUMaxThreadsPerBlock();
int getGPUMaxThreadsPerMP();
int getGPUMaxRegistersPerBlock();
int getGPUSharedMemPerBlock();
void cpyGPUNameString(char *cstring);

@ -148,6 +148,9 @@ int FallocEllDevice(void** deviceMat,unsigned int rows, unsigned int maxRowSize,
return(i);
}
//
// Single Precision Float
//
void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, int* rP, int* rS,
int avgRowSize, int maxRowSize, int rows, int pitch, float *x, float beta, int firstIndex)
{
@ -168,7 +171,7 @@ void sspmdmm_gpu(float *z,int s, int vPitch, float *y, float alpha, float* cM, i
x += vPitch;
}
}
//new
int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{ int i=SPGPU_SUCCESS;
@ -191,7 +194,31 @@ int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
return(i);
}
int spmmEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
struct EllDevice *devMat = (struct EllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuSellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
(float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
devMat->rP, devMat->cMPitch, devMat->rPPitch,
devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
devMat->rows, (float*)x->v_, x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//
// Double Precision
//
void
dspmdmm_gpu (double *z,int s, int vPitch, double *y, double alpha, double* cM, int* rP,
int* rS, int avgRowSize, int maxRowSize, int rows, int pitch,
@ -237,6 +264,31 @@ int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
return SPGPU_SUCCESS;
}
int spmmEllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
struct EllDevice *devMat = (struct EllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuDellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
(double*)y->v_, y->pitch_, alpha, (double*)devMat->cM,
devMat->rP, devMat->cMPitch, devMat->rPPitch,
devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
devMat->rows, (double*)x->v_, x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//
// Single Precision Float Complex
//
void
cspmdmm_gpu (cuFloatComplex *z, int s, int vPitch, cuFloatComplex *y,
cuFloatComplex alpha, cuFloatComplex* cM,
@ -276,6 +328,31 @@ int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* device
return SPGPU_SUCCESS;
}
int spmmEllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX,
cuFloatComplex beta, void* deviceY)
{
struct EllDevice *devMat = (struct EllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuCellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
(cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
devMat->rP, devMat->cMPitch, devMat->rPPitch,
devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
devMat->rows, (cuFloatComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//
// Double Precision Complex
//
void
zspmdmm_gpu (cuDoubleComplex *z, int s, int vPitch, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* cM,
int* rP, int* rS, int avgRowSize, int maxRowSize, int rows, int pitch,
@ -314,6 +391,28 @@ int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* devi
return SPGPU_SUCCESS;
}
int spmmEllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX,
cuDoubleComplex beta, void* deviceY)
{
struct EllDevice *devMat = (struct EllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuZellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
(cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
devMat->rP, devMat->cMPitch, devMat->rPPitch,
devMat->rS, NULL, devMat->avgRowSize, devMat->maxRowSize,
devMat->rows, (cuDoubleComplex*)x->v_, x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int writeEllDeviceFloat(void* deviceMat, float* val, int* ja, int ldj, int* irn, int *idiag)
{ int i;
struct EllDevice *devMat = (struct EllDevice *) deviceMat;

@ -113,16 +113,6 @@ int readEllDeviceDouble(void* deviceMat, double* val, int* ja, int ldj, int* irn
int readEllDeviceFloatComplex(void* deviceMat, float complex* val, int* ja, int ldj, int* irn, int *idiag);
int readEllDeviceDoubleComplex(void* deviceMat, double complex* val, int* ja, int ldj, int* irn, int *idiag);
int spmvEllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY);
int spmvEllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY);
int spmvEllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX,
float complex beta, void* deviceY);
int spmvEllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX,
double complex beta, void* deviceY);
int psiCopyCooToElgFloat(int nr, int nc, int nza, int hacksz, int ldv, int nzm, int *irn,
int *idisp, int *ja, float *val, void *deviceMat);

@ -318,4 +318,35 @@ module elldev_mod
end function spmvEllDeviceDoubleComplex
end interface
interface spmmEllDevice
function spmmEllDeviceFloat(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmEllDeviceFloat')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_float),value :: alpha, beta
end function spmmEllDeviceFloat
function spmmEllDeviceDouble(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmEllDeviceDouble')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_double),value :: alpha, beta
end function spmmEllDeviceDouble
function spmmEllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmEllDeviceFloatComplex')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
complex(c_float_complex),value :: alpha, beta
end function spmmEllDeviceFloatComplex
function spmmEllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmEllDeviceDoubleComplex')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
complex(c_double_complex),value :: alpha, beta
end function spmmEllDeviceDoubleComplex
end interface
end module elldev_mod

@ -267,6 +267,27 @@ int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
return SPGPU_SUCCESS;
}
int spmmHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuDhdiaspmm (handle, y->count_, (double*)y->v_, y->pitch_,
(double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets,
devMat->rows, devMat->cols, (double *)x->v_, x->pitch_, beta);
return SPGPU_SUCCESS;
}
int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets)
{ int i=0,fo,fa,j,k,p;
char buf_a[255], buf_o[255],tmp[255];
@ -384,3 +405,23 @@ int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
return SPGPU_SUCCESS;
}
int spmmHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
struct HdiagDevice *devMat = (struct HdiagDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShdiaspmm (handle, y->count_, (float*)y->v_, y->pitch_,
(float *)y->v_, y->pitch_, alpha, (float *)devMat->cM,
devMat->hdiaOffsets, devMat->hackSize, devMat->hackOffsets,
devMat->rows, devMat->cols, (float *)x->v_, x->pitch_, beta);
return SPGPU_SUCCESS;
}

@ -95,12 +95,13 @@ int allocHdiagDevice(void ** remoteMatrix, HdiagDeviceParams* params);
void freeHdiagDevice(void* remoteMatrix);
int writeHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
int spmvHdiagDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY);
int writeHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
int spmvHdiagDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY);
//int writeHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
//int writeHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceFloat(void* deviceMat, float* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceDouble(void* deviceMat, double* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceFloatComplex(void* deviceMat, float complex* val, int* hdiaOffsets, int *hackOffsets);
//int readHdiagDeviceDoubleComplex(void* deviceMat, double complex* val, int* hdiaOffsets, int *hackOffsets);
#endif

@ -196,4 +196,35 @@ module hdiagdev_mod
!!$ end function spmvHdiagDeviceDoubleComplex
end interface spmvHdiagDevice
interface spmmHdiagDevice
function spmmHdiagDeviceFloat(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHdiagDeviceFloat')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_float),value :: alpha, beta
end function spmmHdiagDeviceFloat
function spmmHdiagDeviceDouble(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHdiagDeviceDouble')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_double),value :: alpha, beta
end function spmmHdiagDeviceDouble
!!$ function spmmHdiagDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
!!$ & result(res) bind(c,name='spmmHdiagDeviceFloatComplex')
!!$ use iso_c_binding
!!$ integer(c_int) :: res
!!$ type(c_ptr), value :: deviceMat, x, y
!!$ complex(c_float_complex),value :: alpha, beta
!!$ end function spmmHdiagDeviceFloatComplex
!!$ function spmmHdiagDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
!!$ & result(res) bind(c,name='spmmHdiagDeviceDoubleComplex')
!!$ use iso_c_binding
!!$ integer(c_int) :: res
!!$ type(c_ptr), value :: deviceMat, x, y
!!$ complex(c_double_complex),value :: alpha, beta
!!$ end function spmmHdiagDeviceDoubleComplex
end interface spmmHdiagDevice
end module hdiagdev_mod

@ -156,7 +156,9 @@ int FallocHllDevice(void** deviceMat,unsigned int hksize, unsigned int rows, un
return(i);
}
//
// Single Precision Float
//
int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
@ -170,101 +172,170 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
/*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
devMat->baseIndex);*/
spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha, (float *)devMat->cM,
devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha,
(float *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (float *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
void
dspmdmmhll_gpu (double *z, int s, int vPitch, double *y, double alpha, double* cM, int* rP,
int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, double *x, double beta, int firstIndex)
int spmmHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
int i=0;
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
for (i=0; i<s; i++)
{
spgpuDhellspmv (handle, (double*) z, (double*)y, alpha, (double*) cM, rP,
hackSize, hackOffs, rS, NULL,
avgNnzPerRow, rows, (double*)x, beta, firstIndex);
z += vPitch;
y += vPitch;
x += vPitch;
}
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
(float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (float*)x->v_,
x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//new
//
// Double Precision
//
int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, (double *)y->v_,
alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
devMat->avgNzr, devMat->rows,
(double *)x->v_, beta, devMat->baseIndex);
spgpuDhellspmv (handle, (double *)y->v_, (double *)y->v_, alpha,
(double *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (double *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmvHllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX,
float complex beta, void* deviceY)
int spmmHllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
cuFloatComplex a = make_cuFloatComplex(crealf(alpha),cimagf(alpha));
cuFloatComplex b = make_cuFloatComplex(crealf(beta),cimagf(beta));
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
/*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
devMat->baseIndex);*/
spgpuDhellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
(double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (double *)x->v_,
x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//
// Single Precision Float Complex
//
int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX,
cuFloatComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
(cuFloatComplex *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX,
cuFloatComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuChellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
(cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_,
x->pitch_, beta, devMat->baseIndex);
spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, a, (cuFloatComplex *)devMat->cM,
devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, b, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmvHllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX,
double complex beta, void* deviceY)
//
// Double Precision Complex
//
int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX,
cuDoubleComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
cuDoubleComplex a = make_cuDoubleComplex(creal(alpha),cimag(alpha));
cuDoubleComplex b = make_cuDoubleComplex(creal(beta),cimag(beta));
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
(cuDoubleComplex *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX,
cuDoubleComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, a, (cuDoubleComplex *)devMat->cM,
devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr,devMat->rows, (cuDoubleComplex *)x->v_, b, devMat->baseIndex);
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuZhellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
(cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_,
x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}

@ -265,4 +265,40 @@ module hlldev_mod
end interface
interface spmmHllDevice
function spmmHllDeviceFloat(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceFloat')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_float),value :: alpha, beta
end function spmmHllDeviceFloat
function spmmHllDeviceDouble(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceDouble')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_double),value :: alpha, beta
end function spmmHllDeviceDouble
function spmmHllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceFloatComplex')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
complex(c_float_complex),value :: alpha, beta
end function spmmHllDeviceFloatComplex
function spmmHllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceDoubleComplex')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
complex(c_double_complex),value :: alpha, beta
end function spmmHllDeviceDoubleComplex
end interface
end module hlldev_mod

@ -275,19 +275,20 @@ psb_d_cuda_cp_hdiag_from_coo.o \
psb_d_cuda_mv_hdiag_from_coo.o \
psb_d_cuda_hdiag_to_gpu.o \
psb_d_cuda_hdiag_csmv.o \
psb_d_cuda_hdiag_csmm.o \
psb_d_cuda_hdiag_mold.o \
psb_d_cuda_hdiag_vect_mv.o \
psb_s_cuda_cp_hdiag_from_coo.o \
psb_s_cuda_mv_hdiag_from_coo.o \
psb_s_cuda_hdiag_to_gpu.o \
psb_s_cuda_hdiag_csmv.o \
psb_s_cuda_hdiag_csmm.o \
psb_s_cuda_hdiag_mold.o \
psb_s_cuda_hdiag_vect_mv.o \
psb_s_cuda_dnsg_mat_impl.o \
psb_d_cuda_dnsg_mat_impl.o \
psb_c_cuda_dnsg_mat_impl.o \
psb_z_cuda_dnsg_mat_impl.o
psb_z_cuda_dnsg_mat_impl.o
objs: $(OBJS)
lib: objs

@ -98,16 +98,16 @@ subroutine psb_c_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -119,3 +119,94 @@ subroutine psb_c_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_c_cuda_elg_vect_mv
subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use elldev_mod
use psb_vectordev_mod
use psb_c_cuda_elg_mat_mod, psb_protect_name => psb_c_cuda_elg_multivect_mv
use psb_c_cuda_vect_mod
implicit none
class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_vect_type), intent(inout) :: x
class(psb_c_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_spk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='c_cuda_elg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= czero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_c_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_c_vect_cuda)
select type(yy => y)
type is (psb_c_vect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= czero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_c_cuda_elg_multivect_mv

@ -0,0 +1,123 @@
! Parallel Sparse BLAS GPU plugin
! (C) Copyright 2013
!
! Salvatore Filippone
! Alessandro Fanfarillo
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_csmm
implicit none
class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta, x(:,:)
complex(psb_spk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
character :: trans_
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
complex(psb_spk_), allocatable :: acc(:)
type(c_ptr) :: gpX, gpY
logical :: tra
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='c_cuda_hdiag_csmm'
logical, parameter :: debug=.false.
info = psb_success_
call psb_erractionsave(err_act)
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
m = a%get_ncols()
n = a%get_nrows()
else
n = a%get_ncols()
m = a%get_nrows()
end if
if (size(x,1)<n) then
info = 36
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
goto 9999
end if
if (size(y,1)<m) then
info = 36
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
goto 9999
end if
if (tra) then
if (a%is_dev()) call a%sync()
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
else
!
! Just to test, move X/Y to/from the GPU.
!
nxy = min(size(x,2),size(y,2))
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)
endif
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_c_cuda_hdiag_csmm

@ -115,3 +115,94 @@ subroutine psb_c_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_c_cuda_hdiag_vect_mv
subroutine psb_c_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_multivect_mv
use psb_c_cuda_multivect_mod
implicit none
class(psb_c_cuda_hdiag_mat_mod), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_spk_), allocatable :: rx(:,:), ry(:,:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='c_cuda_hdiag_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= dzero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_c_multivect_cuda)
select type(yy => y)
type is (psb_c_multivect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_c_cuda_hdiag_multivect_mv

@ -98,16 +98,16 @@ subroutine psb_c_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -117,3 +117,92 @@ subroutine psb_c_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_c_cuda_hlg_vect_mv
subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hlldev_mod
use psb_vectordev_mod
use psb_c_cuda_hlg_mat_mod, psb_protect_name => psb_c_cuda_hlg_multivect_mv
use psb_c_cuda_vect_mod
implicit none
class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_vect_type), intent(inout) :: x
class(psb_c_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_spk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='c_cuda_hlg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= czero) then
if (.not.y%is_host()) call y%sync()
end if
if (a%is_dev()) call a%sync()
call a%psb_c_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_c_vect_cuda)
select type(yy => y)
type is (psb_c_vect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_c_cuda_hlg_multivect_mv

@ -105,7 +105,7 @@ subroutine psb_d_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999

@ -176,7 +176,7 @@ subroutine psb_d_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmvEllDevice(a%deviceMat,alpha,xx%deviceVect,&
info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&

@ -0,0 +1,123 @@
! Parallel Sparse BLAS GPU plugin
! (C) Copyright 2013
!
! Salvatore Filippone
! Alessandro Fanfarillo
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_csmm
implicit none
class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
real(psb_dpk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
character :: trans_
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
real(psb_dpk_), allocatable :: acc(:)
type(c_ptr) :: gpX, gpY
logical :: tra
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='d_cuda_hdiag_csmm'
logical, parameter :: debug=.false.
info = psb_success_
call psb_erractionsave(err_act)
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
m = a%get_ncols()
n = a%get_nrows()
else
n = a%get_ncols()
m = a%get_nrows()
end if
if (size(x,1)<n) then
info = 36
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
goto 9999
end if
if (size(y,1)<m) then
info = 36
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
goto 9999
end if
if (tra) then
if (a%is_dev()) call a%sync()
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
else
!
! Just to test, move X/Y to/from the GPU.
!
nxy = min(size(x,2),size(y,2))
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)
endif
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_d_cuda_hdiag_csmm

@ -115,3 +115,94 @@ subroutine psb_d_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_d_cuda_hdiag_vect_mv
subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_multivect_mv
use psb_d_cuda_multivect_mod
implicit none
class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
real(psb_dpk_), allocatable :: rx(:,:), ry(:,:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='d_cuda_hdiag_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= dzero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_d_multivect_cuda)
select type(yy => y)
type is (psb_d_multivect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_d_cuda_hdiag_multivect_mv

@ -105,7 +105,7 @@ subroutine psb_d_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999

@ -172,7 +172,7 @@ subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmvhllDevice(a%deviceMat,alpha,xx%deviceVect,&
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&

@ -98,16 +98,16 @@ subroutine psb_s_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -119,3 +119,94 @@ subroutine psb_s_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_s_cuda_elg_vect_mv
subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use elldev_mod
use psb_vectordev_mod
use psb_s_cuda_elg_mat_mod, psb_protect_name => psb_s_cuda_elg_multivect_mv
use psb_s_cuda_vect_mod
implicit none
class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
real(psb_spk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='s_cuda_elg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= szero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_s_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_s_vect_cuda)
select type(yy => y)
type is (psb_s_vect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= szero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_s_cuda_elg_multivect_mv

@ -0,0 +1,123 @@
! Parallel Sparse BLAS GPU plugin
! (C) Copyright 2013
!
! Salvatore Filippone
! Alessandro Fanfarillo
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_csmm
implicit none
class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta, x(:,:)
real(psb_spk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
character :: trans_
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
real(psb_spk_), allocatable :: acc(:)
type(c_ptr) :: gpX, gpY
logical :: tra
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='s_cuda_hdiag_csmm'
logical, parameter :: debug=.false.
info = psb_success_
call psb_erractionsave(err_act)
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
m = a%get_ncols()
n = a%get_nrows()
else
n = a%get_ncols()
m = a%get_nrows()
end if
if (size(x,1)<n) then
info = 36
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
goto 9999
end if
if (size(y,1)<m) then
info = 36
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
goto 9999
end if
if (tra) then
if (a%is_dev()) call a%sync()
call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
else
!
! Just to test, move X/Y to/from the GPU.
!
nxy = min(size(x,2),size(y,2))
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)
endif
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_s_cuda_hdiag_csmm

@ -115,3 +115,94 @@ subroutine psb_s_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_s_cuda_hdiag_vect_mv
subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_multivect_mv
use psb_s_cuda_multivect_mod
implicit none
class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
real(psb_spk_), allocatable :: rx(:,:), ry(:,:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='s_cuda_hdiag_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= dzero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_s_multivect_cuda)
select type(yy => y)
type is (psb_s_multivect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_s_cuda_hdiag_multivect_mv

@ -98,16 +98,16 @@ subroutine psb_s_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -117,3 +117,92 @@ subroutine psb_s_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_s_cuda_hlg_vect_mv
subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hlldev_mod
use psb_vectordev_mod
use psb_s_cuda_hlg_mat_mod, psb_protect_name => psb_s_cuda_hlg_multivect_mv
use psb_s_cuda_vect_mod
implicit none
class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
real(psb_spk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='s_cuda_hlg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= szero) then
if (.not.y%is_host()) call y%sync()
end if
if (a%is_dev()) call a%sync()
call a%psb_s_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_s_vect_cuda)
select type(yy => y)
type is (psb_s_vect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_s_cuda_hlg_multivect_mv

@ -98,16 +98,16 @@ subroutine psb_z_cuda_elg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmEllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -119,3 +119,94 @@ subroutine psb_z_cuda_elg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_z_cuda_elg_vect_mv
subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use elldev_mod
use psb_vectordev_mod
use psb_z_cuda_elg_mat_mod, psb_protect_name => psb_z_cuda_elg_multivect_mv
use psb_z_cuda_vect_mod
implicit none
class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_vect_type), intent(inout) :: x
class(psb_z_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_dpk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='z_cuda_elg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= zzero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_z_ell_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_z_vect_cuda)
select type(yy => y)
type is (psb_z_vect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= zzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmEllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmELLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_z_cuda_elg_multivect_mv

@ -0,0 +1,123 @@
! Parallel Sparse BLAS GPU plugin
! (C) Copyright 2013
!
! Salvatore Filippone
! Alessandro Fanfarillo
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_csmm
implicit none
class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
complex(psb_dpk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
character :: trans_
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
complex(psb_dpk_), allocatable :: acc(:)
type(c_ptr) :: gpX, gpY
logical :: tra
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='z_cuda_hdiag_csmm'
logical, parameter :: debug=.false.
info = psb_success_
call psb_erractionsave(err_act)
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
m = a%get_ncols()
n = a%get_nrows()
else
n = a%get_ncols()
m = a%get_nrows()
end if
if (size(x,1)<n) then
info = 36
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
goto 9999
end if
if (size(y,1)<m) then
info = 36
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
goto 9999
end if
if (tra) then
if (a%is_dev()) call a%sync()
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
else
!
! Just to test, move X/Y to/from the GPU.
!
nxy = min(size(x,2),size(y,2))
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)
endif
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_z_cuda_hdiag_csmm

@ -115,3 +115,94 @@ subroutine psb_z_cuda_hdiag_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_z_cuda_hdiag_vect_mv
subroutine psb_z_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hdiagdev_mod
use psb_vectordev_mod
use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_multivect_mv
use psb_z_cuda_multivect_mod
implicit none
class(psb_z_cuda_hdiag_mat_mod), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_dpk_), allocatable :: rx(:,:), ry(:,:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='z_cuda_hdiag_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (a%is_dev()) call a%sync()
if (.not.x%is_host()) call x%sync()
if (beta /= dzero) then
if (.not.y%is_host()) call y%sync()
end if
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_z_multivect_cuda)
select type(yy => y)
type is (psb_z_multivect_cuda)
if (a%is_host()) call a%sync()
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
if (a%is_dev()) call a%sync()
rx = xx%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
if (a%is_dev()) call a%sync()
rx = x%get_vect()
ry = y%get_vect()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_z_cuda_hdiag_multivect_mv

@ -98,16 +98,16 @@ subroutine psb_z_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -117,3 +117,92 @@ subroutine psb_z_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_z_cuda_hlg_vect_mv
subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hlldev_mod
use psb_vectordev_mod
use psb_z_cuda_hlg_mat_mod, psb_protect_name => psb_z_cuda_hlg_multivect_mv
use psb_z_cuda_vect_mod
implicit none
class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_vect_type), intent(inout) :: x
class(psb_z_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_dpk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='z_cuda_hlg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= zzero) then
if (.not.y%is_host()) call y%sync()
end if
if (a%is_dev()) call a%sync()
call a%psb_z_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_z_vect_cuda)
select type(yy => y)
type is (psb_z_vect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_z_cuda_hlg_multivect_mv

@ -56,6 +56,7 @@ module psb_c_cuda_elg_mat_mod
procedure, nopass :: get_fmt => c_cuda_elg_get_fmt
procedure, pass(a) :: sizeof => c_cuda_elg_sizeof
procedure, pass(a) :: vect_mv => psb_c_cuda_elg_vect_mv
procedure, pass(a) :: multivect_mv => psb_c_cuda_elg_multivect_mv
procedure, pass(a) :: csmm => psb_c_cuda_elg_csmm
procedure, pass(a) :: csmv => psb_c_cuda_elg_csmv
procedure, pass(a) :: in_vect_sv => psb_c_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_c_cuda_elg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_elg_vect_mv
subroutine psb_c_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_c_cuda_elg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
class(psb_c_cuda_elg_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_elg_multivect_mv
end interface
interface

@ -44,7 +44,8 @@ module psb_c_cuda_hdiag_mat_mod
procedure, nopass :: get_fmt => c_cuda_hdiag_get_fmt
! procedure, pass(a) :: sizeof => c_cuda_hdiag_sizeof
procedure, pass(a) :: vect_mv => psb_c_cuda_hdiag_vect_mv
! procedure, pass(a) :: csmm => psb_c_cuda_hdiag_csmm
procedure, pass(a) :: multivect_mv => psb_c_cuda_hdiag_multivect_mv
procedure, pass(a) :: csmm => psb_c_cuda_hdiag_csmm
procedure, pass(a) :: csmv => psb_c_cuda_hdiag_csmv
! procedure, pass(a) :: in_vect_sv => psb_c_cuda_hdiag_inner_vect_sv
! procedure, pass(a) :: scals => psb_c_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_c_cuda_hdiag_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hdiag_vect_mv
subroutine psb_c_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hdiag_multivect_mv
end interface
!!$ interface
@ -172,17 +182,17 @@ module psb_c_cuda_hdiag_mat_mod
end subroutine psb_c_cuda_hdiag_csmv
end interface
!!$ interface
!!$ subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
!!$ import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
!!$ class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
!!$ complex(psb_spk_), intent(in) :: alpha, beta, x(:,:)
!!$ complex(psb_spk_), intent(inout) :: y(:,:)
!!$ integer(psb_ipk_), intent(out) :: info
!!$ character, optional, intent(in) :: trans
!!$ end subroutine psb_c_cuda_hdiag_csmm
!!$ end interface
!!$
interface
subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta, x(:,:)
complex(psb_spk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hdiag_csmm
end interface
!!$ interface
!!$ subroutine psb_c_cuda_hdiag_scal(d,a,info, side)
!!$ import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_

@ -55,6 +55,7 @@ module psb_c_cuda_hlg_mat_mod
contains
procedure, nopass :: get_fmt => c_cuda_hlg_get_fmt
procedure, pass(a) :: sizeof => c_cuda_hlg_sizeof
procedure, pass(a) :: multivect_mv => psb_c_cuda_hlg_multivect_mv
procedure, pass(a) :: vect_mv => psb_c_cuda_hlg_vect_mv
procedure, pass(a) :: csmm => psb_c_cuda_hlg_csmm
procedure, pass(a) :: csmv => psb_c_cuda_hlg_csmv
@ -97,6 +98,15 @@ module psb_c_cuda_hlg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hlg_vect_mv
subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_c_cuda_hlg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hlg_multivect_mv
end interface
interface

@ -165,6 +165,11 @@ module psb_cuda_env_mod
use iso_c_binding
integer(c_int) :: res
end function psb_C_get_MaxRegistersPerBlock
function psb_C_get_SharedMemPerBlock() &
& result(res) bind(c,name='getGPUSharedMemsPerBlock')
use iso_c_binding
integer(c_int) :: res
end function psb_C_get_SharedMemPerBlock
end interface
interface
subroutine psb_C_cpy_NameString(cstring) &

@ -44,7 +44,8 @@ module psb_d_cuda_hdiag_mat_mod
procedure, nopass :: get_fmt => d_cuda_hdiag_get_fmt
! procedure, pass(a) :: sizeof => d_cuda_hdiag_sizeof
procedure, pass(a) :: vect_mv => psb_d_cuda_hdiag_vect_mv
! procedure, pass(a) :: csmm => psb_d_cuda_hdiag_csmm
procedure, pass(a) :: multivect_mv => psb_d_cuda_hdiag_multivect_mv
procedure, pass(a) :: csmm => psb_d_cuda_hdiag_csmm
procedure, pass(a) :: csmv => psb_d_cuda_hdiag_csmv
! procedure, pass(a) :: in_vect_sv => psb_d_cuda_hdiag_inner_vect_sv
! procedure, pass(a) :: scals => psb_d_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_d_cuda_hdiag_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_d_cuda_hdiag_vect_mv
subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_
class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta
class(psb_d_base_multivect_type), intent(inout) :: x
class(psb_d_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_d_cuda_hdiag_multivect_mv
end interface
!!$ interface
@ -172,17 +182,17 @@ module psb_d_cuda_hdiag_mat_mod
end subroutine psb_d_cuda_hdiag_csmv
end interface
!!$ interface
!!$ subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
!!$ import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
!!$ class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
!!$ real(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
!!$ real(psb_dpk_), intent(inout) :: y(:,:)
!!$ integer(psb_ipk_), intent(out) :: info
!!$ character, optional, intent(in) :: trans
!!$ end subroutine psb_d_cuda_hdiag_csmm
!!$ end interface
!!$
interface
subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
real(psb_dpk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_d_cuda_hdiag_csmm
end interface
!!$ interface
!!$ subroutine psb_d_cuda_hdiag_scal(d,a,info, side)
!!$ import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_

@ -56,6 +56,7 @@ module psb_s_cuda_elg_mat_mod
procedure, nopass :: get_fmt => s_cuda_elg_get_fmt
procedure, pass(a) :: sizeof => s_cuda_elg_sizeof
procedure, pass(a) :: vect_mv => psb_s_cuda_elg_vect_mv
procedure, pass(a) :: multivect_mv => psb_s_cuda_elg_multivect_mv
procedure, pass(a) :: csmm => psb_s_cuda_elg_csmm
procedure, pass(a) :: csmv => psb_s_cuda_elg_csmv
procedure, pass(a) :: in_vect_sv => psb_s_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_s_cuda_elg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_elg_vect_mv
subroutine psb_s_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_s_cuda_elg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
class(psb_s_cuda_elg_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_elg_multivect_mv
end interface
interface

@ -44,7 +44,8 @@ module psb_s_cuda_hdiag_mat_mod
procedure, nopass :: get_fmt => s_cuda_hdiag_get_fmt
! procedure, pass(a) :: sizeof => s_cuda_hdiag_sizeof
procedure, pass(a) :: vect_mv => psb_s_cuda_hdiag_vect_mv
! procedure, pass(a) :: csmm => psb_s_cuda_hdiag_csmm
procedure, pass(a) :: multivect_mv => psb_s_cuda_hdiag_multivect_mv
procedure, pass(a) :: csmm => psb_s_cuda_hdiag_csmm
procedure, pass(a) :: csmv => psb_s_cuda_hdiag_csmv
! procedure, pass(a) :: in_vect_sv => psb_s_cuda_hdiag_inner_vect_sv
! procedure, pass(a) :: scals => psb_s_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_s_cuda_hdiag_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hdiag_vect_mv
subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hdiag_multivect_mv
end interface
!!$ interface
@ -172,17 +182,17 @@ module psb_s_cuda_hdiag_mat_mod
end subroutine psb_s_cuda_hdiag_csmv
end interface
!!$ interface
!!$ subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
!!$ import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
!!$ class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
!!$ real(psb_spk_), intent(in) :: alpha, beta, x(:,:)
!!$ real(psb_spk_), intent(inout) :: y(:,:)
!!$ integer(psb_ipk_), intent(out) :: info
!!$ character, optional, intent(in) :: trans
!!$ end subroutine psb_s_cuda_hdiag_csmm
!!$ end interface
!!$
interface
subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_
class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta, x(:,:)
real(psb_spk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hdiag_csmm
end interface
!!$ interface
!!$ subroutine psb_s_cuda_hdiag_scal(d,a,info, side)
!!$ import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_

@ -56,6 +56,7 @@ module psb_s_cuda_hlg_mat_mod
procedure, nopass :: get_fmt => s_cuda_hlg_get_fmt
procedure, pass(a) :: sizeof => s_cuda_hlg_sizeof
procedure, pass(a) :: vect_mv => psb_s_cuda_hlg_vect_mv
procedure, pass(a) :: multivect_mv => psb_s_cuda_hlg_multivect_mv
procedure, pass(a) :: csmm => psb_s_cuda_hlg_csmm
procedure, pass(a) :: csmv => psb_s_cuda_hlg_csmv
procedure, pass(a) :: in_vect_sv => psb_s_cuda_hlg_inner_vect_sv
@ -97,6 +98,15 @@ module psb_s_cuda_hlg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hlg_vect_mv
subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_s_cuda_hlg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hlg_multivect_mv
end interface
interface

@ -56,6 +56,7 @@ module psb_z_cuda_elg_mat_mod
procedure, nopass :: get_fmt => z_cuda_elg_get_fmt
procedure, pass(a) :: sizeof => z_cuda_elg_sizeof
procedure, pass(a) :: vect_mv => psb_z_cuda_elg_vect_mv
procedure, pass(a) :: multivect_mv => psb_z_cuda_elg_multivect_mv
procedure, pass(a) :: csmm => psb_z_cuda_elg_csmm
procedure, pass(a) :: csmv => psb_z_cuda_elg_csmv
procedure, pass(a) :: in_vect_sv => psb_z_cuda_elg_inner_vect_sv
@ -101,6 +102,15 @@ module psb_z_cuda_elg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_elg_vect_mv
subroutine psb_z_cuda_elg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_z_cuda_elg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
class(psb_z_cuda_elg_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_elg_multivect_mv
end interface
interface

@ -44,7 +44,8 @@ module psb_z_cuda_hdiag_mat_mod
procedure, nopass :: get_fmt => z_cuda_hdiag_get_fmt
! procedure, pass(a) :: sizeof => z_cuda_hdiag_sizeof
procedure, pass(a) :: vect_mv => psb_z_cuda_hdiag_vect_mv
! procedure, pass(a) :: csmm => psb_z_cuda_hdiag_csmm
procedure, pass(a) :: multivect_mv => psb_z_cuda_hdiag_multivect_mv
procedure, pass(a) :: csmm => psb_z_cuda_hdiag_csmm
procedure, pass(a) :: csmv => psb_z_cuda_hdiag_csmv
! procedure, pass(a) :: in_vect_sv => psb_z_cuda_hdiag_inner_vect_sv
! procedure, pass(a) :: scals => psb_z_cuda_hdiag_scals
@ -77,6 +78,15 @@ module psb_z_cuda_hdiag_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hdiag_vect_mv
subroutine psb_z_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hdiag_multivect_mv
end interface
!!$ interface
@ -172,17 +182,17 @@ module psb_z_cuda_hdiag_mat_mod
end subroutine psb_z_cuda_hdiag_csmv
end interface
!!$ interface
!!$ subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
!!$ import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
!!$ class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
!!$ complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
!!$ complex(psb_dpk_), intent(inout) :: y(:,:)
!!$ integer(psb_ipk_), intent(out) :: info
!!$ character, optional, intent(in) :: trans
!!$ end subroutine psb_z_cuda_hdiag_csmm
!!$ end interface
!!$
interface
subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_
class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
complex(psb_dpk_), intent(inout) :: y(:,:)
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hdiag_csmm
end interface
!!$ interface
!!$ subroutine psb_z_cuda_hdiag_scal(d,a,info, side)
!!$ import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_

@ -56,6 +56,7 @@ module psb_z_cuda_hlg_mat_mod
procedure, nopass :: get_fmt => z_cuda_hlg_get_fmt
procedure, pass(a) :: sizeof => z_cuda_hlg_sizeof
procedure, pass(a) :: vect_mv => psb_z_cuda_hlg_vect_mv
procedure, pass(a) :: multivect_mv => psb_z_cuda_hlg_multivect_mv
procedure, pass(a) :: csmm => psb_z_cuda_hlg_csmm
procedure, pass(a) :: csmv => psb_z_cuda_hlg_csmv
procedure, pass(a) :: in_vect_sv => psb_z_cuda_hlg_inner_vect_sv
@ -97,6 +98,15 @@ module psb_z_cuda_hlg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hlg_vect_mv
subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_z_cuda_hlg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hlg_multivect_mv
end interface
interface

@ -70,6 +70,51 @@ void spgpuSellspmv (spgpuHandle_t handle,
float beta,
int baseIndex);
/**
* \fn void spgpuSellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgRowSize,int maxRowSize,int rows, const __device float *x,int xpitch,float beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The ELL non zero values allocation pointer
* \param rP The ELL column indices allocation pointer
* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
* \param maxNnzPerRow Maximum number of non zeroes per row.
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuSellspmm(spgpuHandle_t handle,
int count,
__device float *z,
int zpitch,
const __device float *y,
int ypitch,
float alpha,
const __device float* cM,
const __device int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const __device float *x,
int xpitch,
float beta,
int baseIndex);
/**
* \fn void spgpuDellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
* Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -107,6 +152,50 @@ void spgpuDellspmv (spgpuHandle_t handle,
double beta,
int baseIndex);
/**
* \fn void spgpuDellspmm ( int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device double *x,int xpitch,double beta,int baseIndex)
* Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The ELL non zero values allocation pointer
* \param rP The ELL column indices allocation pointer
* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
* \param maxNnzPerRow Maximum number of non zeroes per row.
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuDellspmm(spgpuHandle_t handle,
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* cM,
const __device int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const __device double *x,
int xpitch,
double beta,
int baseIndex);
/**
* \fn void spgpuCellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -145,6 +234,51 @@ void spgpuCellspmv (spgpuHandle_t handle,
cuFloatComplex beta,
int baseIndex);
/**
* \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
* Computes single precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The ELL non zero values allocation pointer
* \param rP The ELL column indices allocation pointer
* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
* \param maxNnzPerRow Maximum number of non zeroes per row.
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuCellspmm(spgpuHandle_t handle,
int count,
__device cuFloatComplex *z,
int zpitch,
const __device cuFloatComplex *y,
int ypitch,
cuFloatComplex alpha,
const __device cuFloatComplex* cM,
const __device int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const __device cuFloatComplex *x,
int xpitch,
cuFloatComplex beta,
int baseIndex);
/**
* \fn void spgpuZellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
* Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
@ -182,6 +316,50 @@ void spgpuZellspmv (spgpuHandle_t handle,
cuDoubleComplex beta,
int baseIndex);
/**
* \fn void spgpuCellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int cMPitch,int rPPitch,const __device int* rS,const __device int* rIdx, int avgNnzPerRow,int maxNnzPerRow,int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
* Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The ELL non zero values allocation pointer
* \param rP The ELL column indices allocation pointer
* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
* \param maxNnzPerRow Maximum number of non zeroes per row.
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuZellspmm(spgpuHandle_t handle,
int count,
__device cuDoubleComplex *z,
int zpitch,
const __device cuDoubleComplex *y,
int ypitch,
cuDoubleComplex alpha,
const __device cuDoubleComplex* cM,
const __device int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const __device cuDoubleComplex *x,
int xpitch,
cuDoubleComplex beta,
int baseIndex);
/**
* \fn void spgpuSellcsput (spgpuHandle_t handle, float alpha, __device float *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device float *aVal, int baseIndex)

@ -27,6 +27,10 @@
extern "C" {
#endif
// DIA/HDIA Compressed Matrix Format routines
/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
#define DIA_PITCH_ALIGN_BYTE 128
/**
* \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta)
@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle,
float beta);
/**
* \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuShdiaspmm (spgpuHandle_t handle,
int count,
__device float *z,
int zpitch,
const __device float *y,
int ypitch,
float alpha,
const __device float* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device float *x,
int xpitch,
float beta);
/**
* \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta)
* Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle,
const double *x,
double beta);
/**
* \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuDhdiaspmm (spgpuHandle_t handle,
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device double *x,
int xpitch,
double beta);
/**
* \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta)
@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle,
const cuFloatComplex *x,
cuFloatComplex beta);
/**
* \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuChdiaspmm (spgpuHandle_t handle,
int count,
__device cuFloatComplex *z,
int zpitch,
const __device cuFloatComplex *y,
int ypitch,
cuFloatComplex alpha,
const __device cuFloatComplex* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device cuFloatComplex *x,
int xpitch,
cuFloatComplex beta);
/**
* \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta)
@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle,
int cols,
const cuDoubleComplex *x,
cuDoubleComplex beta);
/**
* \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param dM The stacked HDIA non zero values allocation pointer
* \param offsets The stacked HDIA diagonals offsets vector
* \param hackSize The constant size of every hack (must be a multiple of 32)
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
* \param rows the rows count
* \param cols the columns count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
*/
void
spgpuZhdiaspmm (spgpuHandle_t handle,
int count,
__device cuDoubleComplex *z,
int zpitch,
const __device cuDoubleComplex *y,
int ypitch,
cuDoubleComplex alpha,
const __device cuDoubleComplex* dM,
const __device int* offsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const __device cuDoubleComplex *x,
int xpitch,
cuDoubleComplex beta);
/** @}*/

@ -68,7 +68,46 @@ void spgpuShellspmv (spgpuHandle_t handle,
float beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device float *x,int xpitch,float beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuShellspmm(spgpuHandle_t handle,
int count,
__device float *z,
int zpitch,
const __device float *y,
int ypitch,
float alpha,
const __device float* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device float *x,
int xpitch,
float beta,
int baseIndex);
/**
* \fn void spgpuDhellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
@ -105,6 +144,46 @@ void spgpuDhellspmv (spgpuHandle_t handle,
double beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device double *x,int xpitch,double beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuDhellspmm(spgpuHandle_t handle,
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device double *x,
int xpitch,
double beta,
int baseIndex);
/**
* \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -141,7 +220,46 @@ void spgpuChellspmv (spgpuHandle_t handle,
cuFloatComplex beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuChellspmm(spgpuHandle_t handle,
int count,
__device cuFloatComplex *z,
int zpitch,
const __device cuFloatComplex *y,
int ypitch,
cuFloatComplex alpha,
const __device cuFloatComplex* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device cuFloatComplex *x,
int xpitch,
cuFloatComplex beta,
int baseIndex);
/**
* \fn void spgpuZhellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
@ -178,6 +296,46 @@ void spgpuZhellspmv (spgpuHandle_t handle,
cuDoubleComplex beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuZhellspmm(spgpuHandle_t handle,
int count,
__device cuDoubleComplex *z,
int zpitch,
const __device cuDoubleComplex *y,
int ypitch,
cuDoubleComplex alpha,
const __device cuDoubleComplex* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device cuDoubleComplex *x,
int xpitch,
cuDoubleComplex beta,
int baseIndex);
/** @}*/

@ -14,10 +14,14 @@ OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \
cnrm2.o cscal.o cscat.o csetscal.o cabgdxyz.o\
dabs.o damax.o dasum.o daxpby.o daxy.o ddot.o dgath.o dabgdxyz.o\
dia_cspmv.o dia_dspmv.o dia_sspmv.o dia_zspmv.o dnrm2.o \
dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o ell_dcsput.o \
ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
ell_cspmm.o ell_dspmm.o ell_sspmm.o ell_zspmm.o \
hdia_cspmv.o hdia_cspmm.o hdia_dspmv.o hdia_dspmm.o \
hdia_sspmv.o hdia_sspmm.o hdia_zspmv.o hdia_zspmm.o \
hell_cspmv.o hell_dspmv.o hell_sspmv.o hell_zspmv.o \
hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \
igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
sxyzw.o cxyzw.o dxyzw.o zxyzw.o

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "ell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuFloatComplex
#define TYPE_SYMBOL C
#define TEX_FETCH_TYPE cuFloatComplex
#include "ell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include <stdio.h>
extern "C"
{
#include "core.h"
#include "ell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE double
#define TYPE_SYMBOL D
#define TEX_FETCH_TYPE int2
#include "ell_spmm_base.cuh"

@ -0,0 +1,276 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
#undef GEN_SPGPU_ELL_NAME
#undef X_TEX
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
__device__ __host__ static float zero_float() { return 0.0f; }
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
__device__ static float readValue_float(float fetch) { return fetch; }
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
// host or c.c >= 1.3
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
__device__ __host__ static double zero_double() { return 0.0; }
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
{
cuDoubleComplex c;
c.x = __hiloint2double (fetch.y, fetch.x);
c.y = __hiloint2double (fetch.w, fetch.z);
return c;
}
#endif
#if 0
// Texture cache management
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
__device__ static VALUE_TYPE
fetchTex (int pointer)
{
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
return CONCAT(readValue_,VALUE_TYPE) (fetch);
}
#endif
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
#define GEN_SPGPU_ELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
#include "ell_spmm_base_template.cuh"
#if 0
#undef GEN_SPGPU_ELL_NAME
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
#define GEN_SPGPU_ELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
#undef USE_PREFETCHING
#define USE_PREFETCHING
#include "ell_spmm_base_template.cuh"
#define ENABLE_CACHE
#undef GEN_SPGPU_ELL_NAME
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
#define GEN_SPGPU_ELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
#include "ell_spmm_base_template.cuh"
#undef GEN_SPGPU_ELL_NAME
#undef USE_PREFETCHING
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
#define GEN_SPGPU_ELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
#include "ell_spmm_base_template.cuh"
#endif
#undef GEN_SPGPU_ELL_NAME
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
#if 0
void
GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt, c1;
dim3 block (THREAD_BLOCK, 1);
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
int numMp=getGPUMultiProcessors();
int maxThMp=getGPUMaxThreadsPerMP();
int nmblksMp=maxThMp/THREAD_BLOCK;
int nmblk=nmblksMp*numMp;
dim3 grid (nmblk);
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
if (shrMemSize > maxShmemSz) {
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
return;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, cMPitch, rPPitch,
rS, rows, px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt > MMBSZ) {
c1 = cnt/2;
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch,
alpha, cM, rP, cMPitch, rPPitch,
rS, rows, px, xPitch, beta, baseIndex);
cnt -= c1;
}
if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch,
alpha, cM, rP, cMPitch, rPPitch,
rS, rows, px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on ell_spmm");
}
#endif
void
GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int cMPitch,
int rPPitch,
const __device int* rS,
const __device int* rIdx,
int avgNnzPerRow,
int maxNnzPerRow,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py,*pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
cMPitch, rPPitch,
rS, rIdx, avgNnzPerRow,
maxNnzPerRow, maxNForACall,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
cMPitch, rPPitch,
rS, rIdx, avgNnzPerRow,
maxNnzPerRow, maxNForACall,
px, xPitch, beta, baseIndex);
}
y = y + maxNForACall;
z = z + maxNForACall;
cM = cM + maxNForACall;
rP = rP + maxNForACall;
rS = rS + maxNForACall;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
cMPitch, rPPitch,
rS, rIdx, avgNnzPerRow,
maxNnzPerRow, rows,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
cMPitch, rPPitch,
rS, rIdx, avgNnzPerRow,
maxNnzPerRow, rows,
px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on ell_spmm");
}

@ -0,0 +1,169 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define IDX2
#define THREAD_BLOCK 128
#define MMBSZ 8
#if 0
__global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int cMPitch, int rPPitch, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gridSize = gridDim.x * blockDim.x;
while (i < rows) {
rS += i; rP += i; cM += i;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += rPPitch;
value = cM[0];
cM += cMPitch;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
} else {
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
i += gridSize;
}
}
#endif
__global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int cMPitch, int rPPitch, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
if (i < rows) {
rS += i; rP += i; cM += i;
int rowSize = rS[0];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += rPPitch;
value = cM[0];
cM += cMPitch;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
} else {
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
}
}
void
CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch,
const int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
alpha, cM, rP, cMPitch, rPPitch, rS, rows,
x, xPitch, beta, baseIndex);
}

@ -0,0 +1,34 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
extern "C"
{
#include "core.h"
#include "ell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE float
#define TYPE_SYMBOL S
#define TEX_FETCH_TYPE float
#include "ell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "ell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuDoubleComplex
#define TYPE_SYMBOL Z
#define TEX_FETCH_TYPE int4
#include "ell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "hdia.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuFloatComplex
#define TYPE_SYMBOL C
#define TEX_FETCH_TYPE cuFloatComplex
#include "hdia_spmm_base.cuh"

@ -0,0 +1,36 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
extern "C"
{
#include "core.h"
#include "hdia.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
//#define ENABLE_CACHE
#define VALUE_TYPE double
#define TYPE_SYMBOL D
//#define TEX_FETCH_TYPE int2
#include "hdia_spmm_base.cuh"

@ -0,0 +1,188 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
#undef GEN_SPGPU_HDIA_NAME
#undef X_TEX
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
__device__ __host__ static float zero_float() { return 0.0f; }
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
__device__ static float readValue_float(float fetch) { return fetch; }
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
// host or c.c >= 1.3
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
__device__ __host__ static double zero_double() { return 0.0; }
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
{
cuDoubleComplex c;
c.x = __hiloint2double (fetch.y, fetch.x);
c.y = __hiloint2double (fetch.w, fetch.z);
return c;
}
#endif
#if 0
// Texture cache management
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
__device__ static VALUE_TYPE
fetchTex (int pointer)
{
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
return CONCAT(readValue_,VALUE_TYPE) (fetch);
}
#endif
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
#include "hdia_spmm_base_template.cuh"
#if 0
#undef GEN_SPGPU_HDIA_NAME
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
#undef USE_PREFETCHING
#define USE_PREFETCHING
#include "hdia_spmm_base_template.cuh"
#define ENABLE_CACHE
#undef GEN_SPGPU_HDIA_NAME
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
#include "hdia_spmm_base_template.cuh"
#undef GEN_SPGPU_HDIA_NAME
#undef USE_PREFETCHING
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
#include "hdia_spmm_base_template.cuh"
#endif
#undef GEN_SPGPU_HDIA_NAME
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm)
void
GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* hdiaOffsets,
int hackSize,
const __device int* hackOffsets,
int rows,
int cols,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
maxNForACall, cols,
px, xPitch, beta);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
maxNForACall, cols,
px, xPitch, beta);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
rows, cols,
px, xPitch, beta);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, hdiaOffsets,
hackSize, hackOffsets,
rows, cols,
px, xPitch, beta);
}
cudaCheckError("CUDA error on hdiag_spmm");
}

@ -0,0 +1,141 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define THREAD_BLOCK 128
#define MMBSZ 8
__global__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
int hackSize, const int* hackOffsets, int rows, int cols,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int hackCount = (rows + hackSize - 1)/hackSize;
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
// shared between offsetsChunks and warpHackOffsetTemp
extern __shared__ int dynShrMem[];
int hackOffset = 0;
int nextOffset = 0;
unsigned int laneId = threadIdx.x % warpSize;
unsigned int warpId = threadIdx.x / warpSize;
if (laneId == 0 && i < rows) {
hackOffset = hackOffsets[hackId];
nextOffset = hackOffsets[hackId+1];
}
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);
nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
if (hackId >= hackCount)
return;
cM += hackOffset*hackSize + hackLaneId;
hdiaOffsets += hackOffset;
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
// diags for this hack is next hackOffset minus current hackOffset
int diags = nextOffset - hackOffset;
// Warp oriented
int rounds = (diags + warpSize - 1)/warpSize;
volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
for (int r = 0; r < rounds; r++) {
// in the last round diags will be <= warpSize
if (laneId < diags)
offsetsChunk[laneId] = hdiaOffsets[laneId];
if (i < rows) {
int dCount = min(diags, warpSize);
for (int j = 0; j < dCount; ++j) {
int column = offsetsChunk[j] + i;
if(column >= 0 && column < cols) {
px = (VALUE_TYPE *) x;
for (int k = 0; k < count; k++) {
VALUE_TYPE xValue = px[column];
VALUE_TYPE mValue = cM[0];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
cM += hackSize;
}
}
diags -= warpSize;
hdiaOffsets += warpSize;
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
if (i >= rows)
return;
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
else
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
void
CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
int xPitch, VALUE_TYPE beta)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
maxShmemSz = getGPUSharedMemPerBlock();
shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
x, xPitch, beta);
}

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
extern "C"
{
#include "core.h"
#include "hdia.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE float
#define TYPE_SYMBOL S
#define TEX_FETCH_TYPE float
#include "hdia_spmm_base.cuh"

@ -0,0 +1,36 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "hdia.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuDoubleComplex
#define TYPE_SYMBOL Z
#define TEX_FETCH_TYPE int4
#include "hdia_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuFloatComplex
#define TYPE_SYMBOL C
#define TEX_FETCH_TYPE cuFloatComplex
#include "hell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include <stdio.h>
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE double
#define TYPE_SYMBOL D
#define TEX_FETCH_TYPE int2
#include "hell_spmm_base.cuh"

@ -16,11 +16,14 @@
#include "cudadebug.h"
#include "cudalang.h"
#include <stdio.h>
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
@ -29,4 +32,3 @@ extern "C"
#define TYPE_SYMBOL D
#define TEX_FETCH_TYPE int2
#include "hell_spmv_base.cuh"

@ -0,0 +1,274 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
#undef GEN_SPGPU_HELL_NAME
#undef X_TEX
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
__device__ __host__ static float zero_float() { return 0.0f; }
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
__device__ static float readValue_float(float fetch) { return fetch; }
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
// host or c.c >= 1.3
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
__device__ __host__ static double zero_double() { return 0.0; }
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
{
cuDoubleComplex c;
c.x = __hiloint2double (fetch.y, fetch.x);
c.y = __hiloint2double (fetch.w, fetch.z);
return c;
}
#endif
#if 0
// Texture cache management
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
__device__ static VALUE_TYPE
fetchTex (int pointer)
{
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
return CONCAT(readValue_,VALUE_TYPE) (fetch);
}
#endif
#if __CUDA_ARCH__ < 300
extern __shared__ int dynShrMem[];
#endif
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#if 0
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
#undef USE_PREFETCHING
#define USE_PREFETCHING
#include "hell_spmm_base_template.cuh"
#define ENABLE_CACHE
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#undef USE_PREFETCHING
#endif
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
#if 0
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt, c1;
dim3 block (THREAD_BLOCK, 1);
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
int numMp=getGPUMultiProcessors();
int maxThMp=getGPUMaxThreadsPerMP();
int nmblksMp=maxThMp/THREAD_BLOCK;
int nmblk=nmblksMp*numMp;
dim3 grid (nmblk);
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
if (shrMemSize > maxShmemSz) {
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
return;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt > MMBSZ) {
c1 = cnt/2;
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
cnt -= c1;
}
if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}
#endif
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rS = rS + maxNForACall;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rIdx, rows,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
rows,
px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}

@ -0,0 +1,194 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define IDX2
#define THREAD_BLOCK 128
#define MMBSZ 8
#if 0
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int *rrP;
VALUE_TYPE *rcM;
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gridSize = gridDim.x * blockDim.x;
while (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rrP = (int *) rP + hackOffset;
rcM = (VALUE_TYPE *) cM + hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rrP[0] - baseIndex;
rrP += hackSize;
value = rcM[0];
rcM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
} else {
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
i += gridSize;
}
}
#endif
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
if (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rP += hackOffset;
cM += hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += hackSize;
value = cM[0];
cM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
else
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
}
void
CONCAT(_,GEN_SPGPU_HELL_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
const int* rS, const __device int* rIdx, int rows,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
alpha, cM, rP, hackSize, hackOffsets, rS, rows,
x, xPitch, beta, baseIndex);
}

@ -14,7 +14,6 @@
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
@ -156,4 +155,3 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
cudaCheckError("CUDA error on hell_spmv");
}

@ -0,0 +1,34 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE float
#define TYPE_SYMBOL S
#define TEX_FETCH_TYPE float
#include "hell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuDoubleComplex
#define TYPE_SYMBOL Z
#define TEX_FETCH_TYPE int4
#include "hell_spmm_base.cuh"

@ -38,8 +38,10 @@
#include "core.h"
//new
MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned int size,
unsigned int elementType)
MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
unsigned int size,
unsigned int elementType,
unsigned int storage2d)
{
struct MultiVectorDeviceParams params;
@ -75,6 +77,7 @@ MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned
params.count = count;
params.size = size;
params.storage2d = storage2d ;
return params;
@ -87,8 +90,9 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
struct MultiVectDevice *tmp = (struct MultiVectDevice *)malloc(sizeof(struct MultiVectDevice));
*remoteMultiVec = (void *)tmp;
tmp->size_ = params->size;
tmp->count_ = params->count;
tmp->size_ = params->size;
tmp->count_ = params->count;
tmp->storage2d_ = params->storage2d;
if (params->elementType == SPGPU_TYPE_INT)
{
@ -163,9 +167,9 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
{ int i;
struct MultiVectorDeviceParams p;
p = getMultiVectorDeviceParams(count, size, elementType);
p = getMultiVectorDeviceParams(count, size, elementType, PSB_VECT_STOR_BY_COLS);
i = allocMultiVecDevice(deviceMultiVec, &p);
//fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
//fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.count);
//cudaSync();
if (i != 0) {
fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);
@ -194,3 +198,9 @@ int getMultiVecDevicePitch(void* deviceVec)
return(i);
}
int getMultiVecDeviceStorage2d(void* deviceVec)
{ int i;
struct MultiVectDevice *dev = (struct MultiVectDevice *) deviceVec;
i = dev->storage2d_;
return(i);
}

@ -37,8 +37,14 @@
#include "cintrf.h"
#include <complex.h>
#define PSB_VECT_STOR_BY_COLS 0
#define PSB_VECT_STOR_BY_ROWS 1
struct MultiVectDevice
{
// storage of 2d data
int storage2d_;
// number of vectors
int count_;
@ -54,17 +60,19 @@ struct MultiVectDevice
typedef struct MultiVectorDeviceParams
{
// number on vectors
unsigned int count; //1 for a simple vector
// The resulting allocation will be pitch*s*(size of the elementType)
unsigned int elementType;
// Pitch (in number of elements)
unsigned int pitch;
// Size of a single vector (in number of elements).
unsigned int size;
// storage of 2d data
unsigned int storage2d;
// number on vectors
unsigned int count; //1 for a simple vector
// The resulting allocation will be pitch*s*(size of the elementType)
unsigned int elementType;
// Pitch (in number of elements)
unsigned int pitch;
// Size of a single vector (in number of elements).
unsigned int size;
} MultiVectorDeviceParams;
@ -76,7 +84,8 @@ int unregisterMapped(void *);
MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
unsigned int size,
unsigned int elementType);
unsigned int elementType,
unsigned int storage2d);
int FallocMultiVecDevice(void** deviceMultiVec, unsigned count,
unsigned int size, unsigned int elementType);
@ -85,3 +94,4 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
int getMultiVecDeviceSize(void* deviceVec);
int getMultiVecDeviceCount(void* deviceVec);
int getMultiVecDevicePitch(void* deviceVec);
int getMultiVecDeviceStorage2d(void* deviceVec);

@ -7,16 +7,16 @@ HERE=..
OBJS=psb_s_prec_type_impl.o psb_d_prec_type_impl.o \
psb_c_prec_type_impl.o psb_z_prec_type_impl.o \
psb_d_diagprec_impl.o psb_d_bjacprec_impl.o psb_d_nullprec_impl.o \
psb_dilu_fct.o psb_d_ilu0_fact.o psb_d_iluk_fact.o psb_d_ilut_fact.o \
psb_d_ilu0_fact.o psb_d_iluk_fact.o psb_d_ilut_fact.o \
psb_dprecbld.o psb_dprecinit.o \
psb_s_diagprec_impl.o psb_s_bjacprec_impl.o psb_s_nullprec_impl.o \
psb_silu_fct.o psb_s_ilu0_fact.o psb_s_iluk_fact.o psb_s_ilut_fact.o \
psb_s_ilu0_fact.o psb_s_iluk_fact.o psb_s_ilut_fact.o \
psb_sprecbld.o psb_sprecinit.o \
psb_c_diagprec_impl.o psb_c_bjacprec_impl.o psb_c_nullprec_impl.o \
psb_cilu_fct.o psb_c_ilu0_fact.o psb_c_iluk_fact.o psb_c_ilut_fact.o \
psb_c_ilu0_fact.o psb_c_iluk_fact.o psb_c_ilut_fact.o \
psb_cprecbld.o psb_cprecinit.o \
psb_z_diagprec_impl.o psb_z_bjacprec_impl.o psb_z_nullprec_impl.o \
psb_zilu_fct.o psb_z_ilu0_fact.o psb_z_iluk_fact.o psb_z_ilut_fact.o \
psb_z_ilu0_fact.o psb_z_iluk_fact.o psb_z_ilut_fact.o \
psb_zprecbld.o psb_zprecinit.o \
psb_c_sparsify.o psb_d_sparsify.o psb_s_sparsify.o psb_z_sparsify.o \
psb_crwclip.o psb_drwclip.o psb_srwclip.o psb_zrwclip.o \

@ -1,438 +0,0 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
subroutine psb_cilu_fct(a,l,u,d,info,blck)
!
! This routine copies and factors "on the fly" from A and BLCK
! into L/D/U.
!
!
use psb_base_mod
implicit none
! .. Scalar Arguments ..
integer(psb_ipk_), intent(out) :: info
! .. Array Arguments ..
type(psb_cspmat_type),intent(in) :: a
type(psb_c_csr_sparse_mat),intent(inout) :: l,u
type(psb_cspmat_type),intent(in), optional, target :: blck
complex(psb_spk_), intent(inout) :: d(:)
! .. Local Scalars ..
integer(psb_ipk_) :: l1, l2,m,err_act
type(psb_cspmat_type), pointer :: blck_
character(len=20) :: name, ch_err
name='psb_ilu_fct'
info = psb_success_
call psb_erractionsave(err_act)
! .. Executable Statements ..
!
if (present(blck)) then
blck_ => blck
else
allocate(blck_,stat=info)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
goto 9999
end if
call blck_%csall(izero,izero,info,ione)
endif
call psb_cilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_cilu_fctint'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
call l%set_triangle()
call l%set_lower()
call l%set_unit()
call u%set_triangle()
call u%set_upper()
call u%set_unit()
call l%set_nrows(m)
call l%set_ncols(m)
call u%set_nrows(m)
call u%set_ncols(m)
if (present(blck)) then
blck_ => null()
else
call blck_%free()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_free'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
deallocate(blck_)
endif
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
contains
subroutine psb_cilu_fctint(m,ma,a,mb,b,&
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
implicit none
type(psb_cspmat_type) :: a,b
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
complex(psb_spk_), dimension(:) :: laspk,uaspk,d
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
complex(psb_spk_) :: dia,temp
integer(psb_ipk_), parameter :: nrb=60
type(psb_c_coo_sparse_mat) :: trw
integer(psb_ipk_) :: int_err(5)
character(len=20) :: name, ch_err
name='psb_cilu_fctint'
if(psb_get_errstatus() /= 0) return
info=psb_success_
call psb_erractionsave(err_act)
call trw%allocate(izero,izero,ione)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_all'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
lia2(1) = 1
uia2(1) = 1
l1=0
l2=0
m = ma+mb
do i = 1, ma
d(i) = czero
!
!
select type(aa => a%a)
type is (psb_c_csr_sparse_mat)
do j = aa%irp(i), aa%irp(i+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i,i+irb-1,trw,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
nz = trw%get_nzeros()
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
!!$
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloop: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloop
end if
enddo
!
else if (j == i) then
! j=i update diagonal
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
dia = dia - temp*uaspk(jj)
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
cycle updateloop
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloop
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloop
enddo
!
!
! Non singularity
!
if (abs(dia) < s_epstol) then
!
! Pivot too small: unstable factorization
!
info = psb_err_pivot_too_small_
int_err(1) = i
write(ch_err,'(g20.10)') abs(dia)
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = cone/dia
end if
d(i) = dia
! write(psb_err_unit,*)'diag(',i,')=',d(i)
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
do i = ma+1, m
d(i) = czero
select type(aa => b%a)
type is (psb_c_csr_sparse_mat)
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i-ma,i-ma+irb-1,trw,info)
nz = trw%get_nzeros()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloopb: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloopb
end if
enddo
!
else if (j == i) then
! j=i update diagonal
dia = dia - temp*uaspk(jj)
cycle updateloopb
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloopb
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloopb
enddo
!
!
! Non singularity
!
if (abs(dia) < s_epstol) then
!
! Pivot too small: unstable factorization
!
int_err(1) = i
write(ch_err,'(g20.10)') abs(dia)
info = psb_err_pivot_too_small_
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = cone/dia
end if
d(i) = dia
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
call trw%free()
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
end subroutine psb_cilu_fctint
end subroutine psb_cilu_fct

@ -1,441 +0,0 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
subroutine psb_dilu_fct(a,l,u,d,info,blck)
!
! This routine copies and factors "on the fly" from A and BLCK
! into L/D/U.
!
!
use psb_base_mod
implicit none
! .. Scalar Arguments ..
integer(psb_ipk_), intent(out) :: info
! .. Array Arguments ..
type(psb_dspmat_type),intent(in) :: a
type(psb_d_csr_sparse_mat),intent(inout) :: l,u
type(psb_dspmat_type),intent(in), optional, target :: blck
real(psb_dpk_), intent(inout) :: d(:)
! .. Local Scalars ..
integer(psb_ipk_) :: l1,l2,m,err_act
type(psb_dspmat_type), pointer :: blck_
character(len=20) :: name, ch_err
name='psb_ilu_fct'
info = psb_success_
call psb_erractionsave(err_act)
! .. Executable Statements ..
!
if (present(blck)) then
blck_ => blck
else
allocate(blck_,stat=info)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
goto 9999
end if
call blck_%csall(izero,izero,info,ione)
endif
call psb_dilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_dilu_fctint'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
call l%set_triangle()
call l%set_lower()
call l%set_unit()
call u%set_triangle()
call u%set_upper()
call u%set_unit()
call l%set_nrows(m)
call l%set_ncols(m)
call u%set_nrows(m)
call u%set_ncols(m)
if (present(blck)) then
blck_ => null()
else
call blck_%free()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_free'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
deallocate(blck_)
endif
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
contains
subroutine psb_dilu_fctint(m,ma,a,mb,b,&
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
use psb_mat_mod
implicit none
type(psb_dspmat_type), target :: a
type(psb_dspmat_type), target :: b
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
real(psb_dpk_), dimension(:) :: laspk,uaspk,d
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
real(psb_dpk_) :: dia,temp
integer(psb_ipk_), parameter :: nrb=60
type(psb_d_coo_sparse_mat) :: trw
integer(psb_ipk_) :: int_err(5)
character(len=20) :: name, ch_err
name='psb_dilu_fctint'
if(psb_get_errstatus() /= 0) return
info=psb_success_
call psb_erractionsave(err_act)
call trw%allocate(izero,izero,ione)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_all'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
lia2(1) = 1
uia2(1) = 1
l1=0
l2=0
m = ma+mb
do i = 1, ma
d(i) = dzero
!
!
select type(aa => a%a)
type is (psb_d_csr_sparse_mat)
do j = aa%irp(i), aa%irp(i+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i,i+irb-1,trw,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
nz = trw%get_nzeros()
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
!!$
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloop: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloop
end if
enddo
!
else if (j == i) then
! j=i update diagonal
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
dia = dia - temp*uaspk(jj)
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
cycle updateloop
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloop
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloop
enddo
!
!
! Non singularity
!
if (dabs(dia) < d_epstol) then
!
! Pivot too small: unstable factorization
!
info = psb_err_pivot_too_small_
int_err(1) = i
write(ch_err,'(g20.10)') dia
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = done/dia
end if
d(i) = dia
! write(psb_err_unit,*)'diag(',i,')=',d(i)
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
do i = ma+1, m
d(i) = dzero
select type(aa => b%a)
type is (psb_d_csr_sparse_mat)
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i-ma,i-ma+irb-1,trw,info)
nz = trw%get_nzeros()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloopb: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloopb
end if
enddo
!
else if (j == i) then
! j=i update diagonal
dia = dia - temp*uaspk(jj)
cycle updateloopb
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloopb
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloopb
enddo
!
!
! Non singularity
!
if (dabs(dia) < d_epstol) then
!
! Pivot too small: unstable factorization
!
int_err(1) = i
write(ch_err,'(g20.10)') dia
info = psb_err_pivot_too_small_
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = done/dia
end if
d(i) = dia
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
call trw%free()
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
end subroutine psb_dilu_fctint
end subroutine psb_dilu_fct

@ -1,440 +0,0 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
subroutine psb_silu_fct(a,l,u,d,info,blck)
!
! This routine copies and factors "on the fly" from A and BLCK
! into L/D/U.
!
!
use psb_base_mod
implicit none
! .. Scalar Arguments ..
integer(psb_ipk_), intent(out) :: info
! .. Array Arguments ..
type(psb_sspmat_type),intent(in) :: a
type(psb_s_csr_sparse_mat),intent(inout) :: l,u
type(psb_sspmat_type),intent(in), optional, target :: blck
real(psb_spk_), intent(inout) :: d(:)
! .. Local Scalars ..
integer(psb_ipk_) :: l1,l2,m,err_act
type(psb_sspmat_type), pointer :: blck_
character(len=20) :: name, ch_err
name='psb_ilu_fct'
info = psb_success_
call psb_erractionsave(err_act)
! .. Executable Statements ..
!
if (present(blck)) then
blck_ => blck
else
allocate(blck_,stat=info)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
goto 9999
end if
call blck_%csall(izero,izero,info,ione)
endif
call psb_silu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_silu_fctint'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
call l%set_triangle()
call l%set_lower()
call l%set_unit()
call u%set_triangle()
call u%set_upper()
call u%set_unit()
call l%set_nrows(m)
call l%set_ncols(m)
call u%set_nrows(m)
call u%set_ncols(m)
if (present(blck)) then
blck_ => null()
else
call blck_%free()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_free'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
deallocate(blck_)
endif
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
contains
subroutine psb_silu_fctint(m,ma,a,mb,b,&
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
use psb_mat_mod
implicit none
type(psb_sspmat_type) :: a
type(psb_sspmat_type) :: b
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
real(psb_spk_), dimension(:) :: laspk,uaspk,d
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
real(psb_spk_) :: dia,temp
integer(psb_ipk_), parameter :: nrb=60
type(psb_s_coo_sparse_mat) :: trw
integer(psb_ipk_) :: int_err(5)
character(len=20) :: name, ch_err
name='psb_silu_fctint'
if(psb_get_errstatus() /= 0) return
info=psb_success_
call psb_erractionsave(err_act)
call trw%allocate(izero,izero,ione)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_all'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
lia2(1) = 1
uia2(1) = 1
l1=0
l2=0
m = ma+mb
do i = 1, ma
d(i) = szero
!
!
select type(aa => a%a)
type is (psb_s_csr_sparse_mat)
do j = aa%irp(i), aa%irp(i+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i,i+irb-1,trw,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
nz = trw%get_nzeros()
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
!!$
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloop: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloop
end if
enddo
!
else if (j == i) then
! j=i update diagonal
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
dia = dia - temp*uaspk(jj)
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
cycle updateloop
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloop
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloop
enddo
!
!
! Non singularity
!
if (abs(dia) < s_epstol) then
!
! Pivot too small: unstable factorization
!
info = psb_err_pivot_too_small_
int_err(1) = i
write(ch_err,'(g20.10)') dia
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = sone/dia
end if
d(i) = dia
! write(psb_err_unit,*)'diag(',i,')=',d(i)
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
do i = ma+1, m
d(i) = szero
select type(aa => b%a)
type is (psb_s_csr_sparse_mat)
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i-ma,i-ma+irb-1,trw,info)
nz = trw%get_nzeros()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloopb: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloopb
end if
enddo
!
else if (j == i) then
! j=i update diagonal
dia = dia - temp*uaspk(jj)
cycle updateloopb
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloopb
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloopb
enddo
!
!
! Non singularity
!
if (abs(dia) < s_epstol) then
!
! Pivot too small: unstable factorization
!
int_err(1) = i
write(ch_err,'(g20.10)') dia
info = psb_err_pivot_too_small_
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = sone/dia
end if
d(i) = dia
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
call trw%free()
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
end subroutine psb_silu_fctint
end subroutine psb_silu_fct

@ -1,438 +0,0 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
subroutine psb_zilu_fct(a,l,u,d,info,blck)
!
! This routine copies and factors "on the fly" from A and BLCK
! into L/D/U.
!
!
use psb_base_mod
implicit none
! .. Scalar Arguments ..
integer(psb_ipk_), intent(out) :: info
! .. Array Arguments ..
type(psb_zspmat_type),intent(in) :: a
type(psb_z_csr_sparse_mat),intent(inout) :: l,u
type(psb_zspmat_type),intent(in), optional, target :: blck
complex(psb_dpk_), intent(inout) :: d(:)
! .. Local Scalars ..
integer(psb_ipk_) :: l1, l2,m,err_act
type(psb_zspmat_type), pointer :: blck_
character(len=20) :: name, ch_err
name='psb_ilu_fct'
info = psb_success_
call psb_erractionsave(err_act)
! .. Executable Statements ..
!
if (present(blck)) then
blck_ => blck
else
allocate(blck_,stat=info)
if (info /= psb_success_) then
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
goto 9999
end if
call blck_%csall(izero,izero,info,ione)
endif
call psb_zilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_zilu_fctint'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
call l%set_triangle()
call l%set_lower()
call l%set_unit()
call u%set_triangle()
call u%set_upper()
call u%set_unit()
call l%set_nrows(m)
call l%set_ncols(m)
call u%set_nrows(m)
call u%set_ncols(m)
if (present(blck)) then
blck_ => null()
else
call blck_%free()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_free'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
deallocate(blck_)
endif
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
contains
subroutine psb_zilu_fctint(m,ma,a,mb,b,&
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
implicit none
type(psb_zspmat_type) :: a,b
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
complex(psb_dpk_), dimension(:) :: laspk,uaspk,d
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
complex(psb_dpk_) :: dia,temp
integer(psb_ipk_), parameter :: nrb=60
type(psb_z_coo_sparse_mat) :: trw
integer(psb_ipk_) :: int_err(5)
character(len=20) :: name, ch_err
name='psb_zilu_fctint'
if(psb_get_errstatus() /= 0) return
info=psb_success_
call psb_erractionsave(err_act)
call trw%allocate(izero,izero,ione)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='psb_sp_all'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
lia2(1) = 1
uia2(1) = 1
l1=0
l2=0
m = ma+mb
do i = 1, ma
d(i) = zzero
!
!
select type(aa => a%a)
type is (psb_z_csr_sparse_mat)
do j = aa%irp(i), aa%irp(i+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i,i+irb-1,trw,info)
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
nz = trw%get_nzeros()
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
!!$
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloop: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloop
end if
enddo
!
else if (j == i) then
! j=i update diagonal
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
dia = dia - temp*uaspk(jj)
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
cycle updateloop
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloop
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloop
enddo
!
!
! Non singularity
!
if (abs(dia) < d_epstol) then
!
! Pivot too small: unstable factorization
!
info = psb_err_pivot_too_small_
int_err(1) = i
write(ch_err,'(g20.10)') abs(dia)
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = zone/dia
end if
d(i) = dia
! write(psb_err_unit,*)'diag(',i,')=',d(i)
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
do i = ma+1, m
d(i) = zzero
select type(aa => b%a)
type is (psb_z_csr_sparse_mat)
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
k = aa%ja(j)
! write(psb_err_unit,*)'KKKKK',k
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = aa%val(j)
lia1(l1) = k
else if (k == i) then
d(i) = aa%val(j)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = aa%val(j)
uia1(l2) = k
end if
enddo
class default
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
irb = min(ma-i+1,nrb)
call aa%csget(i-ma,i-ma+irb-1,trw,info)
nz = trw%get_nzeros()
if(info /= psb_success_) then
info=psb_err_from_subroutine_
ch_err='a%csget'
call psb_errpush(info,name,a_err=ch_err)
goto 9999
end if
ktrw=1
end if
do
if (ktrw > nz ) exit
if (trw%ia(ktrw) > i) exit
k = trw%ja(ktrw)
if ((k < i).and.(k >= 1)) then
l1 = l1 + 1
laspk(l1) = trw%val(ktrw)
lia1(l1) = k
else if (k == i) then
d(i) = trw%val(ktrw)
else if ((k > i).and.(k <= m)) then
l2 = l2 + 1
uaspk(l2) = trw%val(ktrw)
uia1(l2) = k
end if
ktrw = ktrw + 1
enddo
end select
lia2(i+1) = l1 + 1
uia2(i+1) = l2 + 1
dia = d(i)
do kk = lia2(i), lia2(i+1) - 1
!
! compute element alo(i,k) of incomplete factorization
!
temp = laspk(kk)
k = lia1(kk)
laspk(kk) = temp*d(k)
! update the rest of row i using alo(i,k)
low1 = kk + 1
low2 = uia2(i)
updateloopb: do jj = uia2(k), uia2(k+1) - 1
j = uia1(jj)
!
if (j < i) then
! search alo(i,*) for matching index J
do ll = low1, lia2(i+1) - 1
l = lia1(ll)
if (l > j) then
low1 = ll
exit
else if (l == j) then
laspk(ll) = laspk(ll) - temp*uaspk(jj)
low1 = ll + 1
cycle updateloopb
end if
enddo
!
else if (j == i) then
! j=i update diagonal
dia = dia - temp*uaspk(jj)
cycle updateloopb
!
else if (j > i) then
! search aup(i,*) for matching index j
do ll = low2, uia2(i+1) - 1
l = uia1(ll)
if (l > j) then
low2 = ll
exit
else if (l == j) then
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
low2 = ll + 1
cycle updateloopb
end if
enddo
end if
!
! for milu al=1.; for ilu al=0.
! al = 1.d0
! dia = dia - al*temp*aup(jj)
enddo updateloopb
enddo
!
!
! Non singularity
!
if (abs(dia) < d_epstol) then
!
! Pivot too small: unstable factorization
!
int_err(1) = i
write(ch_err,'(g20.10)') abs(dia)
info = psb_err_pivot_too_small_
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
goto 9999
else
dia = zone/dia
end if
d(i) = dia
! Scale row i of upper triangle
do kk = uia2(i), uia2(i+1) - 1
uaspk(kk) = uaspk(kk)*dia
enddo
enddo
call trw%free()
call psb_erractionrestore(err_act)
return
9999 continue
call psb_erractionrestore(err_act)
if (err_act == psb_act_abort_) then
call psb_error()
return
end if
return
end subroutine psb_zilu_fctint
end subroutine psb_zilu_fct

@ -596,7 +596,7 @@ program pdegenmm
! solver parameters
integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
real(psb_dpk_) :: err, eps
integer, parameter :: ntests=50, ngpu=50, ncnv=20
integer, parameter :: ntests=50, ngpu=10, ncnv=20
type(psb_d_coo_sparse_mat), target :: acoo
type(psb_d_csr_sparse_mat), target :: acsr
type(psb_d_ell_sparse_mat), target :: aell
@ -613,6 +613,7 @@ program pdegenmm
type(psb_d_cuda_hybg_sparse_mat), target :: ahybg
#endif
type(psb_d_cuda_hlg_sparse_mat), target :: ahlg
! TODO HDIAG E DNSG non hanno nemmeno CSMM
type(psb_d_cuda_hdiag_sparse_mat), target :: ahdiag
type(psb_d_cuda_dnsg_sparse_mat), target :: adnsg
#endif
@ -658,12 +659,12 @@ program pdegenmm
!
! get parameters
!
!call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
nrhs=2
acfmt='CSR'
agfmt='CSRG'
idim=2
tnd=.false.
call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
!nrhs=8
!acfmt='CSR'
!agfmt='CSRG'
!idim=100
!tnd=.false.
call psb_init_timers()
!
! allocate and fill in the coefficient matrix and initial vectors
@ -803,6 +804,40 @@ program pdegenmm
x1 = b_mv%get_vect()
x2 = b_mv_g%get_vect()
! ! TODO test AXPBY
! call psb_geall(xg,desc_a,info)
! call psb_geasb(xg,desc_a,info,mold=tmold)
! call xg%set(done)
! call xg%sync()
! call psb_geall(bg,desc_a,info)
! call psb_geasb(bg,desc_a,info,mold=tmold)
! !call bg%set(done+done)
! ! ! TODO: Non funziona spgpuDaxpby (axpbyMultiVecDeviceDouble)
! call psb_geaxpby(done,xg,dzero,bg,desc_a,info)
! call psb_cuda_DeviceSync()
! write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync()
! call bg%sync()
! write(*,*) 'BG ', bg%is_dev(), bg%is_host(), bg%is_sync()
! do i=1,8
! write(*,*) bg%v%v(i)
! end do
! return
! call x_mv_g%set(done)
! call x_mv_g%sync()
! call psb_geaxpby(done,x_mv_g,dzero,b_mv_g,desc_a,info)
! call b_mv_g%sync()
! do i=1,size(b_mv_g%v%v,1)
! write(*,*) b_mv_g%v%v(i,:)
! end do
! return
call psb_barrier(ctxt)
tt1 = psb_wtime()
do i=1,ntests
@ -876,7 +911,7 @@ program pdegenmm
write(psb_out_unit,'("Size of matrix: ",i20)') nr
write(psb_out_unit,'("Number of nonzeros: ",i20)') annz
write(psb_out_unit,'("Memory occupation: ",i20)') amatsize
flops = ntests*(2.d0*annz)
flops = ntests*(2.d0*annz)*nrhs
tflops = flops
gflops = flops * ngpu
write(psb_out_unit,'("Storage type for A: ",a)') a%get_fmt()
@ -935,7 +970,7 @@ program pdegenmm
write(psb_out_unit,*)
write(psb_out_unit,'("MBYTES/S sust. effective bandwidth (CPU) : ",F20.3)') bdwdth
#ifdef HAVE_CUDA
bdwdth = ngpu*ntests*nbytes/(gt2*1.d6)
bdwdth = nrhs*ngpu*ntests*nbytes/(gt2*1.d6)
write(psb_out_unit,'("MBYTES/S sust. effective bandwidth (GPU) : ",F20.3)') bdwdth
bdwdth = psb_cuda_MemoryPeakBandwidth()
write(psb_out_unit,'("MBYTES/S peak bandwidth (GPU) : ",F20.3)') bdwdth
@ -1018,4 +1053,4 @@ contains
end subroutine get_parms
end program pdegenmm
end program pdegenmm

Loading…
Cancel
Save