Updated HLG SpMM (s,d,c,z)

cuda-multivect
gabrielequatrana 10 months ago
parent ccb4f73dca
commit 9daa04c3dc

@ -35,6 +35,7 @@ module psb_c_base_mat_mod
use psb_base_mat_mod
use psb_c_base_vect_mod
use psb_c_base_multivect_mod
!> \namespace psb_base_mod \class psb_c_base_sparse_mat

@ -35,6 +35,7 @@ module psb_s_base_mat_mod
use psb_base_mat_mod
use psb_s_base_vect_mod
use psb_s_base_multivect_mod
!> \namespace psb_base_mod \class psb_s_base_sparse_mat

@ -35,6 +35,7 @@ module psb_z_base_mat_mod
use psb_base_mat_mod
use psb_z_base_vect_mod
use psb_z_base_multivect_mod
!> \namespace psb_base_mod \class psb_z_base_sparse_mat

@ -156,7 +156,9 @@ int FallocHllDevice(void** deviceMat,unsigned int hksize, unsigned int rows, un
return(i);
}
//
// Single Precision Float
//
int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
@ -170,108 +172,170 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
/*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
devMat->baseIndex);*/
spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha, (float *)devMat->cM,
devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
spgpuShellspmv (handle, (float *)y->v_, (float *)y->v_, alpha,
(float *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (float *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
void
dspmdmmhll_gpu (double *z, int count, int zPitch, double alpha, double* cM, int* rP,
int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows,
double *x, int xPitch, double beta, int firstIndex)
int spmmHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
float beta, void* deviceY)
{
int i=0;
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#if defined(NEW_MM)
spgpuDhellspmm(handle, count, (double*) z, zPitch, (double*)z, zPitch,
alpha, (double*) cM, rP,hackSize, hackOffs, rS, NULL,
rows, (double*)x, xPitch, beta, firstIndex);
#else
for (i=0; i<count; i++) {
spgpuDhellspmv (handle, (double*) z, (double*)z, alpha,
(double*) cM, rP,
hackSize, hackOffs, rS, NULL,
avgNnzPerRow, rows, (double*)x, beta, firstIndex);
z += zPitch;
x += xPitch;
}
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShellspmm(handle, y->count_, (float *)y->v_, y->pitch_,
(float*)y->v_, y->pitch_, alpha, (float*)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (float*)x->v_,
x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//new
//
// Double Precision
//
int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_,
alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
devMat->avgNzr, devMat->rows,
(double *)x->v_, x->pitch_, beta, devMat->baseIndex);
spgpuDhellspmv (handle, (double *)y->v_, (double *)y->v_, alpha,
(double *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (double *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmvHllDeviceFloatComplex(void *deviceMat, float complex alpha, void* deviceX,
float complex beta, void* deviceY)
int spmmHllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
double beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
cuFloatComplex a = make_cuFloatComplex(crealf(alpha),cimagf(alpha));
cuFloatComplex b = make_cuFloatComplex(crealf(beta),cimagf(beta));
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
/*dspmdmm_gpu ((double *)z->v_, y->count_, y->pitch_, (double *)y->v_, alpha, (double *)devMat->cM,
devMat->rP, devMat->rS, devMat->rows, devMat->pitch, (double *)x->v_, beta,
devMat->baseIndex);*/
spgpuDhellspmm(handle, y->count_, (double *)y->v_, y->pitch_,
(double *)y->v_, y->pitch_, alpha, (double *)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (double *)x->v_,
x->pitch_, beta, devMat->baseIndex);
spgpuChellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, a, (cuFloatComplex *)devMat->cM,
devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, b, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmvHllDeviceDoubleComplex(void *deviceMat, double complex alpha, void* deviceX,
double complex beta, void* deviceY)
//
// Single Precision Float Complex
//
int spmvHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX,
cuFloatComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
cuDoubleComplex a = make_cuDoubleComplex(creal(alpha),cimag(alpha));
cuDoubleComplex b = make_cuDoubleComplex(creal(beta),cimag(beta));
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShellspmv (handle, (cuFloatComplex *)y->v_, (cuFloatComplex *)y->v_, alpha,
(cuFloatComplex *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (cuFloatComplex *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmmHllDeviceFloatComplex(void *deviceMat, cuFloatComplex alpha, void* deviceX,
cuFloatComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
spgpuZhellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, a, (cuDoubleComplex *)devMat->cM,
devMat->rP,devMat->hackSize,devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr,devMat->rows, (cuDoubleComplex *)x->v_, b, devMat->baseIndex);
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShellspmm(handle, y->count_, (cuFloatComplex *)y->v_, y->pitch_,
(cuFloatComplex*)y->v_, y->pitch_, alpha, (cuFloatComplex*)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (cuFloatComplex*)x->v_,
x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
//
// Double Precision Complex
//
int spmvHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX,
cuDoubleComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShellspmv (handle, (cuDoubleComplex *)y->v_, (cuDoubleComplex *)y->v_, alpha,
(cuDoubleComplex *)devMat->cM, devMat->rP,
devMat->hackSize, devMat->hackOffs, devMat->rS, NULL,
devMat->avgNzr, devMat->rows, (cuDoubleComplex *)x->v_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}
int spmmHllDeviceDoubleComplex(void *deviceMat, cuDoubleComplex alpha, void* deviceX,
cuDoubleComplex beta, void* deviceY)
{
HllDevice *devMat = (HllDevice *) deviceMat;
struct MultiVectDevice *x = (struct MultiVectDevice *) deviceX;
struct MultiVectDevice *y = (struct MultiVectDevice *) deviceY;
spgpuHandle_t handle=psb_cudaGetHandle();
#ifdef VERBOSE
/*__assert(x->count_ == x->count_, "ERROR: x and y don't share the same number of vectors");*/
/*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
/*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
#endif
spgpuShellspmm(handle, y->count_, (cuDoubleComplex *)y->v_, y->pitch_,
(cuDoubleComplex*)y->v_, y->pitch_, alpha, (cuDoubleComplex*)devMat->cM,
devMat->rP, devMat->hackSize, devMat->hackOffs,
devMat->rS, NULL, devMat->rows, (cuDoubleComplex*)x->v_,
x->pitch_, beta, devMat->baseIndex);
return SPGPU_SUCCESS;
}

@ -265,4 +265,40 @@ module hlldev_mod
end interface
interface spmmHllDevice
function spmmHllDeviceFloat(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceFloat')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_float),value :: alpha, beta
end function spmmHllDeviceFloat
function spmmHllDeviceDouble(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceDouble')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
real(c_double),value :: alpha, beta
end function spmmHllDeviceDouble
function spmmHllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceFloatComplex')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
complex(c_float_complex),value :: alpha, beta
end function spmmHllDeviceFloatComplex
function spmmHllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
& result(res) bind(c,name='spmmHllDeviceDoubleComplex')
use iso_c_binding
integer(c_int) :: res
type(c_ptr), value :: deviceMat, x, y
complex(c_double_complex),value :: alpha, beta
end function spmmHllDeviceDoubleComplex
end interface
end module hlldev_mod

@ -98,16 +98,16 @@ subroutine psb_c_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_float)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_float)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -117,3 +117,92 @@ subroutine psb_c_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_c_cuda_hlg_vect_mv
subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hlldev_mod
use psb_vectordev_mod
use psb_c_cuda_hlg_mat_mod, psb_protect_name => psb_c_cuda_hlg_multivect_mv
use psb_c_cuda_vect_mod
implicit none
class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_vect_type), intent(inout) :: x
class(psb_c_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_spk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='c_cuda_hlg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= czero) then
if (.not.y%is_host()) call y%sync()
end if
if (a%is_dev()) call a%sync()
call a%psb_c_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_c_vect_cuda)
select type(yy => y)
type is (psb_c_vect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_c_cuda_hlg_multivect_mv

@ -105,7 +105,7 @@ subroutine psb_d_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999

@ -172,7 +172,7 @@ subroutine psb_d_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmvhllDevice(a%deviceMat,alpha,xx%deviceVect,&
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&

@ -98,16 +98,16 @@ subroutine psb_s_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_float)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_float)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -117,3 +117,92 @@ subroutine psb_s_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_s_cuda_hlg_vect_mv
subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hlldev_mod
use psb_vectordev_mod
use psb_s_cuda_hlg_mat_mod, psb_protect_name => psb_s_cuda_hlg_multivect_mv
use psb_s_cuda_vect_mod
implicit none
class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_vect_type), intent(inout) :: x
class(psb_s_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
real(psb_spk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='s_cuda_hlg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= szero) then
if (.not.y%is_host()) call y%sync()
end if
if (a%is_dev()) call a%sync()
call a%psb_s_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_s_vect_cuda)
select type(yy => y)
type is (psb_s_vect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_s_cuda_hlg_multivect_mv

@ -98,16 +98,16 @@ subroutine psb_z_cuda_hlg_csmm(alpha,a,x,beta,y,info,trans)
if (info == 0) &
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_complex_double)
if (info == 0) &
& info = writeMultiVecDevice(gpX,x,nxy)
& info = writeMultiVecDevice(gpX,x,size(x,1))
if (info == 0) &
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_complex_double)
if (info == 0) &
& info = writeMultiVecDevice(gpY,y,nxy)
& info = writeMultiVecDevice(gpY,y,size(y,1))
if (info == 0) &
& info = spmvhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
& info = spmmhllDevice(a%deviceMat,alpha,gpX,beta,gpY)
if (info == 0) &
& info = readMultiVecDevice(gpY,y,nxy)
& info = readMultiVecDevice(gpY,y,size(y,1))
if (info /= 0) goto 9999
call freeMultiVecDevice(gpX)
call freeMultiVecDevice(gpY)

@ -117,3 +117,92 @@ subroutine psb_z_cuda_hlg_vect_mv(alpha,a,x,beta,y,info,trans)
return
end subroutine psb_z_cuda_hlg_vect_mv
subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
use psb_base_mod
use hlldev_mod
use psb_vectordev_mod
use psb_z_cuda_hlg_mat_mod, psb_protect_name => psb_z_cuda_hlg_multivect_mv
use psb_z_cuda_vect_mod
implicit none
class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_vect_type), intent(inout) :: x
class(psb_z_base_vect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
complex(psb_dpk_), allocatable :: rx(:), ry(:)
logical :: tra
character :: trans_
Integer(Psb_ipk_) :: err_act
character(len=20) :: name='z_cuda_hlg_multivect_mv'
call psb_erractionsave(err_act)
info = psb_success_
if (present(trans)) then
trans_ = trans
else
trans_ = 'N'
end if
if (.not.a%is_asb()) then
info = psb_err_invalid_mat_state_
call psb_errpush(info,name)
goto 9999
endif
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
if (tra) then
if (.not.x%is_host()) call x%sync()
if (beta /= zzero) then
if (.not.y%is_host()) call y%sync()
end if
if (a%is_dev()) call a%sync()
call a%psb_z_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
call y%set_host()
else
if (a%is_host()) call a%sync()
select type (xx => x)
type is (psb_z_vect_cuda)
select type(yy => y)
type is (psb_z_vect_cuda)
if (xx%is_host()) call xx%sync()
if (beta /= dzero) then
if (yy%is_host()) call yy%sync()
end if
info = spmmhllDevice(a%deviceMat,alpha,xx%deviceVect,&
& beta,yy%deviceVect)
if (info /= 0) then
call psb_errpush(psb_err_from_subroutine_ai_,name,&
& a_err='spmmHLLDevice',i_err=(/info,izero,izero,izero,izero/))
info = psb_err_from_subroutine_ai_
goto 9999
end if
call yy%set_dev()
class default
rx = xx%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
class default
rx = x%get_vect()
ry = y%get_vect()
if (a%is_dev()) call a%sync()
call a%spmm(alpha,rx,beta,ry,info)
call y%bld(ry)
end select
end if
if (info /= 0) goto 9999
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
end subroutine psb_z_cuda_hlg_multivect_mv

@ -55,6 +55,7 @@ module psb_c_cuda_hlg_mat_mod
contains
procedure, nopass :: get_fmt => c_cuda_hlg_get_fmt
procedure, pass(a) :: sizeof => c_cuda_hlg_sizeof
procedure, pass(a) :: multivect_mv => psb_c_cuda_hlg_multivect_mv
procedure, pass(a) :: vect_mv => psb_c_cuda_hlg_vect_mv
procedure, pass(a) :: csmm => psb_c_cuda_hlg_csmm
procedure, pass(a) :: csmv => psb_c_cuda_hlg_csmv
@ -97,6 +98,15 @@ module psb_c_cuda_hlg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hlg_vect_mv
subroutine psb_c_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_c_cuda_hlg_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_
class(psb_c_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_spk_), intent(in) :: alpha, beta
class(psb_c_base_multivect_type), intent(inout) :: x
class(psb_c_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_c_cuda_hlg_multivect_mv
end interface
interface

@ -56,6 +56,7 @@ module psb_s_cuda_hlg_mat_mod
procedure, nopass :: get_fmt => s_cuda_hlg_get_fmt
procedure, pass(a) :: sizeof => s_cuda_hlg_sizeof
procedure, pass(a) :: vect_mv => psb_s_cuda_hlg_vect_mv
procedure, pass(a) :: multivect_mv => psb_s_cuda_hlg_multivect_mv
procedure, pass(a) :: csmm => psb_s_cuda_hlg_csmm
procedure, pass(a) :: csmv => psb_s_cuda_hlg_csmv
procedure, pass(a) :: in_vect_sv => psb_s_cuda_hlg_inner_vect_sv
@ -97,6 +98,15 @@ module psb_s_cuda_hlg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hlg_vect_mv
subroutine psb_s_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_s_cuda_hlg_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_
class(psb_s_cuda_hlg_sparse_mat), intent(in) :: a
real(psb_spk_), intent(in) :: alpha, beta
class(psb_s_base_multivect_type), intent(inout) :: x
class(psb_s_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_s_cuda_hlg_multivect_mv
end interface
interface

@ -56,6 +56,7 @@ module psb_z_cuda_hlg_mat_mod
procedure, nopass :: get_fmt => z_cuda_hlg_get_fmt
procedure, pass(a) :: sizeof => z_cuda_hlg_sizeof
procedure, pass(a) :: vect_mv => psb_z_cuda_hlg_vect_mv
procedure, pass(a) :: multivect_mv => psb_z_cuda_hlg_multivect_mv
procedure, pass(a) :: csmm => psb_z_cuda_hlg_csmm
procedure, pass(a) :: csmv => psb_z_cuda_hlg_csmv
procedure, pass(a) :: in_vect_sv => psb_z_cuda_hlg_inner_vect_sv
@ -97,6 +98,15 @@ module psb_z_cuda_hlg_mat_mod
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hlg_vect_mv
subroutine psb_z_cuda_hlg_multivect_mv(alpha,a,x,beta,y,info,trans)
import :: psb_z_cuda_hlg_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_
class(psb_z_cuda_hlg_sparse_mat), intent(in) :: a
complex(psb_dpk_), intent(in) :: alpha, beta
class(psb_z_base_multivect_type), intent(inout) :: x
class(psb_z_base_multivect_type), intent(inout) :: y
integer(psb_ipk_), intent(out) :: info
character, optional, intent(in) :: trans
end subroutine psb_z_cuda_hlg_multivect_mv
end interface
interface

@ -68,7 +68,46 @@ void spgpuShellspmv (spgpuHandle_t handle,
float beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device float *z,int zpitch,const __device float *y,int ypitch,float alpha, const __device float* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device float *x,int xpitch,float beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuShellspmm(spgpuHandle_t handle,
int count,
__device float *z,
int zpitch,
const __device float *y,
int ypitch,
float alpha,
const __device float* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device float *x,
int xpitch,
float beta,
int baseIndex);
/**
* \fn void spgpuDhellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
@ -104,26 +143,47 @@ void spgpuDhellspmv (spgpuHandle_t handle,
const __device double *x,
double beta,
int baseIndex);
#if defined(NEW_MM)
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device double *z,int zpitch,const __device double *y,int ypitch,double alpha, const __device double* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device double *x,int xpitch,double beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuDhellspmm(spgpuHandle_t handle,
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device double *x,
int xpitch,
double beta,
int baseIndex);
#endif
int count,
__device double *z,
int zpitch,
const __device double *y,
int ypitch,
double alpha,
const __device double* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device double *x,
int xpitch,
double beta,
int baseIndex);
/**
* \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
@ -160,7 +220,46 @@ void spgpuChellspmv (spgpuHandle_t handle,
cuFloatComplex beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuFloatComplex *z,int zpitch,const __device cuFloatComplex *y,int ypitch,cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuFloatComplex *x,int xpitch,cuFloatComplex beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuChellspmm(spgpuHandle_t handle,
int count,
__device cuFloatComplex *z,
int zpitch,
const __device cuFloatComplex *y,
int ypitch,
cuFloatComplex alpha,
const __device cuFloatComplex* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device cuFloatComplex *x,
int xpitch,
cuFloatComplex beta,
int baseIndex);
/**
* \fn void spgpuZhellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
@ -197,6 +296,46 @@ void spgpuZhellspmv (spgpuHandle_t handle,
cuDoubleComplex beta,
int baseIndex);
/**
* \fn void spgpuShellspmm (spgpuHandle_t handle,int count,__device cuDoubleComplex *z,int zpitch,const __device cuDoubleComplex *y,int ypitch,cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS,const __device int* rIdx, int rows, const __device cuDoubleComplex *x,int xpitch,cuDoubleComplex beta,int baseIndex)
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked ELLpack Format on GPU.
* \param handle The spgpu handle used to call this routine
* \param count The cols count
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
* \param zpitch The pitch of the output vector
* \param y The y input vector
* \param ypitch The pitch of the y input vector
* \param alpha The alpha scalar
* \param cM The HELL non zero values allocation pointer
* \param rP The HELL column indices allocation pointer
* \param hackSize The constant size of every hack (must be a multiple of 32).
* \param hackOffsets the array of base index offset for every hack of HELL non zero values allocation and HELL indices allocation.
* \param rS the array containing the row sized (in non zero elements)
* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Hell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the HELL format).
* \param rows the rows count
* \param x the x vector
* \param xpitch The pitch of the x input vector
* \param beta the beta scalar
* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
*/
void spgpuZhellspmm(spgpuHandle_t handle,
int count,
__device cuDoubleComplex *z,
int zpitch,
const __device cuDoubleComplex *y,
int ypitch,
cuDoubleComplex alpha,
const __device cuDoubleComplex* cM,
const __device int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const __device cuDoubleComplex *x,
int xpitch,
cuDoubleComplex beta,
int baseIndex);
/** @}*/

@ -17,7 +17,8 @@ OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \
dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
hell_sspmv.o hell_zspmv.o hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \
igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
sxyzw.o cxyzw.o dxyzw.o zxyzw.o

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuFloatComplex
#define TYPE_SYMBOL C
#define TEX_FETCH_TYPE cuFloatComplex
#include "hell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include <stdio.h>
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE double
#define TYPE_SYMBOL D
#define TEX_FETCH_TYPE int2
#include "hell_spmm_base.cuh"

@ -32,374 +32,3 @@ extern "C"
#define TYPE_SYMBOL D
#define TEX_FETCH_TYPE int2
#include "hell_spmv_base.cuh"
#if 0
#define MMBSZ 8
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
#undef GEN_SPGPU_HELL_NAME_VANILLA
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int *rrP;
VALUE_TYPE *rcM;
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gridSize = gridDim.x * blockDim.x;
while (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rrP = (int *) rP + hackOffset;
rcM = (VALUE_TYPE *) cM + hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rrP[0] - baseIndex;
rrP += hackSize;
value = rcM[0];
rcM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] =
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta,
yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
} else {
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
i += gridSize;
}
}
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt, c1;
dim3 block (THREAD_BLOCK, 1);
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
int numMp=getGPUMultiProcessors();
int maxThMp=getGPUMaxThreadsPerMP();
int nmblksMp=maxThMp/THREAD_BLOCK;
int nmblk=nmblksMp*numMp;
dim3 grid (nmblk);
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
if (shrMemSize > maxShmemSz) {
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
return;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt > MMBSZ) {
c1 = cnt/2;
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
cnt -= c1;
}
if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}
#elif defined(NEW_MM)
#define MMBSZ 8
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
#undef GEN_SPGPU_HELL_NAME_VANILLA
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
if (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rP += hackOffset;
cM += hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += hackSize;
value = cM[0];
cM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] =
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
else
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
}
void
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
const int* rS, const __device int* rIdx, int rows,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
alpha, cM, rP, hackSize, hackOffsets, rS, rows,
x, xPitch, beta, baseIndex);
}
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rS = rS + maxNForACall;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rIdx, rows,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
rows,
px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}
#endif

@ -0,0 +1,275 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
#undef GEN_SPGPU_HELL_NAME
#undef X_TEX
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
__device__ __host__ static float zero_float() { return 0.0f; }
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
__device__ static float readValue_float(float fetch) { return fetch; }
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
// host or c.c >= 1.3
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
__device__ __host__ static double zero_double() { return 0.0; }
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
{
cuDoubleComplex c;
c.x = __hiloint2double (fetch.y, fetch.x);
c.y = __hiloint2double (fetch.w, fetch.z);
return c;
}
#endif
#if 0
// Texture cache management
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
__device__ static VALUE_TYPE
fetchTex (int pointer)
{
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
return CONCAT(readValue_,VALUE_TYPE) (fetch);
}
#endif
#if __CUDA_ARCH__ < 300
extern __shared__ int dynShrMem[];
#endif
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#if 0
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
#undef USE_PREFETCHING
#define USE_PREFETCHING
#include "hell_spmm_base_template.cuh"
#define ENABLE_CACHE
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#undef USE_PREFETCHING
#endif
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
#if 0
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt, c1;
dim3 block (THREAD_BLOCK, 1);
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
int numMp=getGPUMultiProcessors();
int maxThMp=getGPUMaxThreadsPerMP();
int nmblksMp=maxThMp/THREAD_BLOCK;
int nmblk=nmblksMp*numMp;
dim3 grid (nmblk);
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
if (shrMemSize > maxShmemSz) {
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
return;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt > MMBSZ) {
c1 = cnt/2;
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
cnt -= c1;
}
if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}
#endif
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rS = rS + maxNForACall;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rIdx, rows,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
rows,
px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}

@ -0,0 +1,197 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define IDX2
#define THREAD_BLOCK 128
#define MMBSZ 8
#if 0
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int *rrP;
VALUE_TYPE *rcM;
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int gridSize = gridDim.x * blockDim.x;
while (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rrP = (int *) rP + hackOffset;
rcM = (VALUE_TYPE *) cM + hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rrP[0] - baseIndex;
rrP += hackSize;
value = rcM[0];
rcM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] =
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta,
yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
} else {
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
i += gridSize;
}
}
#endif
__global__ void
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
int hackSize, const int* hackOffsets, const int* rS, int rows,
const VALUE_TYPE *x, int xPitch,
VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE *pz,*px,*py;
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
VALUE_TYPE yVal;
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
if (i < rows) {
int j;
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
int hackOffset;
unsigned int laneId = threadIdx.x % 32;
if (laneId == 0)
hackOffset = hackOffsets[hackId];
//__syncthreads();
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
rP += hackOffset;
cM += hackOffset;
int rowSize = rS[i];
for (int k=0; k<count; k++) {
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
}
for (int j = 0; j < rowSize; j++) {
int pointer;
VALUE_TYPE value;
VALUE_TYPE fetch;
pointer = rP[0] - baseIndex;
rP += hackSize;
value = cM[0];
cM += hackSize;
px = (VALUE_TYPE *) x;
for (int k=0; k<count; k++) {
fetch = px[pointer];
temp[k][threadIdx.x] =
CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
px = px + xPitch;
}
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
py = (VALUE_TYPE *) y;
pz = z;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
for (int k=0; k<count; k++) {
yVal = py[i];
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
py += yPitch;
pz += zPitch;
}
else
for (int k=0; k<count; k++) {
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
pz += zPitch;
}
}
}
void
CONCAT(_,GEN_SPGPU_HELL_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
const int* rS, const __device int* rIdx, int rows,
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
{
dim3 block (THREAD_BLOCK, 1);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
alpha, cM, rP, hackSize, hackOffsets, rS, rows,
x, xPitch, beta, baseIndex);
}

@ -0,0 +1,34 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE float
#define TYPE_SYMBOL S
#define TEX_FETCH_TYPE float
#include "hell_spmm_base.cuh"

@ -0,0 +1,35 @@
/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2014
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include "cudadebug.h"
#include "cudalang.h"
#include "cuComplex.h"
extern "C"
{
#include "core.h"
#include "hell.h"
int getGPUSharedMemPerBlock();
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
#include "debug.h"
#define VALUE_TYPE cuDoubleComplex
#define TYPE_SYMBOL Z
#define TEX_FETCH_TYPE int4
#include "hell_spmm_base.cuh"
Loading…
Cancel
Save