First attempt at running CSGA. To be debugged.

repack-csga
sfilippone 8 months ago
parent c35b3b9ef3
commit 3ba2002e60

@ -120,7 +120,8 @@ psb_cuda_mod.o: psb_cuda_env_mod.o psb_i_cuda_vect_mod.o\
psb_d_cuda_diag_mat_mod.o psb_d_cuda_csga_mat_mod.o\
psb_d_cuda_hdiag_mat_mod.o psb_s_cuda_hdiag_mat_mod.o\
psb_s_cuda_dnsg_mat_mod.o psb_d_cuda_dnsg_mat_mod.o \
psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o
psb_c_cuda_dnsg_mat_mod.o psb_z_cuda_dnsg_mat_mod.o \
psb_d_cuda_csga_mat_mod.o
iobjs: $(FOBJS)
$(MAKE) -C impl objs

@ -33,16 +33,13 @@ module d_csga_mod
integer, parameter :: MAX_NNZ_PER_WG = 4096
integer, parameter :: MAX_GRID_SIZE = 65536
type, bind(c) :: d_CAmat
type(c_ptr) :: Mat = c_null_ptr
end type d_CAmat
interface CSGADeviceFree
function d_CSGADeviceFree(Mat) &
& bind(c,name="d_CSGADeviceFree") result(res)
use iso_c_binding
import d_CAmat
type(d_CAmat) :: Mat
import d_CMat
type(d_CMat) :: Mat
integer(c_int) :: res
end function d_CSGADeviceFree
end interface
@ -51,62 +48,61 @@ module d_csga_mod
function d_CSGADeviceAlloc(Mat,nr,nc,nz) &
& bind(c,name="d_CSGADeviceAlloc") result(res)
use iso_c_binding
import d_CAmat
type(d_CAmat) :: Mat
import d_CMat
type(d_CMat) :: Mat
integer(c_int), value :: nr, nc, nz
integer(c_int) :: res
end function d_CSGADeviceAlloc
end interface
interface CSGADeviceSetMatDiagType
function d_CSGADeviceSetMatDiagType(Mat,type) &
& bind(c,name="d_CSGADeviceSetMatDiagType") result(res)
use iso_c_binding
import d_CAmat
type(d_CAmat) :: Mat
integer(c_int),value :: type
integer(c_int) :: res
end function d_CSGADeviceSetMatDiagType
end interface
interface CSGADeviceSetMatFillMode
function d_CSGADeviceSetMatFillMode(Mat,type) &
& bind(c,name="d_CSGADeviceSetMatFillMode") result(res)
use iso_c_binding
import d_CAmat
type(d_CAmat) :: Mat
integer(c_int),value :: type
integer(c_int) :: res
end function d_CSGADeviceSetMatFillMode
end interface
interface CSGAHost2Device
function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,rowBlocks) &
function d_CSGAHost2Device(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) &
& bind(c,name="d_CSGAHost2Device") result(res)
use iso_c_binding
import d_CAmat
type(d_CAmat) :: Mat
integer(c_int), value :: m,n,nz
import d_CMat
type(d_CMat) :: Mat
integer(c_int), value :: m,n,nz,numBlocks
integer(c_int) :: irp(*), ja(*), rowBlocks(*)
real(c_double) :: val(*)
real(c_double) :: val(*)
integer(c_int) :: res
end function d_CSGAHost2Device
end interface
interface CSGADevice2Host
function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,rowBlocks) &
function d_CSGADevice2Host(Mat,m,n,nz,irp,ja,val,numBlocks,rowBlocks) &
& bind(c,name="d_CSGADevice2Host") result(res)
use iso_c_binding
import d_CAmat
type(d_CAmat) :: Mat
import d_CMat
type(d_CMat) :: Mat
integer(c_int), value :: m,n,nz
integer(c_int) :: numBlocks
integer(c_int) :: irp(*), ja(*), rowBlocks(*)
real(c_double) :: val(*)
real(c_double) :: val(*)
integer(c_int) :: res
end function d_CSGADevice2Host
end interface
interface CSGAComputeRowBlocks
subroutine d_CSGAComputeRowBlocks(m,irp,numBlocks,rowBlocks) &
& bind(c,name="d_CSGAComputeRowBlocks")
use iso_c_binding
integer(c_int), value :: m
integer(c_int) :: numBlocks
integer(c_int) :: irp(*), rowBlocks(*)
end subroutine d_CSGAComputeRowBlocks
end interface
interface spmvCSGADevice
function d_spmvCSGADevice(Mat,alpha,x,beta,y) &
& bind(c,name="d_spmvCSGADevice") result(res)
use iso_c_binding
import d_Cmat
type(d_Cmat) :: Mat
type(c_ptr), value :: x
type(c_ptr), value :: y
real(c_double), value :: alpha,beta
integer(c_int) :: res
end function d_spmvCSGADevice
end interface
end module d_csga_mod

@ -51,6 +51,8 @@ typedef struct T_CSRGDeviceMat
TYPE *val;
int *irp;
int *ja;
int *rowBlocks;
int numBlocks;
} T_CSRGDeviceMat;
/* Interoperability: type coming from Fortran side to distinguish D/S/C/Z. */

@ -96,7 +96,6 @@ int T_spmvCSRGDevice(T_Cmat *Matrix, TYPE alpha, void *deviceX,
#else
cusparseDnVecDescr_t vecX, vecY;
size_t bfsz;
if (T_CSRGIsNullMvDescr(cMat)) {
cMat->spmvDescr = (cusparseSpMatDescr_t *) malloc(sizeof(cusparseSpMatDescr_t *));
}
@ -246,9 +245,13 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
return((int) CUSPARSE_STATUS_INVALID_VALUE);
if ((cMat=(T_CSRGDeviceMat *) malloc(sizeof(T_CSRGDeviceMat)))==NULL)
return((int) CUSPARSE_STATUS_ALLOC_FAILED);
cMat->numBlocks = 0;
cMat->rowBlocks = NULL;
cMat->m = nr;
cMat->n = nc;
cMat->nz = nz;
fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n);
if (nr1 == 0) nr1 = 1;
if (nz1 == 0) nz1 = 1;
if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)
@ -344,6 +347,8 @@ int T_CSRGDeviceFree(T_Cmat *Matrix)
cMat->svbuffer=NULL;
cMat->svbsize=0;
#endif
if (cMat->rowBlocks != NULL) freeRemoteBuffer(cMat->rowBlocks);
cMat->numBlocks = 0;
free(cMat);
Matrix->mat = NULL;
}

@ -286,7 +286,11 @@ psb_s_cuda_hdiag_vect_mv.o \
psb_s_cuda_dnsg_mat_impl.o \
psb_d_cuda_dnsg_mat_impl.o \
psb_c_cuda_dnsg_mat_impl.o \
psb_z_cuda_dnsg_mat_impl.o
psb_z_cuda_dnsg_mat_impl.o \
psb_d_cuda_csga_to_gpu.o \
psb_d_cuda_csga_from_gpu.o \
psb_d_cuda_csga_mold.o \
psb_d_cuda_csga_vect_mv.o
objs: $(OBJS)

@ -63,6 +63,8 @@ module psb_cuda_mod
use psb_c_cuda_hlg_mat_mod
use psb_z_hll_mat_mod
use psb_z_cuda_hlg_mat_mod
use psb_d_cuda_csga_mat_mod
use psb_s_cuda_csrg_mat_mod
use psb_d_cuda_csrg_mat_mod

@ -41,11 +41,8 @@ module psb_d_cuda_csga_mat_mod
!
! Format for CSR Adaptive.
!
type(d_CAmat) :: deviceAMat
integer(psb_ipk_), allocatable :: rowBlocks(:)
contains
procedure, nopass :: get_fmt => d_cuda_csga_get_fmt
procedure, pass(a) :: sizeof => d_cuda_csga_sizeof
procedure, pass(a) :: vect_mv => psb_d_cuda_csga_vect_mv
!!$ procedure, pass(a) :: in_vect_sv => psb_d_cuda_csga_inner_vect_sv
!!$ procedure, pass(a) :: csmm => psb_d_cuda_csga_csmm
@ -56,11 +53,11 @@ module psb_d_cuda_csga_mat_mod
!!$ procedure, pass(a) :: allocate_mnnz => psb_d_cuda_csga_allocate_mnnz
! Note: we do *not* need the TO methods, because the parent type
! methods will work.
procedure, pass(a) :: cp_from_coo => psb_d_cuda_cp_csga_from_coo
procedure, pass(a) :: cp_from_fmt => psb_d_cuda_cp_csga_from_fmt
procedure, pass(a) :: mv_from_coo => psb_d_cuda_mv_csga_from_coo
procedure, pass(a) :: mv_from_fmt => psb_d_cuda_mv_csga_from_fmt
procedure, pass(a) :: free => d_cuda_csga_free
!!$ procedure, pass(a) :: cp_from_coo => psb_d_cuda_cp_csga_from_coo
!!$ procedure, pass(a) :: cp_from_fmt => psb_d_cuda_cp_csga_from_fmt
!!$ procedure, pass(a) :: mv_from_coo => psb_d_cuda_mv_csga_from_coo
!!$ procedure, pass(a) :: mv_from_fmt => psb_d_cuda_mv_csga_from_fmt
!!$ procedure, pass(a) :: free => d_cuda_csga_free
procedure, pass(a) :: mold => psb_d_cuda_csga_mold
!!$ procedure, pass(a) :: is_host => d_cuda_csga_is_host
!!$ procedure, pass(a) :: is_dev => d_cuda_csga_is_dev
@ -74,7 +71,7 @@ module psb_d_cuda_csga_mat_mod
final :: d_cuda_csga_finalize
end type psb_d_cuda_csga_sparse_mat
private :: d_cuda_csga_free, d_cuda_csga_get_fmt, d_cuda_csga_sizeof
private :: d_cuda_csga_get_fmt
!!$
!!$ interface
@ -145,41 +142,41 @@ module psb_d_cuda_csga_mat_mod
end subroutine psb_d_cuda_csga_from_gpu
end interface
interface
subroutine psb_d_cuda_cp_csga_from_coo(a,b,info)
import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
class(psb_d_coo_sparse_mat), intent(in) :: b
integer(psb_ipk_), intent(out) :: info
end subroutine psb_d_cuda_cp_csga_from_coo
end interface
interface
subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info)
import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
class(psb_d_base_sparse_mat), intent(in) :: b
integer(psb_ipk_), intent(out) :: info
end subroutine psb_d_cuda_cp_csga_from_fmt
end interface
interface
subroutine psb_d_cuda_mv_csga_from_coo(a,b,info)
import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
class(psb_d_coo_sparse_mat), intent(inout) :: b
integer(psb_ipk_), intent(out) :: info
end subroutine psb_d_cuda_mv_csga_from_coo
end interface
interface
subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info)
import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
class(psb_d_base_sparse_mat), intent(inout) :: b
integer(psb_ipk_), intent(out) :: info
end subroutine psb_d_cuda_mv_csga_from_fmt
end interface
!!$ interface
!!$ subroutine psb_d_cuda_cp_csga_from_coo(a,b,info)
!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
!!$ class(psb_d_coo_sparse_mat), intent(in) :: b
!!$ integer(psb_ipk_), intent(out) :: info
!!$ end subroutine psb_d_cuda_cp_csga_from_coo
!!$ end interface
!!$
!!$ interface
!!$ subroutine psb_d_cuda_cp_csga_from_fmt(a,b,info)
!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
!!$ class(psb_d_base_sparse_mat), intent(in) :: b
!!$ integer(psb_ipk_), intent(out) :: info
!!$ end subroutine psb_d_cuda_cp_csga_from_fmt
!!$ end interface
!!$
!!$ interface
!!$ subroutine psb_d_cuda_mv_csga_from_coo(a,b,info)
!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
!!$ class(psb_d_coo_sparse_mat), intent(inout) :: b
!!$ integer(psb_ipk_), intent(out) :: info
!!$ end subroutine psb_d_cuda_mv_csga_from_coo
!!$ end interface
!!$
!!$ interface
!!$ subroutine psb_d_cuda_mv_csga_from_fmt(a,b,info)
!!$ import :: psb_d_cuda_csga_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
!!$ class(psb_d_base_sparse_mat), intent(inout) :: b
!!$ integer(psb_ipk_), intent(out) :: info
!!$ end subroutine psb_d_cuda_mv_csga_from_fmt
!!$ end interface
!!$ interface
!!$ subroutine psb_d_cuda_csga_csmv(alpha,a,x,beta,y,info,trans)
@ -237,22 +234,6 @@ contains
! == ===================================
function d_cuda_csga_sizeof(a) result(res)
implicit none
class(psb_d_cuda_csga_sparse_mat), intent(in) :: a
integer(psb_epk_) :: res
if (a%is_dev()) call a%sync()
res = 8
res = res + psb_sizeof_dp * size(a%val)
res = res + psb_sizeof_ip * size(a%irp)
res = res + psb_sizeof_ip * size(a%ja)
res = res + psb_sizeof_ip * size(a%rowBlocks)
! Should we account for the shadow data structure
! on the GPU device side?
! res = 2*res
end function d_cuda_csga_sizeof
function d_cuda_csga_get_fmt() result(res)
implicit none
character(len=5) :: res
@ -337,18 +318,18 @@ contains
!!$
!!$ end subroutine d_cuda_csga_sync
subroutine d_cuda_csga_free(a)
implicit none
integer(psb_ipk_) :: info
class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
info = CSGADeviceFree(a%deviceAMat)
call a%psb_d_csr_sparse_mat%free()
return
end subroutine d_cuda_csga_free
!!$ subroutine d_cuda_csga_free(a)
!!$ implicit none
!!$ integer(psb_ipk_) :: info
!!$
!!$ class(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
!!$
!!$ info = CSGADeviceFree(a%deviceMat)
!!$ call a%psb_d_csr_sparse_mat%free()
!!$
!!$ return
!!$
!!$ end subroutine d_cuda_csga_free
subroutine d_cuda_csga_finalize(a)
implicit none
@ -356,7 +337,7 @@ contains
type(psb_d_cuda_csga_sparse_mat), intent(inout) :: a
info = CSGADeviceFree(a%deviceAMat)
info = CSRGDeviceFree(a%deviceMat)
return

@ -49,6 +49,8 @@ module psb_d_cuda_csrg_mat_mod
!
!
type(d_Cmat) :: deviceMat
integer(psb_ipk_), allocatable :: rowBlocks(:)
integer(psb_ipk_) :: numBlocks=0
integer(psb_ipk_) :: devstate = is_host
contains
@ -252,9 +254,10 @@ contains
integer(psb_epk_) :: res
if (a%is_dev()) call a%sync()
res = 8
res = res + psb_sizeof_dp * size(a%val)
res = res + psb_sizeof_ip * size(a%irp)
res = res + psb_sizeof_ip * size(a%ja)
res = res + psb_sizeof_dp * psb_size(a%val)
res = res + psb_sizeof_ip * psb_size(a%irp)
res = res + psb_sizeof_ip * psb_size(a%ja)
res = res + psb_sizeof_ip * psb_size(a%rowBlocks)
! Should we account for the shadow data structure
! on the GPU device side?
! res = 2*res
@ -353,6 +356,8 @@ contains
class(psb_d_cuda_csrg_sparse_mat), intent(inout) :: a
info = CSRGDeviceFree(a%deviceMat)
if (allocated(a%rowBlocks)) deallocate(a%rowBlocks)
a%numBlocks=0
call a%psb_d_csr_sparse_mat%free()
return

@ -20,7 +20,7 @@ OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \
hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \
sxyzw.o cxyzw.o dxyzw.o zxyzw.o
sxyzw.o cxyzw.o dxyzw.o zxyzw.o dcsgamv.o
objs: $(OBJS)
lib: objs

@ -0,0 +1,145 @@
#include "stdio.h"
#include "cudalang.h"
#include "cudadebug.h"
extern "C"
{
#include "core.h"
#include "csga.h"
}
#include "debug.h"
//#define MAX_NNZ_PER_WG 6144
#define MAX_NNZ_PER_WG 4096
#define THREAD_BLOCK 1024
#define MAX_GRID_SIZE 65536
#define WARP_SIZE 32
__device__ double warp_reduce(double val){
for(int offset=warpSize/2; offset>0; offset/=2){
val += __shfl_down_sync(0xffffffff,val, offset);
}
return val;
}
__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
int m, int n, int col_multivector,
int* rowBlocks, double* resultData, int baseIndex){
__shared__ double vals[MAX_NNZ_PER_WG];
__shared__ int cols[MAX_NNZ_PER_WG];
int startRow = rowBlocks[blockIdx.x];
int stopRow = rowBlocks[blockIdx.x+1];
long int numRows = stopRow - startRow;
int nnz = irp[stopRow]-irp[startRow];
int tid = threadIdx.x; // indice del thread nel blocco
if (numRows > 1){
//CSR-Stream
//printf("csr stream\n");
int localCol;
for (int i = tid; i < nnz; i+= blockDim.x){
localCol = irp[startRow]+i;
vals[i] = as[localCol];
//vals[i] *= multivector[ja[localCol]*col_multivector+j];
cols[i] = ja[localCol];
}
int firstCol = irp[startRow];
__syncthreads();
for (int t = tid; t < numRows*col_multivector; t += blockDim.x){
int localRow = startRow + t/col_multivector;
int j = t%col_multivector;
double temp = 0;
for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
temp += vals[i]*multivector[cols[i]*col_multivector + j];
}
resultData[localRow*col_multivector +j] = temp;
}
__syncthreads();
} else {
//CSR-Vector
//printf("csr vector\n");
int warpId = tid / 32; // Global warp index
int lane = tid &(32-1); // thread index within the warp
//one warp per row
double val;
int col;
double sum[64] = {0};
if (nnz < 4096){
int localCol;
for (int i = tid; i < nnz; i+= blockDim.x){
localCol = irp[startRow]+i;
vals[i] = as[localCol];
cols[i] = ja[localCol];
}
}
__syncthreads();
if (warpId < col_multivector){
for (int col_m = warpId; col_m < col_multivector; col_m +=32){
for (int i = irp[startRow] + lane; i < irp[startRow+1]; i +=32){
if (nnz < 4096){
val = vals[i-irp[startRow]];
col = cols[i-irp[startRow]];
} else {
val = as[i];
col = ja[i];
}
sum[col_m] += val*multivector[col*col_multivector + col_m];
}
sum[col_m] = warp_reduce(sum[col_m]);
if (lane == 0){
resultData[startRow*col_multivector + col_m] = sum[col_m];
}
}
}
}
}
__host__ int dCSGAMV(spgpuHandle_t handle,
double beta,
double* y,
double alpha,
const double* as,
const int* ja,
const int* irp,
int m,
int n,
int numBlocks,
const int* rowBlocks,
const double *x,
int baseIndex)
{
int maxBForACall = max(handle->maxGridSizeX, numBlocks);
int blockX = THREAD_BLOCK;
int gridX = maxBForACall;
int rp,rows, blcks, bp, numb;
dim3 blockSize(blockX);
dim3 gridSize(gridX);
fprintf(stderr," dcsgamv %d %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
bp = 0;
rp = 0;
numb = numBlocks;
while (numb > maxBForACall) {//managing large vectors
blcks = maxBForACall;
rp = rowBlocks[bp];
rows = rowBlocks[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp);
bp += blcks;
numb -= blcks;
}
blcks = numb;
rp = rowBlocks[bp];
rows = rowBlocks[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp);
rp += rows;
fprintf(stderr," Final rows %d \n",rows);
return(SPGPU_SUCCESS);
}

@ -21,7 +21,9 @@
__device__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4)
(int i, VALUE_TYPE yVal, int outRow,
VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, const int* rS, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM,
const int* rP, int cMPitch, int rPPitch, const int* rS, int rows,
const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();

@ -608,6 +608,7 @@ program pdgenmv
#ifdef HAVE_CUDA
type(psb_d_cuda_elg_sparse_mat), target :: aelg
type(psb_d_cuda_csrg_sparse_mat), target :: acsrg
type(psb_d_cuda_csga_sparse_mat), target :: acsga
#if CUDA_SHORT_VERSION <= 10
type(psb_d_cuda_hybg_sparse_mat), target :: ahybg
#endif
@ -719,6 +720,8 @@ program pdgenmv
agmold => ahdiag
case('CSRG')
agmold => acsrg
case('CSGA')
agmold => acsga
case('DNSG')
agmold => adnsg
#if CUDA_SHORT_VERSION <= 10

Loading…
Cancel
Save