First working version of CSGA. To be tested and refined.

repack-csga
Salvatore Filippone 10 months ago
parent 3ba2002e60
commit 173ffec2d3

@ -93,7 +93,7 @@ module d_csga_mod
end interface end interface
interface spmvCSGADevice interface spmvCSGADevice
function d_spmvCSGADevice(Mat,alpha,x,beta,y) & function d_spmvCSGADevice(Mat,alpha,x,beta,y,rb) &
& bind(c,name="d_spmvCSGADevice") result(res) & bind(c,name="d_spmvCSGADevice") result(res)
use iso_c_binding use iso_c_binding
import d_Cmat import d_Cmat
@ -101,6 +101,7 @@ module d_csga_mod
type(c_ptr), value :: x type(c_ptr), value :: x
type(c_ptr), value :: y type(c_ptr), value :: y
real(c_double), value :: alpha,beta real(c_double), value :: alpha,beta
integer(c_int) :: rb(*)
integer(c_int) :: res integer(c_int) :: res
end function d_spmvCSGADevice end function d_spmvCSGADevice
end interface end interface

@ -251,7 +251,7 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
cMat->m = nr; cMat->m = nr;
cMat->n = nc; cMat->n = nc;
cMat->nz = nz; cMat->nz = nz;
fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n); //fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n);
if (nr1 == 0) nr1 = 1; if (nr1 == 0) nr1 = 1;
if (nz1 == 0) nz1 = 1; if (nz1 == 0) nz1 = 1;
if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0) if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)

@ -23,9 +23,9 @@ __device__ double warp_reduce(double val){
return val; return val;
} }
__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector, __global__ void dCSGAmvINNER(double alpha, const double* as, const int* ja, const int* irp,
int m, int n, int col_multivector, const double* multivector, int m, int n, int col_multivector,
int* rowBlocks, double* resultData, int baseIndex){ const int* rowBlocks, double beta, double* resultData, int baseIndex){
__shared__ double vals[MAX_NNZ_PER_WG]; __shared__ double vals[MAX_NNZ_PER_WG];
__shared__ int cols[MAX_NNZ_PER_WG]; __shared__ int cols[MAX_NNZ_PER_WG];
@ -34,6 +34,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
long int numRows = stopRow - startRow; long int numRows = stopRow - startRow;
int nnz = irp[stopRow]-irp[startRow]; int nnz = irp[stopRow]-irp[startRow];
int tid = threadIdx.x; // indice del thread nel blocco int tid = threadIdx.x; // indice del thread nel blocco
if (numRows > 1){ if (numRows > 1){
//CSR-Stream //CSR-Stream
//printf("csr stream\n"); //printf("csr stream\n");
@ -46,6 +47,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
//vals[i] *= multivector[ja[localCol]*col_multivector+j]; //vals[i] *= multivector[ja[localCol]*col_multivector+j];
cols[i] = ja[localCol]; cols[i] = ja[localCol];
} }
//return;
int firstCol = irp[startRow]; int firstCol = irp[startRow];
__syncthreads(); __syncthreads();
@ -56,7 +58,11 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){ for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
temp += vals[i]*multivector[cols[i]*col_multivector + j]; temp += vals[i]*multivector[cols[i]*col_multivector + j];
} }
resultData[localRow*col_multivector +j] = temp; if (beta == 0.0) {
resultData[localRow*col_multivector +j] = alpha*temp;
} else {
resultData[localRow*col_multivector +j] = alpha*temp + beta*resultData[localRow*col_multivector +j];
}
} }
__syncthreads(); __syncthreads();
@ -65,7 +71,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
//CSR-Vector //CSR-Vector
//printf("csr vector\n"); //printf("csr vector\n");
int warpId = tid / 32; // Global warp index int warpId = tid / 32; // Global warp index
int lane = tid &(32-1); // thread index within the warp int lane = tid &(0xFFFF); // thread index within the warp
//one warp per row //one warp per row
double val; double val;
int col; int col;
@ -78,6 +84,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
cols[i] = ja[localCol]; cols[i] = ja[localCol];
} }
} }
//return;
__syncthreads(); __syncthreads();
if (warpId < col_multivector){ if (warpId < col_multivector){
for (int col_m = warpId; col_m < col_multivector; col_m +=32){ for (int col_m = warpId; col_m < col_multivector; col_m +=32){
@ -93,7 +100,12 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
} }
sum[col_m] = warp_reduce(sum[col_m]); sum[col_m] = warp_reduce(sum[col_m]);
if (lane == 0){ if (lane == 0){
resultData[startRow*col_multivector + col_m] = sum[col_m]; if (beta == 0.0) {
resultData[startRow*col_multivector + col_m] = alpha*sum[col_m];
} else {
resultData[startRow*col_multivector + col_m] = alpha*sum[col_m] +
beta*resultData[startRow*col_multivector + col_m];
}
} }
} }
} }
@ -102,44 +114,51 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
__host__ int dCSGAMV(spgpuHandle_t handle, __host__ int dCSGAMV(spgpuHandle_t handle,
double beta, double beta,
double* y, double* y,
double alpha, double alpha,
const double* as, const double* as,
const int* ja, const int* ja,
const int* irp, const int* irp,
int m, int m,
int n, int n,
int numBlocks, int ncol,
const int* rowBlocks, int numBlocks,
const double *x, const int* rowBlocks,
int baseIndex) const double *x,
int baseIndex,
int *rb)
{ {
int maxBForACall = max(handle->maxGridSizeX, numBlocks); // fprintf(stderr," dcsgamv %d \n",numBlocks);
int maxBForACall = min(handle->maxGridSizeX, numBlocks);
//int maxBForACall = 1024;
int blockX = THREAD_BLOCK; int blockX = THREAD_BLOCK;
int gridX = maxBForACall; int gridX = maxBForACall;
int rp,rows, blcks, bp, numb; int rp,rows, blcks, bp, numb;
dim3 blockSize(blockX); dim3 blockSize(blockX);
dim3 gridSize(gridX); dim3 gridSize(gridX);
//fprintf(stderr," dcsgamv start %d %d %d \n",m,n,ncol);
fprintf(stderr," dcsgamv %d %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
bp = 0; bp = 0;
rp = 0; rp = 0;
numb = numBlocks; numb = numBlocks;
while (numb > maxBForACall) {//managing large vectors while (numb > maxBForACall) {//managing large vectors
blcks = maxBForACall; blcks = maxBForACall;
rp = rowBlocks[bp]; rp = rb[bp]-baseIndex;
rows = rowBlocks[bp+blcks]-rp; rows = rb[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp); fprintf(stderr," rp %d rows %d bp %d blcks %d\n",rp,rows,bp,blcks);
dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
&(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
bp += blcks; bp += blcks;
numb -= blcks; numb -= blcks;
} }
blcks = numb; blcks = numb;
rp = rowBlocks[bp]; rp = rb[bp]-baseIndex;
rows = rowBlocks[bp+blcks]-rp; rows = rb[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp); //fprintf(stderr," rp %d rows %d bp %d blcks %d\n",rp,rows,bp,blcks);
dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
&(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
rp += rows; rp += rows;
fprintf(stderr," Final rows %d \n",rows); //fprintf(stderr," Final rows %d %d \n",rows,rp);
return(SPGPU_SUCCESS); return(SPGPU_SUCCESS);
} }

@ -32,7 +32,7 @@ dir:
(if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi) (if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi)
dpdegenmv: $(DPGOBJS) dpdegenmv: $(DPGOBJS)
$(FLINK) $(LOPT) $(DPGOBJS) -fopenmp -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS) $(FLINK) $(LOPT) $(DPGOBJS) -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS)
/bin/mv dpdegenmv $(EXEDIR) /bin/mv dpdegenmv $(EXEDIR)
spdegenmv: $(SPGOBJS) spdegenmv: $(SPGOBJS)
$(FLINK) $(LOPT) $(SPGOBJS) -o spdegenmv $(PSBLAS_LIB) $(LDLIBS) $(FLINK) $(LOPT) $(SPGOBJS) -o spdegenmv $(PSBLAS_LIB) $(LDLIBS)

@ -596,6 +596,7 @@ program pdgenmv
integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
real(psb_dpk_) :: err, eps real(psb_dpk_) :: err, eps
integer, parameter :: ntests=200, ngpu=50, ncnv=20 integer, parameter :: ntests=200, ngpu=50, ncnv=20
!integer, parameter :: ntests=1, ngpu=1, ncnv=1
type(psb_d_coo_sparse_mat), target :: acoo type(psb_d_coo_sparse_mat), target :: acoo
type(psb_d_csr_sparse_mat), target :: acsr type(psb_d_csr_sparse_mat), target :: acsr
type(psb_d_ell_sparse_mat), target :: aell type(psb_d_ell_sparse_mat), target :: aell
@ -853,6 +854,11 @@ program pdgenmv
eps = maxval(abs(x1(1:nr)-x2(1:nr))) eps = maxval(abs(x1(1:nr)-x2(1:nr)))
call psb_amx(ctxt,eps) call psb_amx(ctxt,eps)
if (iam==0) write(*,*) 'Max diff on GPU',eps if (iam==0) write(*,*) 'Max diff on GPU',eps
if (.false.) then
do i=1,nr
write(0,*) i,x1(i),x2(i)
end do
end if
if (dump) then if (dump) then
write(fname,'(a,i3.3,a,i3.3,a)')'XCPU-out-',iam,'-',np,'.mtx' write(fname,'(a,i3.3,a,i3.3,a)')'XCPU-out-',iam,'-',np,'.mtx'
call mm_array_write(x1(1:nr),'Local part CPU',info,filename=fname) call mm_array_write(x1(1:nr),'Local part CPU',info,filename=fname)

Loading…
Cancel
Save