First working version of CSGA. To be tested and refined.

repack-csga
Salvatore Filippone 8 months ago
parent 3ba2002e60
commit 173ffec2d3

@ -93,7 +93,7 @@ module d_csga_mod
end interface
interface spmvCSGADevice
function d_spmvCSGADevice(Mat,alpha,x,beta,y) &
function d_spmvCSGADevice(Mat,alpha,x,beta,y,rb) &
& bind(c,name="d_spmvCSGADevice") result(res)
use iso_c_binding
import d_Cmat
@ -101,6 +101,7 @@ module d_csga_mod
type(c_ptr), value :: x
type(c_ptr), value :: y
real(c_double), value :: alpha,beta
integer(c_int) :: rb(*)
integer(c_int) :: res
end function d_spmvCSGADevice
end interface

@ -251,7 +251,7 @@ int T_CSRGDeviceAlloc(T_Cmat *Matrix,int nr, int nc, int nz)
cMat->m = nr;
cMat->n = nc;
cMat->nz = nz;
fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n);
//fprintf(stderr,"Setting M %d N %d \n",cMat->m,cMat->n);
if (nr1 == 0) nr1 = 1;
if (nz1 == 0) nz1 = 1;
if ((rc= allocRemoteBuffer(((void **) &(cMat->irp)), ((nr1+1)*sizeof(int)))) != 0)

@ -23,9 +23,9 @@ __device__ double warp_reduce(double val){
return val;
}
__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
int m, int n, int col_multivector,
int* rowBlocks, double* resultData, int baseIndex){
__global__ void dCSGAmvINNER(double alpha, const double* as, const int* ja, const int* irp,
const double* multivector, int m, int n, int col_multivector,
const int* rowBlocks, double beta, double* resultData, int baseIndex){
__shared__ double vals[MAX_NNZ_PER_WG];
__shared__ int cols[MAX_NNZ_PER_WG];
@ -34,6 +34,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
long int numRows = stopRow - startRow;
int nnz = irp[stopRow]-irp[startRow];
int tid = threadIdx.x; // indice del thread nel blocco
if (numRows > 1){
//CSR-Stream
//printf("csr stream\n");
@ -46,6 +47,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
//vals[i] *= multivector[ja[localCol]*col_multivector+j];
cols[i] = ja[localCol];
}
//return;
int firstCol = irp[startRow];
__syncthreads();
@ -56,7 +58,11 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
temp += vals[i]*multivector[cols[i]*col_multivector + j];
}
resultData[localRow*col_multivector +j] = temp;
if (beta == 0.0) {
resultData[localRow*col_multivector +j] = alpha*temp;
} else {
resultData[localRow*col_multivector +j] = alpha*temp + beta*resultData[localRow*col_multivector +j];
}
}
__syncthreads();
@ -65,7 +71,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
//CSR-Vector
//printf("csr vector\n");
int warpId = tid / 32; // Global warp index
int lane = tid &(32-1); // thread index within the warp
int lane = tid &(0xFFFF); // thread index within the warp
//one warp per row
double val;
int col;
@ -78,6 +84,7 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
cols[i] = ja[localCol];
}
}
//return;
__syncthreads();
if (warpId < col_multivector){
for (int col_m = warpId; col_m < col_multivector; col_m +=32){
@ -93,7 +100,12 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
}
sum[col_m] = warp_reduce(sum[col_m]);
if (lane == 0){
resultData[startRow*col_multivector + col_m] = sum[col_m];
if (beta == 0.0) {
resultData[startRow*col_multivector + col_m] = alpha*sum[col_m];
} else {
resultData[startRow*col_multivector + col_m] = alpha*sum[col_m] +
beta*resultData[startRow*col_multivector + col_m];
}
}
}
}
@ -102,44 +114,51 @@ __global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
__host__ int dCSGAMV(spgpuHandle_t handle,
double beta,
double* y,
double alpha,
const double* as,
const int* ja,
const int* irp,
int m,
int n,
int numBlocks,
const int* rowBlocks,
const double *x,
int baseIndex)
{
int maxBForACall = max(handle->maxGridSizeX, numBlocks);
double beta,
double* y,
double alpha,
const double* as,
const int* ja,
const int* irp,
int m,
int n,
int ncol,
int numBlocks,
const int* rowBlocks,
const double *x,
int baseIndex,
int *rb)
{
// fprintf(stderr," dcsgamv %d \n",numBlocks);
int maxBForACall = min(handle->maxGridSizeX, numBlocks);
//int maxBForACall = 1024;
int blockX = THREAD_BLOCK;
int gridX = maxBForACall;
int rp,rows, blcks, bp, numb;
dim3 blockSize(blockX);
dim3 gridSize(gridX);
fprintf(stderr," dcsgamv %d %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
//fprintf(stderr," dcsgamv start %d %d %d \n",m,n,ncol);
bp = 0;
rp = 0;
numb = numBlocks;
while (numb > maxBForACall) {//managing large vectors
blcks = maxBForACall;
rp = rowBlocks[bp];
rows = rowBlocks[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp);
rp = rb[bp]-baseIndex;
rows = rb[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d blcks %d\n",rp,rows,bp,blcks);
dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
&(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
bp += blcks;
numb -= blcks;
}
blcks = numb;
rp = rowBlocks[bp];
rows = rowBlocks[bp+blcks]-rp;
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp);
rp = rb[bp]-baseIndex;
rows = rb[bp+blcks]-rp;
//fprintf(stderr," rp %d rows %d bp %d blcks %d\n",rp,rows,bp,blcks);
dCSGAmvINNER<<<gridSize,blockSize>>>(alpha,as,ja,irp,x,rows,n,ncol,
&(rowBlocks[bp]),beta,&(y[rp]),baseIndex);
rp += rows;
fprintf(stderr," Final rows %d \n",rows);
//fprintf(stderr," Final rows %d %d \n",rows,rp);
return(SPGPU_SUCCESS);
}

@ -32,7 +32,7 @@ dir:
(if test ! -d $(EXEDIR); then mkdir $(EXEDIR); fi)
dpdegenmv: $(DPGOBJS)
$(FLINK) $(LOPT) $(DPGOBJS) -fopenmp -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS)
$(FLINK) $(LOPT) $(DPGOBJS) -o dpdegenmv $(FINCLUDES) $(PSBLAS_LIB) $(LDLIBS)
/bin/mv dpdegenmv $(EXEDIR)
spdegenmv: $(SPGOBJS)
$(FLINK) $(LOPT) $(SPGOBJS) -o spdegenmv $(PSBLAS_LIB) $(LDLIBS)

@ -596,6 +596,7 @@ program pdgenmv
integer(psb_epk_) :: amatsize, precsize, descsize, annz, nbytes
real(psb_dpk_) :: err, eps
integer, parameter :: ntests=200, ngpu=50, ncnv=20
!integer, parameter :: ntests=1, ngpu=1, ncnv=1
type(psb_d_coo_sparse_mat), target :: acoo
type(psb_d_csr_sparse_mat), target :: acsr
type(psb_d_ell_sparse_mat), target :: aell
@ -853,6 +854,11 @@ program pdgenmv
eps = maxval(abs(x1(1:nr)-x2(1:nr)))
call psb_amx(ctxt,eps)
if (iam==0) write(*,*) 'Max diff on GPU',eps
if (.false.) then
do i=1,nr
write(0,*) i,x1(i),x2(i)
end do
end if
if (dump) then
write(fname,'(a,i3.3,a,i3.3,a)')'XCPU-out-',iam,'-',np,'.mtx'
call mm_array_write(x1(1:nr),'Local part CPU',info,filename=fname)

Loading…
Cancel
Save