First attempt at running CSGA. To be debugged.
parent
c35b3b9ef3
commit
3ba2002e60
@ -0,0 +1,145 @@
|
||||
#include "stdio.h"
|
||||
#include "cudalang.h"
|
||||
#include "cudadebug.h"
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "csga.h"
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
//#define MAX_NNZ_PER_WG 6144
|
||||
#define MAX_NNZ_PER_WG 4096
|
||||
#define THREAD_BLOCK 1024
|
||||
#define MAX_GRID_SIZE 65536
|
||||
#define WARP_SIZE 32
|
||||
|
||||
|
||||
__device__ double warp_reduce(double val){
|
||||
for(int offset=warpSize/2; offset>0; offset/=2){
|
||||
val += __shfl_down_sync(0xffffffff,val, offset);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
__global__ void dCSGAmvINNER(double* as, int* ja, int* irp, double* multivector,
|
||||
int m, int n, int col_multivector,
|
||||
int* rowBlocks, double* resultData, int baseIndex){
|
||||
__shared__ double vals[MAX_NNZ_PER_WG];
|
||||
__shared__ int cols[MAX_NNZ_PER_WG];
|
||||
|
||||
int startRow = rowBlocks[blockIdx.x];
|
||||
int stopRow = rowBlocks[blockIdx.x+1];
|
||||
long int numRows = stopRow - startRow;
|
||||
int nnz = irp[stopRow]-irp[startRow];
|
||||
int tid = threadIdx.x; // indice del thread nel blocco
|
||||
if (numRows > 1){
|
||||
//CSR-Stream
|
||||
//printf("csr stream\n");
|
||||
|
||||
int localCol;
|
||||
|
||||
for (int i = tid; i < nnz; i+= blockDim.x){
|
||||
localCol = irp[startRow]+i;
|
||||
vals[i] = as[localCol];
|
||||
//vals[i] *= multivector[ja[localCol]*col_multivector+j];
|
||||
cols[i] = ja[localCol];
|
||||
}
|
||||
int firstCol = irp[startRow];
|
||||
|
||||
__syncthreads();
|
||||
for (int t = tid; t < numRows*col_multivector; t += blockDim.x){
|
||||
int localRow = startRow + t/col_multivector;
|
||||
int j = t%col_multivector;
|
||||
double temp = 0;
|
||||
for (int i = irp[localRow]-firstCol; i < irp[localRow+1]-firstCol; i++){
|
||||
temp += vals[i]*multivector[cols[i]*col_multivector + j];
|
||||
}
|
||||
resultData[localRow*col_multivector +j] = temp;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
} else {
|
||||
//CSR-Vector
|
||||
//printf("csr vector\n");
|
||||
int warpId = tid / 32; // Global warp index
|
||||
int lane = tid &(32-1); // thread index within the warp
|
||||
//one warp per row
|
||||
double val;
|
||||
int col;
|
||||
double sum[64] = {0};
|
||||
if (nnz < 4096){
|
||||
int localCol;
|
||||
for (int i = tid; i < nnz; i+= blockDim.x){
|
||||
localCol = irp[startRow]+i;
|
||||
vals[i] = as[localCol];
|
||||
cols[i] = ja[localCol];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
if (warpId < col_multivector){
|
||||
for (int col_m = warpId; col_m < col_multivector; col_m +=32){
|
||||
for (int i = irp[startRow] + lane; i < irp[startRow+1]; i +=32){
|
||||
if (nnz < 4096){
|
||||
val = vals[i-irp[startRow]];
|
||||
col = cols[i-irp[startRow]];
|
||||
} else {
|
||||
val = as[i];
|
||||
col = ja[i];
|
||||
}
|
||||
sum[col_m] += val*multivector[col*col_multivector + col_m];
|
||||
}
|
||||
sum[col_m] = warp_reduce(sum[col_m]);
|
||||
if (lane == 0){
|
||||
resultData[startRow*col_multivector + col_m] = sum[col_m];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__host__ int dCSGAMV(spgpuHandle_t handle,
|
||||
double beta,
|
||||
double* y,
|
||||
double alpha,
|
||||
const double* as,
|
||||
const int* ja,
|
||||
const int* irp,
|
||||
int m,
|
||||
int n,
|
||||
int numBlocks,
|
||||
const int* rowBlocks,
|
||||
const double *x,
|
||||
int baseIndex)
|
||||
{
|
||||
int maxBForACall = max(handle->maxGridSizeX, numBlocks);
|
||||
int blockX = THREAD_BLOCK;
|
||||
int gridX = maxBForACall;
|
||||
int rp,rows, blcks, bp, numb;
|
||||
dim3 blockSize(blockX);
|
||||
dim3 gridSize(gridX);
|
||||
|
||||
fprintf(stderr," dcsgamv %d %d \n",numBlocks,rowBlocks[0],rowBlocks[1]);
|
||||
|
||||
bp = 0;
|
||||
rp = 0;
|
||||
numb = numBlocks;
|
||||
while (numb > maxBForACall) {//managing large vectors
|
||||
blcks = maxBForACall;
|
||||
rp = rowBlocks[bp];
|
||||
rows = rowBlocks[bp+blcks]-rp;
|
||||
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp);
|
||||
bp += blcks;
|
||||
numb -= blcks;
|
||||
}
|
||||
blcks = numb;
|
||||
rp = rowBlocks[bp];
|
||||
rows = rowBlocks[bp+blcks]-rp;
|
||||
fprintf(stderr," rp %d rows %d bp %d \n",rp,rows,bp);
|
||||
rp += rows;
|
||||
fprintf(stderr," Final rows %d \n",rows);
|
||||
return(SPGPU_SUCCESS);
|
||||
}
|
Loading…
Reference in New Issue