You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
psblas3/cuda/spgpu/kernels/hdia_spmv_base_template.cuh

254 lines
6.8 KiB
Plaintext

/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define THREAD_BLOCK 128
__device__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets,
int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount)
{
int i = threadIdx.x + blockIdx.x * (blockDim.x);
VALUE_TYPE yVal = CONCAT(zero_,VALUE_TYPE)();
if (i < rows && CONCAT(VALUE_TYPE, _isNotZero(beta)))
yVal = y[i];
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
int hackId = i / hackSize;
int hackLaneId = i % hackSize;
// shared between offsetsChunks and warpHackOffsetTemp
extern __shared__ int dynShrMem[];
int hackOffset = 0;
int nextOffset = 0;
unsigned int laneId = threadIdx.x % warpSize;
unsigned int warpId = threadIdx.x / warpSize;
#if __CUDA_ARCH__ < 300
int* warpHackOffset = dynShrMem;
if (laneId == 0 && i < rows)
{
warpHackOffset[warpId] = hackOffsets[hackId];
warpHackOffset[warpId + (blockDim.x / warpSize)] = hackOffsets[hackId+1];
}
__syncthreads();
hackOffset = warpHackOffset[warpId];
nextOffset = warpHackOffset[warpId + blockDim.x / warpSize];
__syncthreads();
#elif __CUDA_ARCH__ < 700
if (laneId == 0 && i < rows)
{
hackOffset = hackOffsets[hackId];
nextOffset = hackOffsets[hackId+1];
}
hackOffset = __shfl(hackOffset, 0);
nextOffset = __shfl(nextOffset, 0);
#else
if (laneId == 0 && i < rows)
{
hackOffset = hackOffsets[hackId];
nextOffset = hackOffsets[hackId+1];
}
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);
nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
#endif
if (hackId >= hackCount)
return;
dM += hackOffset*hackSize + hackLaneId;
offsets += hackOffset;
// diags for this hack is next hackOffset minus current hackOffset
int diags = nextOffset - hackOffset;
// Warp oriented
int rounds = (diags + warpSize - 1)/warpSize;
volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
for (int r = 0; r < rounds; r++)
{
// in the last round diags will be <= warpSize
if (laneId < diags)
offsetsChunk[laneId] = offsets[laneId];
if (i < rows)
{
int count = min(diags, warpSize);
#ifdef USE_PREFETCHING
int j;
for (j=0; j<=count-2; j += 2)
{
// prefetch 2 values
int column1 = offsetsChunk[j] + i;
int column2 = offsetsChunk[j+1] + i;
VALUE_TYPE xValue1, xValue2;
VALUE_TYPE mValue1, mValue2;
bool inside1 = column1 >= 0 && column1 < cols;
bool inside2 = column2 >= 0 && column2 < cols;
if(inside1)
{
mValue1 = dM[0];
#ifdef ENABLE_CACHE
xValue1 = fetchTex (column1);
#else
xValue1 = x[column1];
#endif
}
dM += hackSize;
if(inside2)
{
mValue2 = dM[0];
#ifdef ENABLE_CACHE
xValue2 = fetchTex (column2);
#else
xValue2 = x[column2];
#endif
}
dM += hackSize;
if(inside1)
zProd = CONCAT(VALUE_TYPE, _fma)(mValue1, xValue1, zProd);
if(inside2)
zProd = CONCAT(VALUE_TYPE, _fma)(mValue2, xValue2, zProd);
}
for (;j<count; ++j)
{
int column = offsetsChunk[j] + i;
if(column >= 0 && column < cols)
{
VALUE_TYPE xValue;
#ifdef ENABLE_CACHE
xValue = fetchTex (column);
#else
xValue = x[column];
#endif
VALUE_TYPE mValue = dM[0];
zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd);
}
dM += hackSize;
}
#else
for (int j=0;j<count; ++j)
{
int column = offsetsChunk[j] + i;
if(column >= 0 && column < cols)
{
VALUE_TYPE xValue;
#ifdef ENABLE_CACHE
xValue = fetchTex (column);
#else
xValue = x[column];
#endif
VALUE_TYPE mValue = dM[0];
zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd);
}
dM += hackSize;
}
#endif
}
diags -= warpSize;
offsets += warpSize;
}
// Since z and y are accessed with the same offset by the same thread,
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
if (i >= rows)
return;
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
z[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd));
else
z[i] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd);
}
// Force to recompile and optimize with llvm
__global__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, int hackCount)
{
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)
(z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, CONCAT(zero_,VALUE_TYPE)(), hackCount);
}
__global__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount)
{
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)
(z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount);
}
void
CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, VALUE_TYPE* z, const VALUE_TYPE *y, VALUE_TYPE alpha,
const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols,
const VALUE_TYPE *x, VALUE_TYPE beta)
{
dim3 block (THREAD_BLOCK);
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
int hackCount = (rows + hackSize - 1)/hackSize;
#ifdef ENABLE_CACHE
bind_tex_x ((const TEX_FETCH_TYPE *) x);
#endif
cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn), cudaFuncCachePreferL1);
cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0), cudaFuncCachePreferL1);
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount);
else
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, hackCount);
#ifdef ENABLE_CACHE
unbind_tex_x ((const TEX_FETCH_TYPE *) x);
#endif
}