You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
psblas3/cuda/spgpu/kernels/hell_spmm_base.cuh

275 lines
9.2 KiB
Plaintext

/*
* spGPU - Sparse matrices on GPU library.
*
* Copyright (C) 2010 - 2015
* Davide Barbieri - University of Rome Tor Vergata
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 3 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)
#undef GEN_SPGPU_HELL_NAME
#undef X_TEX
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
__device__ __host__ static float zero_float() { return 0.0f; }
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
__device__ static float readValue_float(float fetch) { return fetch; }
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
// host or c.c >= 1.3
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
__device__ __host__ static double zero_double() { return 0.0; }
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
{
cuDoubleComplex c;
c.x = __hiloint2double (fetch.y, fetch.x);
c.y = __hiloint2double (fetch.w, fetch.z);
return c;
}
#endif
#if 0
// Texture cache management
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
__device__ static VALUE_TYPE
fetchTex (int pointer)
{
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
return CONCAT(readValue_,VALUE_TYPE) (fetch);
}
#endif
#if __CUDA_ARCH__ < 300
extern __shared__ int dynShrMem[];
#endif
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#if 0
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
#undef USE_PREFETCHING
#define USE_PREFETCHING
#include "hell_spmm_base_template.cuh"
#define ENABLE_CACHE
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#undef USE_PREFETCHING
#endif
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
#include "hell_spmm_base_template.cuh"
#undef GEN_SPGPU_HELL_NAME
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
#if 0
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt, c1;
dim3 block (THREAD_BLOCK, 1);
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
// Should we generalize the code to 1/2/4/8 threads per row?
// And maybe adjust THREAD_BLOCK size?
int shrMemSize,maxShmemSz;
int numMp=getGPUMultiProcessors();
int maxThMp=getGPUMaxThreadsPerMP();
int nmblksMp=maxThMp/THREAD_BLOCK;
int nmblk=nmblksMp*numMp;
dim3 grid (nmblk);
maxShmemSz=getGPUSharedMemPerBlock();
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
if (shrMemSize > maxShmemSz) {
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
return;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > 2*MMBSZ) {
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt > MMBSZ) {
c1 = cnt/2;
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
cnt -= c1;
}
if (cnt > MMBSZ) {
fprintf(stderr,"Invalid residual count %d\n",cnt);
} else if (cnt > 0){
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rows, px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}
#endif
void
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
int count,
VALUE_TYPE* z,
int zPitch,
const VALUE_TYPE *y,
int yPitch,
VALUE_TYPE alpha,
const VALUE_TYPE* cM,
const int* rP,
int hackSize,
const __device int* hackOffsets,
const __device int* rS,
const __device int* rIdx,
int rows,
const VALUE_TYPE *x,
int xPitch,
VALUE_TYPE beta,
int baseIndex)
{
VALUE_TYPE *px,*py, *pz;
int cnt;
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
// maxNForACall should be a multiple of hackSize
maxNForACall = (maxNForACall/hackSize)*hackSize;
int maxShmemSz;
maxShmemSz=getGPUSharedMemPerBlock();
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
while (rows > maxNForACall) {//managing large vectors
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
maxNForACall,
px, xPitch, beta, baseIndex);
}
y = y + maxNForACall;
z = z + maxNForACall;
hackOffsets = hackOffsets + maxNForACall/hackSize;
rS = rS + maxNForACall;
rows -= maxNForACall;
}
cnt = count;
px = (VALUE_TYPE *) x;
py = (VALUE_TYPE *) y;
pz = (VALUE_TYPE *) z;
while (cnt > MMBSZ) {
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
alpha, cM, rP, hackSize, hackOffsets,
rS, rIdx, rows,
px, xPitch, beta, baseIndex);
px += xPitch*MMBSZ;
py += yPitch*MMBSZ;
pz += zPitch*MMBSZ;
cnt -= MMBSZ;
}
if (cnt >0) {
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
py, yPitch,
alpha, cM, rP,
hackSize, hackOffsets,
rS, rIdx,
rows,
px, xPitch, beta, baseIndex);
}
cudaCheckError("CUDA error on hell_spmm");
}