/* * spGPU - Sparse matrices on GPU library. * * Copyright (C) 2010 - 2015 * Davide Barbieri - University of Rome Tor Vergata * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 3 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #define THREAD_BLOCK 128 __device__ void CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _) (VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount) { int i = threadIdx.x + blockIdx.x * (blockDim.x); VALUE_TYPE yVal = CONCAT(zero_,VALUE_TYPE)(); if (i < rows && CONCAT(VALUE_TYPE, _isNotZero(beta))) yVal = y[i]; VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); int hackId = i / hackSize; int hackLaneId = i % hackSize; // shared between offsetsChunks and warpHackOffsetTemp extern __shared__ int dynShrMem[]; int hackOffset = 0; int nextOffset = 0; unsigned int laneId = threadIdx.x % warpSize; unsigned int warpId = threadIdx.x / warpSize; #if __CUDA_ARCH__ < 300 int* warpHackOffset = dynShrMem; if (laneId == 0 && i < rows) { warpHackOffset[warpId] = hackOffsets[hackId]; warpHackOffset[warpId + (blockDim.x / warpSize)] = hackOffsets[hackId+1]; } __syncthreads(); hackOffset = warpHackOffset[warpId]; nextOffset = warpHackOffset[warpId + blockDim.x / warpSize]; __syncthreads(); #elif __CUDA_ARCH__ < 700 if (laneId == 0 && i < rows) { hackOffset = hackOffsets[hackId]; nextOffset = hackOffsets[hackId+1]; } hackOffset = __shfl(hackOffset, 0); nextOffset = __shfl(nextOffset, 0); #else if (laneId == 0 && i < rows) { hackOffset = hackOffsets[hackId]; nextOffset = hackOffsets[hackId+1]; } hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0); nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0); #endif if (hackId >= hackCount) return; dM += hackOffset*hackSize + hackLaneId; offsets += hackOffset; // diags for this hack is next hackOffset minus current hackOffset int diags = nextOffset - hackOffset; // Warp oriented int rounds = (diags + warpSize - 1)/warpSize; volatile int *offsetsChunk = dynShrMem + warpId*warpSize; for (int r = 0; r < rounds; r++) { // in the last round diags will be <= warpSize if (laneId < diags) offsetsChunk[laneId] = offsets[laneId]; if (i < rows) { int count = min(diags, warpSize); #ifdef USE_PREFETCHING int j; for (j=0; j<=count-2; j += 2) { // prefetch 2 values int column1 = offsetsChunk[j] + i; int column2 = offsetsChunk[j+1] + i; VALUE_TYPE xValue1, xValue2; VALUE_TYPE mValue1, mValue2; bool inside1 = column1 >= 0 && column1 < cols; bool inside2 = column2 >= 0 && column2 < cols; if(inside1) { mValue1 = dM[0]; #ifdef ENABLE_CACHE xValue1 = fetchTex (column1); #else xValue1 = x[column1]; #endif } dM += hackSize; if(inside2) { mValue2 = dM[0]; #ifdef ENABLE_CACHE xValue2 = fetchTex (column2); #else xValue2 = x[column2]; #endif } dM += hackSize; if(inside1) zProd = CONCAT(VALUE_TYPE, _fma)(mValue1, xValue1, zProd); if(inside2) zProd = CONCAT(VALUE_TYPE, _fma)(mValue2, xValue2, zProd); } for (;j= 0 && column < cols) { VALUE_TYPE xValue; #ifdef ENABLE_CACHE xValue = fetchTex (column); #else xValue = x[column]; #endif VALUE_TYPE mValue = dM[0]; zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd); } dM += hackSize; } #else for (int j=0;j= 0 && column < cols) { VALUE_TYPE xValue; #ifdef ENABLE_CACHE xValue = fetchTex (column); #else xValue = x[column]; #endif VALUE_TYPE mValue = dM[0]; zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd); } dM += hackSize; } #endif } diags -= warpSize; offsets += warpSize; } // Since z and y are accessed with the same offset by the same thread, // and the write to z follows the y read, y and z can share the same base address (in-place computing). if (i >= rows) return; if (CONCAT(VALUE_TYPE, _isNotZero(beta))) z[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd)); else z[i] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd); } // Force to recompile and optimize with llvm __global__ void CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0) (VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, int hackCount) { CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _) (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, CONCAT(zero_,VALUE_TYPE)(), hackCount); } __global__ void CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) (VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount) { CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _) (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount); } void CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)) (spgpuHandle_t handle, VALUE_TYPE* z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta) { dim3 block (THREAD_BLOCK); dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK); int hackCount = (rows + hackSize - 1)/hackSize; #ifdef ENABLE_CACHE bind_tex_x ((const TEX_FETCH_TYPE *) x); #endif cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn), cudaFuncCachePreferL1); cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0), cudaFuncCachePreferL1); if (CONCAT(VALUE_TYPE, _isNotZero(beta))) CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount); else CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, hackCount); #ifdef ENABLE_CACHE unbind_tex_x ((const TEX_FETCH_TYPE *) x); #endif }