psblas3/cuda/spgpu/kernels/hdia_spmv_base_template.cuh

/* 
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2015
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 
#define THREAD_BLOCK 128

__device__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, 
	int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount)
{
	int i = threadIdx.x + blockIdx.x * (blockDim.x);
	
	VALUE_TYPE yVal = CONCAT(zero_,VALUE_TYPE)();

	if (i < rows && CONCAT(VALUE_TYPE, _isNotZero(beta)))
		yVal = y[i];

	VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
	
	int hackId = i / hackSize;
	int hackLaneId = i % hackSize;
	
	
	// shared between offsetsChunks and warpHackOffsetTemp
	extern __shared__ int dynShrMem[]; 

	int hackOffset = 0;
	int nextOffset = 0;
	
	unsigned int laneId = threadIdx.x % warpSize;
	unsigned int warpId = threadIdx.x / warpSize;
	
#if __CUDA_ARCH__ < 300	
	int* warpHackOffset = dynShrMem;


	if (laneId == 0 && i < rows)
	{
		warpHackOffset[warpId] = hackOffsets[hackId];
		warpHackOffset[warpId + (blockDim.x / warpSize)] = hackOffsets[hackId+1];
	}
	
	__syncthreads();
	hackOffset = warpHackOffset[warpId];
	nextOffset = warpHackOffset[warpId + blockDim.x / warpSize];
	__syncthreads();
#elif __CUDA_ARCH__ < 700
	if (laneId == 0 && i < rows)
	{
		hackOffset = hackOffsets[hackId];
		nextOffset = hackOffsets[hackId+1];
	}
	
	hackOffset = __shfl(hackOffset, 0);	
	nextOffset = __shfl(nextOffset, 0);
#else
	if (laneId == 0 && i < rows)
	{
		hackOffset = hackOffsets[hackId];
		nextOffset = hackOffsets[hackId+1];
	}
	
	hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);	
	nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
	
#endif
	
	if (hackId >= hackCount)
		return;

	dM += hackOffset*hackSize + hackLaneId;
	offsets += hackOffset;
	
	// diags for this hack is next hackOffset minus current hackOffset
	int diags = nextOffset - hackOffset;
	
	
	// Warp oriented
	int rounds = (diags + warpSize - 1)/warpSize;
	
	volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
	
	for (int r = 0; r < rounds; r++)
	{
		// in the last round diags will be <= warpSize
		if (laneId < diags)
			offsetsChunk[laneId] = offsets[laneId];
	
		if (i < rows)
		{
			int count = min(diags, warpSize);

#ifdef USE_PREFETCHING	
			int j;
			for (j=0; j<=count-2; j += 2)
			{
				// prefetch 2 values
				int column1 = offsetsChunk[j] + i;
				int column2 = offsetsChunk[j+1] + i;			

				VALUE_TYPE xValue1, xValue2;
				VALUE_TYPE mValue1, mValue2;
				
				bool inside1 = column1 >= 0 && column1 < cols;
				bool inside2 = column2 >= 0 && column2 < cols;
				
				if(inside1)
                		{
                			mValue1 = dM[0];
#ifdef ENABLE_CACHE
					xValue1 = fetchTex (column1);
#else
					xValue1 = x[column1];
#endif				
				}
				
				dM += hackSize;
							
				if(inside2)
                		{
                			mValue2 = dM[0];
#ifdef ENABLE_CACHE
					xValue2 = fetchTex (column2);
#else
					xValue2 = x[column2];
#endif					
				}

				dM += hackSize;					
											
				if(inside1)
					zProd = CONCAT(VALUE_TYPE, _fma)(mValue1, xValue1, zProd);
				if(inside2)
					zProd = CONCAT(VALUE_TYPE, _fma)(mValue2, xValue2, zProd);
			}
	
			for (;j<count; ++j)
			{
				int column = offsetsChunk[j] + i;
				
				if(column >= 0 && column < cols)
                		{
					VALUE_TYPE xValue;
#ifdef ENABLE_CACHE
					xValue = fetchTex (column);
#else
					xValue = x[column];
#endif				
                			VALUE_TYPE mValue = dM[0];
					zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd);				
				}
				
				dM += hackSize;
			}
#else
			for (int j=0;j<count; ++j)
			{
				int column = offsetsChunk[j] + i;
				
				if(column >= 0 && column < cols)
				{
					VALUE_TYPE xValue;
#ifdef ENABLE_CACHE
					xValue = fetchTex (column);
#else
					xValue = x[column];
#endif				
					VALUE_TYPE mValue = dM[0];
					zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd);				
				}
				
				dM += hackSize;
			}
#endif
	
		}
	
		diags -= warpSize;
		offsets += warpSize;
	}


	// Since z and y are accessed with the same offset by the same thread,
	// and the write to z follows the y read, y and z can share the same base address (in-place computing).
	
	if (i >= rows)
		return;
	
	if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
		z[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd));
	else
		z[i] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd);
}

// Force to recompile and optimize with llvm
__global__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, int hackCount)
{
	CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)
		(z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, CONCAT(zero_,VALUE_TYPE)(), hackCount);
}

__global__ void
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount)
{
	CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)
		(z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount);
}

void
CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
(spgpuHandle_t handle, VALUE_TYPE* z, const VALUE_TYPE *y, VALUE_TYPE alpha, 
	const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols,
	const VALUE_TYPE *x, VALUE_TYPE beta)
{
	dim3 block (THREAD_BLOCK);
	dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);

	int hackCount = (rows + hackSize - 1)/hackSize;
	
#ifdef ENABLE_CACHE
	bind_tex_x ((const TEX_FETCH_TYPE *) x);
#endif

	cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn), cudaFuncCachePreferL1);
	cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0), cudaFuncCachePreferL1);

	if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
		CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount);
	else
		CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, hackCount);

#ifdef ENABLE_CACHE
  	unbind_tex_x ((const TEX_FETCH_TYPE *) x);
#endif

}
Rename GPU into cuda, and merge SPGPU code. 1 year ago			`/*`
			`* spGPU - Sparse matrices on GPU library.`
			`*`
			`* Copyright (C) 2010 - 2015`
			`* Davide Barbieri - University of Rome Tor Vergata`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* version 3 as published by the Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*/`

			`#define THREAD_BLOCK 128`

			`__device__ void`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)`
			`(VALUE_TYPE z, const VALUE_TYPE y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets,`
			`int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount)`
			`{`
			`int i = threadIdx.x + blockIdx.x * (blockDim.x);`

			`VALUE_TYPE yVal = CONCAT(zero_,VALUE_TYPE)();`

			`if (i < rows && CONCAT(VALUE_TYPE, _isNotZero(beta)))`
			`yVal = y[i];`

			`VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();`

			`int hackId = i / hackSize;`
			`int hackLaneId = i % hackSize;`


			`// shared between offsetsChunks and warpHackOffsetTemp`
			`extern __shared__ int dynShrMem[];`

			`int hackOffset = 0;`
			`int nextOffset = 0;`

			`unsigned int laneId = threadIdx.x % warpSize;`
			`unsigned int warpId = threadIdx.x / warpSize;`

			`#if __CUDA_ARCH__ < 300`
			`int* warpHackOffset = dynShrMem;`


			`if (laneId == 0 && i < rows)`
			`{`
			`warpHackOffset[warpId] = hackOffsets[hackId];`
			`warpHackOffset[warpId + (blockDim.x / warpSize)] = hackOffsets[hackId+1];`
			`}`

			`__syncthreads();`
			`hackOffset = warpHackOffset[warpId];`
			`nextOffset = warpHackOffset[warpId + blockDim.x / warpSize];`
			`__syncthreads();`
			`#elif __CUDA_ARCH__ < 700`
			`if (laneId == 0 && i < rows)`
			`{`
			`hackOffset = hackOffsets[hackId];`
			`nextOffset = hackOffsets[hackId+1];`
			`}`

			`hackOffset = __shfl(hackOffset, 0);`
			`nextOffset = __shfl(nextOffset, 0);`
			`#else`
			`if (laneId == 0 && i < rows)`
			`{`
			`hackOffset = hackOffsets[hackId];`
			`nextOffset = hackOffsets[hackId+1];`
			`}`

			`hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);`
			`nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);`

			`#endif`

			`if (hackId >= hackCount)`
			`return;`

			`dM += hackOffset*hackSize + hackLaneId;`
			`offsets += hackOffset;`

			`// diags for this hack is next hackOffset minus current hackOffset`
			`int diags = nextOffset - hackOffset;`


			`// Warp oriented`
			`int rounds = (diags + warpSize - 1)/warpSize;`

			`volatile int offsetsChunk = dynShrMem + warpIdwarpSize;`

			`for (int r = 0; r < rounds; r++)`
			`{`
			`// in the last round diags will be <= warpSize`
			`if (laneId < diags)`
			`offsetsChunk[laneId] = offsets[laneId];`

			`if (i < rows)`
			`{`
			`int count = min(diags, warpSize);`

			`#ifdef USE_PREFETCHING`
			`int j;`
			`for (j=0; j<=count-2; j += 2)`
			`{`
			`// prefetch 2 values`
			`int column1 = offsetsChunk[j] + i;`
			`int column2 = offsetsChunk[j+1] + i;`

			`VALUE_TYPE xValue1, xValue2;`
			`VALUE_TYPE mValue1, mValue2;`

			`bool inside1 = column1 >= 0 && column1 < cols;`
			`bool inside2 = column2 >= 0 && column2 < cols;`

			`if(inside1)`
			`{`
			`mValue1 = dM[0];`
			`#ifdef ENABLE_CACHE`
			`xValue1 = fetchTex (column1);`
			`#else`
			`xValue1 = x[column1];`
			`#endif`
			`}`

			`dM += hackSize;`

			`if(inside2)`
			`{`
			`mValue2 = dM[0];`
			`#ifdef ENABLE_CACHE`
			`xValue2 = fetchTex (column2);`
			`#else`
			`xValue2 = x[column2];`
			`#endif`
			`}`

			`dM += hackSize;`

			`if(inside1)`
			`zProd = CONCAT(VALUE_TYPE, _fma)(mValue1, xValue1, zProd);`
			`if(inside2)`
			`zProd = CONCAT(VALUE_TYPE, _fma)(mValue2, xValue2, zProd);`
			`}`

			`for (;j<count; ++j)`
			`{`
			`int column = offsetsChunk[j] + i;`

			`if(column >= 0 && column < cols)`
			`{`
			`VALUE_TYPE xValue;`
			`#ifdef ENABLE_CACHE`
			`xValue = fetchTex (column);`
			`#else`
			`xValue = x[column];`
			`#endif`
			`VALUE_TYPE mValue = dM[0];`
			`zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd);`
			`}`

			`dM += hackSize;`
			`}`
			`#else`
			`for (int j=0;j<count; ++j)`
			`{`
			`int column = offsetsChunk[j] + i;`

			`if(column >= 0 && column < cols)`
			`{`
			`VALUE_TYPE xValue;`
			`#ifdef ENABLE_CACHE`
			`xValue = fetchTex (column);`
			`#else`
			`xValue = x[column];`
			`#endif`
			`VALUE_TYPE mValue = dM[0];`
			`zProd = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, zProd);`
			`}`

			`dM += hackSize;`
			`}`
			`#endif`

			`}`

			`diags -= warpSize;`
			`offsets += warpSize;`
			`}`


			`// Since z and y are accessed with the same offset by the same thread,`
			`// and the write to z follows the y read, y and z can share the same base address (in-place computing).`

			`if (i >= rows)`
			`return;`

			`if (CONCAT(VALUE_TYPE, _isNotZero(beta)))`
			`z[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd));`
			`else`
			`z[i] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd);`
			`}`

			`// Force to recompile and optimize with llvm`
			`__global__ void`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0)`
			`(VALUE_TYPE z, const VALUE_TYPE y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, int hackCount)`
			`{`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)`
			`(z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, CONCAT(zero_,VALUE_TYPE)(), hackCount);`
			`}`

			`__global__ void`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)`
			`(VALUE_TYPE z, const VALUE_TYPE y, VALUE_TYPE alpha, const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const VALUE_TYPE *x, VALUE_TYPE beta, int hackCount)`
			`{`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _)`
			`(z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount);`
			`}`

			`void`
			`CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))`
			`(spgpuHandle_t handle, VALUE_TYPE* z, const VALUE_TYPE *y, VALUE_TYPE alpha,`
			`const VALUE_TYPE* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols,`
			`const VALUE_TYPE *x, VALUE_TYPE beta)`
			`{`
			`dim3 block (THREAD_BLOCK);`
			`dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);`

			`int hackCount = (rows + hackSize - 1)/hackSize;`

			`#ifdef ENABLE_CACHE`
			`bind_tex_x ((const TEX_FETCH_TYPE *) x);`
			`#endif`

			`cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn), cudaFuncCachePreferL1);`
			`cudaFuncSetCacheConfig(CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0), cudaFuncCachePreferL1);`

			`if (CONCAT(VALUE_TYPE, _isNotZero(beta)))`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, beta, hackCount);`
			`else`
			`CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn_b0) <<< grid, block, block.x*sizeof(int), handle->currentStream >>> (z, y, alpha, dM, offsets, hackSize, hackOffsets, rows, cols, x, hackCount);`

			`#ifdef ENABLE_CACHE`
			`unbind_tex_x ((const TEX_FETCH_TYPE *) x);`
			`#endif`

			`}`