/*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2015
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

__device__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4_noRs)
(int i, VALUE_TYPE yVal, int outRow,
	VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, int rows, int maxNnzPerRow, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
	VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();

	__shared__ VALUE_TYPE temp[2][THREAD_BLOCK+1];
	
	if (i < rows)
	{
		rP += i; cM += i;

		int rowSizeM = maxNnzPerRow / 4;
		
				
		if ((maxNnzPerRow % 4) > threadIdx.y)
			++rowSizeM;
		
		rP += threadIdx.y*rPPitch; 
		cM += threadIdx.y*cMPitch;
		
		
		for (int j = 0; j < rowSizeM; j++)
		{
			int pointer;
			VALUE_TYPE value;
			VALUE_TYPE fetch;
		
			pointer = rP[0] - baseIndex;
			rP += 4*rPPitch; 

			value = cM[0];
			cM +=  4*cMPitch;

#ifdef ENABLE_CACHE
			fetch = fetchTex(pointer);
#else
			fetch = x[pointer];
#endif	

			// avoid MAD on pre-Fermi
			zProd = CONCAT(VALUE_TYPE, _fma)(value, fetch, zProd);
		}

		// Reduction
		if (threadIdx.y > 1)
			temp[threadIdx.y - 2][threadIdx.x] = zProd;
	}
	
	__syncthreads();
	
	if (i < rows)
	{
		if (threadIdx.y <= 1)
			zProd = CONCAT(VALUE_TYPE, _add)(zProd, temp[threadIdx.y][threadIdx.x]);
		
		if (threadIdx.y == 1)
			temp[1][threadIdx.x] = zProd;
	}
	
	__syncthreads();
	
	if (i < rows)
	{
		if (threadIdx.y == 0)	
		{
			zProd = CONCAT(VALUE_TYPE, _add)(zProd, temp[1][threadIdx.x]);
		
			// Since z and y are accessed with the same offset by the same thread,
			// and the write to z follows the y read, y and z can share the same base address (in-place computing).
	
			if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
				z[outRow] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd));
			else
				z[outRow] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd);
		}
	}
}

__device__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_2_noRs)
(int i, VALUE_TYPE yVal, int outRow,
	VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, int maxNnzPerRow, const int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
	VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();

	__shared__ VALUE_TYPE temp[THREAD_BLOCK];
	
	if (i < rows)
	{
		rP += i; cM += i;

		int rowSizeM = maxNnzPerRow / 2;
		
		if (threadIdx.y == 0)
		{
			if (maxNnzPerRow % 2)
				++rowSizeM;
		}
		else
		{
			rP += rPPitch; 
			cM += cMPitch;
		}
		
		
		for (int j = 0; j < rowSizeM; j++)
		{
			int pointer;
			VALUE_TYPE value;
			VALUE_TYPE fetch;
		
			pointer = rP[0] - baseIndex;
			rP += rPPitch; 
			rP += rPPitch;

			value = cM[0];
			cM += cMPitch;
			cM += cMPitch;

#ifdef ENABLE_CACHE
			fetch = fetchTex(pointer);
#else
			fetch = x[pointer];
#endif	

			// avoid MAD on pre-Fermi
			zProd = CONCAT(VALUE_TYPE, _fma)(value, fetch, zProd);
		}

		// Reduction
		if (threadIdx.y == 1)
			temp[threadIdx.x] = zProd;
	}
	
	__syncthreads();
	
	if (i < rows)
	{
		if (threadIdx.y == 0)	
		{
			zProd = CONCAT(VALUE_TYPE, _add)(zProd, temp[threadIdx.x]);
		
			// Since z and y are accessed with the same offset by the same thread,
			// and the write to z follows the y read, y and z can share the same base address (in-place computing).
	
			if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
				z[outRow] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd));
			else
				z[outRow] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd);
		}
	}
}	

__device__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_noRs)
(int i, VALUE_TYPE yVal, int outRow,
	VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, int maxNnzPerRow, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
	VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();

	if (i < rows)
	{
		rP += i; cM += i;

#ifdef USE_PREFETCHING		
		for (int j = 0; j < maxNnzPerRow / 2; j++)
		{
			int pointers1, pointers2;
			VALUE_TYPE values1, values2;
			VALUE_TYPE fetches1, fetches2;
		
			pointers1 = rP[0] - baseIndex;
			rP += rPPitch; 
			pointers2 = rP[0] - baseIndex;
			rP += rPPitch; 

			values1 = cM[0];
			cM += cMPitch;
			
			values2 = cM[0];
			cM += cMPitch;

#ifdef ENABLE_CACHE
			fetches1 = fetchTex(pointers1);
			fetches2 = fetchTex(pointers2);
#else
			fetches1 = x[pointers1];
			fetches2 = x[pointers2];	
#endif

			// avoid MAD on pre-Fermi
			zProd = CONCAT(VALUE_TYPE, _fma)(values1, fetches1, zProd);
			zProd = CONCAT(VALUE_TYPE, _fma)(values2, fetches2, zProd);
		}

		// odd row size
		if (maxNnzPerRow % 2)
	    	{
	     		int pointer = rP[0] - baseIndex;
	      		VALUE_TYPE value = cM[0];
			VALUE_TYPE fetch;
	      		
#ifdef ENABLE_CACHE
			fetch = fetchTex (pointer);
#else
			fetch = x[pointer];
#endif
			zProd = CONCAT(VALUE_TYPE, _fma)(value, fetch, zProd);
	   	}
#else
		for (int j = 0; j < maxNnzPerRow; j++)
		{
			int pointer;
			VALUE_TYPE value;
			VALUE_TYPE fetch;
		
			pointer = rP[0] - baseIndex;
			rP += rPPitch;

			value = cM[0];
			cM += cMPitch;

#ifdef ENABLE_CACHE
			fetch = fetchTex (pointer);
#else
			fetch = x[pointer];
#endif
			zProd = CONCAT(VALUE_TYPE, _fma)(value, fetch, zProd);
	   	}
#endif	   	

		// Since z and y are accessed with the same offset by the same thread,
		// and the write to z follows the y read, y and z can share the same base address (in-place computing).
	
		if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
			z[outRow] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, zProd));
		else
			z[outRow] = CONCAT(VALUE_TYPE, _mul)(alpha, zProd);
	}
}

__global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn_ridx_noRs)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, const int* rIdx, int maxNnzPerRow, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
	int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
	
	VALUE_TYPE yVal = CONCAT(zero_,VALUE_TYPE)();
	int outRow = 0;
	if (i < rows)
	{

		outRow = rIdx[i];
		if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
			yVal = y[outRow];
	}
	
	if (blockDim.y == 1)
		CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_noRs)
			(i, yVal, outRow, z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
	else //if (blockDim.y == 2)
	
		CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_2_noRs)
			(i, yVal, outRow, z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
	/*
	else if (blockDim.y == 4)
	
	 
		CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4_noRs)
			(i, yVal, outRow, z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
			*/
}


__device__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _noRs)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, int maxNnzPerRow,  int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
	int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
	
	VALUE_TYPE yVal = CONCAT(zero_,VALUE_TYPE)();

	if (i < rows)
	{
		if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
			yVal = y[i];
	
	}
	
	if (blockDim.y == 1)
		CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_noRs)
			(i, yVal, i, z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
	
	else //if (blockDim.y == 2)
	
		CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_2_noRs)
			(i, yVal, i, z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
	/*
	else if (blockDim.y == 4)
	
		CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _ridx_4_noRs)
			(i, yVal, i, z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
			*/
			
}

// Force to recompile and optimize with llvm
__global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn_b0_noRs) 
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, int maxNnzPerRow, int rows, const VALUE_TYPE *x, int baseIndex)
{
	CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _noRs)
		(z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, CONCAT(zero_,VALUE_TYPE)(), baseIndex);
}

__global__ void
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn_noRs)
(VALUE_TYPE *z, const VALUE_TYPE *y, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch, int maxNnzPerRow, int rows, const VALUE_TYPE *x, VALUE_TYPE beta, int baseIndex)
{
	CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _noRs)
		(z, y, alpha, cM, rP, cMPitch, rPPitch, maxNnzPerRow, rows, x, beta, baseIndex);
}