psblas3/cuda/spgpu/kernels/axy_base.cuh

/*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2014
 *     Salvatore Filippone - University of Rome Tor Vergata
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */


#define PRE_CONCAT(A, B) A ## B
#define CONCAT(A, B) PRE_CONCAT(A, B)

#undef GEN_SPGPU_FUNC_NAME
#define GEN_SPGPU_FUNC_NAME(x) CONCAT(CONCAT(spgpu,x),axy)
#define GEN_SPGPU_MFUNC_NAME(x) CONCAT(CONCAT(spgpu,x),maxy)
#define GEN_SPGPU_FUNC_NAME_2(x) CONCAT(CONCAT(spgpu,x),axypbz)
#define GEN_SPGPU_MFUNC_NAME_2(x) CONCAT(CONCAT(spgpu,x),maxypbz)
#define GEN_SPGPU_SCAL_NAME(x) CONCAT(CONCAT(spgpu,x),scal)

#define BLOCK_SIZE 256

// Define:
//#define VALUE_TYPE
//#define TYPE_SYMBOL

#include "mathbase.cuh"

__global__ void 
CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_kern)
	(VALUE_TYPE *z, int n, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y)
{
	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
	
	if (id < n)
	{
		// Since z, x and y are accessed with the same offset by the same thread,
		// and the write to z follows the x and y reads, x, y and z can share the same base address (in-place computing).

		z[id] = CONCAT(VALUE_TYPE, _mul)(alpha, CONCAT(VALUE_TYPE, _mul)(x[id], y[id]));
	}
}

void 
CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_)
	(spgpuHandle_t handle, VALUE_TYPE *z, int n, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y)
{
	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;

	dim3 block(BLOCK_SIZE);
	dim3 grid(msize);

	CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_kern)<<<grid, block, 0, handle->currentStream>>>(z, n, alpha, x, y);

}

void 
GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
	__device VALUE_TYPE *z,
	int n,
	VALUE_TYPE alpha,
	__device VALUE_TYPE *x,
	__device VALUE_TYPE *y)
{
	int maxNForACall = max(handle->maxGridSizeX, BLOCK_SIZE*handle->maxGridSizeX);
	while (n > maxNForACall) //managing large vectors
	{
		CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_)
			(handle, z, maxNForACall, alpha, x, y);
		x = x + maxNForACall;
		y = y + maxNForACall;
		z = z + maxNForACall;
		n -= maxNForACall;
	}
	
    	CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_) (handle, z, n, alpha, x, y);
	cudaCheckError("CUDA error on axy");
}

void 
GEN_SPGPU_MFUNC_NAME(TYPE_SYMBOL)
(spgpuHandle_t handle,
	__device VALUE_TYPE *z,
	int n,
	VALUE_TYPE alpha,
	__device VALUE_TYPE* x,
	__device VALUE_TYPE *y,
	int count,
	int pitch)
{
	for (int i=0; i<count; i++)
	{
    		GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL)(handle, z, n, alpha, x, y);
		
		x += pitch;
		y += pitch;
		z += pitch;
	}
}

__global__ void 
CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_kern)
(VALUE_TYPE *w, int n, VALUE_TYPE beta, VALUE_TYPE* z, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y)
{
	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
	
	if (id < n)
	{
		// Since w, x and y and z are accessed with the same offset by the same thread,
		// and the write to z follows the x, y and z reads, x, y, z and w can share the same base address (in-place computing).
		w[id] = CONCAT(VALUE_TYPE, _fma) (
				alpha,
				CONCAT(VALUE_TYPE, _mul)(x[id],y[id]), 
				CONCAT(VALUE_TYPE, _mul)(beta,z[id]));
	}
}


void 
CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_)
	(spgpuHandle_t handle, VALUE_TYPE *w, int n, VALUE_TYPE beta, VALUE_TYPE* z, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y)
{
	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;

	dim3 block(BLOCK_SIZE);
	dim3 grid(msize);
	
	CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_kern)
		<<<grid, block, 0, handle->currentStream>>>(w, n, beta, z, alpha, x, y);

}

void
GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL)
	(spgpuHandle_t handle,
	__device VALUE_TYPE *w,
	int n,
	VALUE_TYPE beta,
	__device VALUE_TYPE *z,
	VALUE_TYPE alpha,
	__device VALUE_TYPE* x,
	__device VALUE_TYPE *y
	)
{

	if (CONCAT(VALUE_TYPE, _isZero(alpha)))
	{
		GEN_SPGPU_SCAL_NAME(TYPE_SYMBOL)
			(handle, w, n, beta, z);
	}
	else if (CONCAT(VALUE_TYPE, _isZero(beta))) {
		GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL)
			(handle, w, n, alpha, x, y);
	} 
	else {
		int maxNForACall = max(handle->maxGridSizeX, BLOCK_SIZE*handle->maxGridSizeX);
		
		while (n > maxNForACall) //managing large vectors
		{
			
			CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_)
				(handle, w, maxNForACall, beta, z, alpha, x, y);
	
			x = x + maxNForACall;
			y = y + maxNForACall;
			z = z + maxNForACall;
			w = w + maxNForACall;
			n -= maxNForACall;
		}
    
		CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_)
			(handle, w, n, beta, z, alpha, x, y);
    }	
  
	cudaCheckError("CUDA error on axypbz");
}

void
GEN_SPGPU_MFUNC_NAME_2(TYPE_SYMBOL)
	(spgpuHandle_t handle,
	__device VALUE_TYPE *w,
	int n,
	VALUE_TYPE beta,
	__device VALUE_TYPE *z,
	VALUE_TYPE alpha,
	__device VALUE_TYPE* x,
	__device VALUE_TYPE *y,
	int count,
	int pitch)
{
  for (int i=0; i<count; i++)
    {
	GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL)(handle, w, n, beta, z, alpha, x, y);
		
	x += pitch;
	y += pitch;
	z += pitch;
	w += pitch;
    }
}
Rename GPU into cuda, and merge SPGPU code. 1 year ago			`/*`
			`* spGPU - Sparse matrices on GPU library.`
			`*`
			`* Copyright (C) 2010 - 2014`
			`* Salvatore Filippone - University of Rome Tor Vergata`
			`* Davide Barbieri - University of Rome Tor Vergata`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* version 3 as published by the Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*/`


			`#define PRE_CONCAT(A, B) A ## B`
			`#define CONCAT(A, B) PRE_CONCAT(A, B)`

			`#undef GEN_SPGPU_FUNC_NAME`
			`#define GEN_SPGPU_FUNC_NAME(x) CONCAT(CONCAT(spgpu,x),axy)`
			`#define GEN_SPGPU_MFUNC_NAME(x) CONCAT(CONCAT(spgpu,x),maxy)`
			`#define GEN_SPGPU_FUNC_NAME_2(x) CONCAT(CONCAT(spgpu,x),axypbz)`
			`#define GEN_SPGPU_MFUNC_NAME_2(x) CONCAT(CONCAT(spgpu,x),maxypbz)`
			`#define GEN_SPGPU_SCAL_NAME(x) CONCAT(CONCAT(spgpu,x),scal)`

			`#define BLOCK_SIZE 256`

			`// Define:`
			`//#define VALUE_TYPE`
			`//#define TYPE_SYMBOL`

			`#include "mathbase.cuh"`

			`__global__ void`
			`CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_kern)`
			`(VALUE_TYPE z, int n, VALUE_TYPE alpha, VALUE_TYPE x, VALUE_TYPE* y)`
			`{`
			`int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;`

			`if (id < n)`
			`{`
			`// Since z, x and y are accessed with the same offset by the same thread,`
			`// and the write to z follows the x and y reads, x, y and z can share the same base address (in-place computing).`

			`z[id] = CONCAT(VALUE_TYPE, _mul)(alpha, CONCAT(VALUE_TYPE, _mul)(x[id], y[id]));`
			`}`
			`}`

			`void`
			`CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_)`
			`(spgpuHandle_t handle, VALUE_TYPE z, int n, VALUE_TYPE alpha, VALUE_TYPE x, VALUE_TYPE* y)`
			`{`
			`int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;`

			`dim3 block(BLOCK_SIZE);`
			`dim3 grid(msize);`

			`CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_kern)<<<grid, block, 0, handle->currentStream>>>(z, n, alpha, x, y);`

			`}`

			`void`
			`GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL)`
			`(spgpuHandle_t handle,`
			`__device VALUE_TYPE *z,`
			`int n,`
			`VALUE_TYPE alpha,`
			`__device VALUE_TYPE *x,`
			`__device VALUE_TYPE *y)`
			`{`
			`int maxNForACall = max(handle->maxGridSizeX, BLOCK_SIZE*handle->maxGridSizeX);`
			`while (n > maxNForACall) //managing large vectors`
			`{`
			`CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_)`
			`(handle, z, maxNForACall, alpha, x, y);`
			`x = x + maxNForACall;`
			`y = y + maxNForACall;`
			`z = z + maxNForACall;`
			`n -= maxNForACall;`
			`}`

			`CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_) (handle, z, n, alpha, x, y);`
			`cudaCheckError("CUDA error on axy");`
			`}`

			`void`
			`GEN_SPGPU_MFUNC_NAME(TYPE_SYMBOL)`
			`(spgpuHandle_t handle,`
			`__device VALUE_TYPE *z,`
			`int n,`
			`VALUE_TYPE alpha,`
			`__device VALUE_TYPE* x,`
			`__device VALUE_TYPE *y,`
			`int count,`
			`int pitch)`
			`{`
			`for (int i=0; i<count; i++)`
			`{`
			`GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL)(handle, z, n, alpha, x, y);`

			`x += pitch;`
			`y += pitch;`
			`z += pitch;`
			`}`
			`}`

			`__global__ void`
			`CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_kern)`
			`(VALUE_TYPE w, int n, VALUE_TYPE beta, VALUE_TYPE z, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y)`
			`{`
			`int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;`

			`if (id < n)`
			`{`
			`// Since w, x and y and z are accessed with the same offset by the same thread,`
			`// and the write to z follows the x, y and z reads, x, y, z and w can share the same base address (in-place computing).`
			`w[id] = CONCAT(VALUE_TYPE, _fma) (`
			`alpha,`
			`CONCAT(VALUE_TYPE, _mul)(x[id],y[id]),`
			`CONCAT(VALUE_TYPE, _mul)(beta,z[id]));`
			`}`
			`}`



			`void`
			`CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_)`
			`(spgpuHandle_t handle, VALUE_TYPE w, int n, VALUE_TYPE beta, VALUE_TYPE z, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y)`
			`{`
			`int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;`

			`dim3 block(BLOCK_SIZE);`
			`dim3 grid(msize);`

			`CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_kern)`
			`<<<grid, block, 0, handle->currentStream>>>(w, n, beta, z, alpha, x, y);`

			`}`

			`void`
			`GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL)`
			`(spgpuHandle_t handle,`
			`__device VALUE_TYPE *w,`
			`int n,`
			`VALUE_TYPE beta,`
			`__device VALUE_TYPE *z,`
			`VALUE_TYPE alpha,`
			`__device VALUE_TYPE* x,`
			`__device VALUE_TYPE *y`
			`)`
			`{`

			`if (CONCAT(VALUE_TYPE, _isZero(alpha)))`
			`{`
			`GEN_SPGPU_SCAL_NAME(TYPE_SYMBOL)`
			`(handle, w, n, beta, z);`
			`}`
			`else if (CONCAT(VALUE_TYPE, _isZero(beta))) {`
			`GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL)`
			`(handle, w, n, alpha, x, y);`
			`}`
			`else {`
			`int maxNForACall = max(handle->maxGridSizeX, BLOCK_SIZE*handle->maxGridSizeX);`

			`while (n > maxNForACall) //managing large vectors`
			`{`

			`CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_)`
			`(handle, w, maxNForACall, beta, z, alpha, x, y);`

			`x = x + maxNForACall;`
			`y = y + maxNForACall;`
			`z = z + maxNForACall;`
			`w = w + maxNForACall;`
			`n -= maxNForACall;`
			`}`

			`CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_)`
			`(handle, w, n, beta, z, alpha, x, y);`
			`}`

			`cudaCheckError("CUDA error on axypbz");`
			`}`

			`void`
			`GEN_SPGPU_MFUNC_NAME_2(TYPE_SYMBOL)`
			`(spgpuHandle_t handle,`
			`__device VALUE_TYPE *w,`
			`int n,`
			`VALUE_TYPE beta,`
			`__device VALUE_TYPE *z,`
			`VALUE_TYPE alpha,`
			`__device VALUE_TYPE* x,`
			`__device VALUE_TYPE *y,`
			`int count,`
			`int pitch)`
			`{`
			`for (int i=0; i<count; i++)`
			`{`
			`GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL)(handle, w, n, beta, z, alpha, x, y);`

			`x += pitch;`
			`y += pitch;`
			`z += pitch;`
			`w += pitch;`
			`}`
			`}`