psblas3/cuda/spgpu/ell.h

#pragma once

/*
 * spGPU - Sparse matrices on GPU library.
 * 
 * Copyright (C) 2010 - 2014
 *     Davide Barbieri - University of Rome Tor Vergata
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 3 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include "core.h"
#include "cuComplex.h"

/** \addtogroup ellFun ELL/HELL Format
 *  @{
 */
 
#ifdef __cplusplus
extern "C" {
#endif


// ELL/HELL Compressed Matrix Format routines

/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
#define ELL_PITCH_ALIGN_BYTE 128

/** 
* \fn void spgpuSellspmv (spgpuHandle_t handle, __device float *z, const __device float *y, float alpha, const __device float* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device float *x, float beta,int baseIndex)
 * Computes single precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param y The y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuSellspmv (spgpuHandle_t handle,
	__device float *z,
	const __device float *y, 
	float alpha, 
	const __device float* cM, 
	const __device int* rP, 
	int cMPitch, 
	int rPPitch, 
	const __device int* rS, 
	const __device int* rIdx, 
	int avgNnzPerRow,
	int maxNnzPerRow,
	int rows, 
	const __device float *x, 
	float beta,
	int baseIndex);

/** 
* \fn void spgpuDellspmv (spgpuHandle_t handle,__device double *z,const __device double *y, double alpha, const __device double* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)
 * Computes double precision z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param y The y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuDellspmv (spgpuHandle_t handle,
	__device double *z,
	const __device double *y, 
	double alpha, 
	const __device double* cM, 
	const __device int* rP, 
	int cMPitch, 
	int rPPitch, 
	const __device int* rS, 
	const __device int* rIdx,  
	int avgNnzPerRow,
	int maxNnzPerRow,
	int rows, 
	const __device double *x, 
	double beta,
	int baseIndex);


/** 
* \fn void spgpuCellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
 * Computes single precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param y The y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuCellspmv (spgpuHandle_t handle,
	__device cuFloatComplex *z,
	const __device cuFloatComplex *y, 
	cuFloatComplex alpha, 
	const __device cuFloatComplex* cM, 
	const __device int* rP, 
	int cMPitch, 
	int rPPitch, 
	const __device int* rS, 
	const __device int* rIdx,  
	int avgNnzPerRow,
	int maxNnzPerRow,
	int rows, 
	const __device cuFloatComplex *x, 
	cuFloatComplex beta,
	int baseIndex);

/** 
* \fn void spgpuZellspmv (spgpuHandle_t handle,__device cuDoubleComplex *z,const __device cuDoubleComplex *y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)
 * Computes double precision complex z = alpha*A*x + beta*y, with A stored in ELLpack Format on GPU.
 * \param handle The spgpu handle used to call this routine
 * \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
 * \param y The y input vector
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).
 * \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.
 * \param maxNnzPerRow Maximum number of non zeroes per row.
 * \param rows the rows count
 * \param x the x vector
 * \param beta the beta scalar
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuZellspmv (spgpuHandle_t handle,
	__device cuDoubleComplex *z,
	const __device cuDoubleComplex *y, 
	cuDoubleComplex alpha, 
	const __device cuDoubleComplex* cM, 
	const __device int* rP, 
	int cMPitch, 
	int rPPitch, 
	const __device int* rS, 
	const __device int* rIdx,  
	int avgNnzPerRow,
	int maxNnzPerRow,
	int rows, 
	const __device cuDoubleComplex *x, 
	cuDoubleComplex beta,
	int baseIndex);
	
	
/** 
* \fn void spgpuSellcsput (spgpuHandle_t handle, float alpha, __device float *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device float *aVal, int baseIndex)
 * Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.
 * It assumes that indices from the same row inside rP are sorted by ascending order.
 * Values are single precision floating point numbers.
 * \param handle The spgpu handle used to call this routine
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param nnz the number of triples (aI, aJ, aVal) to process
 * \param aI The row coordinates vector
 * \param aJ The column coordinates vector
 * \param aVal The values vector
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuSellcsput
	(spgpuHandle_t handle, 
	float alpha, 
	__device float *cM, 
	__device const int* rP, 
	int cMPitch, 
	int rPPitch, 
	__device const int* rS,
	int nnz, 
	__device int *aI, 
	__device int *aJ, 
	__device float *aVal, 
	int baseIndex);	

/** 
* \fn void spgpuDellcsput (spgpuHandle_t handle, double alpha, __device double *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device double *aVal, int baseIndex)
 * Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.
 * It assumes that indices from the same row inside rP are sorted by ascending order.
 * Values are double precision floating point numbers.
 * \param handle The spgpu handle used to call this routine
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param nnz the number of triples (aI, aJ, aVal) to process
 * \param aI The row coordinates vector
 * \param aJ The column coordinates vector
 * \param aVal The values vector
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */	
void spgpuDellcsput
	(spgpuHandle_t handle, 
	double alpha, 
	__device double *cM, 
	__device const int* rP, 
	int cMPitch, 
	int rPPitch, 
	__device const int* rS,
	int nnz, 
	__device int *aI, 
	__device int *aJ, 
	__device double *aVal, 
	int baseIndex);	

/** 
* \fn void spgpuCellcsput (spgpuHandle_t handle, cuFloatComplex alpha, __device cuFloatComplex *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device cuFloatComplex *aVal, int baseIndex)
 * Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.
 * It assumes that indices from the same row inside rP are sorted by ascending order.
 * Values are single precision floating point complex numbers.
 * \param handle The spgpu handle used to call this routine
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param nnz the number of triples (aI, aJ, aVal) to process
 * \param aI The row coordinates vector
 * \param aJ The column coordinates vector
 * \param aVal The values vector
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuCellcsput
	(spgpuHandle_t handle, 
	cuFloatComplex alpha, 
	__device cuFloatComplex *cM, 
	__device const int* rP, 
	int cMPitch, 
	int rPPitch, 
	__device const int* rS,
	int nnz, 
	__device int *aI, 
	__device int *aJ, 
	__device cuFloatComplex *aVal, 
	int baseIndex);	

/** 
* \fn void spgpuZellcsput (spgpuHandle_t handle, cuDoubleComplex alpha, __device cuDoubleComplex *cM, __device const int* rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int *aI, __device int *aJ, __device cuDoubleComplex *aVal, int baseIndex)
 * Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.
 * It assumes that indices from the same row inside rP are sorted by ascending order.
 * Values are double precision floating point complex numbers.
 * \param handle The spgpu handle used to call this routine
 * \param alpha The alpha scalar
 * \param cM The ELL non zero values allocation pointer
 * \param rP The ELL column indices allocation pointer
 * \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values
 * \param rPPitch  the pitch (in number of elements) of the allocation containing the matrix non zero column indices
 * \param rS the array containing the row sized (in non zero elements)
 * \param nnz the number of triples (aI, aJ, aVal) to process
 * \param aI The row coordinates vector
 * \param aJ The column coordinates vector
 * \param aVal The values vector
 * \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).
 */
void spgpuZellcsput
	(spgpuHandle_t handle, 
	cuDoubleComplex alpha, 
	__device cuDoubleComplex *cM, 
	__device const int* rP, 
	int cMPitch, 
	int rPPitch, 
	__device const int* rS,
	int nnz, 
	__device int *aI, 
	__device int *aJ, 
	__device cuDoubleComplex *aVal, 
	int baseIndex);		
	
	
/** @}*/

#ifdef __cplusplus
}
#endif
Rename GPU into cuda, and merge SPGPU code. 1 year ago			`#pragma once`

			`/*`
			`* spGPU - Sparse matrices on GPU library.`
			`*`
			`* Copyright (C) 2010 - 2014`
			`* Davide Barbieri - University of Rome Tor Vergata`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU General Public License`
			`* version 3 as published by the Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*/`

			`#include "core.h"`
			`#include "cuComplex.h"`

			`/** \addtogroup ellFun ELL/HELL Format`
			`* @{`
			`*/`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`


			`// ELL/HELL Compressed Matrix Format routines`

			`/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.`
			`#define ELL_PITCH_ALIGN_BYTE 128`

			`/**`
			`* \fn void spgpuSellspmv (spgpuHandle_t handle, __device float z, const __device float y, float alpha, const __device float* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device float *x, float beta,int baseIndex)`
			`* Computes single precision z = alphaAx + beta*y, with A stored in ELLpack Format on GPU.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).`
			`* \param y The y input vector`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).`
			`* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.`
			`* \param maxNnzPerRow Maximum number of non zeroes per row.`
			`* \param rows the rows count`
			`* \param x the x vector`
			`* \param beta the beta scalar`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuSellspmv (spgpuHandle_t handle,`
			`__device float *z,`
			`const __device float *y,`
			`float alpha,`
			`const __device float* cM,`
			`const __device int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`const __device int* rS,`
			`const __device int* rIdx,`
			`int avgNnzPerRow,`
			`int maxNnzPerRow,`
			`int rows,`
			`const __device float *x,`
			`float beta,`
			`int baseIndex);`

			`/**`
			`* \fn void spgpuDellspmv (spgpuHandle_t handle,__device double z,const __device double y, double alpha, const __device double* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device double *x, double beta,int baseIndex)`
			`* Computes double precision z = alphaAx + beta*y, with A stored in ELLpack Format on GPU.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).`
			`* \param y The y input vector`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).`
			`* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.`
			`* \param maxNnzPerRow Maximum number of non zeroes per row.`
			`* \param rows the rows count`
			`* \param x the x vector`
			`* \param beta the beta scalar`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuDellspmv (spgpuHandle_t handle,`
			`__device double *z,`
			`const __device double *y,`
			`double alpha,`
			`const __device double* cM,`
			`const __device int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`const __device int* rS,`
			`const __device int* rIdx,`
			`int avgNnzPerRow,`
			`int maxNnzPerRow,`
			`int rows,`
			`const __device double *x,`
			`double beta,`
			`int baseIndex);`


			`/**`
			`* \fn void spgpuCellspmv (spgpuHandle_t handle,__device cuFloatComplex z,const __device cuFloatComplex y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)`
			`* Computes single precision complex z = alphaAx + beta*y, with A stored in ELLpack Format on GPU.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).`
			`* \param y The y input vector`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).`
			`* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.`
			`* \param maxNnzPerRow Maximum number of non zeroes per row.`
			`* \param rows the rows count`
			`* \param x the x vector`
			`* \param beta the beta scalar`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuCellspmv (spgpuHandle_t handle,`
			`__device cuFloatComplex *z,`
			`const __device cuFloatComplex *y,`
			`cuFloatComplex alpha,`
			`const __device cuFloatComplex* cM,`
			`const __device int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`const __device int* rS,`
			`const __device int* rIdx,`
			`int avgNnzPerRow,`
			`int maxNnzPerRow,`
			`int rows,`
			`const __device cuFloatComplex *x,`
			`cuFloatComplex beta,`
			`int baseIndex);`

			`/**`
			`* \fn void spgpuZellspmv (spgpuHandle_t handle,__device cuDoubleComplex z,const __device cuDoubleComplex y, cuDoubleComplex alpha, const __device cuDoubleComplex* cM, const __device int* rP, int cMPitch, int rPPitch, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows, const __device cuDoubleComplex *x, cuDoubleComplex beta, int baseIndex)`
			`* Computes double precision complex z = alphaAx + beta*y, with A stored in ELLpack Format on GPU.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).`
			`* \param y The y input vector`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param rIdx (optional) An array containing the row index per every row (i.e. the reorder array) of the Ell matrix. Pass NULL if you don't use a reorder array (i.e. the k-th row is stored in the k-th position in the ELL format).`
			`* \param avgNnzPerRow (optional) Average number of non zeroes per row. Pass 0 if you don't have such information.`
			`* \param maxNnzPerRow Maximum number of non zeroes per row.`
			`* \param rows the rows count`
			`* \param x the x vector`
			`* \param beta the beta scalar`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuZellspmv (spgpuHandle_t handle,`
			`__device cuDoubleComplex *z,`
			`const __device cuDoubleComplex *y,`
			`cuDoubleComplex alpha,`
			`const __device cuDoubleComplex* cM,`
			`const __device int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`const __device int* rS,`
			`const __device int* rIdx,`
			`int avgNnzPerRow,`
			`int maxNnzPerRow,`
			`int rows,`
			`const __device cuDoubleComplex *x,`
			`cuDoubleComplex beta,`
			`int baseIndex);`


			`/**`
			`* \fn void spgpuSellcsput (spgpuHandle_t handle, float alpha, __device float cM, __device const int rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int aI, __device int aJ, __device float *aVal, int baseIndex)`
			`* Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.`
			`* It assumes that indices from the same row inside rP are sorted by ascending order.`
			`* Values are single precision floating point numbers.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param nnz the number of triples (aI, aJ, aVal) to process`
			`* \param aI The row coordinates vector`
			`* \param aJ The column coordinates vector`
			`* \param aVal The values vector`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuSellcsput`
			`(spgpuHandle_t handle,`
			`float alpha,`
			`__device float *cM,`
			`__device const int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`__device const int* rS,`
			`int nnz,`
			`__device int *aI,`
			`__device int *aJ,`
			`__device float *aVal,`
			`int baseIndex);`

			`/**`
			`* \fn void spgpuDellcsput (spgpuHandle_t handle, double alpha, __device double cM, __device const int rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int aI, __device int aJ, __device double *aVal, int baseIndex)`
			`* Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.`
			`* It assumes that indices from the same row inside rP are sorted by ascending order.`
			`* Values are double precision floating point numbers.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param nnz the number of triples (aI, aJ, aVal) to process`
			`* \param aI The row coordinates vector`
			`* \param aJ The column coordinates vector`
			`* \param aVal The values vector`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuDellcsput`
			`(spgpuHandle_t handle,`
			`double alpha,`
			`__device double *cM,`
			`__device const int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`__device const int* rS,`
			`int nnz,`
			`__device int *aI,`
			`__device int *aJ,`
			`__device double *aVal,`
			`int baseIndex);`

			`/**`
			`* \fn void spgpuCellcsput (spgpuHandle_t handle, cuFloatComplex alpha, __device cuFloatComplex cM, __device const int rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int aI, __device int aJ, __device cuFloatComplex *aVal, int baseIndex)`
			`* Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.`
			`* It assumes that indices from the same row inside rP are sorted by ascending order.`
			`* Values are single precision floating point complex numbers.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param nnz the number of triples (aI, aJ, aVal) to process`
			`* \param aI The row coordinates vector`
			`* \param aJ The column coordinates vector`
			`* \param aVal The values vector`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuCellcsput`
			`(spgpuHandle_t handle,`
			`cuFloatComplex alpha,`
			`__device cuFloatComplex *cM,`
			`__device const int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`__device const int* rS,`
			`int nnz,`
			`__device int *aI,`
			`__device int *aJ,`
			`__device cuFloatComplex *aVal,`
			`int baseIndex);`

			`/**`
			`* \fn void spgpuZellcsput (spgpuHandle_t handle, cuDoubleComplex alpha, __device cuDoubleComplex cM, __device const int rP, int cMPitch, int rPPitch, __device const int* rS, int nnz, __device int aI, __device int aJ, __device cuDoubleComplex *aVal, int baseIndex)`
			`* Replaces the values at coordinate (aI[i], aJ[i]) inside A with the value aVal[i] * alpha, with A stored in ELLpack Format on GPU.`
			`* It assumes that indices from the same row inside rP are sorted by ascending order.`
			`* Values are double precision floating point complex numbers.`
			`* \param handle The spgpu handle used to call this routine`
			`* \param alpha The alpha scalar`
			`* \param cM The ELL non zero values allocation pointer`
			`* \param rP The ELL column indices allocation pointer`
			`* \param cMPitch the pitch (in number of elements) of the allocation containing the matrix non zero values`
			`* \param rPPitch the pitch (in number of elements) of the allocation containing the matrix non zero column indices`
			`* \param rS the array containing the row sized (in non zero elements)`
			`* \param nnz the number of triples (aI, aJ, aVal) to process`
			`* \param aI The row coordinates vector`
			`* \param aJ The column coordinates vector`
			`* \param aVal The values vector`
			`* \param baseIndex the ELL format base index used (i.e. 0 for C, 1 for Fortran).`
			`*/`
			`void spgpuZellcsput`
			`(spgpuHandle_t handle,`
			`cuDoubleComplex alpha,`
			`__device cuDoubleComplex *cM,`
			`__device const int* rP,`
			`int cMPitch,`
			`int rPPitch,`
			`__device const int* rS,`
			`int nnz,`
			`__device int *aI,`
			`__device int *aJ,`
			`__device cuDoubleComplex *aVal,`
			`int baseIndex);`




			`/** @}*/`

			`#ifdef __cplusplus`
			`}`
			`#endif`