|
|
|
@ -27,6 +27,10 @@
|
|
|
|
|
extern "C" {
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// DIA/HDIA Compressed Matrix Format routines
|
|
|
|
|
|
|
|
|
|
/// This is the pitch alignment that must be fullfilled by the coefficients and the row pointers allocations.
|
|
|
|
|
#define DIA_PITCH_ALIGN_BYTE 128
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuShdiaspmv (spgpuHandle_t handle, float* z, const float *y, float alpha, const float* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const float *x, float beta)
|
|
|
|
@ -59,6 +63,44 @@ spgpuShdiaspmv (spgpuHandle_t handle,
|
|
|
|
|
float beta);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuShdiaspmm (int count, float *z, int zpitch, const float *y, int ypitch, float alpha, const __device float* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, float beta)
|
|
|
|
|
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
|
|
|
|
|
* \param handle The spgpu handle used to call this routine
|
|
|
|
|
* \param count The cols count
|
|
|
|
|
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
|
|
|
|
|
* \param zpitch The pitch of the output vector
|
|
|
|
|
* \param y The y input vector
|
|
|
|
|
* \param ypitch The pitch of the y input vector
|
|
|
|
|
* \param alpha The alpha scalar
|
|
|
|
|
* \param dM The stacked HDIA non zero values allocation pointer
|
|
|
|
|
* \param offsets The stacked HDIA diagonals offsets vector
|
|
|
|
|
* \param hackSize The constant size of every hack (must be a multiple of 32)
|
|
|
|
|
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
|
|
|
|
|
* \param rows the rows count
|
|
|
|
|
* \param cols the columns count
|
|
|
|
|
* \param x the x vector
|
|
|
|
|
* \param xpitch The pitch of the x input vector
|
|
|
|
|
* \param beta the beta scalar
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
spgpuShdiaspmm (spgpuHandle_t handle,
|
|
|
|
|
int count,
|
|
|
|
|
__device float *z,
|
|
|
|
|
int zpitch,
|
|
|
|
|
const __device float *y,
|
|
|
|
|
int ypitch,
|
|
|
|
|
float alpha,
|
|
|
|
|
const __device float* dM,
|
|
|
|
|
const __device int* offsets,
|
|
|
|
|
int hackSize,
|
|
|
|
|
const __device int* hackOffsets,
|
|
|
|
|
int rows,
|
|
|
|
|
int cols,
|
|
|
|
|
const __device float *x,
|
|
|
|
|
int xpitch,
|
|
|
|
|
float beta);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuDhdiaspmv (spgpuHandle_t handle, double* z, const double *y, double alpha, const double* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const double *x, double beta)
|
|
|
|
|
* Computes double precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
|
|
|
|
@ -89,6 +131,43 @@ spgpuDhdiaspmv (spgpuHandle_t handle,
|
|
|
|
|
const double *x,
|
|
|
|
|
double beta);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuDhdiaspmm (int count, double *z, int zpitch, const double *y, int ypitch, double alpha, const __device double* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device float *x, int xpitch, double beta)
|
|
|
|
|
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
|
|
|
|
|
* \param handle The spgpu handle used to call this routine
|
|
|
|
|
* \param count The cols count
|
|
|
|
|
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
|
|
|
|
|
* \param zpitch The pitch of the output vector
|
|
|
|
|
* \param y The y input vector
|
|
|
|
|
* \param ypitch The pitch of the y input vector
|
|
|
|
|
* \param alpha The alpha scalar
|
|
|
|
|
* \param dM The stacked HDIA non zero values allocation pointer
|
|
|
|
|
* \param offsets The stacked HDIA diagonals offsets vector
|
|
|
|
|
* \param hackSize The constant size of every hack (must be a multiple of 32)
|
|
|
|
|
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
|
|
|
|
|
* \param rows the rows count
|
|
|
|
|
* \param cols the columns count
|
|
|
|
|
* \param x the x vector
|
|
|
|
|
* \param xpitch The pitch of the x input vector
|
|
|
|
|
* \param beta the beta scalar
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
spgpuDhdiaspmm (spgpuHandle_t handle,
|
|
|
|
|
int count,
|
|
|
|
|
__device double *z,
|
|
|
|
|
int zpitch,
|
|
|
|
|
const __device double *y,
|
|
|
|
|
int ypitch,
|
|
|
|
|
double alpha,
|
|
|
|
|
const __device double* dM,
|
|
|
|
|
const __device int* offsets,
|
|
|
|
|
int hackSize,
|
|
|
|
|
const __device int* hackOffsets,
|
|
|
|
|
int rows,
|
|
|
|
|
int cols,
|
|
|
|
|
const __device double *x,
|
|
|
|
|
int xpitch,
|
|
|
|
|
double beta);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuChdiaspmv (spgpuHandle_t handle, cuFloatComplex* z, const cuFloatComplex *y, cuFloatComplex alpha, const cuFloatComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuFloatComplex *x, cuFloatComplex beta)
|
|
|
|
@ -120,6 +199,43 @@ spgpuChdiaspmv (spgpuHandle_t handle,
|
|
|
|
|
const cuFloatComplex *x,
|
|
|
|
|
cuFloatComplex beta);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuChdiaspmm (int count, cuFloatComplex *z, int zpitch, const cuFloatComplex *y, int ypitch, cuFloatComplex alpha, const __device cuFloatComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuFloatComplex *x, int xpitch, cuFloatComplex beta)
|
|
|
|
|
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
|
|
|
|
|
* \param handle The spgpu handle used to call this routine
|
|
|
|
|
* \param count The cols count
|
|
|
|
|
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
|
|
|
|
|
* \param zpitch The pitch of the output vector
|
|
|
|
|
* \param y The y input vector
|
|
|
|
|
* \param ypitch The pitch of the y input vector
|
|
|
|
|
* \param alpha The alpha scalar
|
|
|
|
|
* \param dM The stacked HDIA non zero values allocation pointer
|
|
|
|
|
* \param offsets The stacked HDIA diagonals offsets vector
|
|
|
|
|
* \param hackSize The constant size of every hack (must be a multiple of 32)
|
|
|
|
|
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
|
|
|
|
|
* \param rows the rows count
|
|
|
|
|
* \param cols the columns count
|
|
|
|
|
* \param x the x vector
|
|
|
|
|
* \param xpitch The pitch of the x input vector
|
|
|
|
|
* \param beta the beta scalar
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
spgpuChdiaspmm (spgpuHandle_t handle,
|
|
|
|
|
int count,
|
|
|
|
|
__device cuFloatComplex *z,
|
|
|
|
|
int zpitch,
|
|
|
|
|
const __device cuFloatComplex *y,
|
|
|
|
|
int ypitch,
|
|
|
|
|
cuFloatComplex alpha,
|
|
|
|
|
const __device cuFloatComplex* dM,
|
|
|
|
|
const __device int* offsets,
|
|
|
|
|
int hackSize,
|
|
|
|
|
const __device int* hackOffsets,
|
|
|
|
|
int rows,
|
|
|
|
|
int cols,
|
|
|
|
|
const __device cuFloatComplex *x,
|
|
|
|
|
int xpitch,
|
|
|
|
|
cuFloatComplex beta);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuZhdiaspmv (spgpuHandle_t handle, cuDoubleComplex* z, const cuDoubleComplex *y, cuDoubleComplex alpha, const cuDoubleComplex* dM, const int* offsets, int hackSize, const int* hackOffsets, int rows, int cols, const cuDoubleComplex *x, cuDoubleComplex beta)
|
|
|
|
@ -150,6 +266,45 @@ spgpuZhdiaspmv (spgpuHandle_t handle,
|
|
|
|
|
int cols,
|
|
|
|
|
const cuDoubleComplex *x,
|
|
|
|
|
cuDoubleComplex beta);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* \fn spgpuZhdiaspmm (int count, cuDoubleComplex *z, int zpitch, const cuDoubleComplex *y, int ypitch, cuDoubleComplex alpha, const __device cuDoubleComplex* dM, const __device int* offsets, int hackSize, const __device int* hackOffsets, int rows, int cols, const __device cuDoubleComplex *x, int xpitch, cuDoubleComplex beta)
|
|
|
|
|
* Computes single precision z = alpha*A*x + beta*y, with A stored in Hacked Diagonal Format on GPU.
|
|
|
|
|
* \param handle The spgpu handle used to call this routine
|
|
|
|
|
* \param count The cols count
|
|
|
|
|
* \param z The output vector of the routine. z could be y, but not y + k (i.e. an overlapping area over y, but starting from a base index different from y).
|
|
|
|
|
* \param zpitch The pitch of the output vector
|
|
|
|
|
* \param y The y input vector
|
|
|
|
|
* \param ypitch The pitch of the y input vector
|
|
|
|
|
* \param alpha The alpha scalar
|
|
|
|
|
* \param dM The stacked HDIA non zero values allocation pointer
|
|
|
|
|
* \param offsets The stacked HDIA diagonals offsets vector
|
|
|
|
|
* \param hackSize The constant size of every hack (must be a multiple of 32)
|
|
|
|
|
* \param hackOffsets the array of base index offset for every hack of HDIA offsets vector, plus a last value equal to the size of the offsets vector
|
|
|
|
|
* \param rows the rows count
|
|
|
|
|
* \param cols the columns count
|
|
|
|
|
* \param x the x vector
|
|
|
|
|
* \param xpitch The pitch of the x input vector
|
|
|
|
|
* \param beta the beta scalar
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
spgpuZhdiaspmm (spgpuHandle_t handle,
|
|
|
|
|
int count,
|
|
|
|
|
__device cuDoubleComplex *z,
|
|
|
|
|
int zpitch,
|
|
|
|
|
const __device cuDoubleComplex *y,
|
|
|
|
|
int ypitch,
|
|
|
|
|
cuDoubleComplex alpha,
|
|
|
|
|
const __device cuDoubleComplex* dM,
|
|
|
|
|
const __device int* offsets,
|
|
|
|
|
int hackSize,
|
|
|
|
|
const __device int* hackOffsets,
|
|
|
|
|
int rows,
|
|
|
|
|
int cols,
|
|
|
|
|
const __device cuDoubleComplex *x,
|
|
|
|
|
int xpitch,
|
|
|
|
|
cuDoubleComplex beta);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** @}*/
|
|
|
|
|
|
|
|
|
|