diff --git a/cuda/cvectordev.c b/cuda/cvectordev.c index 518154d5..9db5202e 100644 --- a/cuda/cvectordev.c +++ b/cuda/cvectordev.c @@ -255,6 +255,24 @@ int axpbyMultiVecDeviceFloatComplex(int n,cuFloatComplex alpha, void* devMultiVe return(i); } +int abgdxyzMultiVecDeviceFloatComplex(int n,cuFloatComplex alpha,cuFloatComplex beta, + cuFloatComplex gamma, cuFloatComplex delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuCabgdxyz(handle,n, alpha,beta,gamma,delta, + (cuFloatComplex *)devVecX->v_,(cuFloatComplex *) devVecY->v_,(cuFloatComplex *) devVecZ->v_); + return(i); +} + int axyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB) { int i = 0; diff --git a/cuda/cvectordev.h b/cuda/cvectordev.h index 27c8984a..fc18e328 100644 --- a/cuda/cvectordev.h +++ b/cuda/cvectordev.h @@ -69,6 +69,9 @@ int asumMultiVecDeviceFloatComplex(cuFloatComplex* y_res, int n, void* devVecA); int dotMultiVecDeviceFloatComplex(cuFloatComplex* y_res, int n, void* devVecA, void* devVecB); int axpbyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void* devVecX, cuFloatComplex beta, void* devVecY); +int abgdxyzMultiVecDeviceFloatComplex(int n,cuFloatComplex alpha,cuFloatComplex beta, + cuFloatComplex gamma, cuFloatComplex delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); int axyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB, cuFloatComplex beta, void *deviceVecZ); diff --git a/cuda/dvectordev.c b/cuda/dvectordev.c index 785753dd..b4ca95f4 100644 --- a/cuda/dvectordev.c +++ b/cuda/dvectordev.c @@ -241,7 +241,6 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX, return(i); } - int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, double delta, void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ) { int j=0, i=0; @@ -254,14 +253,8 @@ int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, do if ((n > devVecY->size_) || (n>devVecX->size_ )) return SPGPU_UNSUPPORTED; -#if 1 spgpuDabgdxyz(handle,n, alpha,beta,gamma,delta, (double*)devVecX->v_,(double*) devVecY->v_,(double*) devVecZ->v_); -#else - for(j=0;jcount_;j++) - spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, - (double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j); -#endif return(i); } diff --git a/cuda/dvectordev.h b/cuda/dvectordev.h index 25905c60..81a2e8f6 100644 --- a/cuda/dvectordev.h +++ b/cuda/dvectordev.h @@ -67,6 +67,8 @@ int asumMultiVecDeviceDouble(double* y_res, int n, void* devVecA); int dotMultiVecDeviceDouble(double* y_res, int n, void* devVecA, void* devVecB); int axpbyMultiVecDeviceDouble(int n, double alpha, void* devVecX, double beta, void* devVecY); +int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, double delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); int axyMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB, double beta, void *deviceVecZ); diff --git a/cuda/spgpu/kernels/Makefile b/cuda/spgpu/kernels/Makefile index 5ada698a..3e668b4e 100644 --- a/cuda/spgpu/kernels/Makefile +++ b/cuda/spgpu/kernels/Makefile @@ -11,15 +11,15 @@ LIBNAME=$(UP)/libspgpu.a CINCLUDES=-I$(INCDIR) OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \ - cnrm2.o cscal.o cscat.o csetscal.o \ + cnrm2.o cscal.o cscat.o csetscal.o cabgdxyz.o\ dabs.o damax.o dasum.o daxpby.o daxy.o ddot.o dgath.o dabgdxyz.o\ dia_cspmv.o dia_dspmv.o dia_sspmv.o dia_zspmv.o dnrm2.o \ dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \ ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \ hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \ hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \ - saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o \ - zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o + saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\ + zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o objs: $(OBJS) lib: objs diff --git a/cuda/spgpu/kernels/cabgdxyz.cu b/cuda/spgpu/kernels/cabgdxyz.cu new file mode 100644 index 00000000..00dc6ab4 --- /dev/null +++ b/cuda/spgpu/kernels/cabgdxyz.cu @@ -0,0 +1,80 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuCabgdxyz_krn(int n, cuFloatComplex alpha, cuFloatComplex beta, + cuFloatComplex gamma, cuFloatComplex delta, + cuFloatComplex * x, cuFloatComplex *y, cuFloatComplex *z) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + cuFloatComplex t; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + if (cuFloatComplex_isZero(beta)) + t = cuCmulf(alpha,x[id]); + else + t = cuCfmaf(alpha, x[id], cuCmulf(beta,y[id])); + if (cuFloatComplex_isZero(delta)) + z[id] = cuCmulf(gamma, t); + else + z[id] = cuCfmafmulf(gamma, t, cuCmulf(delta,z[id])); + y[id] = t; + } +} + + +void spgpuCabgdxyz(spgpuHandle_t handle, + int n, + cuFloatComplex alpha, + cuFloatComplex beta, + cuFloatComplex gamma, + cuFloatComplex delta, + __device cuFloatComplex * x, + __device cuFloatComplex * y, + __device cuFloatComplex *z) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuCabgdxyz_krn<<currentStream>>>(n, alpha, beta, gamma, delta, + x, y, z); +} + diff --git a/cuda/spgpu/kernels/sabgdxyz.cu b/cuda/spgpu/kernels/sabgdxyz.cu new file mode 100644 index 00000000..8c137ed3 --- /dev/null +++ b/cuda/spgpu/kernels/sabgdxyz.cu @@ -0,0 +1,79 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuSabgdxyz_krn(int n, float alpha, float beta, float gamma, float delta, + float* x, float *y, float *z) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + float t; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + if (beta == 0.0) + t = PREC_FMUL(alpha,x[id]); + else + t = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id])); + if (delta == 0.0) + z[id] = gamma * t; + else + z[id] = PREC_FADD(PREC_FMUL(gamma, t), PREC_FMUL(delta,z[id])); + y[id] = t; + } +} + + +void spgpuSabgdxyz(spgpuHandle_t handle, + int n, + float alpha, + float beta, + float gamma, + float delta, + __device float* x, + __device float* y, + __device float *z) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuSabgdxyz_krn<<currentStream>>>(n, alpha, beta, gamma, delta, + x, y, z); +} + diff --git a/cuda/spgpu/kernels/zabgdxyz.cu b/cuda/spgpu/kernels/zabgdxyz.cu new file mode 100644 index 00000000..48def937 --- /dev/null +++ b/cuda/spgpu/kernels/zabgdxyz.cu @@ -0,0 +1,80 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuZabgdxyz_krn(int n, cuDoubleComplex alpha, cuDoubleComplex beta, + cuDoubleComplex gamma, cuDoubleComplex delta, + cuDoubleComplex * x, cuDoubleComplex *y, cuDoubleComplex *z) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + cuDoubleComplex t; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + if (cuDoubleComplex_isZero(beta)) + t = cuCmul(alpha,x[id]); + else + t = cuCfma(alpha, x[id], cuCmul(beta,y[id])); + if (cuDoubleComplex_isZero(delta)) + z[id] = cuCmul(gamma, t); + else + z[id] = cuCfma(gamma, t, cuCmul(delta,z[id])); + y[id] = t; + } +} + + +void spgpuZabgdxyz(spgpuHandle_t handle, + int n, + cuDoubleComplex alpha, + cuDoubleComplex beta, + cuDoubleComplex gamma, + cuDoubleComplex delta, + __device cuDoubleComplex * x, + __device cuDoubleComplex * y, + __device cuDoubleComplex *z) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuZabgdxyz_krn<<currentStream>>>(n, alpha, beta, gamma, delta, + x, y, z); +} + diff --git a/cuda/spgpu/vector.h b/cuda/spgpu/vector.h index 69ffedf0..9fc3e658 100644 --- a/cuda/spgpu/vector.h +++ b/cuda/spgpu/vector.h @@ -181,6 +181,18 @@ void spgpuSaxpby(spgpuHandle_t handle, float alpha, __device float* x); + +void spgpuSabgdxyz(spgpuHandle_t handle, + int n, + float alpha, + float beta, + float gamma, + float delta, + __device float* x, + __device float *y, + __device float *z) +; + /** * \fn void spgpuSmaxpby(spgpuHandle_t handle, __device float *z, int n, float beta, __device float *y, float alpha, __device float* x, int count, int pitch) * Computes the single precision z = beta * y + alpha * x of x and y multivectors. z could be exactly x or y (without offset) or another vector. @@ -755,6 +767,18 @@ void spgpuCaxpby(spgpuHandle_t handle, cuFloatComplex alpha, __device cuFloatComplex* x); + +void spgpuCabgdxyz(spgpuHandle_t handle, + int n, + cuFloatComplex alpha, + cuFloatComplex beta, + cuFloatComplex gamma, + cuFloatComplex delta, + __device cuFloatComplex* x, + __device cuFloatComplex *y, + __device cuFloatComplex *z) +; + /** * \fn void spgpuCmaxpby(spgpuHandle_t handle, __device cuFloatComplex *z, int n, cuFloatComplex beta, __device cuFloatComplex *y, cuFloatComplex alpha, __device cuFloatComplex* x, int count, int pitch) * Computes the single precision complex z = beta * y + alpha * x of x and y multivectors. z could be exactly x or y (without offset) or another vector. @@ -1034,6 +1058,18 @@ void spgpuZaxpby(spgpuHandle_t handle, cuDoubleComplex alpha, __device cuDoubleComplex* x); + +void spgpuZabgdxyz(spgpuHandle_t handle, + int n, + cuDoubleComplex alpha, + cuDoubleComplex beta, + cuDoubleComplex gamma, + cuDoubleComplex delta, + __device cuDoubleComplex* x, + __device cuDoubleComplex *y, + __device cuDoubleComplex *z) +; + /** * \fn void spgpuZmaxpby(spgpuHandle_t handle, __device cuDoubleComplex *z, int n, cuDoubleComplex beta, __device cuDoubleComplex *y, cuDoubleComplex alpha, __device cuDoubleComplex* x, int count, int pitch) * Computes the double precision complex z = beta * y + alpha * x of x and y multivectors. z could be exactly x or y (without offset) or another vector. diff --git a/cuda/svectordev.c b/cuda/svectordev.c index bfa4061a..b84718f5 100644 --- a/cuda/svectordev.c +++ b/cuda/svectordev.c @@ -241,6 +241,23 @@ int axpbyMultiVecDeviceFloat(int n,float alpha, void* devMultiVecX, return(i); } +int abgdxyzMultiVecDeviceFloat(int n,float alpha,float beta, float gamma, float delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuSabgdxyz(handle,n, alpha,beta,gamma,delta, + (float*)devVecX->v_,(float*) devVecY->v_,(float*) devVecZ->v_); + return(i); +} + int axyMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB) { int i = 0; struct MultiVectDevice *devVecA = (struct MultiVectDevice *) deviceVecA; diff --git a/cuda/svectordev.h b/cuda/svectordev.h index bf25fcb1..730f929a 100644 --- a/cuda/svectordev.h +++ b/cuda/svectordev.h @@ -67,6 +67,8 @@ int asumMultiVecDeviceFloat(float* y_res, int n, void* devVecA); int dotMultiVecDeviceFloat(float* y_res, int n, void* devVecA, void* devVecB); int axpbyMultiVecDeviceFloat(int n, float alpha, void* devVecX, float beta, void* devVecY); +int abgdxyzMultiVecDeviceFloat(int n,float alpha,float beta, float gamma, float delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); int axyMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB, float beta, void *deviceVecZ); diff --git a/cuda/zvectordev.c b/cuda/zvectordev.c index 0fb1c67e..d1f23f2a 100644 --- a/cuda/zvectordev.c +++ b/cuda/zvectordev.c @@ -234,6 +234,24 @@ int dotMultiVecDeviceDoubleComplex(cuDoubleComplex* y_res, int n, void* devMulti return(i); } +int abgdxyzMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, + cuDoubleComplex beta, cuDoubleComplex gamma, cuDoubleComplex delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ) +{ int j=0, i=0; + int pitch = 0; + struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX; + struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY; + struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ; + spgpuHandle_t handle=psb_cudaGetHandle(); + pitch = devVecY->pitch_; + if ((n > devVecY->size_) || (n>devVecX->size_ )) + return SPGPU_UNSUPPORTED; + + spgpuZabgdxyz(handle,n, alpha,beta,gamma,delta, + (cuDoubleComplex *)devVecX->v_,(cuDoubleComplex *) devVecY->v_,(cuDoubleComplex *) devVecZ->v_); + return(i); +} + int axpbyMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, void* devMultiVecX, cuDoubleComplex beta, void* devMultiVecY) { int j=0, i=0; diff --git a/cuda/zvectordev.h b/cuda/zvectordev.h index 96330a7a..4c32f11c 100644 --- a/cuda/zvectordev.h +++ b/cuda/zvectordev.h @@ -77,6 +77,9 @@ int dotMultiVecDeviceDoubleComplex(cuDoubleComplex* y_res, int n, int axpbyMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void* devVecX, cuDoubleComplex beta, void* devVecY); +int abgdxyzMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, + cuDoubleComplex beta, cuDoubleComplex gamma, cuDoubleComplex delta, + void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ); int axyMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void *deviceVecA, void *deviceVecB); int axybzMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void *deviceVecA,