CUDA kernels for ABGDXYZ

2 years ago · 2d3773df98
parent 2a75d677d0
commit 2d3773df98
13 changed files with 341 additions and 10 deletions
--- a/cuda/cvectordev.c
+++ b/cuda/cvectordev.c
@ -255,6 +255,24 @@ int axpbyMultiVecDeviceFloatComplex(int n,cuFloatComplex alpha, void* devMultiVe
  return(i);
 }

+int abgdxyzMultiVecDeviceFloatComplex(int n,cuFloatComplex  alpha,cuFloatComplex  beta,
+				      cuFloatComplex  gamma, cuFloatComplex  delta, 
+				      void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ)
+{ int j=0, i=0;
+  int pitch = 0;
+  struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
+  struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY;
+  struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+  pitch = devVecY->pitch_;
+  if ((n > devVecY->size_) || (n>devVecX->size_ )) 
+    return SPGPU_UNSUPPORTED;
+
+  spgpuCabgdxyz(handle,n, alpha,beta,gamma,delta, 
+		(cuFloatComplex *)devVecX->v_,(cuFloatComplex *) devVecY->v_,(cuFloatComplex *) devVecZ->v_);
+  return(i);
+}
+
 int axyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, 
 				  void *deviceVecA, void *deviceVecB)
 { int i = 0; 
--- a/cuda/cvectordev.h
+++ b/cuda/cvectordev.h
@ -69,6 +69,9 @@ int asumMultiVecDeviceFloatComplex(cuFloatComplex* y_res, int n, void* devVecA);
 int dotMultiVecDeviceFloatComplex(cuFloatComplex* y_res, int n, void* devVecA, void* devVecB);

 int axpbyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void* devVecX, cuFloatComplex beta, void* devVecY);
+int abgdxyzMultiVecDeviceFloatComplex(int n,cuFloatComplex  alpha,cuFloatComplex  beta,
+				      cuFloatComplex  gamma, cuFloatComplex  delta, 
+				      void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ);
 int axyMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA, void *deviceVecB);
 int axybzMultiVecDeviceFloatComplex(int n, cuFloatComplex alpha, void *deviceVecA,
 			      void *deviceVecB, cuFloatComplex beta, void *deviceVecZ);
--- a/cuda/dvectordev.c
+++ b/cuda/dvectordev.c
@ -241,7 +241,6 @@ int axpbyMultiVecDeviceDouble(int n,double alpha, void* devMultiVecX,
  return(i);
 }
 
-
 int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, double delta, 
 				void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ)
 { int j=0, i=0;
@ -254,14 +253,8 @@ int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, do
  if ((n > devVecY->size_) || (n>devVecX->size_ )) 
    return SPGPU_UNSUPPORTED;

-#if 1
  spgpuDabgdxyz(handle,n, alpha,beta,gamma,delta, 
 	      (double*)devVecX->v_,(double*) devVecY->v_,(double*) devVecZ->v_);
-#else
-  for(j=0;j<devVecY->count_;j++)
-    spgpuDaxpby(handle,(double*)devVecY->v_+pitch*j, n, beta, 
-		(double*)devVecY->v_+pitch*j, alpha,(double*) devVecX->v_+pitch*j);
-#endif
  return(i);
 }
 
--- a/cuda/dvectordev.h
+++ b/cuda/dvectordev.h
@ -67,6 +67,8 @@ int asumMultiVecDeviceDouble(double* y_res, int n, void* devVecA);
 int dotMultiVecDeviceDouble(double* y_res, int n, void* devVecA, void* devVecB);

 int axpbyMultiVecDeviceDouble(int n, double alpha, void* devVecX, double beta, void* devVecY);
+int abgdxyzMultiVecDeviceDouble(int n,double alpha,double beta, double gamma, double delta, 
+				void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ);
 int axyMultiVecDeviceDouble(int n, double alpha, void *deviceVecA, void *deviceVecB);
 int axybzMultiVecDeviceDouble(int n, double alpha, void *deviceVecA,
 			      void *deviceVecB, double beta, void *deviceVecZ);
--- a/cuda/spgpu/kernels/Makefile
+++ b/cuda/spgpu/kernels/Makefile
@ -11,15 +11,15 @@ LIBNAME=$(UP)/libspgpu.a
 CINCLUDES=-I$(INCDIR)

 OBJS=cabs.o  camax.o  casum.o  caxpby.o  caxy.o  cdot.o  cgath.o \
-  cnrm2.o cscal.o cscat.o csetscal.o \
+  cnrm2.o cscal.o cscat.o csetscal.o cabgdxyz.o\
  dabs.o damax.o dasum.o daxpby.o daxy.o ddot.o dgath.o dabgdxyz.o\
  dia_cspmv.o dia_dspmv.o dia_sspmv.o dia_zspmv.o dnrm2.o \
  dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o \
  ell_dcsput.o ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \
  hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \
  hell_sspmv.o hell_zspmv.o igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \
-  saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o \
-  zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o  
+  saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\
+  zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o

 objs: $(OBJS)
 lib: objs
--- a/cuda/spgpu/kernels/cabgdxyz.cu
+++ b/cuda/spgpu/kernels/cabgdxyz.cu
@ -0,0 +1,80 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2012 
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <cuda_runtime.h>
+
+extern "C"
+{
+#include "core.h"
+#include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+
+#include "debug.h"
+
+#define BLOCK_SIZE 512
+
+__global__ void spgpuCabgdxyz_krn(int n, cuFloatComplex  alpha, cuFloatComplex  beta,
+				  cuFloatComplex  gamma, cuFloatComplex  delta,
+				  cuFloatComplex * x, cuFloatComplex  *y, cuFloatComplex  *z)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	cuFloatComplex  t;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
+	{
+
+	  if (cuFloatComplex_isZero(beta)) 
+	    t = cuCmulf(alpha,x[id]);
+	  else
+	    t = cuCfmaf(alpha, x[id], cuCmulf(beta,y[id]));
+	  if (cuFloatComplex_isZero(delta))
+	    z[id] = cuCmulf(gamma, t);
+	  else
+	    z[id] = cuCfmafmulf(gamma, t, cuCmulf(delta,z[id]));
+	  y[id] = t;
+	}
+}
+
+
+void spgpuCabgdxyz(spgpuHandle_t handle,
+		   int n,
+		   cuFloatComplex  alpha,
+		   cuFloatComplex  beta,
+		   cuFloatComplex  gamma,
+		   cuFloatComplex  delta,
+		   __device cuFloatComplex * x,
+		   __device cuFloatComplex * y,
+		   __device cuFloatComplex  *z)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuCabgdxyz_krn<<<grid, block, 0, handle->currentStream>>>(n, alpha, beta, gamma, delta,
+								   x, y, z);
+}
+
--- a/cuda/spgpu/kernels/sabgdxyz.cu
+++ b/cuda/spgpu/kernels/sabgdxyz.cu
@ -0,0 +1,79 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2012 
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <cuda_runtime.h>
+
+extern "C"
+{
+#include "core.h"
+#include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+
+#include "debug.h"
+
+#define BLOCK_SIZE 512
+
+__global__ void spgpuSabgdxyz_krn(int n, float alpha, float beta, float gamma, float delta,
+				  float* x, float *y, float *z)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	float t;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
+	{
+
+	  if (beta == 0.0)
+	    t = PREC_FMUL(alpha,x[id]);
+	  else
+	    t = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
+	  if (delta == 0.0)
+	    z[id] = gamma * t;
+	  else
+	    z[id] = PREC_FADD(PREC_FMUL(gamma, t), PREC_FMUL(delta,z[id]));
+	  y[id] = t;
+	}
+}
+
+
+void spgpuSabgdxyz(spgpuHandle_t handle,
+		   int n,
+		   float alpha,
+		   float beta,
+		   float gamma,
+		   float delta,
+		   __device float* x,
+		   __device float* y,
+		   __device float *z)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuSabgdxyz_krn<<<grid, block, 0, handle->currentStream>>>(n, alpha, beta, gamma, delta,
+								   x, y, z);
+}
+
--- a/cuda/spgpu/kernels/zabgdxyz.cu
+++ b/cuda/spgpu/kernels/zabgdxyz.cu
@ -0,0 +1,80 @@
+/*
+ * spGPU - Sparse matrices on GPU library.
+ * 
+ * Copyright (C) 2010 - 2012 
+ *     Davide Barbieri - University of Rome Tor Vergata
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 3 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "cudadebug.h"
+#include "cudalang.h"
+#include <cuda_runtime.h>
+
+extern "C"
+{
+#include "core.h"
+#include "vector.h"
+  int getGPUMultiProcessors();
+  int getGPUMaxThreadsPerMP();
+}
+
+
+#include "debug.h"
+
+#define BLOCK_SIZE 512
+
+__global__ void spgpuZabgdxyz_krn(int n, cuDoubleComplex  alpha, cuDoubleComplex  beta,
+				  cuDoubleComplex  gamma, cuDoubleComplex  delta,
+				  cuDoubleComplex * x, cuDoubleComplex  *y, cuDoubleComplex  *z)
+{
+	int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
+	unsigned int gridSize = blockDim.x * gridDim.x;
+	cuDoubleComplex  t;
+	for ( ; id < n; id +=gridSize)
+		//if (id,n) 
+	{
+
+	  if (cuDoubleComplex_isZero(beta)) 
+	    t = cuCmul(alpha,x[id]);
+	  else
+	    t = cuCfma(alpha, x[id], cuCmul(beta,y[id]));
+	  if (cuDoubleComplex_isZero(delta))
+	    z[id] = cuCmul(gamma, t);
+	  else
+	    z[id] = cuCfma(gamma, t, cuCmul(delta,z[id]));
+	  y[id] = t;
+	}
+}
+
+
+void spgpuZabgdxyz(spgpuHandle_t handle,
+		   int n,
+		   cuDoubleComplex  alpha,
+		   cuDoubleComplex  beta,
+		   cuDoubleComplex  gamma,
+		   cuDoubleComplex  delta,
+		   __device cuDoubleComplex * x,
+		   __device cuDoubleComplex * y,
+		   __device cuDoubleComplex  *z)
+{
+	int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
+	int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
+	dim3 block(BLOCK_SIZE);
+	num_mp         = getGPUMultiProcessors();
+	max_threads_mp = getGPUMaxThreadsPerMP();
+	num_blocks_mp  = max_threads_mp/BLOCK_SIZE;
+	num_blocks     = num_blocks_mp*num_mp;
+	dim3 grid(num_blocks);
+
+	spgpuZabgdxyz_krn<<<grid, block, 0, handle->currentStream>>>(n, alpha, beta, gamma, delta,
+								   x, y, z);
+}
+
--- a/cuda/spgpu/vector.h
+++ b/cuda/spgpu/vector.h
@ -181,6 +181,18 @@ void spgpuSaxpby(spgpuHandle_t handle,
 	float alpha,
 	__device float* x);

+
+void spgpuSabgdxyz(spgpuHandle_t handle,
+		   int n,
+		   float alpha,
+		   float beta,
+		   float gamma,
+		   float delta,
+		   __device float* x,
+		   __device float *y,
+		   __device float *z)
+;
+	
 /** 
 * \fn void spgpuSmaxpby(spgpuHandle_t handle, __device float *z, int n, float beta, __device float *y, float alpha, __device float* x, int count, int pitch)
 * Computes the single precision z = beta * y + alpha * x of x and y multivectors. z could be exactly x or y (without offset) or another vector.
@ -755,6 +767,18 @@ void spgpuCaxpby(spgpuHandle_t handle,
 	cuFloatComplex alpha,
 	__device cuFloatComplex* x);

+
+void spgpuCabgdxyz(spgpuHandle_t handle,
+		   int n,
+		   cuFloatComplex alpha,
+		   cuFloatComplex beta,
+		   cuFloatComplex gamma,
+		   cuFloatComplex delta,
+		   __device cuFloatComplex* x,
+		   __device cuFloatComplex *y,
+		   __device cuFloatComplex *z)
+;
+	
 /** 
 * \fn void spgpuCmaxpby(spgpuHandle_t handle, __device cuFloatComplex *z, int n, cuFloatComplex beta, __device cuFloatComplex *y, cuFloatComplex alpha, __device cuFloatComplex* x, int count, int pitch)
 * Computes the single precision complex z = beta * y + alpha * x of x and y multivectors. z could be exactly x or y (without offset) or another vector.
@ -1034,6 +1058,18 @@ void spgpuZaxpby(spgpuHandle_t handle,
 	cuDoubleComplex alpha,
 	__device cuDoubleComplex* x);
 	
+
+void spgpuZabgdxyz(spgpuHandle_t handle,
+		   int n,
+		   cuDoubleComplex alpha,
+		   cuDoubleComplex beta,
+		   cuDoubleComplex gamma,
+		   cuDoubleComplex delta,
+		   __device cuDoubleComplex* x,
+		   __device cuDoubleComplex *y,
+		   __device cuDoubleComplex *z)
+;
+	
 /** 
 * \fn void spgpuZmaxpby(spgpuHandle_t handle, __device cuDoubleComplex *z, int n, cuDoubleComplex beta, __device cuDoubleComplex *y, cuDoubleComplex alpha, __device cuDoubleComplex* x, int count, int pitch)
 * Computes the double precision complex z = beta * y + alpha * x of x and y multivectors. z could be exactly x or y (without offset) or another vector.
--- a/cuda/svectordev.c
+++ b/cuda/svectordev.c
@ -241,6 +241,23 @@ int axpbyMultiVecDeviceFloat(int n,float alpha, void* devMultiVecX,
  return(i);
 }

+int abgdxyzMultiVecDeviceFloat(int n,float alpha,float beta, float gamma, float delta, 
+				void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ)
+{ int j=0, i=0;
+  int pitch = 0;
+  struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
+  struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY;
+  struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+  pitch = devVecY->pitch_;
+  if ((n > devVecY->size_) || (n>devVecX->size_ )) 
+    return SPGPU_UNSUPPORTED;
+
+  spgpuSabgdxyz(handle,n, alpha,beta,gamma,delta, 
+	      (float*)devVecX->v_,(float*) devVecY->v_,(float*) devVecZ->v_);
+  return(i);
+}
+ 
 int axyMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB)
 { int i = 0; 
  struct MultiVectDevice *devVecA = (struct MultiVectDevice *) deviceVecA;
--- a/cuda/svectordev.h
+++ b/cuda/svectordev.h
@ -67,6 +67,8 @@ int asumMultiVecDeviceFloat(float* y_res, int n, void* devVecA);
 int dotMultiVecDeviceFloat(float* y_res, int n, void* devVecA, void* devVecB);

 int axpbyMultiVecDeviceFloat(int n, float alpha, void* devVecX, float beta, void* devVecY);
+int abgdxyzMultiVecDeviceFloat(int n,float alpha,float beta, float gamma, float delta, 
+			       void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ);
 int axyMultiVecDeviceFloat(int n, float alpha, void *deviceVecA, void *deviceVecB);
 int axybzMultiVecDeviceFloat(int n, float alpha, void *deviceVecA,
 			      void *deviceVecB, float beta, void *deviceVecZ);
--- a/cuda/zvectordev.c
+++ b/cuda/zvectordev.c
@ -234,6 +234,24 @@ int dotMultiVecDeviceDoubleComplex(cuDoubleComplex* y_res, int n, void* devMulti
  return(i);
 }

+int abgdxyzMultiVecDeviceDoubleComplex(int n,cuDoubleComplex  alpha,
+				       cuDoubleComplex  beta, cuDoubleComplex  gamma, cuDoubleComplex  delta, 
+				       void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ)
+{ int j=0, i=0;
+  int pitch = 0;
+  struct MultiVectDevice *devVecX = (struct MultiVectDevice *) devMultiVecX;
+  struct MultiVectDevice *devVecY = (struct MultiVectDevice *) devMultiVecY;
+  struct MultiVectDevice *devVecZ = (struct MultiVectDevice *) devMultiVecZ;
+  spgpuHandle_t handle=psb_cudaGetHandle();
+  pitch = devVecY->pitch_;
+  if ((n > devVecY->size_) || (n>devVecX->size_ )) 
+    return SPGPU_UNSUPPORTED;
+
+  spgpuZabgdxyz(handle,n, alpha,beta,gamma,delta, 
+	      (cuDoubleComplex *)devVecX->v_,(cuDoubleComplex *) devVecY->v_,(cuDoubleComplex *) devVecZ->v_);
+  return(i);
+}
+ 
 int axpbyMultiVecDeviceDoubleComplex(int n,cuDoubleComplex alpha, void* devMultiVecX, 
 			      cuDoubleComplex beta, void* devMultiVecY)
 { int j=0, i=0;
--- a/cuda/zvectordev.h
+++ b/cuda/zvectordev.h
@ -77,6 +77,9 @@ int dotMultiVecDeviceDoubleComplex(cuDoubleComplex* y_res, int n,

 int axpbyMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void* devVecX, 
 				     cuDoubleComplex beta, void* devVecY);
+int abgdxyzMultiVecDeviceDoubleComplex(int n,cuDoubleComplex  alpha,
+				       cuDoubleComplex  beta, cuDoubleComplex  gamma, cuDoubleComplex  delta, 
+				       void* devMultiVecX, void* devMultiVecY, void* devMultiVecZ);
 int axyMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, 
 				   void *deviceVecA, void *deviceVecB);
 int axybzMultiVecDeviceDoubleComplex(int n, cuDoubleComplex alpha, void *deviceVecA,