From 48455190ecd4e78a43be5a5d6f9c9749cce606a2 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 5 Mar 2024 13:57:03 +0100 Subject: [PATCH] Add GPU version of XYZW --- cuda/spgpu/kernels/cxyzw.cu | 78 +++++++++++++++++++++++++++++++++++++ cuda/spgpu/kernels/dxyzw.cu | 78 +++++++++++++++++++++++++++++++++++++ cuda/spgpu/kernels/sxyzw.cu | 78 +++++++++++++++++++++++++++++++++++++ cuda/spgpu/kernels/zxyzw.cu | 78 +++++++++++++++++++++++++++++++++++++ 4 files changed, 312 insertions(+) create mode 100644 cuda/spgpu/kernels/cxyzw.cu create mode 100644 cuda/spgpu/kernels/dxyzw.cu create mode 100644 cuda/spgpu/kernels/sxyzw.cu create mode 100644 cuda/spgpu/kernels/zxyzw.cu diff --git a/cuda/spgpu/kernels/cxyzw.cu b/cuda/spgpu/kernels/cxyzw.cu new file mode 100644 index 00000000..d2b332b1 --- /dev/null +++ b/cuda/spgpu/kernels/cxyzw.cu @@ -0,0 +1,78 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuCxyzw_krn(int n, cuFloatComplex a, cuFloatComplex b, + cuFloatComplex c, cuFloatComplex d, + cuFloatComplex e, cuFloatComplex f, + cuFloatComplex * x, cuFloatComplex *y, + cuFloatComplex *z, cuFloatComplex *w) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + cuFloatComplex ty, tz; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + ty = cuCfmaf(a, x[id], cuCmulf(b,y[id])); + tz = cuCfmaf(c, ty, cuCmulf(d,z[id])); + w[id] = cuCfmaf(e, tz, cuCmulf(f,w[id])); + y[id] = ty; + z[id] = tz; + } +} + + +void spgpuCxyzw(spgpuHandle_t handle, + int n, + cuFloatComplex a, cuFloatComplex b, + cuFloatComplex c, cuFloatComplex d, + cuFloatComplex e, cuFloatComplex f, + __device cuFloatComplex * x, + __device cuFloatComplex * y, + __device cuFloatComplex * z, + __device cuFloatComplex *w) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuCxyzw_krn<<currentStream>>>(n, a,b,c,d,e,f, + x, y, z,w); +} + diff --git a/cuda/spgpu/kernels/dxyzw.cu b/cuda/spgpu/kernels/dxyzw.cu new file mode 100644 index 00000000..afd36651 --- /dev/null +++ b/cuda/spgpu/kernels/dxyzw.cu @@ -0,0 +1,78 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuDxyzw_krn(int n, double a, double b, + double c, double d, + double e, double f, + double * x, double *y, + double *z, double *w) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + double ty, tz; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + ty = PREC_DADD(PREC_DADD(a, x[id]), PREC_DMUL(b,y[id])); + tz = PREC_DADD(PREC_DADD(c, ty), PREC_DMUL(d,z[id])); + w[id] = PREC_DADD(PREC_DADD(e, tz), PREC_DMUL(f,w[id])); + y[id] = ty; + z[id] = tz; + } +} + + +void spgpuDxyzw(spgpuHandle_t handle, + int n, + double a, double b, + double c, double d, + double e, double f, + __device double * x, + __device double * y, + __device double * z, + __device double *w) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuDxyzw_krn<<currentStream>>>(n, a,b,c,d,e,f, + x, y, z,w); +} + diff --git a/cuda/spgpu/kernels/sxyzw.cu b/cuda/spgpu/kernels/sxyzw.cu new file mode 100644 index 00000000..9cedd02f --- /dev/null +++ b/cuda/spgpu/kernels/sxyzw.cu @@ -0,0 +1,78 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuSxyzw_krn(int n, float a, float b, + float c, float d, + float e, float f, + float * x, float *y, + float *z, float *w) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + float ty, tz; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + ty = PREC_FADD(PREC_FMUL(a, x[id]), PREC_FMUL(b,y[id])); + tz = PREC_FADD(PREC_FMUL(c, ty), PREC_FMUL(d,z[id])); + w[id] = PREC_FADD(PREC_FMUL(e, tz), PREC_FMUL(f,w[id])); + y[id] = ty; + z[id] = tz; + } +} + + +void spgpuSxyzw(spgpuHandle_t handle, + int n, + float a, float b, + float c, float d, + float e, float f, + __device float * x, + __device float * y, + __device float * z, + __device float *w) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuSxyzw_krn<<currentStream>>>(n, a,b,c,d,e,f, + x, y, z,w); +} + diff --git a/cuda/spgpu/kernels/zxyzw.cu b/cuda/spgpu/kernels/zxyzw.cu new file mode 100644 index 00000000..7a61edee --- /dev/null +++ b/cuda/spgpu/kernels/zxyzw.cu @@ -0,0 +1,78 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2012 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include + +extern "C" +{ +#include "core.h" +#include "vector.h" + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + + +#include "debug.h" + +#define BLOCK_SIZE 512 + +__global__ void spgpuZxyzw_krn(int n, cuDoubleComplex a, cuDoubleComplex b, + cuDoubleComplex c, cuDoubleComplex d, + cuDoubleComplex e, cuDoubleComplex f, + cuDoubleComplex * x, cuDoubleComplex *y, + cuDoubleComplex *z, cuDoubleComplex *w) +{ + int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; + unsigned int gridSize = blockDim.x * gridDim.x; + cuDoubleComplex ty, tz; + for ( ; id < n; id +=gridSize) + //if (id,n) + { + + ty = cuCfma(a, x[id], cuCmul(b,y[id])); + tz = cuCfma(c, ty, cuCmul(d,z[id])); + w[id] = cuCfma(e, tz, cuCmul(f,w[id])); + y[id] = ty; + z[id] = tz; + } +} + + +void spgpuZxyzw(spgpuHandle_t handle, + int n, + cuDoubleComplex a, cuDoubleComplex b, + cuDoubleComplex c, cuDoubleComplex d, + cuDoubleComplex e, cuDoubleComplex f, + __device cuDoubleComplex * x, + __device cuDoubleComplex * y, + __device cuDoubleComplex * z, + __device cuDoubleComplex *w) +{ + int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; + int num_mp, max_threads_mp, num_blocks_mp, num_blocks; + dim3 block(BLOCK_SIZE); + num_mp = getGPUMultiProcessors(); + max_threads_mp = getGPUMaxThreadsPerMP(); + num_blocks_mp = max_threads_mp/BLOCK_SIZE; + num_blocks = num_blocks_mp*num_mp; + dim3 grid(num_blocks); + + spgpuZxyzw_krn<<currentStream>>>(n, a,b,c,d,e,f, + x, y, z,w); +} +