Reuse calls to getDeviceProperties done at init time

nond-rep
Salvatore Filippone 11 months ago
parent 1ba8dfc7b7
commit 35d68aa4e3

@ -22,6 +22,9 @@ extern "C"
{
#include "core.h"
#include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
}
@ -29,6 +32,8 @@ extern "C"
#define BLOCK_SIZE 512
#if 1
__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -51,7 +56,6 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
}
}
#if 1
void spgpuCaxpby(spgpuHandle_t handle,
__device cuFloatComplex *z,
int n,
@ -63,10 +67,8 @@ void spgpuCaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_mp = getGPUMultiProcessors();
max_threads_mp = getGPUMaxThreadsPerMP();
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
@ -75,6 +77,23 @@ void spgpuCaxpby(spgpuHandle_t handle,
}
#else
__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuFloatComplex_isZero(beta))
z[id] = cuCmulf(alpha,x[id]);
else
z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
}
}
void spgpuCaxpby_(spgpuHandle_t handle,
__device cuFloatComplex *z,
int n,

@ -22,6 +22,8 @@ extern "C"
{
#include "core.h"
#include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
}
@ -65,10 +67,8 @@ void spgpuDabgdxyz(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_mp = getGPUMultiProcessors();
max_threads_mp = getGPUMaxThreadsPerMP();
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);

@ -22,6 +22,9 @@ extern "C"
{
#include "core.h"
#include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
}
@ -29,6 +32,8 @@ extern "C"
#define BLOCK_SIZE 512
#if 1
__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -36,23 +41,17 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
if (beta == 0.0) {
for ( ; id < n; id +=gridSize)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_DMUL(alpha,x[id]);
}
} else {
for ( ; id < n; id +=gridSize)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
}
}
}
#if 1
void spgpuDaxpby(spgpuHandle_t handle,
__device double *z,
int n,
@ -64,10 +63,8 @@ void spgpuDaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_mp = getGPUMultiProcessors();
max_threads_mp = getGPUMaxThreadsPerMP();
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
@ -75,6 +72,23 @@ void spgpuDaxpby(spgpuHandle_t handle,
spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0)
z[id] = PREC_DMUL(alpha,x[id]);
else
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
}
}
void spgpuDaxpby_(spgpuHandle_t handle,
__device double *z,
int n,

@ -20,6 +20,9 @@ extern "C"
{
#include "core.h"
#include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
}
@ -27,6 +30,8 @@ extern "C"
#define BLOCK_SIZE 512
#if 1
__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -49,8 +54,6 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
}
}
#if 1
void spgpuSaxpby(spgpuHandle_t handle,
__device float *z,
int n,
@ -62,17 +65,35 @@ void spgpuSaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_mp = getGPUMultiProcessors();
max_threads_mp = getGPUMaxThreadsPerMP();
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0f)
z[id] = PREC_FMUL(alpha,x[id]);
else
z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
}
}
void spgpuSaxpby_(spgpuHandle_t handle,
__device float *z,
int n,

@ -23,6 +23,9 @@ extern "C"
{
#include "core.h"
#include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
}
@ -30,6 +33,7 @@ extern "C"
#define BLOCK_SIZE 512
#if 1
__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -52,7 +56,6 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
}
}
#if 1
void spgpuZaxpby(spgpuHandle_t handle,
__device cuDoubleComplex *z,
int n,
@ -64,10 +67,8 @@ void spgpuZaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_mp = getGPUMultiProcessors();
max_threads_mp = getGPUMaxThreadsPerMP();
num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks);
@ -75,6 +76,23 @@ void spgpuZaxpby(spgpuHandle_t handle,
spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
}
#else
__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuDoubleComplex_isZero(beta))
z[id] = cuCmul(alpha,x[id]);
else
z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id]));
}
}
void spgpuZaxpby_(spgpuHandle_t handle,
__device cuDoubleComplex *z,
int n,

Loading…
Cancel
Save