Reuse calls to getDeviceProperties done at init time

nond-rep
Salvatore Filippone 11 months ago
parent 1ba8dfc7b7
commit 35d68aa4e3

@ -22,6 +22,9 @@ extern "C"
{ {
#include "core.h" #include "core.h"
#include "vector.h" #include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
} }
@ -29,6 +32,8 @@ extern "C"
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#if 1
__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x) __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -51,7 +56,6 @@ __global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, c
} }
} }
#if 1
void spgpuCaxpby(spgpuHandle_t handle, void spgpuCaxpby(spgpuHandle_t handle,
__device cuFloatComplex *z, __device cuFloatComplex *z,
int n, int n,
@ -63,10 +67,8 @@ void spgpuCaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks; int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp; num_mp = getGPUMultiProcessors();
cudaGetDeviceProperties(&deviceProp, 0); max_threads_mp = getGPUMaxThreadsPerMP();
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp; num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks); dim3 grid(num_blocks);
@ -75,6 +77,23 @@ void spgpuCaxpby(spgpuHandle_t handle,
} }
#else #else
__global__ void spgpuCaxpby_krn(cuFloatComplex *z, int n, cuFloatComplex beta, cuFloatComplex *y, cuFloatComplex alpha, cuFloatComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuFloatComplex_isZero(beta))
z[id] = cuCmulf(alpha,x[id]);
else
z[id] = cuCfmaf(beta, y[id], cuCmulf(alpha, x[id]));
}
}
void spgpuCaxpby_(spgpuHandle_t handle, void spgpuCaxpby_(spgpuHandle_t handle,
__device cuFloatComplex *z, __device cuFloatComplex *z,
int n, int n,

@ -22,6 +22,8 @@ extern "C"
{ {
#include "core.h" #include "core.h"
#include "vector.h" #include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
} }
@ -65,10 +67,8 @@ void spgpuDabgdxyz(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks; int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp; num_mp = getGPUMultiProcessors();
cudaGetDeviceProperties(&deviceProp, 0); max_threads_mp = getGPUMaxThreadsPerMP();
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp; num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks); dim3 grid(num_blocks);

@ -22,6 +22,9 @@ extern "C"
{ {
#include "core.h" #include "core.h"
#include "vector.h" #include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
} }
@ -29,6 +32,8 @@ extern "C"
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#if 1
__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x) __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -36,23 +41,17 @@ __global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double
if (beta == 0.0) { if (beta == 0.0) {
for ( ; id < n; id +=gridSize) for ( ; id < n; id +=gridSize)
{ {
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_DMUL(alpha,x[id]); z[id] = PREC_DMUL(alpha,x[id]);
} }
} else { } else {
for ( ; id < n; id +=gridSize) for ( ; id < n; id +=gridSize)
{ {
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id])); z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
} }
} }
} }
#if 1
void spgpuDaxpby(spgpuHandle_t handle, void spgpuDaxpby(spgpuHandle_t handle,
__device double *z, __device double *z,
int n, int n,
@ -64,10 +63,8 @@ void spgpuDaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks; int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp; num_mp = getGPUMultiProcessors();
cudaGetDeviceProperties(&deviceProp, 0); max_threads_mp = getGPUMaxThreadsPerMP();
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp; num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks); dim3 grid(num_blocks);
@ -75,6 +72,23 @@ void spgpuDaxpby(spgpuHandle_t handle,
spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x); spgpuDaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
} }
#else #else
__global__ void spgpuDaxpby_krn(double *z, int n, double beta, double *y, double alpha, double* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0)
z[id] = PREC_DMUL(alpha,x[id]);
else
z[id] = PREC_DADD(PREC_DMUL(alpha, x[id]), PREC_DMUL(beta,y[id]));
}
}
void spgpuDaxpby_(spgpuHandle_t handle, void spgpuDaxpby_(spgpuHandle_t handle,
__device double *z, __device double *z,
int n, int n,

@ -20,6 +20,9 @@ extern "C"
{ {
#include "core.h" #include "core.h"
#include "vector.h" #include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
} }
@ -27,6 +30,8 @@ extern "C"
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#if 1
__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x) __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -49,8 +54,6 @@ __global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alp
} }
} }
#if 1
void spgpuSaxpby(spgpuHandle_t handle, void spgpuSaxpby(spgpuHandle_t handle,
__device float *z, __device float *z,
int n, int n,
@ -62,17 +65,35 @@ void spgpuSaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks; int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp; num_mp = getGPUMultiProcessors();
cudaGetDeviceProperties(&deviceProp, 0); max_threads_mp = getGPUMaxThreadsPerMP();
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp; num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks); dim3 grid(num_blocks);
spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x); spgpuSaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
} }
#else #else
__global__ void spgpuSaxpby_krn(float *z, int n, float beta, float *y, float alpha, float* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (beta == 0.0f)
z[id] = PREC_FMUL(alpha,x[id]);
else
z[id] = PREC_FADD(PREC_FMUL(alpha, x[id]), PREC_FMUL(beta,y[id]));
}
}
void spgpuSaxpby_(spgpuHandle_t handle, void spgpuSaxpby_(spgpuHandle_t handle,
__device float *z, __device float *z,
int n, int n,

@ -23,6 +23,9 @@ extern "C"
{ {
#include "core.h" #include "core.h"
#include "vector.h" #include "vector.h"
int getGPUMultiProcessors();
int getGPUMaxThreadsPerMP();
//#include "cuda_util.h"
} }
@ -30,6 +33,7 @@ extern "C"
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#if 1
__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x) __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
{ {
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
@ -52,7 +56,6 @@ __global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta,
} }
} }
#if 1
void spgpuZaxpby(spgpuHandle_t handle, void spgpuZaxpby(spgpuHandle_t handle,
__device cuDoubleComplex *z, __device cuDoubleComplex *z,
int n, int n,
@ -64,10 +67,8 @@ void spgpuZaxpby(spgpuHandle_t handle,
int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE;
int num_mp, max_threads_mp, num_blocks_mp, num_blocks; int num_mp, max_threads_mp, num_blocks_mp, num_blocks;
dim3 block(BLOCK_SIZE); dim3 block(BLOCK_SIZE);
cudaDeviceProp deviceProp; num_mp = getGPUMultiProcessors();
cudaGetDeviceProperties(&deviceProp, 0); max_threads_mp = getGPUMaxThreadsPerMP();
num_mp = deviceProp.multiProcessorCount;
max_threads_mp = deviceProp.maxThreadsPerMultiProcessor;
num_blocks_mp = max_threads_mp/BLOCK_SIZE; num_blocks_mp = max_threads_mp/BLOCK_SIZE;
num_blocks = num_blocks_mp*num_mp; num_blocks = num_blocks_mp*num_mp;
dim3 grid(num_blocks); dim3 grid(num_blocks);
@ -75,6 +76,23 @@ void spgpuZaxpby(spgpuHandle_t handle,
spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x); spgpuZaxpby_krn<<<grid, block, 0, handle->currentStream>>>(z, n, beta, y, alpha, x);
} }
#else #else
__global__ void spgpuZaxpby_krn(cuDoubleComplex *z, int n, cuDoubleComplex beta, cuDoubleComplex *y, cuDoubleComplex alpha, cuDoubleComplex* x)
{
int id = threadIdx.x + BLOCK_SIZE*blockIdx.x;
if (id < n)
{
// Since z, x and y are accessed with the same offset by the same thread,
// and the write to z follows the x and y read, x, y and z can share the same base address (in-place computing).
if (cuDoubleComplex_isZero(beta))
z[id] = cuCmul(alpha,x[id]);
else
z[id] = cuCfma(alpha, x[id], cuCmul(beta,y[id]));
}
}
void spgpuZaxpby_(spgpuHandle_t handle, void spgpuZaxpby_(spgpuHandle_t handle,
__device cuDoubleComplex *z, __device cuDoubleComplex *z,
int n, int n,

Loading…
Cancel
Save