You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
psblas3/cuda/cuda_util.c

808 lines
19 KiB
C

/* Parallel Sparse BLAS GPU plugin */
/* (C) Copyright 2013 */
/* Salvatore Filippone */
/* Alessandro Fanfarillo */
/* Redistribution and use in source and binary forms, with or without */
/* modification, are permitted provided that the following conditions */
/* are met: */
/* 1. Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* 2. Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions, and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution. */
/* 3. The name of the PSBLAS group or the names of its contributors may */
/* not be used to endorse or promote products derived from this */
/* software without specific written permission. */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
/* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
/* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS */
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */
/* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF */
/* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS */
/* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) */
/* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
#include "cuda_util.h"
static int hasUVA=-1;
static struct cudaDeviceProp *prop=NULL;
static spgpuHandle_t psb_cuda_handle = NULL;
static cublasHandle_t psb_cublas_handle = NULL;
#if defined(TRACK_CUDA_MALLOC)
static long long total_cuda_mem = 0;
#endif
int allocRemoteBuffer(void** buffer, int count)
{
cudaError_t err = cudaMalloc(buffer, count);
#if defined(TRACK_CUDA_MALLOC)
total_cuda_mem += count;
fprintf(stderr,"Tracking CUDA allocRemoteBuffer for %ld bytes total %ld address %p\n",
count, total_cuda_mem, *buffer);
#endif
if (err == cudaSuccess)
{
return SPGPU_SUCCESS;
}
else
{
fprintf(stderr,"CUDA allocRemoteBuffer for %d bytes Error: %s \n",
count, cudaGetErrorString(err));
if(err == cudaErrorMemoryAllocation)
return SPGPU_OUTOFMEMORY;
else
return SPGPU_UNSPECIFIED;
}
}
int hostRegisterMapped(void *pointer, long size)
{
cudaError_t err = cudaHostRegister(pointer, size, cudaHostRegisterMapped);
if (err == cudaSuccess)
{
return SPGPU_SUCCESS;
}
else
{
fprintf(stderr,"CUDA hostRegisterMapped Error: %s\n", cudaGetErrorString(err));
if(err == cudaErrorMemoryAllocation)
return SPGPU_OUTOFMEMORY;
else
return SPGPU_UNSPECIFIED;
}
}
int getDevicePointer(void **d_p, void * h_p)
{
cudaError_t err = cudaHostGetDevicePointer(d_p,h_p,0);
if (err == cudaSuccess)
{
return SPGPU_SUCCESS;
}
else
{
fprintf(stderr,"CUDA getDevicePointer Error: %s\n", cudaGetErrorString(err));
if(err == cudaErrorMemoryAllocation)
return SPGPU_OUTOFMEMORY;
else
return SPGPU_UNSPECIFIED;
}
}
int registerMappedMemory(void *buffer, void **dp, int size)
{
//cudaError_t err = cudaHostAlloc(buffer,size,cudaHostAllocMapped);
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterMapped);
if (err == cudaSuccess) err = cudaHostGetDevicePointer(dp,buffer,0);
if (err == cudaSuccess)
{
err = cudaHostGetDevicePointer(dp,buffer,0);
if (err == cudaSuccess)
{
return SPGPU_SUCCESS;
}
else
{
fprintf(stderr,"CUDA registerMappedMemory Error: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
}
else
{
fprintf(stderr,"CUDA registerMappedMemory Error: %s\n", cudaGetErrorString(err));
if(err == cudaErrorMemoryAllocation)
return SPGPU_OUTOFMEMORY;
else
return SPGPU_UNSPECIFIED;
}
}
int allocMappedMemory(void **buffer, void **dp, int size)
{
cudaError_t err = cudaHostAlloc(buffer,size,cudaHostAllocMapped);
if (err == 0) err = cudaHostGetDevicePointer(dp,*buffer,0);
if (err == cudaSuccess)
{
return SPGPU_SUCCESS;
}
else
{
fprintf(stderr,"CUDA allocMappedMemory Error: %s\n", cudaGetErrorString(err));
if(err == cudaErrorMemoryAllocation)
return SPGPU_OUTOFMEMORY;
else
return SPGPU_UNSPECIFIED;
}
}
int unregisterMappedMemory(void *buffer)
{
//cudaError_t err = cudaHostAlloc(buffer,size,cudaHostAllocMapped);
cudaError_t err = cudaHostUnregister(buffer);
if (err == cudaSuccess)
{
return SPGPU_SUCCESS;
}
else
{
fprintf(stderr,"CUDA unregisterMappedMemory Error: %s\n", cudaGetErrorString(err));
if(err == cudaErrorMemoryAllocation)
return SPGPU_OUTOFMEMORY;
else
return SPGPU_UNSPECIFIED;
}
}
int writeRemoteBuffer(void* hostSrc, void* buffer, int count)
{
cudaError_t err = cudaMemcpy(buffer, hostSrc, count, cudaMemcpyHostToDevice);
if (err == cudaSuccess)
return SPGPU_SUCCESS;
else {
fprintf(stderr,"CUDA Error writeRemoteBuffer: %s %p %p %d\n",
cudaGetErrorString(err),buffer, hostSrc, count);
return SPGPU_UNSPECIFIED;
}
}
int readRemoteBuffer(void* hostDest, void* buffer, int count)
{
cudaError_t err1;
cudaError_t err;
#if 0
{
err1 =cudaGetLastError();
fprintf(stderr,"CUDA Error prior to readRemoteBuffer: %s %d\n",
cudaGetErrorString(err1),err1);
}
#endif
err = cudaMemcpy(hostDest, buffer, count, cudaMemcpyDeviceToHost);
if (err == cudaSuccess)
return SPGPU_SUCCESS;
else {
fprintf(stderr,"CUDA Error readRemoteBuffer: %s %p %p %d %d\n",
cudaGetErrorString(err),hostDest,buffer,count,err);
return SPGPU_UNSPECIFIED;
}
}
int freeRemoteBuffer(void* buffer)
{
cudaError_t err = cudaFree(buffer);
if (err == cudaSuccess)
return SPGPU_SUCCESS;
else {
fprintf(stderr,"CUDA Error freeRemoteBuffer: %s %p\n", cudaGetErrorString(err),buffer);
return SPGPU_UNSPECIFIED;
}
}
int gpuInit(int dev)
{
int count,err;
if ((err=cudaSetDeviceFlags(cudaDeviceMapHost))!=cudaSuccess)
fprintf(stderr,"Error On SetDeviceFlags: %d '%s'\n",err,cudaGetErrorString(err));
if ((prop=(struct cudaDeviceProp *) malloc(sizeof(struct cudaDeviceProp)))==NULL) {
fprintf(stderr,"CUDA Error gpuInit3: not malloced prop\n");
return SPGPU_UNSPECIFIED;
}
err = setDevice(dev);
if (err != cudaSuccess) {
fprintf(stderr,"CUDA Error gpuInit2: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
if (!psb_cublas_handle)
psb_cudaCreateCublasHandle();
hasUVA=getDeviceHasUVA();
FcusparseCreate();
return err;
}
void gpuClose()
{
cudaStream_t st1, st2;
if (! psb_cuda_handle)
st1=spgpuGetStream(psb_cuda_handle);
if (! psb_cublas_handle)
cublasGetStream(psb_cublas_handle,&st2);
FcusparseDestroy();
psb_cudaDestroyHandle();
if (st1 != st2)
psb_cudaDestroyCublasHandle();
free(prop);
prop=NULL;
hasUVA=-1;
}
int setDevice(int dev)
{
int count,err,idev;
err = cudaGetDeviceCount(&count);
if (err != cudaSuccess) {
fprintf(stderr,"CUDA Error setDevice: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
if ((0<=dev)&&(dev<count))
idev = dev;
else
idev = 0;
err = cudaSetDevice(idev);
if (err != cudaSuccess) {
fprintf(stderr,"CUDA Error gpuInit2: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
err = cudaGetDeviceProperties(prop,idev);
if (err != cudaSuccess) {
fprintf(stderr,"CUDA Error gpuInit4: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
return SPGPU_SUCCESS;
}
int getDevice()
{ int count;
cudaGetDevice(&count);
return(count);
}
int getDeviceHasUVA()
{ int count=0;
if (prop!=NULL)
count = prop->unifiedAddressing;
return(count);
}
int getGPUMultiProcessors()
{ int count=0;
if (prop!=NULL)
count = prop->multiProcessorCount;
return(count);
}
int getGPUMemoryBusWidth()
{ int count=0;
#if CUDART_VERSION >= 5000
if (prop!=NULL)
count = prop->memoryBusWidth;
#endif
return(count);
}
int getGPUMemoryClockRate()
{ int count=0;
#if CUDART_VERSION >= 5000
if (prop!=NULL)
count = prop->memoryClockRate;
#endif
return(count);
}
int getGPUWarpSize()
{ int count=0;
if (prop!=NULL)
count = prop->warpSize;
return(count);
}
int getGPUMaxThreadsPerBlock()
{ int count=0;
if (prop!=NULL)
count = prop->maxThreadsPerBlock;
return(count);
}
int getGPUMaxThreadsPerMP()
{ int count=0;
if (prop!=NULL)
count = prop->maxThreadsPerMultiProcessor;
return(count);
}
int getGPUMaxRegistersPerBlock()
{ int count=0;
if (prop!=NULL)
count = prop->regsPerBlock;
return(count);
}
void cpyGPUNameString(char *cstring)
{
*cstring='\0';
if (prop!=NULL)
strcpy(cstring,prop->name);
}
int DeviceHasUVA()
{
return(hasUVA == 1);
}
int getDeviceCount()
{ int count;
cudaError_t err;
err = cudaGetDeviceCount(&count);
if (err != cudaSuccess) {
fprintf(stderr,"CUDA Error getDeviceCount: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
return(count);
}
void cudaSync()
{
cudaError_t err;
err = cudaDeviceSynchronize();
if (err == cudaSuccess)
return SPGPU_SUCCESS;
else {
fprintf(stderr,"CUDA Error cudaSync: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
}
void cudaReset()
{
cudaError_t err;
err = cudaDeviceReset();
if (err != cudaSuccess) {
fprintf(stderr,"CUDA Error Reset: %s\n", cudaGetErrorString(err));
return SPGPU_UNSPECIFIED;
}
}
spgpuHandle_t psb_cudaGetHandle()
{
return psb_cuda_handle;
}
void psb_cudaCreateHandle()
{
if (!psb_cuda_handle)
spgpuCreate(&psb_cuda_handle, getDevice());
}
void psb_cudaDestroyHandle()
{
if (!psb_cuda_handle)
spgpuDestroy(psb_cuda_handle);
psb_cuda_handle = NULL;
}
cudaStream_t psb_cudaGetStream()
{
return spgpuGetStream(psb_cuda_handle);
}
void psb_cudaSetStream(cudaStream_t stream)
{
spgpuSetStream(psb_cuda_handle, stream);
return ;
}
cublasHandle_t psb_cudaGetCublasHandle()
{
if (!psb_cublas_handle)
psb_cudaCreateCublasHandle();
return psb_cublas_handle;
}
void psb_cudaCreateCublasHandle()
{ if (!psb_cublas_handle)
cublasCreate(&psb_cublas_handle);
}
void psb_cudaDestroyCublasHandle()
{
if (!psb_cublas_handle)
cublasDestroy(psb_cublas_handle);
psb_cublas_handle=NULL;
}
/* Simple memory tools */
int allocateInt(void **d_int, int n)
{
return allocRemoteBuffer((void **)(d_int), n*sizeof(int));
}
int writeInt(void *d_int, int* h_int, int n)
{
int i,j;
int *di;
i = writeRemoteBuffer((void*)h_int, (void*)d_int, n*sizeof(int));
return i;
}
int readInt(void* d_int, int* h_int, int n)
{ int i;
i = readRemoteBuffer((void *) h_int, (void *) d_int, n*sizeof(int));
//cudaSync();
return(i);
}
int writeIntFirst(int first, void *d_int, int* h_int, int n, int IndexBase)
{
int i,j;
int *di=(int *) d_int;
di = &(di[first-IndexBase]);
i = writeRemoteBuffer((void*)h_int, (void*)di, n*sizeof(int));
return i;
}
int readIntFirst(int first,void* d_int, int* h_int, int n, int IndexBase)
{ int i;
int *di=(int *) d_int;
di = &(di[first-IndexBase]);
i = readRemoteBuffer((void *) h_int, (void *) di, n*sizeof(int));
//cudaSync();
return(i);
}
int allocateMultiInt(void **d_int, int m, int n)
{
return allocRemoteBuffer((void **)(d_int), m*n*sizeof(int));
}
int writeMultiInt(void *d_int, int* h_int, int m, int n)
{
int i,j;
int *di;
i = writeRemoteBuffer((void*)h_int, (void*)d_int, m*n*sizeof(int));
return i;
}
int readMultiInt(void* d_int, int* h_int, int m, int n)
{ int i;
i = readRemoteBuffer((void *) h_int, (void *) d_int, m*n*sizeof(int));
//cudaSync();
return(i);
}
void freeInt(void *d_int)
{
//printf("Before freeInt\n");
freeRemoteBuffer(d_int);
}
int allocateFloat(void **d_float, int n)
{
return allocRemoteBuffer((void **)(d_float), n*sizeof(float));
}
int writeFloat(void *d_float, float* h_float, int n)
{
int i;
i = writeRemoteBuffer((void*)h_float, (void*)d_float, n*sizeof(float));
return i;
}
int readFloat(void* d_float, float* h_float, int n)
{ int i;
i = readRemoteBuffer((void *) h_float, (void *) d_float, n*sizeof(float));
return(i);
}
int writeFloatFirst(int df, void *d_float, float* h_float, int n, int IndexBase)
{
int i;
float *dv=(float *) d_float;
dv = &dv[df-IndexBase];
i = writeRemoteBuffer((void*)h_float, (void*)dv, n*sizeof(float));
return i;
}
int readFloatFirst(int df, void* d_float, float* h_float, int n, int IndexBase)
{ int i;
float *dv=(float *) d_float;
dv = &dv[df-IndexBase];
//fprintf(stderr,"readFloatFirst: %d %p %p %p %d \n",df,d_float,dv,h_float,n);
i = readRemoteBuffer((void *) h_float, (void *) dv, n*sizeof(float));
return(i);
}
int allocateMultiFloat(void **d_float, int m, int n)
{
return allocRemoteBuffer((void **)(d_float), m*n*sizeof(float));
}
int writeMultiFloat(void *d_float, float* h_float, int m, int n)
{
int i,j;
i = writeRemoteBuffer((void*)h_float, (void*)d_float, m*n*sizeof(float));
return i;
}
int readMultiFloat(void* d_float, float* h_float, int m, int n)
{ int i;
i = readRemoteBuffer((void *) h_float, (void *) d_float, m*n*sizeof(float));
//cudaSync();
return(i);
}
void freeFloat(void *d_float)
{
freeRemoteBuffer(d_float);
}
int allocateDouble(void **d_double, int n)
{
return allocRemoteBuffer((void **)(d_double), n*sizeof(double));
}
int writeDouble(void *d_double, double* h_double, int n)
{
int i;
i = writeRemoteBuffer((void*)h_double, (void*)d_double, n*sizeof(double));
return i;
}
int readDouble(void* d_double, double* h_double, int n)
{ int i;
i = readRemoteBuffer((void *) h_double, (void *) d_double, n*sizeof(double));
return(i);
}
int writeDoubleFirst(int df, void *d_double, double* h_double, int n, int IndexBase)
{
int i;
double *dv=(double *) d_double;
dv = &dv[df-IndexBase];
i = writeRemoteBuffer((void*)h_double, (void*)dv, n*sizeof(double));
return i;
}
int readDoubleFirst(int df, void* d_double, double* h_double, int n, int IndexBase)
{ int i;
double *dv=(double *) d_double;
dv = &dv[df-IndexBase];
//fprintf(stderr,"readDoubleFirst: %d %p %p %p %d \n",df,d_double,dv,h_double,n);
i = readRemoteBuffer((void *) h_double, (void *) dv, n*sizeof(double));
return(i);
}
int allocateMultiDouble(void **d_double, int m, int n)
{
return allocRemoteBuffer((void **)(d_double), m*n*sizeof(double));
}
int writeMultiDouble(void *d_double, double* h_double, int m, int n)
{
int i,j;
i = writeRemoteBuffer((void*)h_double, (void*)d_double, m*n*sizeof(double));
return i;
}
int readMultiDouble(void* d_double, double* h_double, int m, int n)
{ int i;
i = readRemoteBuffer((void *) h_double, (void *) d_double, m*n*sizeof(double));
//cudaSync();
return(i);
}
void freeDouble(void *d_double)
{
freeRemoteBuffer(d_double);
}
int allocateFloatComplex(void **d_FloatComplex, int n)
{
return allocRemoteBuffer((void **)(d_FloatComplex), n*sizeof(cuFloatComplex));
}
int writeFloatComplex(void *d_FloatComplex, cuFloatComplex* h_FloatComplex, int n)
{
int i;
i = writeRemoteBuffer((void*)h_FloatComplex, (void*)d_FloatComplex, n*sizeof(cuFloatComplex));
return i;
}
int readFloatComplex(void* d_FloatComplex, cuFloatComplex* h_FloatComplex, int n)
{ int i;
i = readRemoteBuffer((void *) h_FloatComplex, (void *) d_FloatComplex, n*sizeof(cuFloatComplex));
return(i);
}
int allocateMultiFloatComplex(void **d_FloatComplex, int m, int n)
{
return allocRemoteBuffer((void **)(d_FloatComplex), m*n*sizeof(cuFloatComplex));
}
int writeMultiFloatComplex(void *d_FloatComplex, cuFloatComplex* h_FloatComplex, int m, int n)
{
int i,j;
i = writeRemoteBuffer((void*)h_FloatComplex, (void*)d_FloatComplex, m*n*sizeof(cuFloatComplex));
return i;
}
int readMultiFloatComplex(void* d_FloatComplex, cuFloatComplex* h_FloatComplex, int m, int n)
{ int i;
i = readRemoteBuffer((void *) h_FloatComplex, (void *) d_FloatComplex, m*n*sizeof(cuFloatComplex));
//cudaSync();
return(i);
}
int writeFloatComplexFirst(int df, void *d_floatComplex,
cuFloatComplex* h_floatComplex, int n, int IndexBase)
{
int i;
cuFloatComplex *dv=(cuFloatComplex *) d_floatComplex;
dv = &dv[df-IndexBase];
i = writeRemoteBuffer((void*)h_floatComplex, (void*)dv, n*sizeof(cuFloatComplex));
return i;
}
int readFloatComplexFirst(int df, void* d_floatComplex, cuFloatComplex* h_floatComplex,
int n, int IndexBase)
{ int i;
cuFloatComplex *dv=(cuFloatComplex *) d_floatComplex;
dv = &dv[df-IndexBase];
i = readRemoteBuffer((void *) h_floatComplex, (void *) dv, n*sizeof(cuFloatComplex));
return(i);
}
void freeFloatComplex(void *d_FloatComplex)
{
freeRemoteBuffer(d_FloatComplex);
}
int allocateDoubleComplex(void **d_DoubleComplex, int n)
{
return allocRemoteBuffer((void **)(d_DoubleComplex), n*sizeof(cuDoubleComplex));
}
int writeDoubleComplex(void *d_DoubleComplex, cuDoubleComplex* h_DoubleComplex, int n)
{
int i;
i = writeRemoteBuffer((void*)h_DoubleComplex, (void*)d_DoubleComplex, n*sizeof(cuDoubleComplex));
return i;
}
int readDoubleComplex(void* d_DoubleComplex, cuDoubleComplex* h_DoubleComplex, int n)
{ int i;
i = readRemoteBuffer((void *) h_DoubleComplex, (void *) d_DoubleComplex, n*sizeof(cuDoubleComplex));
return(i);
}
int writeDoubleComplexFirst(int df, void *d_doubleComplex,
cuDoubleComplex* h_doubleComplex, int n, int IndexBase)
{
int i;
cuDoubleComplex *dv=(cuDoubleComplex *) d_doubleComplex;
dv = &dv[df-IndexBase];
i = writeRemoteBuffer((void*)h_doubleComplex, (void*)dv, n*sizeof(cuDoubleComplex));
return i;
}
int readDoubleComplexFirst(int df, void* d_doubleComplex, cuDoubleComplex* h_doubleComplex,
int n, int IndexBase)
{ int i;
cuDoubleComplex *dv=(cuDoubleComplex *) d_doubleComplex;
dv = &dv[df-IndexBase];
i = readRemoteBuffer((void *) h_doubleComplex, (void *) dv, n*sizeof(cuDoubleComplex));
return(i);
}
int allocateMultiDoubleComplex(void **d_DoubleComplex, int m, int n)
{
return allocRemoteBuffer((void **)(d_DoubleComplex), m*n*sizeof(cuDoubleComplex));
}
int writeMultiDoubleComplex(void *d_DoubleComplex, cuDoubleComplex* h_DoubleComplex, int m, int n)
{
int i,j;
i = writeRemoteBuffer((void*)h_DoubleComplex, (void*)d_DoubleComplex, m*n*sizeof(cuDoubleComplex));
return i;
}
int readMultiDoubleComplex(void* d_DoubleComplex, cuDoubleComplex* h_DoubleComplex, int m, int n)
{ int i;
i = readRemoteBuffer((void *) h_DoubleComplex, (void *) d_DoubleComplex, m*n*sizeof(cuDoubleComplex));
//cudaSync();
return(i);
}
void freeDoubleComplex(void *d_DoubleComplex)
{
freeRemoteBuffer(d_DoubleComplex);
}
double etime()
{
struct timeval tt;
struct timezone tz;
double temp;
if (gettimeofday(&tt,&tz) != 0) {
fprintf(stderr,"Fatal error for gettimeofday ??? \n");
exit(-1);
}
temp = ((double)tt.tv_sec) + ((double)tt.tv_usec)*1.0e-6;
return(temp);
}