/* * spGPU - Sparse matrices on GPU library. * * Copyright (C) 2010 - 2014 * Salvatore Filippone - University of Rome Tor Vergata * Davide Barbieri - University of Rome Tor Vergata * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 3 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #define PRE_CONCAT(A, B) A ## B #define CONCAT(A, B) PRE_CONCAT(A, B) #undef GEN_SPGPU_FUNC_NAME #define GEN_SPGPU_FUNC_NAME(x) CONCAT(CONCAT(spgpu,x),axy) #define GEN_SPGPU_MFUNC_NAME(x) CONCAT(CONCAT(spgpu,x),maxy) #define GEN_SPGPU_FUNC_NAME_2(x) CONCAT(CONCAT(spgpu,x),axypbz) #define GEN_SPGPU_MFUNC_NAME_2(x) CONCAT(CONCAT(spgpu,x),maxypbz) #define GEN_SPGPU_SCAL_NAME(x) CONCAT(CONCAT(spgpu,x),scal) #define BLOCK_SIZE 256 // Define: //#define VALUE_TYPE //#define TYPE_SYMBOL #include "mathbase.cuh" __global__ void CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_kern) (VALUE_TYPE *z, int n, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y) { int id = threadIdx.x + BLOCK_SIZE*blockIdx.x; if (id < n) { // Since z, x and y are accessed with the same offset by the same thread, // and the write to z follows the x and y reads, x, y and z can share the same base address (in-place computing). z[id] = CONCAT(VALUE_TYPE, _mul)(alpha, CONCAT(VALUE_TYPE, _mul)(x[id], y[id])); } } void CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_) (spgpuHandle_t handle, VALUE_TYPE *z, int n, VALUE_TYPE alpha, VALUE_TYPE* x, VALUE_TYPE* y) { int msize = (n+BLOCK_SIZE-1)/BLOCK_SIZE; dim3 block(BLOCK_SIZE); dim3 grid(msize); CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_kern)<<currentStream>>>(z, n, alpha, x, y); } void GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL) (spgpuHandle_t handle, __device VALUE_TYPE *z, int n, VALUE_TYPE alpha, __device VALUE_TYPE *x, __device VALUE_TYPE *y) { int maxNForACall = max(handle->maxGridSizeX, BLOCK_SIZE*handle->maxGridSizeX); while (n > maxNForACall) //managing large vectors { CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_) (handle, z, maxNForACall, alpha, x, y); x = x + maxNForACall; y = y + maxNForACall; z = z + maxNForACall; n -= maxNForACall; } CONCAT(GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL),_) (handle, z, n, alpha, x, y); cudaCheckError("CUDA error on axy"); } void GEN_SPGPU_MFUNC_NAME(TYPE_SYMBOL) (spgpuHandle_t handle, __device VALUE_TYPE *z, int n, VALUE_TYPE alpha, __device VALUE_TYPE* x, __device VALUE_TYPE *y, int count, int pitch) { for (int i=0; icurrentStream>>>(w, n, beta, z, alpha, x, y); } void GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL) (spgpuHandle_t handle, __device VALUE_TYPE *w, int n, VALUE_TYPE beta, __device VALUE_TYPE *z, VALUE_TYPE alpha, __device VALUE_TYPE* x, __device VALUE_TYPE *y ) { if (CONCAT(VALUE_TYPE, _isZero(alpha))) { GEN_SPGPU_SCAL_NAME(TYPE_SYMBOL) (handle, w, n, beta, z); } else if (CONCAT(VALUE_TYPE, _isZero(beta))) { GEN_SPGPU_FUNC_NAME(TYPE_SYMBOL) (handle, w, n, alpha, x, y); } else { int maxNForACall = max(handle->maxGridSizeX, BLOCK_SIZE*handle->maxGridSizeX); while (n > maxNForACall) //managing large vectors { CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_) (handle, w, maxNForACall, beta, z, alpha, x, y); x = x + maxNForACall; y = y + maxNForACall; z = z + maxNForACall; w = w + maxNForACall; n -= maxNForACall; } CONCAT(GEN_SPGPU_FUNC_NAME_2(TYPE_SYMBOL),_) (handle, w, n, beta, z, alpha, x, y); } cudaCheckError("CUDA error on axypbz"); } void GEN_SPGPU_MFUNC_NAME_2(TYPE_SYMBOL) (spgpuHandle_t handle, __device VALUE_TYPE *w, int n, VALUE_TYPE beta, __device VALUE_TYPE *z, VALUE_TYPE alpha, __device VALUE_TYPE* x, __device VALUE_TYPE *y, int count, int pitch) { for (int i=0; i