From 655c86caeda10756c110adf8863280bfab85da63 Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Wed, 29 Nov 2023 10:20:38 +0100
Subject: [PATCH] Updated docs.

---
 README.md                   | 16 ++++++++++++++-
 cuda/License-spgpu.md       | 21 ++++++++++++++++++++
 test/cudakern/dpdegenmv.F90 |  4 ++--
 test/cudakern/spdegenmv.F90 | 39 ++++++++++++++++++++-----------------
 4 files changed, 59 insertions(+), 21 deletions(-)
 create mode 100644 cuda/License-spgpu.md
diff --git a/README.md b/README.md
index a9813f5e..afab1646 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-PSBLAS library, version 3.8
+PSBLAS library, version 3.9
 ===========================
 
 The architecture of the Fortran 2003 sparse BLAS is described in:
@@ -40,6 +40,15 @@ The main reference for the serial sparse BLAS is:
 >linear algebra subprograms for sparse matrices: a user level interface,
 >ACM Trans. Math. Softw., 23(3), 379-401, 1997.
 
+CUDA and GPU support
+--------------------
+This version of PSBLAS incorporates into a single package three
+entities that were previouslty separated:
+1. PSBLAS     -- the base library
+2. PSBLAS-EXT -- a library providing additional storage formats
+3. SPGPU      -- a package of kernels for NVIDIA GPUs originally
+   	      	 written by Davide Barbieri and Salvatore Filippone;
+		 see the license file cuda/License-spgpu.md
 
 INSTALLING
 ----------
@@ -61,6 +70,11 @@ prerequisites (see also SERIAL below):
    specify `--with-amd` (see `./configure --help` for more details).
    We use the C interface to AMD.
 
+5. If you have CUDA available, use
+   --with-cuda=<path>      to specify the CUDA toolkit location
+   --with-cudacc=XX,YY,ZZ  to specify a list of target CCs (compute
+   			   capabilities) to compile the CUDA code for.
+
 The configure script will generate a Make.inc file suitable for building
 the library. The script is capable of recognizing the needed libraries
 with their default names; if they are in unusual places consider adding
diff --git a/cuda/License-spgpu.md b/cuda/License-spgpu.md
new file mode 100644
index 00000000..7f4b8ff4
--- /dev/null
+++ b/cuda/License-spgpu.md
@@ -0,0 +1,21 @@
+(c) Copyright 2011-2021 Davide Barbieri, Salvatore Filippone
+ 
+  Redistribution and use in source and binary forms, with or without modification, 
+  are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, 
+  this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice, 
+  this list of conditions and the following disclaimer in the documentation and/or 
+  other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 
+  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
+  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 
+  SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/test/cudakern/dpdegenmv.F90 b/test/cudakern/dpdegenmv.F90
index d2cc2172..bde57f5f 100644
--- a/test/cudakern/dpdegenmv.F90
+++ b/test/cudakern/dpdegenmv.F90
@@ -631,7 +631,7 @@ program pdgenmv
     stop
   endif
   if(psb_get_errstatus() /= 0) goto 9999
-  name='pdegenmv-gpu'
+  name='pdegenmv-cuda'
   !
   ! Hello world
   !
@@ -974,7 +974,7 @@ contains
     if (iam == 0) then
       write(*,*) 'CPU side format?'
       read(psb_inp_unit,*) acfmt
-      write(*,*) 'GPU side format?'
+      write(*,*) 'CUDA side format?'
       read(psb_inp_unit,*) agfmt
       write(*,*) 'Size of discretization cube?'
       read(psb_inp_unit,*) idim
diff --git a/test/cudakern/spdegenmv.F90 b/test/cudakern/spdegenmv.F90
index 1c7d646f..9644d8c7 100644
--- a/test/cudakern/spdegenmv.F90
+++ b/test/cudakern/spdegenmv.F90
@@ -548,7 +548,7 @@ program pdgenmv
   use psb_util_mod 
   use psb_ext_mod
 #ifdef HAVE_GPU
-  use psb_gpu_mod
+  use psb_cuda_mod
 #endif
   use psb_s_pde3d_mod
   implicit none
@@ -570,8 +570,8 @@ program pdgenmv
   ! dense matrices
   type(psb_s_vect_type), target :: xv,bv, xg, bg 
 #ifdef HAVE_GPU
-  type(psb_s_vect_gpu)  :: vmold
-  type(psb_i_vect_gpu)  :: imold
+  type(psb_s_vect_cuda)  :: vmold
+  type(psb_i_vect_cuda)  :: imold 
 #endif
   real(psb_spk_), allocatable :: x1(:), x2(:), x0(:)
   ! blacs parameters
@@ -589,14 +589,14 @@ program pdgenmv
   type(psb_s_dia_sparse_mat), target   :: adia
   type(psb_s_hdia_sparse_mat), target   :: ahdia
 #ifdef HAVE_GPU
-  type(psb_s_elg_sparse_mat), target   :: aelg
-  type(psb_s_csrg_sparse_mat), target  :: acsrg
+  type(psb_s_cuda_elg_sparse_mat), target   :: aelg
+  type(psb_s_cuda_csrg_sparse_mat), target  :: acsrg
 #if CUDA_SHORT_VERSION <= 10
-  type(psb_s_hybg_sparse_mat), target  :: ahybg
+  type(psb_s_cuda_hybg_sparse_mat), target  :: ahybg
 #endif
-  type(psb_s_hlg_sparse_mat), target   :: ahlg
-  type(psb_s_dnsg_sparse_mat), target   :: adnsg
-  type(psb_s_hdiag_sparse_mat), target   :: ahdiag
+  type(psb_s_cuda_hlg_sparse_mat), target   :: ahlg
+  type(psb_s_cuda_hdiag_sparse_mat), target   :: ahdiag
+  type(psb_s_cuda_dnsg_sparse_mat), target   :: adnsg
 #endif
   class(psb_s_base_sparse_mat), pointer :: agmold, acmold
   ! other variables
@@ -613,7 +613,10 @@ program pdgenmv
   call psb_info(ctxt,iam,np)
 
 #ifdef HAVE_GPU
-  call psb_gpu_init(ctxt)
+  call psb_cuda_init(ctxt)
+#endif
+#ifdef HAVE_RSB
+  call psb_rsb_init()
 #endif
 
   if (iam < 0) then 
@@ -622,7 +625,7 @@ program pdgenmv
     stop
   endif
   if(psb_get_errstatus() /= 0) goto 9999
-  name='pdegenmv-gpu'
+  name='pdegenmv-cuda'
   !
   ! Hello world
   !
@@ -632,7 +635,7 @@ program pdgenmv
   end if
 #ifdef HAVE_GPU
   write(*,*) 'Process ',iam,' running on device: ', psb_cuda_getDevice(),' out of', psb_cuda_getDeviceCount()
-  write(*,*) 'Process ',iam,' device ', psb_cuda_getDevice(),' is a: ', trim(psb_gpu_DeviceName())  
+  write(*,*) 'Process ',iam,' device ', psb_cuda_getDevice(),' is a: ', trim(psb_cuda_DeviceName())  
 #endif
   !
   !  get parameters
@@ -752,7 +755,7 @@ program pdgenmv
     call psb_barrier(ctxt)
     t1 = psb_wtime()
     call agpu%cscnv(info,mold=agmold)
-    call psb_gpu_DeviceSync()
+    call psb_cuda_DeviceSync()
     t2 = psb_Wtime() -t1
     call psb_amx(ctxt,t2)
     if (j==1) tcnvg1 = t2
@@ -789,7 +792,7 @@ program pdgenmv
     end if
 
   end do
-  call psb_gpu_DeviceSync()
+  call psb_cuda_DeviceSync()
   call psb_barrier(ctxt)
   tt2 = psb_wtime() - tt1
   call psb_amx(ctxt,tt2)
@@ -817,7 +820,7 @@ program pdgenmv
     end if
     
   end do
-  call psb_gpu_DeviceSync()
+  call psb_cuda_DeviceSync()
   call psb_barrier(ctxt)
   gt2 = psb_wtime() - gt1
   call psb_amx(ctxt,gt2)
@@ -919,7 +922,7 @@ program pdgenmv
 #ifdef HAVE_GPU
     bdwdth = ngpu*ntests*nbytes/(gt2*1.d6)
     write(psb_out_unit,'("MBYTES/S sust. effective bandwidth  (GPU)  : ",F20.3)') bdwdth
-    bdwdth = psb_gpu_MemoryPeakBandwidth()
+    bdwdth = psb_cuda_MemoryPeakBandwidth()
     write(psb_out_unit,'("MBYTES/S peak bandwidth             (GPU)  : ",F20.3)') bdwdth
 #endif
     write(psb_out_unit,'("Storage type for DESC_A: ",a)') desc_a%indxmap%get_fmt()
@@ -941,7 +944,7 @@ program pdgenmv
     goto 9999
   end if
 #ifdef HAVE_GPU
-  call psb_gpu_exit()
+  call psb_cuda_exit()
 #endif
   call psb_exit(ctxt)
   stop
@@ -965,7 +968,7 @@ contains
     if (iam == 0) then
       write(*,*) 'CPU side format?'
       read(psb_inp_unit,*) acfmt
-      write(*,*) 'GPU side format?'
+      write(*,*) 'CUDA side format?'
       read(psb_inp_unit,*) agfmt
       write(*,*) 'Size of discretization cube?'
       read(psb_inp_unit,*) idim