diff --git a/amgprec/amg_base_prec_type.F90 b/amgprec/amg_base_prec_type.F90 index dbe5fe5f..e9eb8cab 100644 --- a/amgprec/amg_base_prec_type.F90 +++ b/amgprec/amg_base_prec_type.F90 @@ -266,15 +266,18 @@ module amg_base_prec_type ! integer(psb_ipk_), parameter :: amg_dec_aggr_ = 0 integer(psb_ipk_), parameter :: amg_sym_dec_aggr_ = 1 - integer(psb_ipk_), parameter :: amg_ext_aggr_ = 2 - integer(psb_ipk_), parameter :: amg_max_par_aggr_alg_ = amg_ext_aggr_ + integer(psb_ipk_), parameter :: amg_ext_aggr_ = 2 + integer(psb_ipk_), parameter :: amg_coupled_aggr_ = 3 + integer(psb_ipk_), parameter :: amg_max_par_aggr_alg_ = amg_coupled_aggr_ ! ! Legal values for entry: amg_aggr_type_ ! integer(psb_ipk_), parameter :: amg_noalg_ = 0 integer(psb_ipk_), parameter :: amg_soc1_ = 1 integer(psb_ipk_), parameter :: amg_soc2_ = 2 + integer(psb_ipk_), parameter :: amg_matchboxp_ = 3 ! + ! Legal values for entry: amg_aggr_prol_ ! integer(psb_ipk_), parameter :: amg_no_smooth_ = 0 @@ -506,7 +509,7 @@ contains val = amg_soc2_ case('SOC1') val = amg_soc1_ - case('DEC') + case('DEC', 'DECOUPLED') val = amg_dec_aggr_ case('SYMDEC') val = amg_sym_dec_aggr_ diff --git a/docs/amg4psblas_1.0-guide.pdf b/docs/amg4psblas_1.0-guide.pdf index 6e6f8b21..090666a9 100644 Binary files a/docs/amg4psblas_1.0-guide.pdf and b/docs/amg4psblas_1.0-guide.pdf differ diff --git a/docs/html/index.html b/docs/html/index.html index 18ae1e4d..f1d24e40 100644 --- a/docs/html/index.html +++ b/docs/html/index.html @@ -112,73 +112,73 @@ href="userhtmlsu7.html#x16-150004.2" id="QQ2-16-21">GPU example
5 User Interface
 5.1 Method init
 5.2 Method set
 5.3 Method hierarchy_build
 5.4 Method smoothers_build
 5.5 Method build
 5.6 Method apply
 5.7 Method free
 5.8 Method descr
 5.9 Auxiliary Methods
6 Adding new smoother and solver objects to AMG4PSBLAS
7 Error Handling
A License
B Contributor Covenant Code of Conduct
References diff --git a/docs/html/userhtml.html b/docs/html/userhtml.html index 18ae1e4d..f1d24e40 100644 --- a/docs/html/userhtml.html +++ b/docs/html/userhtml.html @@ -112,73 +112,73 @@ href="userhtmlsu7.html#x16-150004.2" id="QQ2-16-21">GPU example
5 User Interface
 5.1 Method init
 5.2 Method set
 5.3 Method hierarchy_build
 5.4 Method smoothers_build
 5.5 Method build
 5.6 Method apply
 5.7 Method free
 5.8 Method descr
 5.9 Auxiliary Methods
6 Adding new smoother and solver objects to AMG4PSBLAS
7 Error Handling
A License
B Contributor Covenant Code of Conduct
References diff --git a/docs/html/userhtmlli2.html b/docs/html/userhtmlli2.html index f7e08c84..1d6a77b7 100644 --- a/docs/html/userhtmlli2.html +++ b/docs/html/userhtmlli2.html @@ -137,32 +137,32 @@ class="cmr-12">Auxiliary Methods class="cmr-12">  5.9.1 Method: dump
  5.9.2 Method: clone
  5.9.3 Method: sizeof
  5.9.4 Method: allocate_wrk
  5.9.5 Method: free_wrk
pass it as follows: -
+   
   ! sparse matrix and preconditioner
   type(psb_dspmat_type) :: a
   type(amg_dprec_type)  :: prec
diff --git a/docs/html/userhtmlse8.html b/docs/html/userhtmlse8.html
index b840cd20..028b6cbf 100644
--- a/docs/html/userhtmlse8.html
+++ b/docs/html/userhtmlse8.html
@@ -36,7 +36,7 @@ class="cmr-12">AMG4PSBLAS is freely distributable under the following copyright
                                                                                
 
                                                                                
-   
+   
                            AMG4PSBLAS  version 1.0
               Algebraic MultiGrid Preconditioners Package
              based on PSBLAS (Parallel Sparse BLAS version 3.7)
@@ -78,7 +78,7 @@ class="cmr-12">abide by its terms:
                                                                                
 
                                                                                
-   
+   
                            MLD2P4  version 2.2
   MultiLevel Domain Decomposition Parallel Preconditioners Package
              based on PSBLAS (Parallel Sparse BLAS version 3.5)
@@ -127,7 +127,7 @@ class="cmr-12">here.
                                                                                
 
                                                                                
-   
+   
 // ***********************************************************************
 //
 //        MatchboxP: A C++ library for approximate weighted matching
diff --git a/docs/html/userhtmlsu7.html b/docs/html/userhtmlsu7.html
index 93848cfa..fbd845fb 100644
--- a/docs/html/userhtmlsu7.html
+++ b/docs/html/userhtmlsu7.html
@@ -29,17 +29,21 @@ class="cmr-12">4.2    GPU example
 

The code reported in Figure 5 shows how to set up a program exploiting the combined +class="cmr-12">The code discussed here shows how to set up a program exploiting the combined GPU GPU capabilities of PSBLAS and AMG4PSBLAS. +class="cmr-12">capabilities of PSBLAS and AMG4PSBLAS. The code example is availabile in the +source distribution directory amg4psblas/tests/gpu. +

First of all, we need to include the appropriate modules and declare some auxiliary +variables: -


@@ -47,24 +51,143 @@ class="cmr-12">GPU capabilities of PSBLAS and AMG4PSBLAS.
-

+

-! build a one-level RAS with overlap 2 and ILU(0) on the local blocks.
-call P%init(’AS’,info)
-call P%set(’SUB_OVR’,2,info)
-call P%build(A,desc_A,info)
-... ...
-! solve Ax=b with preconditioned BiCGSTAB
-  call psb_krylov(’BICGSTAB’,A,P,b,x,tol,desc_A,info)
+program amg_d_pde3d
+  use psb_base_mod
+  use amg_prec_mod
+  use psb_krylov_mod
+  use psb_util_mod
+  use psb_gpu_mod
+  use data_input
+  use amg_d_pde3d_base_mod
+  use amg_d_pde3d_exp_mod
+  use amg_d_pde3d_gauss_mod
+  use amg_d_genpde_mod
+  implicit none
+  .......
+  ! GPU variables
+  type(psb_d_hlg_sparse_mat) :: agmold
+  type(psb_d_vect_gpu)       :: vgmold
+  type(psb_i_vect_gpu)       :: igmold
+ 
 
-

+


Listing 5: setup of a one-level Schwarz preconditioner.
+class="content">setup of a GPU-enabled test program part one.
+

We then have to initialize the GPU environment, and pass the appropriate MOLD +variables to the build methods + + + +


+ + + +
+

+

+  call psb_init(ctxt)
+  call psb_info(ctxt,iam,np)
+  !
+  ! BEWARE: if you have NGPUS  per node, the default is to
+  ! attach to mod(IAM,NGPUS)
+  !
+  call psb_gpu_init(ictxt)
+  ......
+  t1 = psb_wtime()
+  call prec%smoothers_build(a,desc_a,info, amold=agmold, vmold=vgmold, imold=igmold)
+ 
+
+

+
Listing 6: setup of a GPU-enabled test program part two.
+ + + +

+

Finally, we convert the input matrix, the descriptor and the vectors, then +preallocate the preconditioner workspace before entering the Krylov method. At the +end of the code, we close the GPU environment + + + +


+ + + +
+

+

+  call desc_a%cnv(mold=igmold)
+  call a%cscnv(info,mold=agmold)
+  call psb_geasb(x,desc_a,info,mold=vgmold)
+  call psb_geasb(b,desc_a,info,mold=vgmold)
+  !
+  ! iterative method parameters
+  !
+  call psb_barrier(ctxt)
+  call prec%allocate_wrk(info)
+  t1 = psb_wtime()
+  call psb_krylov(s_choice%kmethd,a,prec,b,x,s_choice%eps,&
+       & desc_a,info,itmax=s_choice%itmax,iter=iter,err=err,itrace=s_choice%itrace,&
+       & istop=s_choice%istopc,irst=s_choice%irst)
+  call prec%deallocate_wrk(info)
+  call psb_barrier(ctxt)
+  tslv = psb_wtime() - t1
+  ......
+  call psb_gpu_exit()
+  call psb_exit(ctxt)
+  stop
+ 
+
+

+
Listing 7: setup of a GPU-enabled test program part three.
+ + + +

+

It is very important to employ solvers that are suited to the GPU, i.e. solvers that +do NOT employ triangular system solve kernels. Solvers that satisfy this constraint +include: +

    +
  • JACOBI +
  • +
  • INVK +
  • +
  • INVT +
  • +
  • AINV
+

and their L1 variants. diff --git a/docs/src/gettingstarted.tex b/docs/src/gettingstarted.tex index e4322ce0..7e5e0568 100644 --- a/docs/src/gettingstarted.tex +++ b/docs/src/gettingstarted.tex @@ -419,41 +419,182 @@ call P%build(A,desc_A,info) \subsection{GPU example\label{sec:gpu-example}} -The code reported in Figure~\ref{fig:gpu-ex1} shows how to set up a +The code discussed here shows how to set up a program exploiting the combined GPU capabilities of PSBLAS and -AMG4PSBLAS. +AMG4PSBLAS. The code example is availabile in the source distribution +directory \verb|amg4psblas/tests/gpu|. + +First of all, we need to include the appropriate modules and +declare some auxiliary variables: \begin{listing}[h!] \ifpdf \begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran} -! build a one-level RAS with overlap 2 and ILU(0) on the local blocks. -call P%init('AS',info) -call P%set('SUB_OVR',2,info) -call P%build(A,desc_A,info) -... ... -! solve Ax=b with preconditioned BiCGSTAB - call psb_krylov('BICGSTAB',A,P,b,x,tol,desc_A,info) +program amg_d_pde3d + use psb_base_mod + use amg_prec_mod + use psb_krylov_mod + use psb_util_mod + use psb_gpu_mod + use data_input + use amg_d_pde3d_base_mod + use amg_d_pde3d_exp_mod + use amg_d_pde3d_gauss_mod + use amg_d_genpde_mod + implicit none + ....... + ! GPU variables + type(psb_d_hlg_sparse_mat) :: agmold + type(psb_d_vect_gpu) :: vgmold + type(psb_i_vect_gpu) :: igmold \end{minted} \else \begin{center} \begin{minipage}{.90\textwidth} {\small \begin{verbatim} -! build a one-level RAS with overlap 2 and ILU(0) on the local blocks. -call P%init('AS',info) -call P%set('SUB_OVR',2,info) -call P%build(A,desc_A,info) -... ... -! solve Ax=b with preconditioned BiCGSTAB - call psb_krylov('BICGSTAB',A,P,b,x,tol,desc_A,info) -\end{verbatim} +program amg_d_pde3d + use psb_base_mod + use amg_prec_mod + use psb_krylov_mod + use psb_util_mod + use psb_gpu_mod + use data_input + use amg_d_pde3d_base_mod + use amg_d_pde3d_exp_mod + use amg_d_pde3d_gauss_mod + use amg_d_genpde_mod + implicit none + ....... + ! GPU variables + type(psb_d_hlg_sparse_mat) :: agmold + type(psb_d_vect_gpu) :: vgmold + type(psb_i_vect_gpu) :: igmold + + \end{verbatim} +} +\end{minipage} +\end{center} +\fi +\caption{setup of a GPU-enabled test program part one.\label{fig:gpu-ex1}} +\end{listing} + +We then have to initialize the GPU environment, and pass the +appropriate MOLD variables to the build methods +\begin{listing}[h!] +\ifpdf +\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran} + call psb_init(ctxt) + call psb_info(ctxt,iam,np) + ! + ! BEWARE: if you have NGPUS per node, the default is to + ! attach to mod(IAM,NGPUS) + ! + call psb_gpu_init(ictxt) + ...... + t1 = psb_wtime() + call prec%smoothers_build(a,desc_a,info, amold=agmold, vmold=vgmold, imold=igmold) + +\end{minted} +\else +\begin{center} +\begin{minipage}{.90\textwidth} +{\small +\begin{verbatim} + call psb_init(ctxt) + call psb_info(ctxt,iam,np) + ! + ! BEWARE: if you have NGPUS per node, the default is to + ! attach to mod(IAM,NGPUS) + ! + call psb_gpu_init(ictxt) + ...... + t1 = psb_wtime() + call prec%smoothers_build(a,desc_a,info, amold=agmold, vmold=vgmold, imold=igmold) + + \end{verbatim} } \end{minipage} \end{center} \fi -\caption{setup of a one-level Schwarz preconditioner.\label{fig:gpu-ex1}} +\caption{setup of a GPU-enabled test program part two.\label{fig:gpu-ex2}} \end{listing} +Finally, we convert the input matrix, the descriptor and the vectors, +then preallocate the preconditioner workspace before entering the +Krylov method. At the end of the code, we close the GPU environment +\begin{listing}[h!] +\ifpdf +\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran} + call desc_a%cnv(mold=igmold) + call a%cscnv(info,mold=agmold) + call psb_geasb(x,desc_a,info,mold=vgmold) + call psb_geasb(b,desc_a,info,mold=vgmold) + + ! + ! iterative method parameters + ! + call psb_barrier(ctxt) + call prec%allocate_wrk(info) + t1 = psb_wtime() + call psb_krylov(s_choice%kmethd,a,prec,b,x,s_choice%eps,& + & desc_a,info,itmax=s_choice%itmax,iter=iter,err=err,itrace=s_choice%itrace,& + & istop=s_choice%istopc,irst=s_choice%irst) + call prec%deallocate_wrk(info) + call psb_barrier(ctxt) + tslv = psb_wtime() - t1 + ...... + call psb_gpu_exit() + call psb_exit(ctxt) + stop + + +\end{minted} +\else +\begin{center} +\begin{minipage}{.90\textwidth} +{\small +\begin{verbatim} + call desc_a%cnv(mold=igmold) + call a%cscnv(info,mold=agmold) + call psb_geasb(x,desc_a,info,mold=vgmold) + call psb_geasb(b,desc_a,info,mold=vgmold) + + ! + ! iterative method parameters + ! + call psb_barrier(ctxt) + call prec%allocate_wrk(info) + t1 = psb_wtime() + call psb_krylov(s_choice%kmethd,a,prec,b,x,s_choice%eps,& + & desc_a,info,itmax=s_choice%itmax,iter=iter,err=err,itrace=s_choice%itrace,& + & istop=s_choice%istopc,irst=s_choice%irst) + call prec%deallocate_wrk(info) + call psb_barrier(ctxt) + tslv = psb_wtime() - t1 + + ...... + call psb_gpu_exit() + call psb_exit(ctxt) + stop + + \end{verbatim} +} +\end{minipage} +\end{center} +\fi +\caption{setup of a GPU-enabled test program part three.\label{fig:gpu-ex3}} +\end{listing} +It is very important to employ solvers that are suited +to the GPU, i.e. solvers that do NOT employ triangular +system solve kernels. Solvers that satisfy this constraint include: +\begin{itemize} +\item \verb|JACOBI| +\item \verb|INVK| +\item \verb|INVT| +\item \verb|AINV| +\end{itemize} +and their \verb|L1| variants. %%% Local Variables: %%% mode: latex