New GPU comments and sample program in docs.

mergeparmatch
Salvatore Filippone 4 years ago
parent 97237e709b
commit 47acde313f

@ -267,14 +267,17 @@ module amg_base_prec_type
integer(psb_ipk_), parameter :: amg_dec_aggr_ = 0
integer(psb_ipk_), parameter :: amg_sym_dec_aggr_ = 1
integer(psb_ipk_), parameter :: amg_ext_aggr_ = 2
integer(psb_ipk_), parameter :: amg_max_par_aggr_alg_ = amg_ext_aggr_
integer(psb_ipk_), parameter :: amg_coupled_aggr_ = 3
integer(psb_ipk_), parameter :: amg_max_par_aggr_alg_ = amg_coupled_aggr_
!
! Legal values for entry: amg_aggr_type_
!
integer(psb_ipk_), parameter :: amg_noalg_ = 0
integer(psb_ipk_), parameter :: amg_soc1_ = 1
integer(psb_ipk_), parameter :: amg_soc2_ = 2
integer(psb_ipk_), parameter :: amg_matchboxp_ = 3
!
! Legal values for entry: amg_aggr_prol_
!
integer(psb_ipk_), parameter :: amg_no_smooth_ = 0
@ -506,7 +509,7 @@ contains
val = amg_soc2_
case('SOC1')
val = amg_soc1_
case('DEC')
case('DEC', 'DECOUPLED')
val = amg_dec_aggr_
case('SYMDEC')
val = amg_sym_dec_aggr_

Binary file not shown.

@ -112,73 +112,73 @@ href="userhtmlsu7.html#x16-150004.2" id="QQ2-16-21"><span
class="cmr-12">GPU example</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">5 </span><a
href="userhtmlse5.html#x17-160005" id="QQ2-17-23"><span
href="userhtmlse5.html#x17-160005" id="QQ2-17-25"><span
class="cmr-12">User Interface</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.1 </span><a
href="userhtmlsu8.html#x18-170005.1" id="QQ2-18-24"><span
href="userhtmlsu8.html#x18-170005.1" id="QQ2-18-26"><span
class="cmr-12">Method init</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.2 </span><a
href="userhtmlsu9.html#x19-180005.2" id="QQ2-19-25"><span
href="userhtmlsu9.html#x19-180005.2" id="QQ2-19-27"><span
class="cmr-12">Method set</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.3 </span><a
href="userhtmlsu10.html#x20-190005.3" id="QQ2-20-33"><span
href="userhtmlsu10.html#x20-190005.3" id="QQ2-20-35"><span
class="cmr-12">Method hierarchy</span><span
class="cmr-12">_build</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.4 </span><a
href="userhtmlsu11.html#x21-200005.4" id="QQ2-21-34"><span
href="userhtmlsu11.html#x21-200005.4" id="QQ2-21-36"><span
class="cmr-12">Method smoothers</span><span
class="cmr-12">_build</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.5 </span><a
href="userhtmlsu12.html#x22-210005.5" id="QQ2-22-35"><span
href="userhtmlsu12.html#x22-210005.5" id="QQ2-22-37"><span
class="cmr-12">Method build</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.6 </span><a
href="userhtmlsu13.html#x23-220005.6" id="QQ2-23-36"><span
href="userhtmlsu13.html#x23-220005.6" id="QQ2-23-38"><span
class="cmr-12">Method apply</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.7 </span><a
href="userhtmlsu14.html#x24-230005.7" id="QQ2-24-37"><span
href="userhtmlsu14.html#x24-230005.7" id="QQ2-24-39"><span
class="cmr-12">Method free</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.8 </span><a
href="userhtmlsu15.html#x25-240005.8" id="QQ2-25-38"><span
href="userhtmlsu15.html#x25-240005.8" id="QQ2-25-40"><span
class="cmr-12">Method descr</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.9 </span><a
href="userhtmlsu16.html#x26-250005.9" id="QQ2-26-39"><span
href="userhtmlsu16.html#x26-250005.9" id="QQ2-26-41"><span
class="cmr-12">Auxiliary Methods</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">6 </span><a
href="userhtmlse6.html#x27-310006" id="QQ2-27-45"><span
href="userhtmlse6.html#x27-310006" id="QQ2-27-47"><span
class="cmr-12">Adding new smoother and solver objects to AMG4PSBLAS</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">7 </span><a
href="userhtmlse7.html#x28-320007" id="QQ2-28-46"><span
href="userhtmlse7.html#x28-320007" id="QQ2-28-48"><span
class="cmr-12">Error Handling</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">A </span><a
href="userhtmlse8.html#x29-33000A" id="QQ2-29-47"><span
href="userhtmlse8.html#x29-33000A" id="QQ2-29-49"><span
class="cmr-12">License</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">B </span><a
href="userhtmlse9.html#x30-34000B" id="QQ2-30-48"><span
href="userhtmlse9.html#x30-34000B" id="QQ2-30-50"><span
class="cmr-12">Contributor Covenant Code of Conduct</span></a></span>
<br /> <span class="likesectionToc" ><a
href="userhtmlli5.html#x31-39000B" id="QQ2-31-53"><span
href="userhtmlli5.html#x31-39000B" id="QQ2-31-55"><span
class="cmr-12">References</span></a></span>
</div>

@ -112,73 +112,73 @@ href="userhtmlsu7.html#x16-150004.2" id="QQ2-16-21"><span
class="cmr-12">GPU example</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">5 </span><a
href="userhtmlse5.html#x17-160005" id="QQ2-17-23"><span
href="userhtmlse5.html#x17-160005" id="QQ2-17-25"><span
class="cmr-12">User Interface</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.1 </span><a
href="userhtmlsu8.html#x18-170005.1" id="QQ2-18-24"><span
href="userhtmlsu8.html#x18-170005.1" id="QQ2-18-26"><span
class="cmr-12">Method init</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.2 </span><a
href="userhtmlsu9.html#x19-180005.2" id="QQ2-19-25"><span
href="userhtmlsu9.html#x19-180005.2" id="QQ2-19-27"><span
class="cmr-12">Method set</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.3 </span><a
href="userhtmlsu10.html#x20-190005.3" id="QQ2-20-33"><span
href="userhtmlsu10.html#x20-190005.3" id="QQ2-20-35"><span
class="cmr-12">Method hierarchy</span><span
class="cmr-12">_build</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.4 </span><a
href="userhtmlsu11.html#x21-200005.4" id="QQ2-21-34"><span
href="userhtmlsu11.html#x21-200005.4" id="QQ2-21-36"><span
class="cmr-12">Method smoothers</span><span
class="cmr-12">_build</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.5 </span><a
href="userhtmlsu12.html#x22-210005.5" id="QQ2-22-35"><span
href="userhtmlsu12.html#x22-210005.5" id="QQ2-22-37"><span
class="cmr-12">Method build</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.6 </span><a
href="userhtmlsu13.html#x23-220005.6" id="QQ2-23-36"><span
href="userhtmlsu13.html#x23-220005.6" id="QQ2-23-38"><span
class="cmr-12">Method apply</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.7 </span><a
href="userhtmlsu14.html#x24-230005.7" id="QQ2-24-37"><span
href="userhtmlsu14.html#x24-230005.7" id="QQ2-24-39"><span
class="cmr-12">Method free</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.8 </span><a
href="userhtmlsu15.html#x25-240005.8" id="QQ2-25-38"><span
href="userhtmlsu15.html#x25-240005.8" id="QQ2-25-40"><span
class="cmr-12">Method descr</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span class="subsectionToc" ><span
class="cmr-12">5.9 </span><a
href="userhtmlsu16.html#x26-250005.9" id="QQ2-26-39"><span
href="userhtmlsu16.html#x26-250005.9" id="QQ2-26-41"><span
class="cmr-12">Auxiliary Methods</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">6 </span><a
href="userhtmlse6.html#x27-310006" id="QQ2-27-45"><span
href="userhtmlse6.html#x27-310006" id="QQ2-27-47"><span
class="cmr-12">Adding new smoother and solver objects to AMG4PSBLAS</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">7 </span><a
href="userhtmlse7.html#x28-320007" id="QQ2-28-46"><span
href="userhtmlse7.html#x28-320007" id="QQ2-28-48"><span
class="cmr-12">Error Handling</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">A </span><a
href="userhtmlse8.html#x29-33000A" id="QQ2-29-47"><span
href="userhtmlse8.html#x29-33000A" id="QQ2-29-49"><span
class="cmr-12">License</span></a></span>
<br /> <span class="sectionToc" ><span
class="cmr-12">B </span><a
href="userhtmlse9.html#x30-34000B" id="QQ2-30-48"><span
href="userhtmlse9.html#x30-34000B" id="QQ2-30-50"><span
class="cmr-12">Contributor Covenant Code of Conduct</span></a></span>
<br /> <span class="likesectionToc" ><a
href="userhtmlli5.html#x31-39000B" id="QQ2-31-53"><span
href="userhtmlli5.html#x31-39000B" id="QQ2-31-55"><span
class="cmr-12">References</span></a></span>
</div>

@ -137,32 +137,32 @@ class="cmr-12">Auxiliary Methods</span></a></span>
class="cmr-12">&#x00A0;</span><span
class="cmr-12">&#x00A0;</span><span class="subsubsectionToc" ><span
class="cmr-12">5.9.1 </span><a
href="userhtmlsu16.html#x26-260005.9.1" id="QQ2-26-40"><span
href="userhtmlsu16.html#x26-260005.9.1" id="QQ2-26-42"><span
class="cmr-12">Method: dump</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span
class="cmr-12">&#x00A0;</span><span class="subsubsectionToc" ><span
class="cmr-12">5.9.2 </span><a
href="userhtmlsu16.html#x26-270005.9.2" id="QQ2-26-41"><span
href="userhtmlsu16.html#x26-270005.9.2" id="QQ2-26-43"><span
class="cmr-12">Method: clone</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span
class="cmr-12">&#x00A0;</span><span class="subsubsectionToc" ><span
class="cmr-12">5.9.3 </span><a
href="userhtmlsu16.html#x26-280005.9.3" id="QQ2-26-42"><span
href="userhtmlsu16.html#x26-280005.9.3" id="QQ2-26-44"><span
class="cmr-12">Method: sizeof</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span
class="cmr-12">&#x00A0;</span><span class="subsubsectionToc" ><span
class="cmr-12">5.9.4 </span><a
href="userhtmlsu16.html#x26-290005.9.4" id="QQ2-26-43"><span
href="userhtmlsu16.html#x26-290005.9.4" id="QQ2-26-45"><span
class="cmr-12">Method: allocate</span><span
class="cmr-12">_wrk</span></a></span>
<br /> <span
class="cmr-12">&#x00A0;</span><span
class="cmr-12">&#x00A0;</span><span class="subsubsectionToc" ><span
class="cmr-12">5.9.5 </span><a
href="userhtmlsu16.html#x26-300005.9.5" id="QQ2-26-44"><span
href="userhtmlsu16.html#x26-300005.9.5" id="QQ2-26-46"><span
class="cmr-12">Method: free</span><span
class="cmr-12">_wrk</span></a></span>
<br /> <span class="sectionToc" ><span

@ -215,7 +215,7 @@ class="cmr-12">pass it as follows:</span>
<pre class="verbatim" id="verbatim-11">
<pre class="verbatim" id="verbatim-13">
&#x00A0;&#x00A0;!&#x00A0;sparse&#x00A0;matrix&#x00A0;and&#x00A0;preconditioner
&#x00A0;&#x00A0;type(psb_dspmat_type)&#x00A0;::&#x00A0;a
&#x00A0;&#x00A0;type(amg_dprec_type)&#x00A0;&#x00A0;::&#x00A0;prec

@ -36,7 +36,7 @@ class="cmr-12">AMG4PSBLAS is freely distributable under the following copyright
<pre class="verbatim" id="verbatim-12">
<pre class="verbatim" id="verbatim-14">
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;AMG4PSBLAS&#x00A0;&#x00A0;version&#x00A0;1.0
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;Algebraic&#x00A0;MultiGrid&#x00A0;Preconditioners&#x00A0;Package
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;based&#x00A0;on&#x00A0;PSBLAS&#x00A0;(Parallel&#x00A0;Sparse&#x00A0;BLAS&#x00A0;version&#x00A0;3.7)
@ -78,7 +78,7 @@ class="cmr-12">abide by its terms:</span>
<pre class="verbatim" id="verbatim-13">
<pre class="verbatim" id="verbatim-15">
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;MLD2P4&#x00A0;&#x00A0;version&#x00A0;2.2
&#x00A0;&#x00A0;MultiLevel&#x00A0;Domain&#x00A0;Decomposition&#x00A0;Parallel&#x00A0;Preconditioners&#x00A0;Package
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;based&#x00A0;on&#x00A0;PSBLAS&#x00A0;(Parallel&#x00A0;Sparse&#x00A0;BLAS&#x00A0;version&#x00A0;3.5)
@ -127,7 +127,7 @@ class="cmr-12">here.</span>
<pre class="verbatim" id="verbatim-14">
<pre class="verbatim" id="verbatim-16">
//&#x00A0;***********************************************************************
//
//&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;MatchboxP:&#x00A0;A&#x00A0;C++&#x00A0;library&#x00A0;for&#x00A0;approximate&#x00A0;weighted&#x00A0;matching

@ -29,17 +29,21 @@ class="cmr-12">4.2 </span></span> <a
id="x16-150004.2"></a><span
class="cmr-12">GPU example</span></h4>
<!--l. 422--><p class="noindent" ><span
class="cmr-12">The code reported in Figure</span><span
class="cmr-12">&#x00A0;</span><a
href="#x16-15001r5"><span
class="cmr-12">5</span><!--tex4ht:ref: fig:gpu-ex1 --></a> <span
class="cmr-12">shows how to set up a program exploiting the combined</span>
class="cmr-12">The code discussed here shows how to set up a program exploiting the combined GPU</span>
<span
class="cmr-12">GPU capabilities of PSBLAS and AMG4PSBLAS.</span>
class="cmr-12">capabilities of PSBLAS and AMG4PSBLAS. The code example is availabile in the</span>
<span
class="cmr-12">source distribution directory </span><span class="obeylines-h"><span class="verb"><span
class="cmtt-12">amg4psblas/tests/gpu</span></span></span><span
class="cmr-12">.</span>
<!--l. 427--><p class="indent" > <span
class="cmr-12">First of all, we need to include the appropriate modules and declare some auxiliary</span>
<span
class="cmr-12">variables:</span>
<!--l. 425--><p class="indent" > <a
<!--l. 429--><p class="indent" > <a
id="x16-15001r5"></a><hr class="float"><div class="float"
>
@ -47,24 +51,143 @@ class="cmr-12">GPU capabilities of PSBLAS and AMG4PSBLAS.</span>
<div class="center"
>
<!--l. 437--><p class="noindent" >
<!--l. 451--><p class="noindent" >
<div class="minipage"><pre class="verbatim" id="verbatim-10">
!&#x00A0;build&#x00A0;a&#x00A0;one-level&#x00A0;RAS&#x00A0;with&#x00A0;overlap&#x00A0;2&#x00A0;and&#x00A0;ILU(0)&#x00A0;on&#x00A0;the&#x00A0;local&#x00A0;blocks.
call&#x00A0;P%init(&#8217;AS&#8217;,info)
call&#x00A0;P%set(&#8217;SUB_OVR&#8217;,2,info)
call&#x00A0;P%build(A,desc_A,info)
...&#x00A0;...
!&#x00A0;solve&#x00A0;Ax=b&#x00A0;with&#x00A0;preconditioned&#x00A0;BiCGSTAB
&#x00A0;&#x00A0;call&#x00A0;psb_krylov(&#8217;BICGSTAB&#8217;,A,P,b,x,tol,desc_A,info)
program&#x00A0;amg_d_pde3d
&#x00A0;&#x00A0;use&#x00A0;psb_base_mod
&#x00A0;&#x00A0;use&#x00A0;amg_prec_mod
&#x00A0;&#x00A0;use&#x00A0;psb_krylov_mod
&#x00A0;&#x00A0;use&#x00A0;psb_util_mod
&#x00A0;&#x00A0;use&#x00A0;psb_gpu_mod
&#x00A0;&#x00A0;use&#x00A0;data_input
&#x00A0;&#x00A0;use&#x00A0;amg_d_pde3d_base_mod
&#x00A0;&#x00A0;use&#x00A0;amg_d_pde3d_exp_mod
&#x00A0;&#x00A0;use&#x00A0;amg_d_pde3d_gauss_mod
&#x00A0;&#x00A0;use&#x00A0;amg_d_genpde_mod
&#x00A0;&#x00A0;implicit&#x00A0;none
&#x00A0;&#x00A0;.......
&#x00A0;&#x00A0;!&#x00A0;GPU&#x00A0;variables
&#x00A0;&#x00A0;type(psb_d_hlg_sparse_mat)&#x00A0;::&#x00A0;agmold
&#x00A0;&#x00A0;type(psb_d_vect_gpu)&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;::&#x00A0;vgmold
&#x00A0;&#x00A0;type(psb_i_vect_gpu)&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;::&#x00A0;igmold
&#x00A0;
</pre>
<!--l. 448--><p class="nopar" ></div></div>
<!--l. 473--><p class="nopar" ></div></div>
<br /> <div class="caption"
><span class="id">Listing 5: </span><span
class="content">setup of a one-level Schwarz preconditioner.</span></div><!--tex4ht:label?: x16-15001r5 -->
class="content">setup of a GPU-enabled test program part one.</span></div><!--tex4ht:label?: x16-15001r5 -->
</div><hr class="endfloat" />
<!--l. 481--><p class="indent" > <span
class="cmr-12">We then have to initialize the GPU environment, and pass the appropriate MOLD</span>
<span
class="cmr-12">variables to the build methods</span>
<!--l. 483--><p class="indent" > <a
id="x16-15002r6"></a><hr class="float"><div class="float"
>
<div class="center"
>
<!--l. 499--><p class="noindent" >
<div class="minipage"><pre class="verbatim" id="verbatim-11">
&#x00A0;&#x00A0;call&#x00A0;psb_init(ctxt)
&#x00A0;&#x00A0;call&#x00A0;psb_info(ctxt,iam,np)
&#x00A0;&#x00A0;!
&#x00A0;&#x00A0;!&#x00A0;BEWARE:&#x00A0;if&#x00A0;you&#x00A0;have&#x00A0;NGPUS&#x00A0;&#x00A0;per&#x00A0;node,&#x00A0;the&#x00A0;default&#x00A0;is&#x00A0;to
&#x00A0;&#x00A0;!&#x00A0;attach&#x00A0;to&#x00A0;mod(IAM,NGPUS)
&#x00A0;&#x00A0;!
&#x00A0;&#x00A0;call&#x00A0;psb_gpu_init(ictxt)
&#x00A0;&#x00A0;......
&#x00A0;&#x00A0;t1&#x00A0;=&#x00A0;psb_wtime()
&#x00A0;&#x00A0;call&#x00A0;prec%smoothers_build(a,desc_a,info,&#x00A0;amold=agmold,&#x00A0;vmold=vgmold,&#x00A0;imold=igmold)
&#x00A0;
</pre>
<!--l. 514--><p class="nopar" ></div></div>
<br /> <div class="caption"
><span class="id">Listing 6: </span><span
class="content">setup of a GPU-enabled test program part two.</span></div><!--tex4ht:label?: x16-15002r6 -->
</div><hr class="endfloat" />
<!--l. 521--><p class="indent" > <span
class="cmr-12">Finally, we convert the input matrix, the descriptor and the vectors, then</span>
<span
class="cmr-12">preallocate the preconditioner workspace before entering the Krylov method. At the</span>
<span
class="cmr-12">end of the code, we close the GPU environment</span>
<!--l. 524--><p class="indent" > <a
id="x16-15003r7"></a><hr class="float"><div class="float"
>
<div class="center"
>
<!--l. 553--><p class="noindent" >
<div class="minipage"><pre class="verbatim" id="verbatim-12">
&#x00A0;&#x00A0;call&#x00A0;desc_a%cnv(mold=igmold)
&#x00A0;&#x00A0;call&#x00A0;a%cscnv(info,mold=agmold)
&#x00A0;&#x00A0;call&#x00A0;psb_geasb(x,desc_a,info,mold=vgmold)
&#x00A0;&#x00A0;call&#x00A0;psb_geasb(b,desc_a,info,mold=vgmold)
&#x00A0;&#x00A0;!
&#x00A0;&#x00A0;!&#x00A0;iterative&#x00A0;method&#x00A0;parameters
&#x00A0;&#x00A0;!
&#x00A0;&#x00A0;call&#x00A0;psb_barrier(ctxt)
&#x00A0;&#x00A0;call&#x00A0;prec%allocate_wrk(info)
&#x00A0;&#x00A0;t1&#x00A0;=&#x00A0;psb_wtime()
&#x00A0;&#x00A0;call&#x00A0;psb_krylov(s_choice%kmethd,a,prec,b,x,s_choice%eps,&amp;
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&amp;&#x00A0;desc_a,info,itmax=s_choice%itmax,iter=iter,err=err,itrace=s_choice%itrace,&amp;
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&amp;&#x00A0;istop=s_choice%istopc,irst=s_choice%irst)
&#x00A0;&#x00A0;call&#x00A0;prec%deallocate_wrk(info)
&#x00A0;&#x00A0;call&#x00A0;psb_barrier(ctxt)
&#x00A0;&#x00A0;tslv&#x00A0;=&#x00A0;psb_wtime()&#x00A0;-&#x00A0;t1
&#x00A0;&#x00A0;......
&#x00A0;&#x00A0;call&#x00A0;psb_gpu_exit()
&#x00A0;&#x00A0;call&#x00A0;psb_exit(ctxt)
&#x00A0;&#x00A0;stop
&#x00A0;
</pre>
<!--l. 580--><p class="nopar" ></div></div>
<br /> <div class="caption"
><span class="id">Listing 7: </span><span
class="content">setup of a GPU-enabled test program part three.</span></div><!--tex4ht:label?: x16-15003r7 -->
</div><hr class="endfloat" />
<!--l. 588--><p class="indent" > <span
class="cmr-12">It is very important to employ solvers that are suited to the GPU, i.e. solvers that</span>
<span
class="cmr-12">do NOT employ triangular system solve kernels. Solvers that satisfy this constraint</span>
<span
class="cmr-12">include:</span>
<ul class="itemize1">
<li class="itemize"><span class="obeylines-h"><span class="verb"><span
class="cmtt-12">JACOBI</span></span></span>
</li>
<li class="itemize"><span class="obeylines-h"><span class="verb"><span
class="cmtt-12">INVK</span></span></span>
</li>
<li class="itemize"><span class="obeylines-h"><span class="verb"><span
class="cmtt-12">INVT</span></span></span>
</li>
<li class="itemize"><span class="obeylines-h"><span class="verb"><span
class="cmtt-12">AINV</span></span></span></li></ul>
<!--l. 597--><p class="noindent" ><span
class="cmr-12">and their </span><span class="obeylines-h"><span class="verb"><span
class="cmtt-12">L1</span></span></span> <span
class="cmr-12">variants.</span>

@ -419,41 +419,182 @@ call P%build(A,desc_A,info)
\subsection{GPU example\label{sec:gpu-example}}
The code reported in Figure~\ref{fig:gpu-ex1} shows how to set up a
The code discussed here shows how to set up a
program exploiting the combined GPU capabilities of PSBLAS and
AMG4PSBLAS.
AMG4PSBLAS. The code example is availabile in the source distribution
directory \verb|amg4psblas/tests/gpu|.
First of all, we need to include the appropriate modules and
declare some auxiliary variables:
\begin{listing}[h!]
\ifpdf
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
! build a one-level RAS with overlap 2 and ILU(0) on the local blocks.
call P%init('AS',info)
call P%set('SUB_OVR',2,info)
call P%build(A,desc_A,info)
program amg_d_pde3d
use psb_base_mod
use amg_prec_mod
use psb_krylov_mod
use psb_util_mod
use psb_gpu_mod
use data_input
use amg_d_pde3d_base_mod
use amg_d_pde3d_exp_mod
use amg_d_pde3d_gauss_mod
use amg_d_genpde_mod
implicit none
.......
! GPU variables
type(psb_d_hlg_sparse_mat) :: agmold
type(psb_d_vect_gpu) :: vgmold
type(psb_i_vect_gpu) :: igmold
\end{minted}
\else
\begin{center}
\begin{minipage}{.90\textwidth}
{\small
\begin{verbatim}
program amg_d_pde3d
use psb_base_mod
use amg_prec_mod
use psb_krylov_mod
use psb_util_mod
use psb_gpu_mod
use data_input
use amg_d_pde3d_base_mod
use amg_d_pde3d_exp_mod
use amg_d_pde3d_gauss_mod
use amg_d_genpde_mod
implicit none
.......
! GPU variables
type(psb_d_hlg_sparse_mat) :: agmold
type(psb_d_vect_gpu) :: vgmold
type(psb_i_vect_gpu) :: igmold
\end{verbatim}
}
\end{minipage}
\end{center}
\fi
\caption{setup of a GPU-enabled test program part one.\label{fig:gpu-ex1}}
\end{listing}
We then have to initialize the GPU environment, and pass the
appropriate MOLD variables to the build methods
\begin{listing}[h!]
\ifpdf
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
call psb_init(ctxt)
call psb_info(ctxt,iam,np)
!
! BEWARE: if you have NGPUS per node, the default is to
! attach to mod(IAM,NGPUS)
!
call psb_gpu_init(ictxt)
......
! solve Ax=b with preconditioned BiCGSTAB
call psb_krylov('BICGSTAB',A,P,b,x,tol,desc_A,info)
t1 = psb_wtime()
call prec%smoothers_build(a,desc_a,info, amold=agmold, vmold=vgmold, imold=igmold)
\end{minted}
\else
\begin{center}
\begin{minipage}{.90\textwidth}
{\small
\begin{verbatim}
! build a one-level RAS with overlap 2 and ILU(0) on the local blocks.
call P%init('AS',info)
call P%set('SUB_OVR',2,info)
call P%build(A,desc_A,info)
call psb_init(ctxt)
call psb_info(ctxt,iam,np)
!
! BEWARE: if you have NGPUS per node, the default is to
! attach to mod(IAM,NGPUS)
!
call psb_gpu_init(ictxt)
......
! solve Ax=b with preconditioned BiCGSTAB
call psb_krylov('BICGSTAB',A,P,b,x,tol,desc_A,info)
t1 = psb_wtime()
call prec%smoothers_build(a,desc_a,info, amold=agmold, vmold=vgmold, imold=igmold)
\end{verbatim}
}
\end{minipage}
\end{center}
\fi
\caption{setup of a one-level Schwarz preconditioner.\label{fig:gpu-ex1}}
\caption{setup of a GPU-enabled test program part two.\label{fig:gpu-ex2}}
\end{listing}
Finally, we convert the input matrix, the descriptor and the vectors,
then preallocate the preconditioner workspace before entering the
Krylov method. At the end of the code, we close the GPU environment
\begin{listing}[h!]
\ifpdf
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
call desc_a%cnv(mold=igmold)
call a%cscnv(info,mold=agmold)
call psb_geasb(x,desc_a,info,mold=vgmold)
call psb_geasb(b,desc_a,info,mold=vgmold)
!
! iterative method parameters
!
call psb_barrier(ctxt)
call prec%allocate_wrk(info)
t1 = psb_wtime()
call psb_krylov(s_choice%kmethd,a,prec,b,x,s_choice%eps,&
& desc_a,info,itmax=s_choice%itmax,iter=iter,err=err,itrace=s_choice%itrace,&
& istop=s_choice%istopc,irst=s_choice%irst)
call prec%deallocate_wrk(info)
call psb_barrier(ctxt)
tslv = psb_wtime() - t1
......
call psb_gpu_exit()
call psb_exit(ctxt)
stop
\end{minted}
\else
\begin{center}
\begin{minipage}{.90\textwidth}
{\small
\begin{verbatim}
call desc_a%cnv(mold=igmold)
call a%cscnv(info,mold=agmold)
call psb_geasb(x,desc_a,info,mold=vgmold)
call psb_geasb(b,desc_a,info,mold=vgmold)
!
! iterative method parameters
!
call psb_barrier(ctxt)
call prec%allocate_wrk(info)
t1 = psb_wtime()
call psb_krylov(s_choice%kmethd,a,prec,b,x,s_choice%eps,&
& desc_a,info,itmax=s_choice%itmax,iter=iter,err=err,itrace=s_choice%itrace,&
& istop=s_choice%istopc,irst=s_choice%irst)
call prec%deallocate_wrk(info)
call psb_barrier(ctxt)
tslv = psb_wtime() - t1
......
call psb_gpu_exit()
call psb_exit(ctxt)
stop
\end{verbatim}
}
\end{minipage}
\end{center}
\fi
\caption{setup of a GPU-enabled test program part three.\label{fig:gpu-ex3}}
\end{listing}
It is very important to employ solvers that are suited
to the GPU, i.e. solvers that do NOT employ triangular
system solve kernels. Solvers that satisfy this constraint include:
\begin{itemize}
\item \verb|JACOBI|
\item \verb|INVK|
\item \verb|INVT|
\item \verb|AINV|
\end{itemize}
and their \verb|L1| variants.
%%% Local Variables:
%%% mode: latex

Loading…
Cancel
Save