Updatex docs for linsolve

repack-newsolve
sfilippone 3 months ago
parent 4f4006cf6b
commit a02440afff

@ -9,7 +9,7 @@ FINCLUDES=$(FMFLAG). $(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR)
CINCLUDES=-I. -I$(HERE) -I$(INCLUDEDIR) CINCLUDES=-I. -I$(HERE) -I$(INCLUDEDIR)
PSBC_LIBS= -L$(LIBDIR) -lpsb_cbind PSBC_LIBS= -L$(LIBDIR) -lpsb_cbind
PSB_LIBS=-lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base -L$(LIBDIR) PSB_LIBS=-lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base -L$(LIBDIR)
# #
# Compilers and such # Compilers and such

@ -54,11 +54,11 @@ href="userhtmlse10.html#x15-13500010" id="QQ2-15-165">Preconditioner routines</a
<br /> &#x00A0;<span class="sectionToc" >11 <a <br /> &#x00A0;<span class="sectionToc" >11 <a
href="userhtmlse11.html#x17-14200011" id="QQ2-17-172">Iterative Methods</a></span> href="userhtmlse11.html#x17-14200011" id="QQ2-17-172">Iterative Methods</a></span>
<br /> &#x00A0;<span class="sectionToc" >12 <a <br /> &#x00A0;<span class="sectionToc" >12 <a
href="userhtmlse12.html#x19-14400012" id="QQ2-19-174">Extensions</a></span> href="userhtmlse12.html#x19-14500012" id="QQ2-19-175">Extensions</a></span>
<br /> &#x00A0;<span class="sectionToc" >13 <a <br /> &#x00A0;<span class="sectionToc" >13 <a
href="userhtmlse13.html#x20-15300013" id="QQ2-20-189">CUDA Environment Routines</a></span> href="userhtmlse13.html#x20-15400013" id="QQ2-20-190">CUDA Environment Routines</a></span>
<br /> &#x00A0;<span class="likesectionToc" ><a <br /> &#x00A0;<span class="likesectionToc" ><a
href="userhtmlli2.html#x21-168000" id="QQ2-21-218">References</a></span> href="userhtmlli2.html#x21-169000" id="QQ2-21-219">References</a></span>
</div> </div>

@ -54,11 +54,11 @@ href="userhtmlse10.html#x15-13500010" id="QQ2-15-165">Preconditioner routines</a
<br /> &#x00A0;<span class="sectionToc" >11 <a <br /> &#x00A0;<span class="sectionToc" >11 <a
href="userhtmlse11.html#x17-14200011" id="QQ2-17-172">Iterative Methods</a></span> href="userhtmlse11.html#x17-14200011" id="QQ2-17-172">Iterative Methods</a></span>
<br /> &#x00A0;<span class="sectionToc" >12 <a <br /> &#x00A0;<span class="sectionToc" >12 <a
href="userhtmlse12.html#x19-14400012" id="QQ2-19-174">Extensions</a></span> href="userhtmlse12.html#x19-14500012" id="QQ2-19-175">Extensions</a></span>
<br /> &#x00A0;<span class="sectionToc" >13 <a <br /> &#x00A0;<span class="sectionToc" >13 <a
href="userhtmlse13.html#x20-15300013" id="QQ2-20-189">CUDA Environment Routines</a></span> href="userhtmlse13.html#x20-15400013" id="QQ2-20-190">CUDA Environment Routines</a></span>
<br /> &#x00A0;<span class="likesectionToc" ><a <br /> &#x00A0;<span class="likesectionToc" ><a
href="userhtmlli2.html#x21-168000" id="QQ2-21-218">References</a></span> href="userhtmlli2.html#x21-169000" id="QQ2-21-219">References</a></span>
</div> </div>

@ -310,46 +310,48 @@ href="userhtmlse10.html#x15-14100010.6" id="QQ2-15-171">free &#8212; Free a prec
href="userhtmlse11.html#x17-14200011">Iterative Methods</a></span> href="userhtmlse11.html#x17-14200011">Iterative Methods</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >11.1 <a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" >11.1 <a
href="userhtmlse11.html#x17-14300011.1" id="QQ2-17-173">psb_krylov &#8212; Krylov Methods Driver Routine</a></span> href="userhtmlse11.html#x17-14300011.1" id="QQ2-17-173">psb_krylov &#8212; Krylov Methods Driver Routine</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >11.2 <a
href="userhtmlse11.html#x17-14400011.2" id="QQ2-17-174">psb_richardson &#8212; Richardson Iteration Driver Routine</a></span>
<br /> &#x00A0;<span class="sectionToc" >12 <a <br /> &#x00A0;<span class="sectionToc" >12 <a
href="userhtmlse12.html#x19-14400012">Extensions</a></span> href="userhtmlse12.html#x19-14500012">Extensions</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.1 <a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.1 <a
href="userhtmlse12.html#x19-14500012.1" id="QQ2-19-175">Using the extensions</a></span> href="userhtmlse12.html#x19-14600012.1" id="QQ2-19-176">Using the extensions</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.2 <a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.2 <a
href="userhtmlse12.html#x19-14600012.2" id="QQ2-19-176">Extensions&#8217; Data Structures</a></span> href="userhtmlse12.html#x19-14700012.2" id="QQ2-19-177">Extensions&#8217; Data Structures</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.3 <a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.3 <a
href="userhtmlse12.html#x19-14700012.3" id="QQ2-19-179">CPU-class extensions</a></span> href="userhtmlse12.html#x19-14800012.3" id="QQ2-19-180">CPU-class extensions</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.4 <a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.4 <a
href="userhtmlse12.html#x19-15200012.4" id="QQ2-19-188">CUDA-class extensions</a></span> href="userhtmlse12.html#x19-15300012.4" id="QQ2-19-189">CUDA-class extensions</a></span>
<br /> &#x00A0;<span class="sectionToc" >13 <a <br /> &#x00A0;<span class="sectionToc" >13 <a
href="userhtmlse13.html#x20-15300013">CUDA Environment Routines</a></span> href="userhtmlse13.html#x20-15400013">CUDA Environment Routines</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-191">psb_cuda_init</a></span> href="userhtmlse13.html#Q1-20-192">psb_cuda_init</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-193">psb_cuda_exit</a></span> href="userhtmlse13.html#Q1-20-194">psb_cuda_exit</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-195">psb_cuda_DeviceSync</a></span> href="userhtmlse13.html#Q1-20-196">psb_cuda_DeviceSync</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-197">psb_cuda_getDeviceCount</a></span> href="userhtmlse13.html#Q1-20-198">psb_cuda_getDeviceCount</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-199">psb_cuda_getDevice</a></span> href="userhtmlse13.html#Q1-20-200">psb_cuda_getDevice</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-201">psb_cuda_setDevice</a></span> href="userhtmlse13.html#Q1-20-202">psb_cuda_setDevice</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-203">psb_cuda_DeviceHasUVA</a></span> href="userhtmlse13.html#Q1-20-204">psb_cuda_DeviceHasUVA</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-205">psb_cuda_WarpSize</a></span> href="userhtmlse13.html#Q1-20-206">psb_cuda_WarpSize</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-207">psb_cuda_MultiProcessors</a></span> href="userhtmlse13.html#Q1-20-208">psb_cuda_MultiProcessors</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-209">psb_cuda_MaxThreadsPerMP</a></span> href="userhtmlse13.html#Q1-20-210">psb_cuda_MaxThreadsPerMP</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-211">psb_cuda_MaxRegisterPerBlock</a></span> href="userhtmlse13.html#Q1-20-212">psb_cuda_MaxRegisterPerBlock</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-213">psb_cuda_MemoryClockRate</a></span> href="userhtmlse13.html#Q1-20-214">psb_cuda_MemoryClockRate</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-215">psb_cuda_MemoryBusWidth</a></span> href="userhtmlse13.html#Q1-20-216">psb_cuda_MemoryBusWidth</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a <br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-217">psb_cuda_MemoryPeakBandwidth</a></span> href="userhtmlse13.html#Q1-20-218">psb_cuda_MemoryPeakBandwidth</a></span>
</div> </div>

@ -16,7 +16,7 @@ href="userhtmlse13.html#tailuserhtmlse13.html" >prev-tail</a>] [<a
href="#tailuserhtmlli2.html">tail</a>] [<a href="#tailuserhtmlli2.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div> href="userhtml.html# " >up</a>] </p></div>
<h3 class="likesectionHead"><a <h3 class="likesectionHead"><a
id="x21-168000"></a>References</h3> id="x21-169000"></a>References</h3>
<!--l. 2--><p class="noindent" > <!--l. 2--><p class="noindent" >
<div class="thebibliography"> <div class="thebibliography">
<p class="bibitem" ><span class="biblabel"> <p class="bibitem" ><span class="biblabel">

@ -17,10 +17,9 @@ href="userhtmlse8.html#tailuserhtmlse11.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div> href="userhtml.html# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">11 </span> <a <h3 class="sectionHead"><span class="titlemark">11 </span> <a
id="x17-14200011"></a>Iterative Methods</h3> id="x17-14200011"></a>Iterative Methods</h3>
<!--l. 4--><p class="noindent" >In this chapter we provide routines for preconditioners and iterative methods. <!--l. 4--><p class="noindent" >In this chapter we provide routines for preconditioners and iterative methods. The
The interfaces for Krylov subspace methods are available in the module interfaces for iterative methods are available in the module <span class="obeylines-h"><span class="verb"><span
<span class="obeylines-h"><span class="verb"><span class="cmtt-10">psb_linsolve_mod</span></span></span>.
class="cmtt-10">psb_krylov_mod</span></span></span>.
@ -456,6 +455,344 @@ class="newline" />An integer value; 0 means no error has been detected.</dd></dl
<h4 class="subsectionHead"><span class="titlemark">11.2 </span> <a
id="x17-14400011.2"></a>psb_richardson &#8212; Richardson Iteration Driver Routine</h4>
<!--l. 158--><p class="noindent" >This subroutine is a driver implementig a Richardson iteration
<div class="math-display" >
<img
src="userhtml33x.png" alt="x = M - 1(b - Ax )+ x ,
k+1 k k
" class="math-display" ></div>
<!--l. 159--><p class="nopar" > with the preconditioner operator <span
class="cmmi-10">M </span>defined in the previous section.
<!--l. 162--><p class="indent" > The stopping criterion can take the following values:
<dl class="description"><dt class="description">
<!--l. 164--><p class="noindent" >
<span
class="cmbx-10">1</span> </dt><dd
class="description">
<!--l. 164--><p class="noindent" >normwise backward error in the infinity norm; the iteration is stopped
when
<div class="math-display" >
<img
src="userhtml34x.png" alt=" -----&#x2225;ri&#x2225;------
err = (&#x2225;A&#x2225;&#x2225;xi&#x2225;+ &#x2225;b&#x2225;) &#x003C; eps
" class="math-display" ></div>
<!--l. 166--><p class="nopar" >
</dd><dt class="description">
<!--l. 167--><p class="noindent" >
<span
class="cmbx-10">2</span> </dt><dd
class="description">
<!--l. 167--><p class="noindent" >Relative residual in the 2-norm; the iteration is stopped when
<div class="math-display" >
<img
src="userhtml35x.png" alt=" &#x2225;ri&#x2225;-
err = &#x2225;b&#x2225;2 &#x003C; eps
" class="math-display" ></div>
<!--l. 169--><p class="nopar" >
</dd><dt class="description">
<!--l. 170--><p class="noindent" >
<span
class="cmbx-10">3</span> </dt><dd
class="description">
<!--l. 170--><p class="noindent" >Relative residual reduction in the 2-norm; the iteration is stopped when
<div class="math-display" >
<img
src="userhtml36x.png" alt=" -&#x2225;ri&#x2225;-
err = &#x2225;r0&#x2225;2 &#x003C; eps
" class="math-display" ></div>
<!--l. 172--><p class="nopar" ></dd></dl>
<!--l. 174--><p class="noindent" >The behaviour is controlled by the istop argument (see later). In the above formulae, <span
class="cmmi-10">x</span><sub><span
class="cmmi-7">i</span></sub>
is the tentative solution and <span
class="cmmi-10">r</span><sub><span
class="cmmi-7">i</span></sub> = <span
class="cmmi-10">b </span><span
class="cmsy-10">- </span><span
class="cmmi-10">Ax</span><sub><span
class="cmmi-7">i</span></sub> the corresponding residual at the <span
class="cmmi-10">i</span>-th
iteration.
<!--l. 179-->
<pre class="lstlisting" id="listing-168"><span class="label"><a
id="x17-144001r1"></a></span><span style="color:#000000"><span
class="cmtt-10">call</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-10">psb_richardson</span></span><span style="color:#000000"><span
class="cmtt-10">(</span></span><span style="color:#000000"><span
class="cmtt-10">a</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">prec</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">b</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">x</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">eps</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">desc_a</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">info</span></span><span style="color:#000000"><span
class="cmtt-10">,&amp;</span></span>
<span class="label"><a
id="x17-144002r2"></a></span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-10">&amp;</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-10">itmax</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">iter</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">err</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">itrace</span></span><span style="color:#000000"><span
class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">istop</span></span><span style="color:#000000"><span
class="cmtt-10">)</span></span></pre>
<!--l. 184--><p class="indent" >
<dl class="description"><dt class="description">
<!--l. 185--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="description">
<!--l. 185--><p class="noindent" >Synchronous.
</dd><dt class="description">
<!--l. 186--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="description">
<!--l. 186--><p class="noindent" >
</dd><dt class="description">
<!--l. 187--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="description">
<!--l. 187--><p class="noindent" >the local portion of global sparse matrix <span
class="cmmi-10">A</span>. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_Tspmat</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 193--><p class="noindent" >
<span
class="cmbx-10">prec</span> </dt><dd
class="description">
<!--l. 193--><p class="noindent" >The data structure containing the preconditioner.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#precdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_prec</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 198--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="description">
<!--l. 198--><p class="noindent" >The RHS vector. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_T</span><span
class="cmtt-10">_vect</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 203--><p class="noindent" >
<span
class="cmbx-10">x</span> </dt><dd
class="description">
<!--l. 203--><p class="noindent" >The initial guess. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">inout</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_T</span><span
class="cmtt-10">_vect</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 208--><p class="noindent" >
<span
class="cmbx-10">eps</span> </dt><dd
class="description">
<!--l. 208--><p class="noindent" >The stopping tolerance. <br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Specified as: a real number.
</dd><dt class="description">
<!--l. 213--><p class="noindent" >
<span
class="cmbx-10">desc</span><span
class="cmbx-10">_a</span> </dt><dd
class="description">
<!--l. 213--><p class="noindent" >contains data structures for communications.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#descdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_desc</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 218--><p class="noindent" >
<span
class="cmbx-10">itmax</span> </dt><dd
class="description">
<!--l. 218--><p class="noindent" >The maximum number of iterations to perform.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Default: <span
class="cmmi-10">itmax </span>= 1000.<br
class="newline" />Specified as: an integer variable <span
class="cmmi-10">itmax </span><span
class="cmsy-10">&#x2265; </span>1.
</dd><dt class="description">
<!--l. 224--><p class="noindent" >
<span
class="cmbx-10">itrace</span> </dt><dd
class="description">
<!--l. 224--><p class="noindent" >If <span
class="cmmi-10">&#x003E; </span>0 print out an informational message about convergence every <span
class="cmmi-10">itrace</span>
iterations. If = 0 print a message in case of convergence failure.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Default: <span
class="cmmi-10">itrace </span>= <span
class="cmsy-10">-</span>1.<br
class="newline" />
</dd><dt class="description">
<!--l. 232--><p class="noindent" >
<span
class="cmbx-10">istop</span> </dt><dd
class="description">
<!--l. 232--><p class="noindent" >An integer specifying the stopping criterion.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span>.<br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="newline" />Values: 1: use the normwise backward error, 2: use the scaled 2-norm of
the residual, 3: use the residual reduction in the 2-norm. Default: 2.
</dd><dt class="description">
<!--l. 238--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="description">
<!--l. 238--><p class="noindent" >
</dd><dt class="description">
<!--l. 239--><p class="noindent" >
<span
class="cmbx-10">x</span> </dt><dd
class="description">
<!--l. 239--><p class="noindent" >The computed solution. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="newline" />Intent: <span
class="cmbx-10">inout</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_T</span><span
class="cmtt-10">_vect</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 244--><p class="noindent" >
<span
class="cmbx-10">iter</span> </dt><dd
class="description">
<!--l. 244--><p class="noindent" >The number of iterations performed.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="newline" />Returned as: an integer variable.
</dd><dt class="description">
<!--l. 249--><p class="noindent" >
<span
class="cmbx-10">err</span> </dt><dd
class="description">
<!--l. 249--><p class="noindent" >The convergence estimate on exit.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="newline" />Returned as: a real number.
</dd><dt class="description">
<!--l. 254--><p class="noindent" >
<span
class="cmbx-10">info</span> </dt><dd
class="description">
<!--l. 254--><p class="noindent" >Error code.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>

@ -16,7 +16,7 @@ href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
href="userhtmlse9.html#tailuserhtmlse12.html">tail</a>] [<a href="userhtmlse9.html#tailuserhtmlse12.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div> href="userhtml.html# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">12 </span> <a <h3 class="sectionHead"><span class="titlemark">12 </span> <a
id="x19-14400012"></a>Extensions</h3> id="x19-14500012"></a>Extensions</h3>
<!--l. 3--><p class="noindent" >The EXT, CUDA and RSB subdirectories contains a set of extensions to the base <!--l. 3--><p class="noindent" >The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
library. The extensions provide additional storage formats beyond the ones already library. The extensions provide additional storage formats beyond the ones already
contained in the base library, as well as interfaces to: contained in the base library, as well as interfaces to:
@ -49,7 +49,7 @@ in&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XOurTechRep">22</a>]</span>. href="userhtmlli2.html#XOurTechRep">22</a>]</span>.
<!--l. 19--><p class="noindent" > <!--l. 19--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a <h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a
id="x19-14500012.1"></a>Using the extensions</h4> id="x19-14600012.1"></a>Using the extensions</h4>
<!--l. 21--><p class="noindent" >A sample application using the PSBLAS extensions will contain the following <!--l. 21--><p class="noindent" >A sample application using the PSBLAS extensions will contain the following
steps: steps:
<ul class="itemize1"> <ul class="itemize1">
@ -142,7 +142,7 @@ speed of the sparse matrix-vector product with the various data structures inclu
in the library. in the library.
<!--l. 146--><p class="noindent" > <!--l. 146--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a <h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a
id="x19-14600012.2"></a>Extensions&#8217; Data Structures</h4> id="x19-14700012.2"></a>Extensions&#8217; Data Structures</h4>
<!--l. 150--><p class="noindent" >Access to the facilities provided by the EXT library is mainly achieved through <!--l. 150--><p class="noindent" >Access to the facilities provided by the EXT library is mainly achieved through
the data types that are provided within. The data classes are derived from the data types that are provided within. The data classes are derived from
the base classes in PSBLAS, through the Fortran&#x00A0;2003 mechanism of <span the base classes in PSBLAS, through the Fortran&#x00A0;2003 mechanism of <span
@ -153,20 +153,20 @@ href="userhtmlli2.html#XMRC:11">17</a>]</span>.
<!--l. 155--><p class="indent" > The data classes are divided between the general purpose CPU extensions, the <!--l. 155--><p class="indent" > The data classes are divided between the general purpose CPU extensions, the
GPU interfaces and the RSB interfaces. In the description we will make use of the GPU interfaces and the RSB interfaces. In the description we will make use of the
notation introduced in Table&#x00A0;<a notation introduced in Table&#x00A0;<a
href="#x19-146001r21">21<!--tex4ht:ref: tab:notation --></a>. href="#x19-147001r21">21<!--tex4ht:ref: tab:notation --></a>.
<div class="table"> <div class="table">
<!--l. 160--><p class="indent" > <a <!--l. 160--><p class="indent" > <a
id="x19-146001r21"></a><hr class="float"><div class="float" id="x19-147001r21"></a><hr class="float"><div class="float"
> >
<div class="caption" <div class="caption"
><span class="id">Table&#x00A0;21: </span><span ><span class="id">Table&#x00A0;21: </span><span
class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-146001r21 --> class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-147001r21 -->
<div class="center" <div class="center"
> >
<!--l. 162--><p class="noindent" > <!--l. 162--><p class="noindent" >
@ -274,7 +274,7 @@ class="td11"> </td></tr></table>
<a <a
id="x19-146002r5"></a> id="x19-147002r5"></a>
@ -283,18 +283,18 @@ src="mat.png" alt="PIC"
width="147" height="147" > width="147" height="147" >
<br /> <div class="caption" <br /> <div class="caption"
><span class="id">Figure&#x00A0;5: </span><span ><span class="id">Figure&#x00A0;5: </span><span
class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-146002r5 --> class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-147002r5 -->
<!--l. 198--><p class="indent" > </div><hr class="endfigure"> <!--l. 198--><p class="indent" > </div><hr class="endfigure">
<h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a <h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a
id="x19-14700012.3"></a>CPU-class extensions</h4> id="x19-14800012.3"></a>CPU-class extensions</h4>
<!--l. 203--><p class="noindent" > <!--l. 203--><p class="noindent" >
<h5 class="likesubsubsectionHead"><a <h5 class="likesubsubsectionHead"><a
id="x19-148000"></a>ELLPACK</h5> id="x19-149000"></a>ELLPACK</h5>
<!--l. 205--><p class="noindent" >The ELLPACK/ITPACK format (shown in Figure&#x00A0;<a <!--l. 205--><p class="noindent" >The ELLPACK/ITPACK format (shown in Figure&#x00A0;<a
href="#x19-148001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional href="#x19-149001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional
arrays <span class="obeylines-h"><span class="verb"><span arrays <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">JA</span></span></span> with <span class="obeylines-h"><span class="verb"><span class="cmtt-10">JA</span></span></span> with <span class="obeylines-h"><span class="verb"><span
@ -315,7 +315,7 @@ row.
<a <a
id="x19-148001r6"></a> id="x19-149001r6"></a>
@ -325,13 +325,13 @@ width="233" height="233" >
<br /> <div class="caption" <br /> <div class="caption"
><span class="id">Figure&#x00A0;6: </span><span ><span class="id">Figure&#x00A0;6: </span><span
class="content">ELLPACK compression of matrix in Figure&#x00A0;<a class="content">ELLPACK compression of matrix in Figure&#x00A0;<a
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-148001r6 --> href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-149001r6 -->
<!--l. 225--><p class="indent" > </div><hr class="endfigure"> <!--l. 225--><p class="indent" > </div><hr class="endfigure">
<a <a
id="x19-148002r1"></a> id="x19-149002r1"></a>
@ -341,8 +341,8 @@ href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:l
<!--l. 231--> <!--l. 231-->
<pre class="lstlisting" id="listing-168"><span class="label"><a <pre class="lstlisting" id="listing-169"><span class="label"><a
id="x19-148003r1"></a></span><span id="x19-149003r1"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -352,7 +352,7 @@ class="cmtt-9">i</span></span><span style="color:#000000"><span
class="cmtt-9">=1,</span></span><span style="color:#000000"><span class="cmtt-9">=1,</span></span><span style="color:#000000"><span
class="cmtt-9">n</span></span> class="cmtt-9">n</span></span>
<span class="label"><a <span class="label"><a
id="x19-148004r2"></a></span><span id="x19-149004r2"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -362,7 +362,7 @@ class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">t</span></span><span style="color:#000000"><span class="cmtt-9">t</span></span><span style="color:#000000"><span
class="cmtt-9">=0</span></span> class="cmtt-9">=0</span></span>
<span class="label"><a <span class="label"><a
id="x19-148005r3"></a></span><span id="x19-149005r3"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -374,7 +374,7 @@ class="cmtt-9">j</span></span><span style="color:#000000"><span
class="cmtt-9">=1,</span></span><span style="color:#000000"><span class="cmtt-9">=1,</span></span><span style="color:#000000"><span
class="cmtt-9">maxnzr</span></span> class="cmtt-9">maxnzr</span></span>
<span class="label"><a <span class="label"><a
id="x19-148006r4"></a></span><span id="x19-149006r4"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -403,7 +403,7 @@ class="cmtt-9">,</span></span><span style="color:#000000"><span
class="cmtt-9">j</span></span><span style="color:#000000"><span class="cmtt-9">j</span></span><span style="color:#000000"><span
class="cmtt-9">))</span></span> class="cmtt-9">))</span></span>
<span class="label"><a <span class="label"><a
id="x19-148007r5"></a></span><span id="x19-149007r5"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -413,7 +413,7 @@ class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">do</span></span> class="cmtt-9">do</span></span>
<span class="label"><a <span class="label"><a
id="x19-148008r6"></a></span><span id="x19-149008r6"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -427,7 +427,7 @@ class="cmtt-9">)</span></span><span style="color:#000000"> </span><span style="c
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">t</span></span> class="cmtt-9">t</span></span>
<span class="label"><a <span class="label"><a
id="x19-148009r7"></a></span><span id="x19-149009r7"></a></span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span class="cmtt-9">&#x00A0;</span><span
@ -436,9 +436,9 @@ class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style=
class="cmtt-9">do</span></span></pre> class="cmtt-9">do</span></span></pre>
<a <a
id="x19-148010r1"></a> id="x19-149010r1"></a>
<a <a
id="x19-148011"></a> id="x19-149011"></a>
<span <span
class="cmbx-10">Algorithm</span><span class="cmbx-10">Algorithm</span><span
class="cmbx-10">&#x00A0;1:</span>&#x00A0; Matrix-Vector product in ELL format class="cmbx-10">&#x00A0;1:</span>&#x00A0; Matrix-Vector product in ELL format
@ -450,7 +450,7 @@ class="cmbx-10">&#x00A0;1:</span>&#x00A0; Matrix-Vector product in ELL format
class="cmmi-10">y </span>= <span class="cmmi-10">y </span>= <span
class="cmmi-10">Ax </span>can be computed with the code shown in class="cmmi-10">Ax </span>can be computed with the code shown in
Alg.&#x00A0;<a Alg.&#x00A0;<a
href="#x19-148010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and href="#x19-149010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and
two floating-point operations per inner iteration. two floating-point operations per inner iteration.
<!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the coefficients <!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the coefficients
in the <span class="obeylines-h"><span class="verb"><span in the <span class="obeylines-h"><span class="verb"><span
@ -459,12 +459,12 @@ in terms of memory space and redundant operations (multiplications by zero). The
overhead can be acceptable if: overhead can be acceptable if:
<ol class="enumerate1" > <ol class="enumerate1" >
<li <li
class="enumerate" id="x19-148013x1"> class="enumerate" id="x19-149013x1">
<!--l. 253--><p class="noindent" >The maximum number of nonzeros per row is not much larger than the <!--l. 253--><p class="noindent" >The maximum number of nonzeros per row is not much larger than the
average; average;
</li> </li>
<li <li
class="enumerate" id="x19-148015x2"> class="enumerate" id="x19-149015x2">
<!--l. 255--><p class="noindent" >The regularity of the data structure allows for faster code, e.g. by allowing <!--l. 255--><p class="noindent" >The regularity of the data structure allows for faster code, e.g. by allowing
vectorization, thereby offsetting the additional storage requirements.</li></ol> vectorization, thereby offsetting the additional storage requirements.</li></ol>
<!--l. 259--><p class="noindent" >In the extreme case where the input matrix has one full row, the ELLPACK <!--l. 259--><p class="noindent" >In the extreme case where the input matrix has one full row, the ELLPACK
@ -492,7 +492,7 @@ class="cmtt-10">psb_T_ell_sparse_mat</span></span></span>:
</pre> </pre>
<!--l. 295--><p class="nopar" > </div></div> <!--l. 295--><p class="nopar" > </div></div>
<h5 class="likesubsubsectionHead"><a <h5 class="likesubsubsectionHead"><a
id="x19-149000"></a>Hacked ELLPACK</h5> id="x19-150000"></a>Hacked ELLPACK</h5>
<!--l. 303--><p class="noindent" >The <span <!--l. 303--><p class="noindent" >The <span
class="cmti-10">hacked ELLPACK </span>(<span class="cmti-10">hacked ELLPACK </span>(<span
class="cmbx-10">HLL</span>) format alleviates the main problem of the ELLPACK class="cmbx-10">HLL</span>) format alleviates the main problem of the ELLPACK
@ -558,7 +558,7 @@ format.
<a <a
id="x19-149001r7"></a> id="x19-150001r7"></a>
@ -568,7 +568,7 @@ width="248" height="248" >
<br /> <div class="caption" <br /> <div class="caption"
><span class="id">Figure&#x00A0;7: </span><span ><span class="id">Figure&#x00A0;7: </span><span
class="content">Hacked ELLPACK compression of matrix in Figure&#x00A0;<a class="content">Hacked ELLPACK compression of matrix in Figure&#x00A0;<a
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-149001r7 --> href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r7 -->
@ -595,9 +595,9 @@ class="cmtt-10">psb_T_hll_sparse_mat</span></span></span>:
</pre> </pre>
<!--l. 388--><p class="nopar" > </div></div> <!--l. 388--><p class="nopar" > </div></div>
<h5 class="likesubsubsectionHead"><a <h5 class="likesubsubsectionHead"><a
id="x19-150000"></a>Diagonal storage</h5> id="x19-151000"></a>Diagonal storage</h5>
<!--l. 396--><p class="noindent" >The DIAgonal (DIA) format (shown in Figure&#x00A0;<a <!--l. 396--><p class="noindent" >The DIAgonal (DIA) format (shown in Figure&#x00A0;<a
href="#x19-150001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span href="#x19-151001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> class="cmtt-10">AS</span></span></span>
containing in each column the coefficients along a diagonal of the matrix, and an containing in each column the coefficients along a diagonal of the matrix, and an
integer array <span class="obeylines-h"><span class="verb"><span integer array <span class="obeylines-h"><span class="verb"><span
@ -607,7 +607,7 @@ are padded with zeros as necessary.
<!--l. 402--><p class="indent" > The code to compute the matrix-vector product <span <!--l. 402--><p class="indent" > The code to compute the matrix-vector product <span
class="cmmi-10">y </span>= <span class="cmmi-10">y </span>= <span
class="cmmi-10">Ax </span>is shown in Alg.&#x00A0;<a class="cmmi-10">Ax </span>is shown in Alg.&#x00A0;<a
href="#x19-150003r2">2<!--tex4ht:ref: alg:dia --></a>; it href="#x19-151003r2">2<!--tex4ht:ref: alg:dia --></a>; it
costs one memory read per outer iteration, plus three memory reads, one memory costs one memory read per outer iteration, plus three memory reads, one memory
write and two floating-point operations per inner iteration. The accesses to write and two floating-point operations per inner iteration. The accesses to
<span class="obeylines-h"><span class="verb"><span <span class="obeylines-h"><span class="verb"><span
@ -620,7 +620,7 @@ required.
<a <a
id="x19-150001r8"></a> id="x19-151001r8"></a>
@ -630,13 +630,13 @@ width="248" height="248" >
<br /> <div class="caption" <br /> <div class="caption"
><span class="id">Figure&#x00A0;8: </span><span ><span class="id">Figure&#x00A0;8: </span><span
class="content">DIA compression of matrix in Figure&#x00A0;<a class="content">DIA compression of matrix in Figure&#x00A0;<a
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r8 --> href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r8 -->
<!--l. 419--><p class="indent" > </div><hr class="endfigure"> <!--l. 419--><p class="indent" > </div><hr class="endfigure">
<a <a
id="x19-150002r2"></a> id="x19-151002r2"></a>
@ -662,9 +662,9 @@ href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:l
</pre> </pre>
<!--l. 450--><p class="nopar" > </div></div> <!--l. 450--><p class="nopar" > </div></div>
<a <a
id="x19-150003r2"></a> id="x19-151003r2"></a>
<a <a
id="x19-150004"></a> id="x19-151004"></a>
<span <span
class="cmbx-10">Algorithm</span><span class="cmbx-10">Algorithm</span><span
class="cmbx-10">&#x00A0;2:</span>&#x00A0; Matrix-Vector product in DIA format class="cmbx-10">&#x00A0;2:</span>&#x00A0; Matrix-Vector product in DIA format
@ -691,7 +691,7 @@ class="cmtt-10">psb_T_dia_sparse_mat</span></span></span>:
</pre> </pre>
<!--l. 486--><p class="nopar" > </div></div> <!--l. 486--><p class="nopar" > </div></div>
<h5 class="likesubsubsectionHead"><a <h5 class="likesubsubsectionHead"><a
id="x19-151000"></a>Hacked DIA</h5> id="x19-152000"></a>Hacked DIA</h5>
<!--l. 495--><p class="noindent" >Storage by DIAgonals is an attractive option for matrices whose coefficients are <!--l. 495--><p class="noindent" >Storage by DIAgonals is an attractive option for matrices whose coefficients are
located on a small set of diagonals, since they do away with storing explicitly the located on a small set of diagonals, since they do away with storing explicitly the
indices and therefore reduce significantly memory traffic. However, having a few indices and therefore reduce significantly memory traffic. However, having a few
@ -738,7 +738,7 @@ class="cmti-10">hackOffsets[k]</span>.
<a <a
id="x19-151001r9"></a> id="x19-152001r9"></a>
@ -748,7 +748,7 @@ width="248" height="248" >
<br /> <div class="caption" <br /> <div class="caption"
><span class="id">Figure&#x00A0;9: </span><span ><span class="id">Figure&#x00A0;9: </span><span
class="content">Hacked DIA compression of matrix in Figure&#x00A0;<a class="content">Hacked DIA compression of matrix in Figure&#x00A0;<a
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r9 --> href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-152001r9 -->
@ -793,7 +793,7 @@ class="cmtt-10">psb_T_hdia_sparse_mat</span></span></span>:
<h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a <h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a
id="x19-15200012.4"></a>CUDA-class extensions</h4> id="x19-15300012.4"></a>CUDA-class extensions</h4>
<!--l. 4--><p class="noindent" >For computing with CUDA we define a dual memorization strategy in which each <!--l. 4--><p class="noindent" >For computing with CUDA we define a dual memorization strategy in which each
variable on the CPU (&#8220;host&#8221;) side has a GPU (&#8220;device&#8221;) side. When a GPU-type variable on the CPU (&#8220;host&#8221;) side has a GPU (&#8220;device&#8221;) side. When a GPU-type
variable is initialized, the data contained is (usually) the same on both sides. Each variable is initialized, the data contained is (usually) the same on both sides. Each

@ -16,12 +16,12 @@ href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
href="userhtmlse10.html#tailuserhtmlse13.html">tail</a>] [<a href="userhtmlse10.html#tailuserhtmlse13.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div> href="userhtml.html# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">13 </span> <a <h3 class="sectionHead"><span class="titlemark">13 </span> <a
id="x20-15300013"></a>CUDA Environment Routines</h3> id="x20-15400013"></a>CUDA Environment Routines</h3>
<!--l. 91--><p class="noindent" > <!--l. 91--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-154000"></a>psb_cuda_init &#8212; Initializes PSBLAS-CUDA environment</h4> id="x20-155000"></a>psb_cuda_init &#8212; Initializes PSBLAS-CUDA environment</h4>
<a <a
id="Q1-20-191"></a> id="Q1-20-192"></a>
<div class="center" <div class="center"
> >
<!--l. 99--><p class="noindent" > <!--l. 99--><p class="noindent" >
@ -64,13 +64,13 @@ class="cmbx-12">Notes</span>
<ol class="enumerate1" > <ol class="enumerate1" >
<li <li
class="enumerate" id="x20-154002x1"> class="enumerate" id="x20-155002x1">
<!--l. 125--><p class="noindent" >A call to this routine must precede any other PSBLAS-CUDA call.</li></ol> <!--l. 125--><p class="noindent" >A call to this routine must precede any other PSBLAS-CUDA call.</li></ol>
<!--l. 129--><p class="noindent" > <!--l. 129--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-155000"></a>psb_cuda_exit &#8212; Exit from PSBLAS-CUDA environment</h4> id="x20-156000"></a>psb_cuda_exit &#8212; Exit from PSBLAS-CUDA environment</h4>
<a <a
id="Q1-20-193"></a> id="Q1-20-194"></a>
<div class="center" <div class="center"
> >
<!--l. 137--><p class="noindent" > <!--l. 137--><p class="noindent" >
@ -106,9 +106,9 @@ class="cmbx-10">in</span>.<br
class="newline" />Specified as: an integer variable.</dd></dl> class="newline" />Specified as: an integer variable.</dd></dl>
<!--l. 161--><p class="noindent" > <!--l. 161--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-156000"></a>psb_cuda_DeviceSync &#8212; Synchronize CUDA device</h4> id="x20-157000"></a>psb_cuda_DeviceSync &#8212; Synchronize CUDA device</h4>
<a <a
id="Q1-20-195"></a> id="Q1-20-196"></a>
@ -123,9 +123,9 @@ call&#x00A0;psb_cuda_DeviceSync()
CUDA-side code, have completed. CUDA-side code, have completed.
<!--l. 182--><p class="noindent" > <!--l. 182--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-157000"></a>psb_cuda_getDeviceCount </h4> id="x20-158000"></a>psb_cuda_getDeviceCount </h4>
<a <a
id="Q1-20-197"></a> id="Q1-20-198"></a>
<div class="center" <div class="center"
> >
<!--l. 190--><p class="noindent" > <!--l. 190--><p class="noindent" >
@ -136,9 +136,9 @@ ngpus&#x00A0;=&#x00A0;&#x00A0;psb_cuda_getDeviceCount()
<!--l. 199--><p class="noindent" >Get number of devices available on current computing node. <!--l. 199--><p class="noindent" >Get number of devices available on current computing node.
<!--l. 201--><p class="noindent" > <!--l. 201--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-158000"></a>psb_cuda_getDevice </h4> id="x20-159000"></a>psb_cuda_getDevice </h4>
<a <a
id="Q1-20-199"></a> id="Q1-20-200"></a>
<div class="center" <div class="center"
> >
<!--l. 209--><p class="noindent" > <!--l. 209--><p class="noindent" >
@ -149,9 +149,9 @@ ngpus&#x00A0;=&#x00A0;&#x00A0;psb_cuda_getDevice()
<!--l. 218--><p class="noindent" >Get device in use by current process. <!--l. 218--><p class="noindent" >Get device in use by current process.
<!--l. 220--><p class="noindent" > <!--l. 220--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-159000"></a>psb_cuda_setDevice </h4> id="x20-160000"></a>psb_cuda_setDevice </h4>
<a <a
id="Q1-20-201"></a> id="Q1-20-202"></a>
@ -165,9 +165,9 @@ info&#x00A0;=&#x00A0;psb_cuda_setDevice(dev)
<!--l. 237--><p class="noindent" >Set device to be used by current process. <!--l. 237--><p class="noindent" >Set device to be used by current process.
<!--l. 239--><p class="noindent" > <!--l. 239--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-160000"></a>psb_cuda_DeviceHasUVA </h4> id="x20-161000"></a>psb_cuda_DeviceHasUVA </h4>
<a <a
id="Q1-20-203"></a> id="Q1-20-204"></a>
<div class="center" <div class="center"
> >
<!--l. 247--><p class="noindent" > <!--l. 247--><p class="noindent" >
@ -178,9 +178,9 @@ hasUva&#x00A0;=&#x00A0;psb_cuda_DeviceHasUVA()
<!--l. 256--><p class="noindent" >Returns true if device currently in use supports UVA (Unified Virtual Addressing). <!--l. 256--><p class="noindent" >Returns true if device currently in use supports UVA (Unified Virtual Addressing).
<!--l. 259--><p class="noindent" > <!--l. 259--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-161000"></a>psb_cuda_WarpSize </h4> id="x20-162000"></a>psb_cuda_WarpSize </h4>
<a <a
id="Q1-20-205"></a> id="Q1-20-206"></a>
<div class="center" <div class="center"
> >
<!--l. 267--><p class="noindent" > <!--l. 267--><p class="noindent" >
@ -191,9 +191,9 @@ nw&#x00A0;=&#x00A0;psb_cuda_WarpSize()
<!--l. 276--><p class="noindent" >Returns the warp size. <!--l. 276--><p class="noindent" >Returns the warp size.
<!--l. 279--><p class="noindent" > <!--l. 279--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-162000"></a>psb_cuda_MultiProcessors </h4> id="x20-163000"></a>psb_cuda_MultiProcessors </h4>
<a <a
id="Q1-20-207"></a> id="Q1-20-208"></a>
@ -207,9 +207,9 @@ nmp&#x00A0;=&#x00A0;psb_cuda_MultiProcessors()
<!--l. 296--><p class="noindent" >Returns the number of multiprocessors in the CUDA device. <!--l. 296--><p class="noindent" >Returns the number of multiprocessors in the CUDA device.
<!--l. 298--><p class="noindent" > <!--l. 298--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-163000"></a>psb_cuda_MaxThreadsPerMP </h4> id="x20-164000"></a>psb_cuda_MaxThreadsPerMP </h4>
<a <a
id="Q1-20-209"></a> id="Q1-20-210"></a>
<div class="center" <div class="center"
> >
<!--l. 306--><p class="noindent" > <!--l. 306--><p class="noindent" >
@ -220,9 +220,9 @@ nt&#x00A0;=&#x00A0;psb_cuda_MaxThreadsPerMP()
<!--l. 315--><p class="noindent" >Returns the maximum number of threads per multiprocessor. <!--l. 315--><p class="noindent" >Returns the maximum number of threads per multiprocessor.
<!--l. 318--><p class="noindent" > <!--l. 318--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-164000"></a>psb_cuda_MaxRegistersPerBlock </h4> id="x20-165000"></a>psb_cuda_MaxRegistersPerBlock </h4>
<a <a
id="Q1-20-211"></a> id="Q1-20-212"></a>
<div class="center" <div class="center"
> >
<!--l. 326--><p class="noindent" > <!--l. 326--><p class="noindent" >
@ -233,9 +233,9 @@ nr&#x00A0;=&#x00A0;psb_cuda_MaxRegistersPerBlock()
<!--l. 335--><p class="noindent" >Returns the maximum number of register per thread block. <!--l. 335--><p class="noindent" >Returns the maximum number of register per thread block.
<!--l. 338--><p class="noindent" > <!--l. 338--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-165000"></a>psb_cuda_MemoryClockRate </h4> id="x20-166000"></a>psb_cuda_MemoryClockRate </h4>
<a <a
id="Q1-20-213"></a> id="Q1-20-214"></a>
@ -249,9 +249,9 @@ cl&#x00A0;=&#x00A0;psb_cuda_MemoryClockRate()
<!--l. 355--><p class="noindent" >Returns the memory clock rate in KHz, as an integer. <!--l. 355--><p class="noindent" >Returns the memory clock rate in KHz, as an integer.
<!--l. 357--><p class="noindent" > <!--l. 357--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-166000"></a>psb_cuda_MemoryBusWidth </h4> id="x20-167000"></a>psb_cuda_MemoryBusWidth </h4>
<a <a
id="Q1-20-215"></a> id="Q1-20-216"></a>
<div class="center" <div class="center"
> >
<!--l. 365--><p class="noindent" > <!--l. 365--><p class="noindent" >
@ -262,9 +262,9 @@ nb&#x00A0;=&#x00A0;psb_cuda_MemoryBusWidth()
<!--l. 374--><p class="noindent" >Returns the memory bus width in bits. <!--l. 374--><p class="noindent" >Returns the memory bus width in bits.
<!--l. 376--><p class="noindent" > <!--l. 376--><p class="noindent" >
<h4 class="likesubsectionHead"><a <h4 class="likesubsectionHead"><a
id="x20-167000"></a>psb_cuda_MemoryPeakBandwidth </h4> id="x20-168000"></a>psb_cuda_MemoryPeakBandwidth </h4>
<a <a
id="Q1-20-217"></a> id="Q1-20-218"></a>
<div class="center" <div class="center"
> >
<!--l. 384--><p class="noindent" > <!--l. 384--><p class="noindent" >

File diff suppressed because one or more lines are too long

@ -2,8 +2,8 @@
\label{sec:methods} \label{sec:methods}
In this chapter we provide routines for preconditioners and iterative In this chapter we provide routines for preconditioners and iterative
methods. The interfaces for Krylov subspace methods are available in methods. The interfaces for iterative methods are available in
the module \verb|psb_krylov_mod|. the module \verb|psb_linsolve_mod|.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %
@ -146,6 +146,119 @@ An integer value; 0 means no error has been detected.
\end{description} \end{description}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Richardson driver routine
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage\subsection{psb\_richardson \label{richardson} ---
Richardson Iteration Driver Routine}
This subroutine is a driver implementig a Richardson iteration
\[ x_{k+1} = M^-1 (b-Ax_k) +x_k,\]
with the preconditioner operator $M$ defined in the previous section.
The stopping criterion can take the following values:
\begin{description}
\item[1] normwise backward error in the infinity
norm; the iteration is stopped when
\[ err = \frac{\|r_i\|}{(\|A\|\|x_i\|+\|b\|)} < eps \]
\item[2] Relative residual in the 2-norm; the iteration is stopped
when
\[ err = \frac{\|r_i\|}{\|b\|_2} < eps \]
\item[3] Relative residual reduction in the 2-norm; the iteration is stopped
when
\[ err = \frac{\|r_i\|}{\|r_0\|_2} < eps \]
\end{description}
The behaviour is controlled by the istop argument (see
later). In the above formulae, $x_i$ is the tentative solution and
$r_i=b-Ax_i$ the corresponding residual at the $i$-th iteration.
\begin{lstlisting}
call psb_richardson(a,prec,b,x,eps,desc_a,info,&
& itmax,iter,err,itrace,istop)
\end{lstlisting}
\begin{description}
\item[Type:] Synchronous.
\item[\bf On Entry]
\item[a] the local portion of global sparse matrix
$A$. \\
Scope: {\bf local} \\
Type: {\bf required}\\
Intent: {\bf in}.\\
Specified as: a structured data of type \spdata.
\item[prec] The data structure containing the preconditioner.\\
Scope: {\bf local} \\
Type: {\bf required}\\
Intent: {\bf in}.\\
Specified as: a structured data of type \precdata.
\item[b] The RHS vector. \\
Scope: {\bf local} \\
Type: {\bf required}\\
Intent: {\bf in}.\\
Specified as: a rank one array or an object of type \vdata.
\item[x] The initial guess. \\
Scope: {\bf local} \\
Type: {\bf required}\\
Intent: {\bf inout}.\\
Specified as: a rank one array or an object of type \vdata.
\item[eps] The stopping tolerance. \\
Scope: {\bf global} \\
Type: {\bf required}\\
Intent: {\bf in}.\\
Specified as: a real number.
\item[desc\_a] contains data structures for communications.\\
Scope: {\bf local} \\
Type: {\bf required}\\
Intent: {\bf in}.\\
Specified as: a structured data of type \descdata.
\item[itmax] The maximum number of iterations to perform.\\
Scope: {\bf global} \\
Type: {\bf optional}\\
Intent: {\bf in}.\\
Default: $itmax = 1000$.\\
Specified as: an integer variable $itmax \ge 1$.
\item[itrace] If $>0$ print out an informational message about
convergence every $itrace$ iterations. If $=0$ print a message in
case of convergence failure.\\
Scope: {\bf global} \\
Type: {\bf optional}\\
Intent: {\bf in}.\\
Default: $itrace = -1$.\\
\item[istop] An integer specifying the stopping criterion.\\
Scope: {\bf global} \\
Type: {\bf optional}.\\
Intent: {\bf in}.\\
Values: 1: use the normwise backward error, 2: use the scaled 2-norm
of the residual, 3: use the residual reduction in the 2-norm. Default: 2.
\item[\bf On Return]
\item[x] The computed solution. \\
Scope: {\bf local} \\
Type: {\bf required}\\
Intent: {\bf inout}.\\
Specified as: a rank one array or an object of type \vdata.
\item[iter] The number of iterations performed.\\
Scope: {\bf global} \\
Type: {\bf optional}\\
Intent: {\bf out}.\\
Returned as: an integer variable.
\item[err] The convergence estimate on exit.\\
Scope: {\bf global} \\
Type: {\bf optional}\\
Intent: {\bf out}.\\
Returned as: a real number.
\item[info] Error code.\\
Scope: {\bf local} \\
Type: {\bf required} \\
Intent: {\bf out}.\\
An integer value; 0 means no error has been detected.
\end{description}
%%% Local Variables: %%% Local Variables:
%%% mode: latex %%% mode: latex
%%% TeX-master: "userguide" %%% TeX-master: "userguide"

Loading…
Cancel
Save