Merge branch 'repackage' of github.com:sfilippone/psblas3 into repackage

openacc
sfilippone 3 months ago
commit f825bf37a1

@ -1,7 +1,7 @@
all: guide
guide:
cd src && $(MAKE)
cd src && $(MAKE) clean all
doxy:
doxygen doxypsb
doxygen doxypsb

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

@ -10,16 +10,16 @@
<link rel="stylesheet" type="text/css" href="userhtml.css">
</head><body
>
<!--l. 91--><p class="noindent" ><span
class="cmbx-12x-x-144">PSBLAS</span><br
<!--l. 99--><p class="noindent" ><span
class="pplb7t-x-x-172">PSBLAS</span><br
class="newline" /> <span
class="cmbx-12x-x-144">User&#8217;s and Reference Guide</span><br
class="pplb7t-x-x-172">User&#8217;s and Reference Guide</span><br
class="newline" /> <span
class="cmti-12">A reference guide for the Parallel Sparse BLAS library</span><br
class="pplri7t-x-x-120">A reference guide for the Parallel Sparse BLAS library</span><br
class="newline" /> <span
class="cmbx-10">Salvatore Filippone</span><br
class="pplb7t-">Salvatore Filippone</span><br
class="newline" /><span
class="cmbx-10">Alfredo Buttari </span><br
class="pplb7t-">Alfredo Buttari </span><br
class="newline" />Software version: 3.9.0<br
class="newline" />Aug 1st, 2024
@ -52,13 +52,13 @@ href="userhtmlse9.html#x14-1280009" id="QQ2-14-158">Utilities</a></span>
<br /> &#x00A0;<span class="sectionToc" >10 <a
href="userhtmlse10.html#x15-13500010" id="QQ2-15-165">Preconditioner routines</a></span>
<br /> &#x00A0;<span class="sectionToc" >11 <a
href="userhtmlse11.html#x17-14200011" id="QQ2-17-172">Iterative Methods</a></span>
href="userhtmlse11.html#x17-14300011" id="QQ2-17-174">Iterative Methods</a></span>
<br /> &#x00A0;<span class="sectionToc" >12 <a
href="userhtmlse12.html#x19-14500012" id="QQ2-19-175">Extensions</a></span>
href="userhtmlse12.html#x19-14600012" id="QQ2-19-177">Extensions</a></span>
<br /> &#x00A0;<span class="sectionToc" >13 <a
href="userhtmlse13.html#x20-15400013" id="QQ2-20-190">CUDA Environment Routines</a></span>
href="userhtmlse13.html#x20-15500013" id="QQ2-20-192">CUDA Environment Routines</a></span>
<br /> &#x00A0;<span class="likesectionToc" ><a
href="userhtmlli2.html#x21-169000" id="QQ2-21-219">References</a></span>
href="userhtmlli2.html#x21-170000" id="QQ2-21-221">References</a></span>
</div>

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

@ -1,33 +1,62 @@
/* start css.sty */
.cmr-7{font-size:70%;}
.cmmi-5{font-size:50%;font-style: italic;}
.cmmi-7{font-size:70%;font-style: italic;}
.cmmi-10{font-style: italic;}
.cmsy-7{font-size:70%;}
.cmbx-12x-x-144{font-size:172%; font-weight: bold;}
.cmbx-12x-x-144{ font-weight: bold;}
.cmbx-12x-x-144{ font-weight: bold;}
.cmti-10{ font-style: italic;}
.cmti-12{font-size:120%; font-style: italic;}
.cmbx-10{ font-weight: bold;}
.cmbx-10{ font-weight: bold;}
.cmbx-10{ font-weight: bold;}
.pplb7t-x-x-172{font-size:172%;font-weight: bold;}
.pplb7t-x-x-172{font-weight: bold;}
.pplb7t-x-x-172{font-weight: bold;}
.pplri7t-{font-style: italic;}
.pplri7t-{font-style: italic;}
.pplri7t-x-x-120{font-size:120%;font-style: italic;}
.pplri7t-x-x-120{font-style: italic;}
.pplb7t-{font-weight: bold;}
.pplb7t-{font-weight: bold;}
.pplb7t-{font-weight: bold;}
.cmtt-10{font-family: monospace,monospace;}
.cmtt-10{font-family: monospace,monospace;}
.cmtt-10{font-family: monospace,monospace;}
.cmr-9{font-size:90%;}
.cmr-8{font-size:80%;}
.cmbx-12{font-size:120%; font-weight: bold;}
.cmbx-12{ font-weight: bold;}
.cmbx-12{ font-weight: bold;}
.pplr7t-x-x-76{font-size:76%;}
.zplmr7m-{font-style: italic;}
.zplmr7m-{font-style: italic;}
.zplmr7m-{font-style: italic;}
.zplmr7m-{font-style: italic;}
.zplmr7m-{font-style: italic;}
.zplmr7m-x-x-76{font-size:76%;font-style: italic;}
.zplmr7m-x-x-76{font-style: italic;}
.zplmr7m-x-x-76{font-style: italic;}
.zplmr7m-x-x-76{font-style: italic;}
.zplmr7m-x-x-76{font-style: italic;}
.zplmr7m-x-x-60{font-size:60%;font-style: italic;}
.zplmr7m-x-x-60{font-style: italic;}
.zplmr7m-x-x-60{font-style: italic;}
.zplmr7m-x-x-60{font-style: italic;}
.zplmr7m-x-x-60{font-style: italic;}
.zplmr7y-x-x-76{font-size:76%;}
.zplmr7t-x-x-76{font-size:76%;}
.pplr7t-x-x-90{font-size:90%;}
.pplr7t-x-x-80{font-size:80%;}
.pplb7t-x-x-120{font-size:120%;font-weight: bold;}
.pplb7t-x-x-120{font-weight: bold;}
.pplb7t-x-x-120{font-weight: bold;}
.cmtt-8{font-size:80%;font-family: monospace,monospace;}
.cmtt-8{font-family: monospace,monospace;}
.cmtt-8{font-family: monospace,monospace;}
.cmtt-9{font-size:90%;font-family: monospace,monospace;}
.cmtt-9{font-family: monospace,monospace;}
.cmtt-9{font-family: monospace,monospace;}
.cmmi-8{font-size:80%;font-style: italic;}
.pplr7t-x-x-70{font-size:70%;}
.zplmr7m-x-x-90{font-size:90%;font-style: italic;}
.zplmr7m-x-x-90{font-style: italic;}
.zplmr7m-x-x-90{font-style: italic;}
.zplmr7m-x-x-90{font-style: italic;}
.zplmr7m-x-x-90{font-style: italic;}
.zplmr7y-x-x-90{font-size:90%;}
.zplmr7m-x-x-80{font-size:80%;font-style: italic;}
.zplmr7m-x-x-80{font-style: italic;}
.zplmr7m-x-x-80{font-style: italic;}
.zplmr7m-x-x-80{font-style: italic;}
.zplmr7m-x-x-80{font-style: italic;}
.zplmr7t-x-x-80{font-size:80%;}
.pplrc7t-x-x-90{font-size:90%;}
.small-caps{font-variant: small-caps; }
p{margin-top:0;margin-bottom:0}
p.indent{text-indent:0;}
p + p{margin-top:1em;}
@ -158,5 +187,11 @@ pre.listings{font-family: monospace,monospace; white-space: pre-wrap; margin-top
pre.lstlisting{font-family: monospace,monospace; white-space: pre-wrap; margin-top:0.5em; margin-bottom:0.5em; }
pre.lstinputlisting{ font-family: monospace,monospace; white-space: pre-wrap; }
.lstinputlisting .label{margin-right:0.5em;}
#TBL-24-1{border-left: 1px solid black;}
#TBL-24-1{border-right:1px solid black;}
#TBL-24-2{border-right:1px solid black;}
#TBL-24-3{border-right:1px solid black;}
#TBL-24-4{border-right:1px solid black;}
#TBL-24-5{border-right:1px solid black;}
/* end css.sty */

@ -10,16 +10,16 @@
<link rel="stylesheet" type="text/css" href="userhtml.css">
</head><body
>
<!--l. 91--><p class="noindent" ><span
class="cmbx-12x-x-144">PSBLAS</span><br
<!--l. 99--><p class="noindent" ><span
class="pplb7t-x-x-172">PSBLAS</span><br
class="newline" /> <span
class="cmbx-12x-x-144">User&#8217;s and Reference Guide</span><br
class="pplb7t-x-x-172">User&#8217;s and Reference Guide</span><br
class="newline" /> <span
class="cmti-12">A reference guide for the Parallel Sparse BLAS library</span><br
class="pplri7t-x-x-120">A reference guide for the Parallel Sparse BLAS library</span><br
class="newline" /> <span
class="cmbx-10">Salvatore Filippone</span><br
class="pplb7t-">Salvatore Filippone</span><br
class="newline" /><span
class="cmbx-10">Alfredo Buttari </span><br
class="pplb7t-">Alfredo Buttari </span><br
class="newline" />Software version: 3.9.0<br
class="newline" />Aug 1st, 2024
@ -52,13 +52,13 @@ href="userhtmlse9.html#x14-1280009" id="QQ2-14-158">Utilities</a></span>
<br /> &#x00A0;<span class="sectionToc" >10 <a
href="userhtmlse10.html#x15-13500010" id="QQ2-15-165">Preconditioner routines</a></span>
<br /> &#x00A0;<span class="sectionToc" >11 <a
href="userhtmlse11.html#x17-14200011" id="QQ2-17-172">Iterative Methods</a></span>
href="userhtmlse11.html#x17-14300011" id="QQ2-17-174">Iterative Methods</a></span>
<br /> &#x00A0;<span class="sectionToc" >12 <a
href="userhtmlse12.html#x19-14500012" id="QQ2-19-175">Extensions</a></span>
href="userhtmlse12.html#x19-14600012" id="QQ2-19-177">Extensions</a></span>
<br /> &#x00A0;<span class="sectionToc" >13 <a
href="userhtmlse13.html#x20-15400013" id="QQ2-20-190">CUDA Environment Routines</a></span>
href="userhtmlse13.html#x20-15500013" id="QQ2-20-192">CUDA Environment Routines</a></span>
<br /> &#x00A0;<span class="likesectionToc" ><a
href="userhtmlli2.html#x21-169000" id="QQ2-21-219">References</a></span>
href="userhtmlli2.html#x21-170000" id="QQ2-21-221">References</a></span>
</div>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

@ -10,10 +10,10 @@
<link rel="stylesheet" type="text/css" href="userhtml.css">
</head><body
>
<div class="footnote-text">
<!--l. 72--><p class="indent" > <span class="footnote-mark"><a
<div class="footnote-text">
<!--l. 72--><p class="indent" > <span class="footnote-mark"><a
id="fn4x0"><a
id="x16-136002x10.1"></a> <sup class="textsuperscript">4</sup></a></span><span
class="cmr-8">The string is case-insensitive</span></div>
id="x16-136002x10.1"></a> <sup class="textsuperscript">4</sup></a></span><span
class="pplr7t-x-x-80">The string is case-insensitive</span></div>
</body></html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 968 B

After

Width:  |  Height:  |  Size: 1021 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

@ -13,8 +13,11 @@
<div class="footnote-text">
<!--l. 53--><p class="noindent" ><span class="footnote-mark"><a
id="fn5x0"><a
id="x18-143004x11.1"></a> <sup class="textsuperscript">5</sup></a></span><span
class="cmr-8">Note: the implementation is for </span><span
class="cmmi-8">FCG</span><span
class="cmr-8">(1).</span></div>
id="x18-144004x11.1"></a> <sup class="textsuperscript">5</sup></a></span><span
class="pplr7t-x-x-80">Note: the implementation is for </span><span
class="zplmr7m-x-x-80">FCG</span><span
class="zplmr7t-x-x-80">(</span><span
class="pplr7t-x-x-80">1</span><span
class="zplmr7t-x-x-80">)</span><span
class="pplr7t-x-x-80">.</span></div>
</body></html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.5 KiB

After

Width:  |  Height:  |  Size: 8.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 970 B

After

Width:  |  Height:  |  Size: 978 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 420 B

After

Width:  |  Height:  |  Size: 399 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 710 B

After

Width:  |  Height:  |  Size: 700 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1016 B

After

Width:  |  Height:  |  Size: 1.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.5 KiB

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

After

Width:  |  Height:  |  Size: 1.7 KiB

@ -10,10 +10,10 @@
<link rel="stylesheet" type="text/css" href="userhtml.css">
</head><body
>
<div class="footnote-text">
<!--l. 151--><p class="indent" > <span class="footnote-mark"><a
<div class="footnote-text">
<!--l. 151--><p class="indent" > <span class="footnote-mark"><a
id="fn1x0"><a
id="x5-3003x2"></a> <sup class="textsuperscript">1</sup></a></span><span
class="cmr-8">In our prototype implementation we provide sample scatter/gather routines.</span></div>
id="x5-3003x2"></a> <sup class="textsuperscript">1</sup></a></span><span
class="pplr7t-x-x-80">In our prototype implementation we provide sample scatter/gather routines.</span></div>
</body></html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

@ -13,12 +13,12 @@
<div class="footnote-text">
<!--l. 195--><p class="noindent" ><span class="footnote-mark"><a
id="fn2x0"><a
id="x6-4002x2.1"></a> <sup class="textsuperscript">2</sup></a></span><span
class="cmr-8">This is the normal situation when the pattern of the sparse matrix is symmetric, which is</span>
id="x6-4002x2.1"></a> <sup class="textsuperscript">2</sup></a></span><span
class="pplr7t-x-x-80">This is the normal situation when the pattern of the sparse matrix is symmetric, which is equivalent to</span>
<span
class="cmr-8">equivalent to say that the interaction between two variables is reciprocal. If the matrix pattern is</span>
class="pplr7t-x-x-80">say that the interaction between two variables is reciprocal. If the matrix pattern is non-symmetric we may</span>
<span
class="cmr-8">non-symmetric we may have one-way interactions, and these could cause a situation in which a</span>
class="pplr7t-x-x-80">have one-way interactions, and these could cause a situation in which a boundary point is not a halo point</span>
<span
class="cmr-8">boundary point is not a halo point for its neighbour.</span></div>
class="pplr7t-x-x-80">for its neighbour.</span></div>
</body></html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 2.3 KiB

@ -11,16 +11,16 @@
</head><body
>
<div class="footnote-text">
<!--l. 362--><p class="noindent" ><span class="footnote-mark"><a
<!--l. 363--><p class="noindent" ><span class="footnote-mark"><a
id="fn3x0"><a
id="x7-6020x3"></a> <sup class="textsuperscript">3</sup></a></span><span
class="cmr-8">The subroutine style </span><span
id="x7-6020x3"></a> <sup class="textsuperscript">3</sup></a></span><span
class="pplr7t-x-x-80">The subroutine style </span><span
class="cmtt-8">psb</span><span
class="cmtt-8">_precinit </span><span
class="cmr-8">and </span><span
class="pplr7t-x-x-80">and </span><span
class="cmtt-8">psb</span><span
class="cmtt-8">_precbl </span><span
class="cmr-8">are still supported for backward</span>
class="cmtt-8">_precbld </span><span
class="pplr7t-x-x-80">are still supported for backward</span>
<span
class="cmr-8">compatibility</span></div>
class="pplr7t-x-x-80">compatibility</span></div>
</body></html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

After

Width:  |  Height:  |  Size: 1.8 KiB

@ -10,7 +10,7 @@
<link rel="stylesheet" type="text/css" href="userhtml.css">
</head><body
>
<!--l. 106--><div class="crosslinks"><p class="noindent">[<a
<!--l. 114--><div class="crosslinks"><p class="noindent">[<a
href="userhtmlse1.html" >next</a>] [<a
href="#tailuserhtmlli1.html">tail</a>] [<a
href="userhtml.html#userhtmlli1.html" >up</a>] </p></div>
@ -297,61 +297,63 @@ href="userhtmlse10.html#x15-13500010">Preconditioner routines</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.1 <a
href="userhtmlse10.html#x15-13600010.1" id="QQ2-15-166">init &#8212; Initialize a preconditioner</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.2 <a
href="userhtmlse10.html#x15-13700010.2" id="QQ2-15-167">build &#8212; Builds a preconditioner</a></span>
href="userhtmlse10.html#x15-13700010.2" id="QQ2-15-167">Set &#8212; set preconditioner parameters</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.3 <a
href="userhtmlse10.html#x15-13800010.3" id="QQ2-15-168">apply &#8212; Preconditioner application routine</a></span>
href="userhtmlse10.html#x15-13800010.3" id="QQ2-15-169">build &#8212; Builds a preconditioner</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.4 <a
href="userhtmlse10.html#x15-13900010.4" id="QQ2-15-169">descr &#8212; Prints a description of current preconditioner</a></span>
href="userhtmlse10.html#x15-13900010.4" id="QQ2-15-170">apply &#8212; Preconditioner application routine</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.5 <a
href="userhtmlse10.html#x15-14000010.5" id="QQ2-15-170">clone &#8212; clone current preconditioner</a></span>
href="userhtmlse10.html#x15-14000010.5" id="QQ2-15-171">descr &#8212; Prints a description of current preconditioner</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.6 <a
href="userhtmlse10.html#x15-14100010.6" id="QQ2-15-171">free &#8212; Free a preconditioner</a></span>
href="userhtmlse10.html#x15-14100010.6" id="QQ2-15-172">clone &#8212; clone current preconditioner</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >10.7 <a
href="userhtmlse10.html#x15-14200010.7" id="QQ2-15-173">free &#8212; Free a preconditioner</a></span>
<br /> &#x00A0;<span class="sectionToc" >11 <a
href="userhtmlse11.html#x17-14200011">Iterative Methods</a></span>
href="userhtmlse11.html#x17-14300011">Iterative Methods</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >11.1 <a
href="userhtmlse11.html#x17-14300011.1" id="QQ2-17-173">psb_krylov &#8212; Krylov Methods Driver Routine</a></span>
href="userhtmlse11.html#x17-14400011.1" id="QQ2-17-175">psb_krylov &#8212; Krylov Methods Driver Routine</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >11.2 <a
href="userhtmlse11.html#x17-14400011.2" id="QQ2-17-174">psb_richardson &#8212; Richardson Iteration Driver Routine</a></span>
href="userhtmlse11.html#x17-14500011.2" id="QQ2-17-176">psb_richardson &#8212; Richardson Iteration Driver Routine</a></span>
<br /> &#x00A0;<span class="sectionToc" >12 <a
href="userhtmlse12.html#x19-14500012">Extensions</a></span>
href="userhtmlse12.html#x19-14600012">Extensions</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.1 <a
href="userhtmlse12.html#x19-14600012.1" id="QQ2-19-176">Using the extensions</a></span>
href="userhtmlse12.html#x19-14700012.1" id="QQ2-19-178">Using the extensions</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.2 <a
href="userhtmlse12.html#x19-14700012.2" id="QQ2-19-177">Extensions&#8217; Data Structures</a></span>
href="userhtmlse12.html#x19-14800012.2" id="QQ2-19-179">Extensions&#8217; Data Structures</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.3 <a
href="userhtmlse12.html#x19-14800012.3" id="QQ2-19-180">CPU-class extensions</a></span>
href="userhtmlse12.html#x19-14900012.3" id="QQ2-19-182">CPU-class extensions</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" >12.4 <a
href="userhtmlse12.html#x19-15300012.4" id="QQ2-19-189">CUDA-class extensions</a></span>
href="userhtmlse12.html#x19-15400012.4" id="QQ2-19-191">CUDA-class extensions</a></span>
<br /> &#x00A0;<span class="sectionToc" >13 <a
href="userhtmlse13.html#x20-15400013">CUDA Environment Routines</a></span>
href="userhtmlse13.html#x20-15500013">CUDA Environment Routines</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-192">psb_cuda_init</a></span>
href="userhtmlse13.html#Q1-20-194">psb_cuda_init</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-194">psb_cuda_exit</a></span>
href="userhtmlse13.html#Q1-20-196">psb_cuda_exit</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-196">psb_cuda_DeviceSync</a></span>
href="userhtmlse13.html#Q1-20-198">psb_cuda_DeviceSync</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-198">psb_cuda_getDeviceCount</a></span>
href="userhtmlse13.html#Q1-20-200">psb_cuda_getDeviceCount</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-200">psb_cuda_getDevice</a></span>
href="userhtmlse13.html#Q1-20-202">psb_cuda_getDevice</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-202">psb_cuda_setDevice</a></span>
href="userhtmlse13.html#Q1-20-204">psb_cuda_setDevice</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-204">psb_cuda_DeviceHasUVA</a></span>
href="userhtmlse13.html#Q1-20-206">psb_cuda_DeviceHasUVA</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-206">psb_cuda_WarpSize</a></span>
href="userhtmlse13.html#Q1-20-208">psb_cuda_WarpSize</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-208">psb_cuda_MultiProcessors</a></span>
href="userhtmlse13.html#Q1-20-210">psb_cuda_MultiProcessors</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-210">psb_cuda_MaxThreadsPerMP</a></span>
href="userhtmlse13.html#Q1-20-212">psb_cuda_MaxThreadsPerMP</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-212">psb_cuda_MaxRegisterPerBlock</a></span>
href="userhtmlse13.html#Q1-20-214">psb_cuda_MaxRegisterPerBlock</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-214">psb_cuda_MemoryClockRate</a></span>
href="userhtmlse13.html#Q1-20-216">psb_cuda_MemoryClockRate</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-216">psb_cuda_MemoryBusWidth</a></span>
href="userhtmlse13.html#Q1-20-218">psb_cuda_MemoryBusWidth</a></span>
<br /> &#x00A0;&#x00A0;<span class="subsectionToc" ><a
href="userhtmlse13.html#Q1-20-218">psb_cuda_MemoryPeakBandwidth</a></span>
href="userhtmlse13.html#Q1-20-220">psb_cuda_MemoryPeakBandwidth</a></span>
</div>

@ -16,194 +16,210 @@ href="userhtmlse13.html#tailuserhtmlse13.html" >prev-tail</a>] [<a
href="#tailuserhtmlli2.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div>
<h3 class="likesectionHead"><a
id="x21-169000"></a>References</h3>
id="x21-170000"></a>References</h3>
<!--l. 2--><p class="noindent" >
<div class="thebibliography">
<p class="bibitem" ><span class="biblabel">
[1]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XPARA04FOREST"></a>G.&#x00A0;Bella, S.&#x00A0;Filippone, A.&#x00A0;De Maio and M.&#x00A0;Testa, <span
class="cmti-10">A Simulation Model</span>
class="pplri7t-">A Simulation Model</span>
<span
class="cmti-10">for Forest Fires</span>, in J.&#x00A0;Dongarra, K.&#x00A0;Madsen, J.&#x00A0;Wasniewski, editors,
class="pplri7t-">for Forest Fires</span>, in J.&#x00A0;Dongarra, K.&#x00A0;Madsen, J.&#x00A0;Wasniewski, editors,
Proceedings of PARA&#x00A0;04 Workshop on State of the Art in Scientific
Computing, pp.&#x00A0;546&#8211;553, Lecture Notes in Computer Science, Springer,
2005.
</p>
<p class="bibitem" ><span class="biblabel">
[2]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="X2007d"></a>A. Buttari, D. di Serafino, P. D&#8217;Ambra, S. Filippone, 2LEV-D2P4:
a package of high-performance preconditioners, Applicable Algebra in
Engineering, Communications and Computing, Volume 18, Number 3, May,
2007, pp. 223-239
id="XBERTACCINIFILIPPONE"></a>D. Bertaccini&#x00A0;and&#x00A0;S. Filippone, <span
class="pplri7t-">Sparse approximate</span>
<span
class="pplri7t-">inverse preconditioners on high performance GPU platforms</span>, Comput. Math.
Appl., 71, (2016), no.&#x00A0;3, 693&#8211;711.
</p>
<p class="bibitem" ><span class="biblabel">
[3]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="X2007c"></a>P. D&#8217;Ambra, S. Filippone, D. Di Serafino On the Development
of PSBLAS-based Parallel Two-level Schwarz Preconditioners Applied
Numerical Mathematics, Elsevier Science, Volume 57, Issues 11-12,
November-December 2007, Pages 1181-1196.
id="X2007d"></a>A. Buttari, D. di Serafino, P. D&#8217;Ambra, S. Filippone, 2LEV-D2P4:
a package of high-performance preconditioners, Applicable Algebra in
Engineering, Communications and Computing, Volume 18, Number 3,
May, 2007, pp. 223-239
</p>
<p class="bibitem" ><span class="biblabel">
[4]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="X2007c"></a>P. D&#8217;Ambra, S. Filippone, D. Di Serafino On the Development of
PSBLAS-based Parallel Two-level Schwarz Preconditioners Applied
Numerical Mathematics, Elsevier Science, Volume 57, Issues 11-12,
November-December 2007, Pages 1181-1196.
</p>
<p class="bibitem" ><span class="biblabel">
[5]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XBLAS2"></a>Dongarra, J. J., DuCroz, J., Hammarling, S. and Hanson, R., An
Extended Set of Fortran Basic Linear Algebra Subprograms, ACM Trans.
Math. Softw. vol.&#x00A0;14, 1&#8211;17, 1988.
</p>
<p class="bibitem" ><span class="biblabel">
[5]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[6]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XBLAS3"></a>Dongarra, J., DuCroz, J., Hammarling, S. and Duff, I., A Set of level
3 Basic Linear Algebra Subprograms, ACM Trans. Math. Softw. vol.&#x00A0;16,
1&#8211;17, 1990.
</p>
<p class="bibitem" ><span class="biblabel">
[6]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XBLACS"></a>J.&#x00A0;J.&#x00A0;Dongarra and R.&#x00A0;C.&#x00A0;Whaley, <span
class="cmti-10">A User&#8217;s Guide to the BLACS</span>
<span
class="cmti-10">v.</span><span
class="cmti-10">&#x00A0;1.1</span>, Lapack Working Note 94, Tech.&#x00A0;Rep.&#x00A0;UT-CS-95-281, University of
Tennessee, March 1995 (updated May 1997).
</p>
<p class="bibitem" ><span class="biblabel">
[7]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XBLACS"></a>J.&#x00A0;J.&#x00A0;Dongarra and R.&#x00A0;C.&#x00A0;Whaley, <span
class="pplri7t-">A User&#8217;s Guide to the BLACS</span>
<span
class="pplri7t-">v.</span><span
class="pplri7t-">&#x00A0;1.1</span>, Lapack Working Note 94, Tech.&#x00A0;Rep.&#x00A0;UT-CS-95-281, University of
Tennessee, March 1995 (updated May 1997).
</p>
<p class="bibitem" ><span class="biblabel">
[8]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="Xsblas97"></a>I.&#x00A0;Duff, M.&#x00A0;Marrone, G.&#x00A0;Radicati and C.&#x00A0;Vittoli, <span
class="cmti-10">Level 3 Basic Linear</span>
class="pplri7t-">Level 3 Basic Linear</span>
<span
class="cmti-10">Algebra Subprograms for Sparse Matrices: a User Level Interface</span>, ACM
class="pplri7t-">Algebra Subprograms for Sparse Matrices: a User Level Interface</span>, ACM
Transactions on Mathematical Software, 23(3), pp.&#x00A0;379&#8211;401, 1997.
</p>
<p class="bibitem" ><span class="biblabel">
[8]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[9]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="Xsblas02"></a>I.&#x00A0;Duff, M.&#x00A0;Heroux and R.&#x00A0;Pozo, <span
class="cmti-10">An Overview of the Sparse Basic</span>
<span
class="cmti-10">Linear Algebra Subprograms: the New Standard from the BLAS Technical</span>
class="pplri7t-">An Overview of the Sparse Basic Linear</span>
<span
class="cmti-10">Forum</span>, ACM Transactions on Mathematical Software, 28(2), pp.&#x00A0;239&#8211;267,
2002.
class="pplri7t-">Algebra Subprograms: the New Standard from the BLAS Technical Forum</span>, ACM
Transactions on Mathematical Software, 28(2), pp.&#x00A0;239&#8211;267, 2002.
</p>
<p class="bibitem" ><span class="biblabel">
[9]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XPSBLAS"></a>S.&#x00A0;Filippone and M.&#x00A0;Colajanni, <span
class="cmti-10">PSBLAS: A Library for Parallel</span>
[10]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XPSBLAS"></a>S.&#x00A0;Filippone and M.&#x00A0;Colajanni, <span
class="pplri7t-">PSBLAS: A Library for Parallel</span>
<span
class="cmti-10">Linear Algebra Computation on Sparse Matrices</span>, ACM Transactions on
class="pplri7t-">Linear Algebra Computation on Sparse Matrices</span>, ACM Transactions on
Mathematical Software, 26(4), pp.&#x00A0;527&#8211;550, 2000.
</p>
<p class="bibitem" ><span class="biblabel">
[10]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XSparse03"></a>S.&#x00A0;Filippone and A.&#x00A0;Buttari, <span
class="cmti-10">Object-Oriented Techniques for Sparse</span>
[11]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XSparse03"></a>S.&#x00A0;Filippone and A.&#x00A0;Buttari, <span
class="pplri7t-">Object-Oriented Techniques for Sparse</span>
<span
class="cmti-10">Matrix Computations in Fortran 2003</span>, ACM Transactions on Mathematical
class="pplri7t-">Matrix Computations in Fortran 2003</span>, ACM Transactions on Mathematical
Software, 38(4), 2012.
</p>
<p class="bibitem" ><span class="biblabel">
[11]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[12]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XKIVA3PSBLAS"></a>S.&#x00A0;Filippone, P.&#x00A0;D&#8217;Ambra, M.&#x00A0;Colajanni, <span
class="cmti-10">Using a Parallel Library</span>
class="pplri7t-">Using a Parallel Library of</span>
<span
class="cmti-10">of Sparse Linear Algebra in a Fluid Dynamics Applications Code on</span>
class="pplri7t-">Sparse Linear Algebra in a Fluid Dynamics Applications Code on Linux</span>
<span
class="cmti-10">Linux Clusters</span>, in G.&#x00A0;Joubert, A.&#x00A0;Murli, F.&#x00A0;Peters, M.&#x00A0;Vanneschi, editors,
class="pplri7t-">Clusters</span>, in G.&#x00A0;Joubert, A.&#x00A0;Murli, F.&#x00A0;Peters, M.&#x00A0;Vanneschi, editors,
Parallel Computing - Advances &amp; Current Issues, pp.&#x00A0;441&#8211;448, Imperial
College Press, 2002.
</p>
<p class="bibitem" ><span class="biblabel">
[12]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XDesignPatterns"></a> Gamma, E., Helm, R., Johnson, R., and Vlissides, J. 1995. <span
class="cmti-10">Design</span>
[13]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XDesignPatterns"></a> Gamma, E., Helm, R., Johnson, R., and Vlissides, J. 1995. <span
class="pplri7t-">Design</span>
<span
class="cmti-10">Patterns: Elements of Reusable Object-Oriented Software</span>. Addison-Wesley.
class="pplri7t-">Patterns: Elements of Reusable Object-Oriented Software</span>. Addison-Wesley.
</p>
<p class="bibitem" ><span class="biblabel">
[13]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XMETIS"></a>Karypis, G. and Kumar, V., <span
class="cmti-10">METIS: Unstructured Graph Partitioning</span>
[14]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XMETIS"></a>Karypis, G. and Kumar, V., <span
class="pplri7t-">METIS: Unstructured Graph Partitioning</span>
<span
class="cmti-10">and Sparse Matrix Ordering System</span>. Minneapolis, MN 55455: University
class="pplri7t-">and Sparse Matrix Ordering System</span>. Minneapolis, MN 55455: University
of Minnesota, Department of Computer Science, 1995. Internet Address:
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">http://www.cs.umn.edu/~karypis</span></span></span>.
</p>
<p class="bibitem" ><span class="biblabel">
[14]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[15]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XBLAS1"></a>Lawson, C., Hanson, R., Kincaid, D. and Krogh, F., Basic Linear
Algebra Subprograms for Fortran usage, ACM Trans. Math. Softw. vol.&#x00A0;5,
38&#8211;329, 1979.
</p>
<p class="bibitem" ><span class="biblabel">
[15]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="Xmachiels"></a>Machiels, L. and Deville, M. <span
class="cmti-10">Fortran 90: An entry to object-oriented</span>
[16]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="Xmachiels"></a>Machiels, L. and Deville, M. <span
class="pplri7t-">Fortran 90: An entry to object-oriented</span>
<span
class="cmti-10">programming for the solution of partial differential equations. </span>ACM Trans.
class="pplri7t-">programming for the solution of partial differential equations. </span>ACM Trans.
Math. Softw. vol.&#x00A0;23, 32&#8211;49.
</p>
<p class="bibitem" ><span class="biblabel">
[16]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[17]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="Xmetcalf"></a>Metcalf, M., Reid, J. and Cohen, M. <span
class="cmti-10">Fortran 95/2003 explained. </span>Oxford
class="pplri7t-">Fortran 95/2003 explained. </span>Oxford
University Press, 2004.
</p>
<p class="bibitem" ><span class="biblabel">
[17]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[18]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XMRC:11"></a>Metcalf, M., Reid, J. and Cohen, M. <span
class="cmti-10">Modern Fortran explained. </span>Oxford
class="pplri7t-">Modern Fortran explained. </span>Oxford
University Press, 2011.
</p>
<p class="bibitem" ><span class="biblabel">
[18]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XRouXiaXu:11"></a>Rouson, D.W.I., Xia, J., Xu, X.: Scientific Software Design: The
[19]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XRouXiaXu:11"></a>Rouson, D.W.I., Xia, J., Xu, X.: Scientific Software Design: The
Object-Oriented Way. Cambridge University Press (2011)
</p>
<p class="bibitem" ><span class="biblabel">
[19]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[20]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XMPI1"></a>M.&#x00A0;Snir, S.&#x00A0;Otto, S.&#x00A0;Huss-Lederman, D.&#x00A0;Walker and J.&#x00A0;Dongarra,
<span
class="cmti-10">MPI: The Complete Reference. Volume 1 - The MPI Core</span>, second edition,
MIT Press, 1998.
class="pplri7t-">MPI: The Complete Reference. Volume 1 - The MPI Core</span>, second edition, MIT
Press, 1998.
</p>
<p class="bibitem" ><span class="biblabel">
[20]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span>
<a
id="XDesPat:11"></a>D.&#x00A0;Barbieri, V.&#x00A0;Cardellini, S.&#x00A0;Filippone and D.&#x00A0;Rouson <span
class="cmti-10">Design Patterns</span>
[21]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XDesPat:11"></a>D.&#x00A0;Barbieri, V.&#x00A0;Cardellini,
S.&#x00A0;Filippone and D.&#x00A0;Rouson <span
class="pplri7t-">Design Patterns for Scientific Computations</span>
<span
class="cmti-10">for Scientific Computations on Sparse Matrices</span>, HPSS 2011, Algorithms
and Programming Tools for Next-Generation High-Performance Scientific
Software, Bordeaux, Sep. 2011
class="pplri7t-">on Sparse Matrices</span>, HPSS 2011, Algorithms and Programming Tools for
Next-Generation High-Performance Scientific Software, Bordeaux, Sep.
2011
</p>
<p class="bibitem" ><span class="biblabel">
[21]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XCaFiRo:2014"></a> Cardellini, V., Filippone, S., and Rouson, D. 2014, Design patterns
[22]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XCaFiRo:2014"></a> Cardellini, V., Filippone, S., and Rouson, D. 2014, Design patterns
for sparse-matrix computations on hybrid CPU/GPU platforms, <span
class="cmti-10">Scientific</span>
class="pplri7t-">Scientific</span>
<span
class="cmti-10">Programming</span>&#x00A0;<span
class="cmti-10">22,</span>&#x00A0;1, 1&#8211;19.
class="pplri7t-">Programming</span>&#x00A0;<span
class="pplri7t-">22,</span>&#x00A0;1, 1&#8211;19.
</p>
<p class="bibitem" ><span class="biblabel">
[22]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
[23]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XOurTechRep"></a>D.&#x00A0;Barbieri, V.&#x00A0;Cardellini, A.&#x00A0;Fanfarillo, S.&#x00A0;Filippone, Three storage
formats for sparse matrices on GPGPUs, Tech. Rep. DICII RR-15.6,
Università di Roma Tor Vergata (February 2015).
</p>
<p class="bibitem" ><span class="biblabel">
[24]<span class="bibsp">&#x00A0;&#x00A0;&#x00A0;</span></span><a
id="XFilippone:2017:SMM:3034774.3017994"></a>S.&#x00A0;Filippone, V.&#x00A0;Cardellini, D.&#x00A0;Barbieri, and A.&#x00A0;Fanfarillo. Sparse
matrix-vector multiplication on GPGPUs. <span
class="pplri7t-">ACM Trans. Math. Softw.</span>,
43(4):30:1&#8211;30:49, 2017.
</p>
</div>
<!--l. 130--><div class="crosslinks"><p class="noindent">[<a
<!--l. 138--><div class="crosslinks"><p class="noindent">[<a
href="userhtmlse13.html" >prev</a>] [<a
href="userhtmlse13.html#tailuserhtmlse13.html" >prev-tail</a>] [<a
href="userhtmlli2.html" >front</a>] [<a
href="userhtml.html# " >up</a>] </p></div>
<!--l. 130--><p class="indent" > <a
<!--l. 138--><p class="indent" > <a
id="tailuserhtmlli2.html"></a>
</body></html>

@ -16,64 +16,66 @@ href="userhtmlli1.html" >prev</a>] [<a
href="userhtmlli1.html#tailuserhtmlli1.html" >prev-tail</a>] [<a
href="#tailuserhtmlse1.html">tail</a>] [<a
href="userhtml.html#userhtmlse1.html" >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">1 </span> <a
<h3 class="sectionHead"><span class="titlemark">1 </span> <a
id="x3-20001"></a>Introduction</h3>
<!--l. 3--><p class="noindent" >The PSBLAS library, developed with the aim to facilitate the parallelization of
computationally intensive scientific applications, is designed to address parallel
implementation of iterative solvers for sparse linear systems through the distributed
memory paradigm. It includes routines for multiplying sparse matrices by dense
matrices, solving block diagonal systems with triangular diagonal entries,
preprocessing sparse matrices, and contains additional routines for dense matrix
operations. The current implementation of PSBLAS addresses a distributed memory
execution model operating with message passing.
implementation of iterative solvers for sparse linear systems through the
distributed memory paradigm. It includes routines for multiplying sparse
matrices by dense matrices, solving block diagonal systems with triangular
diagonal entries, preprocessing sparse matrices, and contains additional
routines for dense matrix operations. The current implementation of PSBLAS
addresses a distributed memory execution model operating with message
passing.
<!--l. 14--><p class="indent" > The PSBLAS library version 3 is implemented in the Fortran&#x00A0;2003&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#Xmetcalf">16</a>]</span>
href="userhtmlli2.html#Xmetcalf">17</a>]</span>
programming language, with reuse and/or adaptation of existing Fortran&#x00A0;77 and
Fortran&#x00A0;95 software, plus a handful of C routines.
<!--l. 19--><p class="indent" > The use of Fortran&#x00A0;2003 offers a number of advantages over Fortran&#x00A0;95, mostly in
the handling of requirements for evolution and adaptation of the library to new
computing architectures and integration of new algorithms. For a detailed discussion
of our design see&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XSparse03">10</a>]</span>; other works discussing advanced programming in Fortran&#x00A0;2003
include&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XDesPat:11">20</a>,&#x00A0;<a
href="userhtmlli2.html#XRouXiaXu:11">18</a>]</span>; sufficient support for Fortran&#x00A0;2003 is now available from many
compilers, including the GNU Fortran compiler from the Free Software Foundation
(as of version 4.8).
<!--l. 19--><p class="indent" > The use of Fortran&#x00A0;2003 offers a number of advantages over Fortran&#x00A0;95, mostly
in the handling of requirements for evolution and adaptation of the library to new
computing architectures and integration of new algorithms. For a detailed
discussion of our design see&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XSparse03">11</a>]</span>; other works discussing advanced programming in
Fortran&#x00A0;2003 include&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XDesPat:11">21</a>,&#x00A0;<a
href="userhtmlli2.html#XRouXiaXu:11">19</a>]</span>; sufficient support for Fortran&#x00A0;2003 is now available
from many compilers, including the GNU Fortran compiler from the Free Software
Foundation (as of version 4.8).
<!--l. 30--><p class="indent" > Previous approaches have been based on mixing Fortran&#x00A0;95, with its support for
object-based design, with other languages; these have been advocated by a number of
authors, e.g.&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#Xmachiels">15</a>]</span>. Moreover, the Fortran&#x00A0;95 facilities for dynamic memory
object-based design, with other languages; these have been advocated by a number
of authors, e.g.&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#Xmachiels">16</a>]</span>. Moreover, the Fortran&#x00A0;95 facilities for dynamic memory
management and interface overloading greatly enhance the usability of the PSBLAS
subroutines. In this way, the library can take care of runtime memory requirements
that are quite difficult or even impossible to predict at implementation or
compilation time.
<!--l. 40--><p class="indent" > The presentation of the PSBLAS library follows the general structure of the
proposal for serial Sparse BLAS&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#Xsblas97">7</a>,&#x00A0;<a
href="userhtmlli2.html#Xsblas02">8</a>]</span>, which in its turn is based on the proposal for
href="userhtmlli2.html#Xsblas97">8</a>,&#x00A0;<a
href="userhtmlli2.html#Xsblas02">9</a>]</span>, which in its turn is based on the proposal for
BLAS on dense matrices&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XBLAS1">14</a>,&#x00A0;<a
href="userhtmlli2.html#XBLAS2">4</a>,&#x00A0;<a
href="userhtmlli2.html#XBLAS3">5</a>]</span>.
<!--l. 45--><p class="indent" > The applicability of sparse iterative solvers to many different areas causes some
terminology problems because the same concept may be denoted through different
names depending on the application area. The PSBLAS features presented in this
document will be discussed referring to a finite difference discretization of a Partial
Differential Equation (PDE). However, the scope of the library is wider than that: for
example, it can be applied to finite element discretizations of PDEs, and even to
different classes of problems such as nonlinear optimization, for example in optimal
control problems.
href="userhtmlli2.html#XBLAS1">15</a>,&#x00A0;<a
href="userhtmlli2.html#XBLAS2">5</a>,&#x00A0;<a
href="userhtmlli2.html#XBLAS3">6</a>]</span>.
<!--l. 45--><p class="indent" > The applicability of sparse iterative solvers to many different areas causes
some terminology problems because the same concept may be denoted
through different names depending on the application area. The PSBLAS
features presented in this document will be discussed referring to a finite
difference discretization of a Partial Differential Equation (PDE). However,
the scope of the library is wider than that: for example, it can be applied
to finite element discretizations of PDEs, and even to different classes of
problems such as nonlinear optimization, for example in optimal control
problems.
<!--l. 55--><p class="indent" > The design of a solver for sparse linear systems is driven by many conflicting
objectives, such as limiting occupation of storage resources, exploiting regularities in
the input data, exploiting hardware characteristics of the parallel platform. To
achieve an optimal communication to computation ratio on distributed memory
machines it is essential to keep the <span
class="cmti-10">data locality </span>as high as possible; this can be
done through an appropriate data allocation strategy. The choice of the
machines it is essential to keep the <span
class="pplri7t-">data locality </span>as high as possible; this can
be done through an appropriate data allocation strategy. The choice of the
preconditioner is another very important factor that affects efficiency of the
implemented application. Optimal data distribution requirements for a given
preconditioner may conflict with distribution requirements of the rest of the solver.

File diff suppressed because it is too large Load Diff

@ -15,48 +15,48 @@ href="userhtmlse10.html" >prev</a>] [<a
href="userhtmlse10.html#tailuserhtmlse10.html" >prev-tail</a>] [<a
href="userhtmlse8.html#tailuserhtmlse11.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">11 </span> <a
id="x17-14200011"></a>Iterative Methods</h3>
<h3 class="sectionHead"><span class="titlemark">11 </span> <a
id="x17-14300011"></a>Iterative Methods</h3>
<!--l. 4--><p class="noindent" >In this chapter we provide routines for preconditioners and iterative methods. The
interfaces for iterative methods are available in the module <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_linsolve_mod</span></span></span>.
<h4 class="subsectionHead"><span class="titlemark">11.1 </span> <a
id="x17-14300011.1"></a>psb_krylov &#8212; Krylov Methods Driver Routine</h4>
<h4 class="subsectionHead"><span class="titlemark">11.1 </span> <a
id="x17-14400011.1"></a>psb_krylov &#8212; Krylov Methods Driver Routine</h4>
<!--l. 17--><p class="noindent" >This subroutine is a driver that provides a general interface for all the Krylov-Subspace
family methods implemented in PSBLAS version 2.
<!--l. 20--><p class="indent" > The stopping criterion can take the following values:
<dl class="description"><dt class="description">
<!--l. 22--><p class="noindent" >
<span
class="cmbx-10">1</span> </dt><dd
class="pplb7t-">1</span> </dt><dd
class="description">
<!--l. 22--><p class="noindent" >normwise backward error in the infinity norm; the iteration is stopped
when
<div class="math-display" >
<img
src="userhtml30x.png" alt=" -----&#x2225;ri&#x2225;------
err = (&#x2225;A&#x2225;&#x2225;xi&#x2225;+ &#x2225;b&#x2225;) &#x003C; eps
src="userhtml30x.png" alt=" ------&#x2225;ri&#x2225;------
err = (&#x2225;A &#x2225;&#x2225;xi&#x2225; + &#x2225;b&#x2225;) &#x003C; eps
" class="math-display" ></div>
<!--l. 24--><p class="nopar" >
</dd><dt class="description">
<!--l. 25--><p class="noindent" >
<span
class="cmbx-10">2</span> </dt><dd
class="pplb7t-">2</span> </dt><dd
class="description">
<!--l. 25--><p class="noindent" >Relative residual in the 2-norm; the iteration is stopped when
<div class="math-display" >
<img
src="userhtml31x.png" alt=" &#x2225;ri&#x2225;-
err = &#x2225;b&#x2225;2 &#x003C; eps
src="userhtml31x.png" alt=" &#x2225;ri&#x2225;
err = &#x2225;b&#x2225;-2 &#x003C; eps
" class="math-display" ></div>
<!--l. 27--><p class="nopar" >
</dd><dt class="description">
<!--l. 28--><p class="noindent" >
<span
class="cmbx-10">3</span> </dt><dd
class="pplb7t-">3</span> </dt><dd
class="description">
<!--l. 28--><p class="noindent" >Relative residual reduction in the 2-norm; the iteration is stopped when
<div class="math-display" >
@ -66,23 +66,24 @@ err = &#x2225;r0&#x2225;2 &#x003C; eps
" class="math-display" ></div>
<!--l. 30--><p class="nopar" ></dd></dl>
<!--l. 32--><p class="noindent" >The behaviour is controlled by the istop argument (see later). In the above formulae, <span
class="cmmi-10">x</span><sub><span
class="cmmi-7">i</span></sub>
class="zplmr7m-">x</span><sub><span
class="zplmr7m-x-x-76">i</span></sub>
is the tentative solution and <span
class="cmmi-10">r</span><sub><span
class="cmmi-7">i</span></sub> = <span
class="cmmi-10">b </span><span
class="cmsy-10">- </span><span
class="cmmi-10">Ax</span><sub><span
class="cmmi-7">i</span></sub> the corresponding residual at the <span
class="cmmi-10">i</span>-th
class="zplmr7m-">r</span><sub><span
class="zplmr7m-x-x-76">i</span></sub> <span
class="zplmr7t-">= </span><span
class="zplmr7m-">b</span><span
class="zplmr7y-">-</span><span
class="zplmr7m-">Ax</span><sub><span
class="zplmr7m-x-x-76">i</span></sub> the corresponding residual at the <span
class="zplmr7m-">i</span>-th
iteration.
<!--l. 37-->
<pre class="lstlisting" id="listing-167"><span class="label"><a
id="x17-143001r1"></a></span><span style="color:#000000"><span
<pre class="lstlisting" id="listing-218"><span class="label"><a
id="x17-144001r1"></a></span><span style="color:#000000"><span
class="cmtt-10">call</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-10">psb_krylov</span></span><span style="color:#000000"><span
class="cmtt-10">(</span></span><span style="color:#000000"><span
@ -103,7 +104,7 @@ class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">info</span></span><span style="color:#000000"><span
class="cmtt-10">,&amp;</span></span>
<span class="label"><a
id="x17-143002r2"></a></span><span
id="x17-144002r2"></a></span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span
@ -128,58 +129,58 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 43--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 43--><p class="noindent" >Synchronous.
</dd><dt class="description">
<!--l. 44--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 44--><p class="noindent" >
</dd><dt class="description">
<!--l. 45--><p class="noindent" >
<span
class="cmbx-10">method</span> </dt><dd
class="pplb7t-">method</span> </dt><dd
class="description">
<!--l. 45--><p class="noindent" >a string that defines the iterative method to be used. Supported values
are:
<dl class="description"><dt class="description">
<!--l. 48--><p class="noindent" >
<span
class="cmbx-10">CG:</span> </dt><dd
class="pplb7t-">CG:</span> </dt><dd
class="description">
<!--l. 48--><p class="noindent" >the Conjugate Gradient method;
</dd><dt class="description">
<!--l. 49--><p class="noindent" >
<span
class="cmbx-10">CGS:</span> </dt><dd
class="pplb7t-">CGS:</span> </dt><dd
class="description">
<!--l. 49--><p class="noindent" >the Conjugate Gradient Stabilized method;
</dd><dt class="description">
<!--l. 51--><p class="noindent" >
<span
class="cmbx-10">GCR:</span> </dt><dd
class="pplb7t-">GCR:</span> </dt><dd
class="description">
<!--l. 51--><p class="noindent" >the Generalized Conjugate Residual method;
</dd><dt class="description">
<!--l. 52--><p class="noindent" >
<span
class="cmbx-10">FCG:</span> </dt><dd
class="pplb7t-">FCG:</span> </dt><dd
class="description">
<!--l. 52--><p class="noindent" >the Flexible Conjugate Gradient method<span class="footnote-mark"><a
href="userhtml18.html#fn5x0"><sup class="textsuperscript">5</sup></a></span><a
id="x17-143003f5"></a> ;
id="x17-144003f5"></a> ;
</dd><dt class="description">
<!--l. 55--><p class="noindent" >
<span
class="cmbx-10">BICG:</span> </dt><dd
class="pplb7t-">BICG:</span> </dt><dd
class="description">
<!--l. 55--><p class="noindent" >the Bi-Conjugate Gradient method;
</dd><dt class="description">
<!--l. 56--><p class="noindent" >
<span
class="cmbx-10">BICGSTAB:</span> </dt><dd
class="pplb7t-">BICGSTAB:</span> </dt><dd
class="description">
<!--l. 56--><p class="noindent" >the Bi-Conjugate Gradient Stabilized method;
@ -188,28 +189,28 @@ class="description">
</dd><dt class="description">
<!--l. 57--><p class="noindent" >
<span
class="cmbx-10">BICGSTABL:</span> </dt><dd
class="pplb7t-">BICGSTABL:</span> </dt><dd
class="description">
<!--l. 57--><p class="noindent" >the Bi-Conjugate Gradient Stabilized method with restarting;
</dd><dt class="description">
<!--l. 58--><p class="noindent" >
<span
class="cmbx-10">RGMRES:</span> </dt><dd
class="pplb7t-">RGMRES:</span> </dt><dd
class="description">
<!--l. 58--><p class="noindent" >the Generalized Minimal Residual method with restarting.</dd></dl>
</dd><dt class="description">
<!--l. 60--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="pplb7t-">a</span> </dt><dd
class="description">
<!--l. 60--><p class="noindent" >the local portion of global sparse matrix <span
class="cmmi-10">A</span>. <br
class="zplmr7m-">A</span>. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
@ -218,32 +219,32 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 66--><p class="noindent" >
<span
class="cmbx-10">prec</span> </dt><dd
class="pplb7t-">prec</span> </dt><dd
class="description">
<!--l. 66--><p class="noindent" >The data structure containing the preconditioner.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#precdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_prec</span><span
class="cmtt-10">_Tprec</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 71--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="pplb7t-">b</span> </dt><dd
class="description">
<!--l. 71--><p class="noindent" >The RHS vector. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
@ -253,15 +254,15 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 76--><p class="noindent" >
<span
class="cmbx-10">x</span> </dt><dd
class="pplb7t-">x</span> </dt><dd
class="description">
<!--l. 76--><p class="noindent" >The initial guess. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">inout</span>.<br
class="pplb7t-">inout</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
@ -271,32 +272,32 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 81--><p class="noindent" >
<span
class="cmbx-10">eps</span> </dt><dd
class="pplb7t-">eps</span> </dt><dd
class="description">
<!--l. 81--><p class="noindent" >The stopping tolerance. <br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a real number.
</dd><dt class="description">
<!--l. 86--><p class="noindent" >
<span
class="cmbx-10">desc</span><span
class="cmbx-10">_a</span> </dt><dd
class="pplb7t-">desc</span><span
class="pplb7t-">_a</span> </dt><dd
class="description">
<!--l. 86--><p class="noindent" >contains data structures for communications.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#descdata"><span
class="cmtt-10">psb</span><span
@ -305,90 +306,93 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 91--><p class="noindent" >
<span
class="cmbx-10">itmax</span> </dt><dd
class="pplb7t-">itmax</span> </dt><dd
class="description">
<!--l. 91--><p class="noindent" >The maximum number of iterations to perform.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Default: <span
class="cmmi-10">itmax </span>= 1000.<br
class="zplmr7m-">itmax </span><span
class="zplmr7t-">= </span>1000.<br
class="newline" />Specified as: an integer variable <span
class="cmmi-10">itmax </span><span
class="cmsy-10">&#x2265; </span>1.
class="zplmr7m-">itmax </span><span
class="zplmr7y-">&#x2265; </span>1.
</dd><dt class="description">
<!--l. 97--><p class="noindent" >
<span
class="cmbx-10">itrace</span> </dt><dd
class="pplb7t-">itrace</span> </dt><dd
class="description">
<!--l. 97--><p class="noindent" >If <span
class="cmmi-10">&#x003E; </span>0 print out an informational message about convergence every <span
class="cmmi-10">itrace</span>
iterations. If = 0 print a message in case of convergence failure.<br
class="zplmr7m-">&#x003E; </span>0 print out an informational message about convergence every <span
class="zplmr7m-">itrace</span>
iterations. If <span
class="zplmr7t-">= </span>0 print a message in case of convergence failure.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Default: <span
class="cmmi-10">itrace </span>= <span
class="cmsy-10">-</span>1.<br
class="zplmr7m-">itrace </span><span
class="zplmr7t-">= </span><span
class="zplmr7y-">-</span>1.<br
class="newline" />
</dd><dt class="description">
<!--l. 104--><p class="noindent" >
<span
class="cmbx-10">irst</span> </dt><dd
class="pplb7t-">irst</span> </dt><dd
class="description">
<!--l. 104--><p class="noindent" >An integer specifying the restart parameter.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Values: <span
class="cmmi-10">irst &#x003E; </span>0. This is employed for the BiCGSTABL or RGMRES methods,
class="zplmr7m-">irst &#x003E; </span>0. This is employed for the BiCGSTABL or RGMRES methods,
otherwise it is ignored.
</dd><dt class="description">
<!--l. 111--><p class="noindent" >
<span
class="cmbx-10">istop</span> </dt><dd
class="pplb7t-">istop</span> </dt><dd
class="description">
<!--l. 111--><p class="noindent" >An integer specifying the stopping criterion.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Values: 1: use the normwise backward error, 2: use the scaled 2-norm
of the residual, 3: use the residual reduction in the 2-norm. Default:
2.
</dd><dt class="description">
<!--l. 117--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 117--><p class="noindent" >
</dd><dt class="description">
<!--l. 118--><p class="noindent" >
<span
class="cmbx-10">x</span> </dt><dd
class="pplb7t-">x</span> </dt><dd
class="description">
<!--l. 118--><p class="noindent" >The computed solution. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">inout</span>.<br
class="pplb7t-">inout</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
@ -398,65 +402,65 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 123--><p class="noindent" >
<span
class="cmbx-10">iter</span> </dt><dd
class="pplb7t-">iter</span> </dt><dd
class="description">
<!--l. 123--><p class="noindent" >The number of iterations performed.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />Returned as: an integer variable.
</dd><dt class="description">
<!--l. 128--><p class="noindent" >
<span
class="cmbx-10">err</span> </dt><dd
class="pplb7t-">err</span> </dt><dd
class="description">
<!--l. 128--><p class="noindent" >The convergence estimate on exit.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />Returned as: a real number.
</dd><dt class="description">
<!--l. 133--><p class="noindent" >
<span
class="cmbx-10">cond</span> </dt><dd
class="pplb7t-">cond</span> </dt><dd
class="description">
<!--l. 133--><p class="noindent" >An estimate of the condition number of matrix <span
class="cmmi-10">A</span>; only available with the <span
class="cmmi-10">CG</span>
class="zplmr7m-">A</span>; only available with the <span
class="zplmr7m-">CG</span>
method on real data.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />Returned as: a real number. A correct result will be greater than or
equal to one; if specified for non-real data, or an error occurred, zero is
returned.
</dd><dt class="description">
<!--l. 141--><p class="noindent" >
<span
class="cmbx-10">info</span> </dt><dd
class="pplb7t-">info</span> </dt><dd
class="description">
<!--l. 141--><p class="noindent" >Error code.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">11.2 </span> <a
id="x17-14400011.2"></a>psb_richardson &#8212; Richardson Iteration Driver Routine</h4>
<h4 class="subsectionHead"><span class="titlemark">11.2 </span> <a
id="x17-14500011.2"></a>psb_richardson &#8212; Richardson Iteration Driver Routine</h4>
<!--l. 158--><p class="noindent" >This subroutine is a driver implementig a Richardson iteration
<div class="math-display" >
<img
@ -464,37 +468,37 @@ src="userhtml33x.png" alt="x = M - 1(b - Ax )+ x ,
k+1 k k
" class="math-display" ></div>
<!--l. 159--><p class="nopar" > with the preconditioner operator <span
class="cmmi-10">M </span>defined in the previous section.
class="zplmr7m-">M </span>defined in the previous section.
<!--l. 162--><p class="indent" > The stopping criterion can take the following values:
<dl class="description"><dt class="description">
<!--l. 164--><p class="noindent" >
<span
class="cmbx-10">1</span> </dt><dd
class="pplb7t-">1</span> </dt><dd
class="description">
<!--l. 164--><p class="noindent" >normwise backward error in the infinity norm; the iteration is stopped
when
<div class="math-display" >
<img
src="userhtml34x.png" alt=" -----&#x2225;ri&#x2225;------
err = (&#x2225;A&#x2225;&#x2225;xi&#x2225;+ &#x2225;b&#x2225;) &#x003C; eps
src="userhtml34x.png" alt=" ------&#x2225;ri&#x2225;------
err = (&#x2225;A &#x2225;&#x2225;xi&#x2225; + &#x2225;b&#x2225;) &#x003C; eps
" class="math-display" ></div>
<!--l. 166--><p class="nopar" >
</dd><dt class="description">
<!--l. 167--><p class="noindent" >
<span
class="cmbx-10">2</span> </dt><dd
class="pplb7t-">2</span> </dt><dd
class="description">
<!--l. 167--><p class="noindent" >Relative residual in the 2-norm; the iteration is stopped when
<div class="math-display" >
<img
src="userhtml35x.png" alt=" &#x2225;ri&#x2225;-
err = &#x2225;b&#x2225;2 &#x003C; eps
err = &#x2225;b&#x2225; 2 &#x003C; eps
" class="math-display" ></div>
<!--l. 169--><p class="nopar" >
</dd><dt class="description">
<!--l. 170--><p class="noindent" >
<span
class="cmbx-10">3</span> </dt><dd
class="pplb7t-">3</span> </dt><dd
class="description">
<!--l. 170--><p class="noindent" >Relative residual reduction in the 2-norm; the iteration is stopped when
<div class="math-display" >
@ -507,20 +511,21 @@ err = &#x2225;r0&#x2225;2 &#x003C; eps
<!--l. 174--><p class="noindent" >The behaviour is controlled by the istop argument (see later). In the above formulae, <span
class="cmmi-10">x</span><sub><span
class="cmmi-7">i</span></sub>
class="zplmr7m-">x</span><sub><span
class="zplmr7m-x-x-76">i</span></sub>
is the tentative solution and <span
class="cmmi-10">r</span><sub><span
class="cmmi-7">i</span></sub> = <span
class="cmmi-10">b </span><span
class="cmsy-10">- </span><span
class="cmmi-10">Ax</span><sub><span
class="cmmi-7">i</span></sub> the corresponding residual at the <span
class="cmmi-10">i</span>-th
class="zplmr7m-">r</span><sub><span
class="zplmr7m-x-x-76">i</span></sub> <span
class="zplmr7t-">= </span><span
class="zplmr7m-">b</span><span
class="zplmr7y-">-</span><span
class="zplmr7m-">Ax</span><sub><span
class="zplmr7m-x-x-76">i</span></sub> the corresponding residual at the <span
class="zplmr7m-">i</span>-th
iteration.
<!--l. 179-->
<pre class="lstlisting" id="listing-168"><span class="label"><a
id="x17-144001r1"></a></span><span style="color:#000000"><span
<pre class="lstlisting" id="listing-219"><span class="label"><a
id="x17-145001r1"></a></span><span style="color:#000000"><span
class="cmtt-10">call</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-10">psb_richardson</span></span><span style="color:#000000"><span
class="cmtt-10">(</span></span><span style="color:#000000"><span
@ -539,7 +544,7 @@ class="cmtt-10">,</span></span><span style="color:#000000"><span
class="cmtt-10">info</span></span><span style="color:#000000"><span
class="cmtt-10">,&amp;</span></span>
<span class="label"><a
id="x17-144002r2"></a></span><span
id="x17-145002r2"></a></span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span
class="cmtt-10">&#x00A0;</span><span
@ -560,28 +565,28 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 185--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 185--><p class="noindent" >Synchronous.
</dd><dt class="description">
<!--l. 186--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 186--><p class="noindent" >
</dd><dt class="description">
<!--l. 187--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="pplb7t-">a</span> </dt><dd
class="description">
<!--l. 187--><p class="noindent" >the local portion of global sparse matrix <span
class="cmmi-10">A</span>. <br
class="zplmr7m-">A</span>. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
@ -590,32 +595,32 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 193--><p class="noindent" >
<span
class="cmbx-10">prec</span> </dt><dd
class="pplb7t-">prec</span> </dt><dd
class="description">
<!--l. 193--><p class="noindent" >The data structure containing the preconditioner.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#precdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_prec</span><span
class="cmtt-10">_Tprec</span><span
class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 198--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="pplb7t-">b</span> </dt><dd
class="description">
<!--l. 198--><p class="noindent" >The RHS vector. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
@ -625,18 +630,18 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 203--><p class="noindent" >
<span
class="cmbx-10">x</span> </dt><dd
class="pplb7t-">x</span> </dt><dd
class="description">
<!--l. 203--><p class="noindent" >The initial guess. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">inout</span>.<br
class="pplb7t-">inout</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
@ -646,29 +651,29 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 208--><p class="noindent" >
<span
class="cmbx-10">eps</span> </dt><dd
class="pplb7t-">eps</span> </dt><dd
class="description">
<!--l. 208--><p class="noindent" >The stopping tolerance. <br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a real number.
</dd><dt class="description">
<!--l. 213--><p class="noindent" >
<span
class="cmbx-10">desc</span><span
class="cmbx-10">_a</span> </dt><dd
class="pplb7t-">desc</span><span
class="pplb7t-">_a</span> </dt><dd
class="description">
<!--l. 213--><p class="noindent" >contains data structures for communications.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#descdata"><span
class="cmtt-10">psb</span><span
@ -677,74 +682,77 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 218--><p class="noindent" >
<span
class="cmbx-10">itmax</span> </dt><dd
class="pplb7t-">itmax</span> </dt><dd
class="description">
<!--l. 218--><p class="noindent" >The maximum number of iterations to perform.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Default: <span
class="cmmi-10">itmax </span>= 1000.<br
class="zplmr7m-">itmax </span><span
class="zplmr7t-">= </span>1000.<br
class="newline" />Specified as: an integer variable <span
class="cmmi-10">itmax </span><span
class="cmsy-10">&#x2265; </span>1.
class="zplmr7m-">itmax </span><span
class="zplmr7y-">&#x2265; </span>1.
</dd><dt class="description">
<!--l. 224--><p class="noindent" >
<span
class="cmbx-10">itrace</span> </dt><dd
class="pplb7t-">itrace</span> </dt><dd
class="description">
<!--l. 224--><p class="noindent" >If <span
class="cmmi-10">&#x003E; </span>0 print out an informational message about convergence every <span
class="cmmi-10">itrace</span>
iterations. If = 0 print a message in case of convergence failure.<br
class="zplmr7m-">&#x003E; </span>0 print out an informational message about convergence every <span
class="zplmr7m-">itrace</span>
iterations. If <span
class="zplmr7t-">= </span>0 print a message in case of convergence failure.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Default: <span
class="cmmi-10">itrace </span>= <span
class="cmsy-10">-</span>1.<br
class="zplmr7m-">itrace </span><span
class="zplmr7t-">= </span><span
class="zplmr7y-">-</span>1.<br
class="newline" />
</dd><dt class="description">
<!--l. 232--><p class="noindent" >
<span
class="cmbx-10">istop</span> </dt><dd
class="pplb7t-">istop</span> </dt><dd
class="description">
<!--l. 232--><p class="noindent" >An integer specifying the stopping criterion.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Values: 1: use the normwise backward error, 2: use the scaled 2-norm of
the residual, 3: use the residual reduction in the 2-norm. Default: 2.
</dd><dt class="description">
<!--l. 238--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 238--><p class="noindent" >
</dd><dt class="description">
<!--l. 239--><p class="noindent" >
<span
class="cmbx-10">x</span> </dt><dd
class="pplb7t-">x</span> </dt><dd
class="description">
<!--l. 239--><p class="noindent" >The computed solution. <br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">inout</span>.<br
class="pplb7t-">inout</span>.<br
class="newline" />Specified as: a rank one array or an object of type <a
href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
@ -754,41 +762,41 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 244--><p class="noindent" >
<span
class="cmbx-10">iter</span> </dt><dd
class="pplb7t-">iter</span> </dt><dd
class="description">
<!--l. 244--><p class="noindent" >The number of iterations performed.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />Returned as: an integer variable.
</dd><dt class="description">
<!--l. 249--><p class="noindent" >
<span
class="cmbx-10">err</span> </dt><dd
class="pplb7t-">err</span> </dt><dd
class="description">
<!--l. 249--><p class="noindent" >The convergence estimate on exit.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />Returned as: a real number.
</dd><dt class="description">
<!--l. 254--><p class="noindent" >
<span
class="cmbx-10">info</span> </dt><dd
class="pplb7t-">info</span> </dt><dd
class="description">
<!--l. 254--><p class="noindent" >Error code.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />Intent: <span
class="cmbx-10">out</span>.<br
class="pplb7t-">out</span>.<br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>

@ -15,17 +15,17 @@ href="userhtmlse11.html" >prev</a>] [<a
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
href="userhtmlse9.html#tailuserhtmlse12.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">12 </span> <a
id="x19-14500012"></a>Extensions</h3>
<h3 class="sectionHead"><span class="titlemark">12 </span> <a
id="x19-14600012"></a>Extensions</h3>
<!--l. 3--><p class="noindent" >The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
library. The extensions provide additional storage formats beyond the ones already
contained in the base library, as well as interfaces to:
<dl class="description"><dt class="description">
<!--l. 8--><p class="noindent" >
<span
class="cmbx-10">SPGPU</span> </dt><dd
class="pplb7t-">SPGPU</span> </dt><dd
class="description">
<!--l. 8--><p class="noindent" >a CUDA library originally
<!--l. 8--><p class="noindent" >a CUDA library originally
published as <a
href="https://code.google.com/p/spgpu/" class="url" ><span
class="cmtt-10">https://code.google.com/p/spgpu/</span></a> and now included
@ -34,22 +34,22 @@ class="cmtt-10">cuda</span></span></span> subdir, for computations on NVIDIA GPU
</dd><dt class="description">
<!--l. 11--><p class="noindent" >
<span
class="cmbx-10">LIBRSB</span> </dt><dd
class="pplb7t-">LIBRSB</span> </dt><dd
class="description">
<!--l. 11--><p class="noindent" ><a
href="http://sourceforge.net/projects/librsb/" class="url" ><span
class="cmtt-10">http://sourceforge.net/projects/librsb/</span></a>, for computations on
class="cmtt-10">http://sourceforge.net/projects/librsb/</span></a>, for computations on
multicore parallel machines.</dd></dl>
<!--l. 14--><p class="noindent" >The infrastructure laid out in the base library to allow for these extensions is detailed in
the references&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XDesPat:11">20</a>,&#x00A0;<a
href="userhtmlli2.html#XCaFiRo:2014">21</a>,&#x00A0;<a
href="userhtmlli2.html#XSparse03">10</a>]</span>; the CUDA-specific data formats are described
href="userhtmlli2.html#XDesPat:11">21</a>,&#x00A0;<a
href="userhtmlli2.html#XCaFiRo:2014">22</a>,&#x00A0;<a
href="userhtmlli2.html#XSparse03">11</a>]</span>; the CUDA-specific data formats are described
in&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XOurTechRep">22</a>]</span>.
href="userhtmlli2.html#XOurTechRep">23</a>]</span>.
<!--l. 19--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a
id="x19-14600012.1"></a>Using the extensions</h4>
<h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a
id="x19-14700012.1"></a>Using the extensions</h4>
<!--l. 21--><p class="noindent" >A sample application using the PSBLAS extensions will contain the following
steps:
<ul class="itemize1">
@ -60,16 +60,16 @@ class="cmtt-10">psb_ext_mod</span></span></span>, <span class="obeylines-h"><spa
class="cmtt-10">psb_cuda_mod</span></span></span>);
</li>
<li class="itemize">
<!--l. 26--><p class="noindent" >Declare a <span
class="cmti-10">mold </span>variable of the necessary type (e.g.
<!--l. 26--><p class="noindent" >Declare a <span
class="pplri7t-">mold </span>variable of the necessary type (e.g.
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_d_ell_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_d_hlg_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_d_vect_cuda</span></span></span>);
</li>
<li class="itemize">
<!--l. 29--><p class="noindent" >Pass the mold variable to the base library interface where needed to ensure
the appropriate dynamic type.</li></ul>
<!--l. 29--><p class="noindent" >Pass the mold variable to the base library interface where needed to
ensure the appropriate dynamic type.</li></ul>
@ -141,126 +141,128 @@ class="cmtt-10">test/cuda/kernel </span>subdirectories, where we provide sample
speed of the sparse matrix-vector product with the various data structures included
in the library.
<!--l. 146--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a
id="x19-14700012.2"></a>Extensions&#8217; Data Structures</h4>
<h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a
id="x19-14800012.2"></a>Extensions&#8217; Data Structures</h4>
<!--l. 150--><p class="noindent" >Access to the facilities provided by the EXT library is mainly achieved through
the data types that are provided within. The data classes are derived from
the base classes in PSBLAS, through the Fortran&#x00A0;2003 mechanism of <span
class="cmti-10">type</span>
class="pplri7t-">type</span>
<span
class="cmti-10">extension</span>&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XMRC:11">17</a>]</span>.
class="pplri7t-">extension</span>&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XMRC:11">18</a>]</span>.
<!--l. 155--><p class="indent" > The data classes are divided between the general purpose CPU extensions, the
GPU interfaces and the RSB interfaces. In the description we will make use of the
notation introduced in Table&#x00A0;<a
href="#x19-147001r21">21<!--tex4ht:ref: tab:notation --></a>.
href="#x19-148001r22">22<!--tex4ht:ref: tab:notation --></a>.
<div class="table">
<!--l. 160--><p class="indent" > <a
id="x19-147001r21"></a><hr class="float"><div class="float"
id="x19-148001r22"></a><hr class="float"><div class="float"
>
<div class="caption"
><span class="id">Table&#x00A0;21: </span><span
class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-147001r21 -->
><span class="id">Table&#x00A0;22: </span><span
class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-148001r22 -->
<div class="center"
>
<!--l. 162--><p class="noindent" >
<div class="tabular"> <table id="TBL-23" class="tabular"
<div class="tabular"> <table id="TBL-25" class="tabular"
><colgroup id="TBL-23-1g"><col
id="TBL-23-1"><col
id="TBL-23-2"></colgroup><tr
><colgroup id="TBL-25-1g"><col
id="TBL-25-1"><col
id="TBL-25-2"></colgroup><tr
class="hline"><td><hr></td><td><hr></td></tr><tr
style="vertical-align:baseline;" id="TBL-23-1-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-1"
style="vertical-align:baseline;" id="TBL-25-1-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-1-1"
class="td11"><span
class="cmr-8">Name </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-2"
class="pplr7t-x-x-80">Name </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-1-2"
class="td11"><span
class="cmr-8">Description </span></td>
class="pplr7t-x-x-80">Description </span></td>
</tr><tr
class="hline"><td><hr></td><td><hr></td></tr><tr
style="vertical-align:baseline;" id="TBL-23-2-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-1"
style="vertical-align:baseline;" id="TBL-25-2-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-2-1"
class="td11"><span
class="cmr-8">M </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-2"
class="pplr7t-x-x-80">M </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-2-2"
class="td11"><span
class="cmr-8">Number of rows in matrix </span></td></tr><tr
style="vertical-align:baseline;" id="TBL-23-3-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-1"
class="pplr7t-x-x-80">Number of rows in matrix </span></td></tr><tr
style="vertical-align:baseline;" id="TBL-25-3-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-3-1"
class="td
11"><span
class="pplr7t-x-x-80">N </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-3-2"
class="td11"><span
class="cmr-8">N </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-2"
class="pplr7t-x-x-80">Number of columns in matrix </span></td></tr><tr
style="vertical-align:baseline;" id="TBL-25-4-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-4-1"
class="td
11"><span
class="pplr7t-x-x-80">NZ </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-4-2"
class="td11"><span
class="cmr-8">Number of columns in matrix</span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-4-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-1"
class="td11"><span
class="cmr-8">NZ </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-2"
class="td11"><span
class="cmr-8">Number of nonzeros in matrix </span></td></tr><tr
style="vertical-align:baseline;" id="TBL-23-5-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-1"
class="pplr7t-x-x-80">Number of nonzeros in matrix </span></td></tr><tr
style="vertical-align:baseline;" id="TBL-25-5-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-5-1"
class="td
11"><span
class="pplr7t-x-x-80">AVGNZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-5-2"
class="td11"><span
class="cmr-8">AVGNZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-2"
class="td11"><span
class="cmr-8">Average number of nonzeros per row</span></td>
class="pplr7t-x-x-80">Average number of nonzeros per row </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-6-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-1"
style="vertical-align:baseline;" id="TBL-25-6-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-6-1"
class="td11"><span
class="cmr-8">MAXNZR</span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-2"
class="pplr7t-x-x-80">MAXNZR</span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-6-2"
class="td11"><span
class="cmr-8">Maximum number of nonzeros per row</span></td>
class="pplr7t-x-x-80">Maximum number of nonzeros per row</span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-7-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-1"
style="vertical-align:baseline;" id="TBL-25-7-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-7-1"
class="td11"><span
class="cmr-8">NDIAG </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-2"
class="pplr7t-x-x-80">NDIAG </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-7-2"
class="td11"><span
class="cmr-8">Numero of nonzero diagonals </span></td>
class="pplr7t-x-x-80">Numero of nonzero diagonals </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-8-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-1"
style="vertical-align:baseline;" id="TBL-25-8-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-8-1"
class="td11"><span
class="cmr-8">AS </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-2"
class="pplr7t-x-x-80">AS </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-8-2"
class="td11"><span
class="cmr-8">Coefficients array </span></td>
class="pplr7t-x-x-80">Coefficients array </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-9-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-1"
style="vertical-align:baseline;" id="TBL-25-9-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-9-1"
class="td11"><span
class="cmr-8">IA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-2"
class="pplr7t-x-x-80">IA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-9-2"
class="td11"><span
class="cmr-8">Row indices array </span></td>
class="pplr7t-x-x-80">Row indices array </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-10-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-1"
style="vertical-align:baseline;" id="TBL-25-10-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-10-1"
class="td11"><span
class="cmr-8">JA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-2"
class="pplr7t-x-x-80">JA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-10-2"
class="td11"><span
class="cmr-8">Column indices array </span></td>
class="pplr7t-x-x-80">Column indices array </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-11-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-1"
style="vertical-align:baseline;" id="TBL-25-11-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-11-1"
class="td11"><span
class="cmr-8">IRP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-2"
class="pplr7t-x-x-80">IRP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-11-2"
class="td11"><span
class="cmr-8">Row start pointers array </span></td>
class="pplr7t-x-x-80">Row start pointers array </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-12-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-1"
style="vertical-align:baseline;" id="TBL-25-12-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-12-1"
class="td11"><span
class="cmr-8">JCP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-2"
class="pplr7t-x-x-80">JCP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-12-2"
class="td11"><span
class="cmr-8">Column start pointers array </span></td>
class="pplr7t-x-x-80">Column start pointers array </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-13-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-1"
style="vertical-align:baseline;" id="TBL-25-13-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-13-1"
class="td11"><span
class="cmr-8">NZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-2"
class="pplr7t-x-x-80">NZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-13-2"
class="td11"><span
class="cmr-8">Number of nonzeros per row array </span></td>
class="pplr7t-x-x-80">Number of nonzeros per row array </span></td>
</tr><tr
style="vertical-align:baseline;" id="TBL-23-14-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-1"
style="vertical-align:baseline;" id="TBL-25-14-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-14-1"
class="td11"><span
class="cmr-8">OFFSET </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-2"
class="pplr7t-x-x-80">OFFSET </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-25-14-2"
class="td11"><span
class="cmr-8">Offset for diagonals </span></td>
class="pplr7t-x-x-80">Offset for diagonals </span></td>
</tr><tr
class="hline"><td><hr></td><td><hr></td></tr><tr
style="vertical-align:baseline;" id="TBL-23-15-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-15-1"
style="vertical-align:baseline;" id="TBL-25-15-"><td style="white-space:nowrap; text-align:left;" id="TBL-25-15-1"
class="td11"> </td></tr></table> </div>
</div>
@ -274,7 +276,7 @@ class="td11"> </td></tr></table>
<a
id="x19-147002r5"></a>
id="x19-148002r5"></a>
@ -283,18 +285,18 @@ src="mat.png" alt="PIC"
width="147" height="147" >
<br /> <div class="caption"
><span class="id">Figure&#x00A0;5: </span><span
class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-147002r5 -->
class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-148002r5 -->
<!--l. 198--><p class="indent" > </div><hr class="endfigure">
<h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a
id="x19-14800012.3"></a>CPU-class extensions</h4>
<h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a
id="x19-14900012.3"></a>CPU-class extensions</h4>
<!--l. 203--><p class="noindent" >
<h5 class="likesubsubsectionHead"><a
id="x19-149000"></a>ELLPACK</h5>
id="x19-150000"></a>ELLPACK</h5>
<!--l. 205--><p class="noindent" >The ELLPACK/ITPACK format (shown in Figure&#x00A0;<a
href="#x19-149001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional
href="#x19-150001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional
arrays <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">JA</span></span></span> with <span class="obeylines-h"><span class="verb"><span
@ -302,7 +304,7 @@ class="cmtt-10">M</span></span></span> rows and <span class="obeylines-h"><span
class="cmtt-10">MAXNZR</span></span></span> columns, where <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">MAXNZR</span></span></span> is the maximum
number of nonzeros in any row&#x00A0;<span class="cite">[<span
class="cmbx-10">?</span>]</span>. Each row of the arrays <span class="obeylines-h"><span class="verb"><span
class="pplb7t-">?</span>]</span>. Each row of the arrays <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">JA</span></span></span> contains the
coefficients and column indices; rows shorter than <span class="obeylines-h"><span class="verb"><span
@ -315,7 +317,7 @@ row.
<a
id="x19-149001r6"></a>
id="x19-150001r6"></a>
@ -325,13 +327,13 @@ width="233" height="233" >
<br /> <div class="caption"
><span class="id">Figure&#x00A0;6: </span><span
class="content">ELLPACK compression of matrix in Figure&#x00A0;<a
href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-149001r6 -->
href="#x19-148002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r6 -->
<!--l. 225--><p class="indent" > </div><hr class="endfigure">
<a
id="x19-149002r1"></a>
id="x19-150002r1"></a>
@ -341,9 +343,8 @@ href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:l
<!--l. 231-->
<pre class="lstlisting" id="listing-169"><span class="label"><a
id="x19-149003r1"></a></span><span
class="cmtt-9">&#x00A0;</span><span
<pre class="lstlisting" id="listing-220"><span class="label"><a
id="x19-150003r1"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
@ -352,8 +353,7 @@ class="cmtt-9">i</span></span><span style="color:#000000"><span
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
class="cmtt-9">n</span></span>
<span class="label"><a
id="x19-149004r2"></a></span><span
class="cmtt-9">&#x00A0;</span><span
id="x19-150004r2"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
@ -362,8 +362,7 @@ class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">t</span></span><span style="color:#000000"><span
class="cmtt-9">=0</span></span>
<span class="label"><a
id="x19-149005r3"></a></span><span
class="cmtt-9">&#x00A0;</span><span
id="x19-150005r3"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
@ -374,8 +373,7 @@ class="cmtt-9">j</span></span><span style="color:#000000"><span
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
class="cmtt-9">maxnzr</span></span>
<span class="label"><a
id="x19-149006r4"></a></span><span
class="cmtt-9">&#x00A0;</span><span
id="x19-150006r4"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
@ -403,8 +401,7 @@ class="cmtt-9">,</span></span><span style="color:#000000"><span
class="cmtt-9">j</span></span><span style="color:#000000"><span
class="cmtt-9">))</span></span>
<span class="label"><a
id="x19-149007r5"></a></span><span
class="cmtt-9">&#x00A0;</span><span
id="x19-150007r5"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
@ -413,8 +410,7 @@ class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">do</span></span>
<span class="label"><a
id="x19-149008r6"></a></span><span
class="cmtt-9">&#x00A0;</span><span
id="x19-150008r6"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
@ -427,8 +423,7 @@ class="cmtt-9">)</span></span><span style="color:#000000"> </span><span style="c
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">t</span></span>
<span class="label"><a
id="x19-149009r7"></a></span><span
class="cmtt-9">&#x00A0;</span><span
id="x19-150009r7"></a></span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
@ -436,35 +431,36 @@ class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style=
class="cmtt-9">do</span></span></pre>
<a
id="x19-149010r1"></a>
id="x19-150010r1"></a>
<a
id="x19-149011"></a>
<span
class="cmbx-10">Algorithm</span><span
class="cmbx-10">&#x00A0;1:</span>&#x00A0; Matrix-Vector product in ELL format
id="x19-150011"></a>
<span
class="pplb7t-">Algorithm</span><span
class="pplb7t-">&#x00A0;1:</span>&#x00A0; Matrix-Vector product in ELL format
</div><hr class="endfloat" />
<!--l. 242--><p class="indent" > The matrix-vector product <span
class="cmmi-10">y </span>= <span
class="cmmi-10">Ax </span>can be computed with the code shown in
class="zplmr7m-">y </span><span
class="zplmr7t-">= </span><span
class="zplmr7m-">Ax </span>can be computed with the code shown in
Alg.&#x00A0;<a
href="#x19-149010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and
href="#x19-150010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and
two floating-point operations per inner iteration.
<!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the coefficients
in the <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> array will be zeros; therefore this data structure will have an overhead both
in terms of memory space and redundant operations (multiplications by zero). The
overhead can be acceptable if:
<!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the
coefficients in the <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> array will be zeros; therefore this data structure will have an
overhead both in terms of memory space and redundant operations (multiplications
by zero). The overhead can be acceptable if:
<ol class="enumerate1" >
<li
class="enumerate" id="x19-149013x1">
class="enumerate" id="x19-150013x1">
<!--l. 253--><p class="noindent" >The maximum number of nonzeros per row is not much larger than the
average;
</li>
<li
class="enumerate" id="x19-149015x2">
class="enumerate" id="x19-150015x2">
<!--l. 255--><p class="noindent" >The regularity of the data structure allows for faster code, e.g. by allowing
vectorization, thereby offsetting the additional storage requirements.</li></ol>
<!--l. 259--><p class="noindent" >In the extreme case where the input matrix has one full row, the ELLPACK
@ -492,62 +488,72 @@ class="cmtt-10">psb_T_ell_sparse_mat</span></span></span>:
</pre>
<!--l. 295--><p class="nopar" > </div></div>
<h5 class="likesubsubsectionHead"><a
id="x19-150000"></a>Hacked ELLPACK</h5>
<!--l. 303--><p class="noindent" >The <span
class="cmti-10">hacked ELLPACK </span>(<span
class="cmbx-10">HLL</span>) format alleviates the main problem of the ELLPACK
format, that is, the amount of memory required by padding for sparse matrices in
which the maximum row length is larger than the average.
id="x19-151000"></a>Hacked ELLPACK</h5>
<!--l. 303--><p class="noindent" >The <span
class="pplri7t-">hacked ELLPACK </span>(<span
class="pplb7t-">HLL</span>) format alleviates the main problem of the ELLPACK
format, that is, the amount of memory required by padding for sparse matrices in
which the maximum row length is larger than the average.
<!--l. 308--><p class="indent" > The number of elements allocated to padding is
[(<span
class="cmmi-10">m</span><span
class="cmsy-10">*</span><span
class="cmmi-10">maxNR</span>) <span
class="cmsy-10">- </span>(<span
class="cmmi-10">m</span><span
class="cmsy-10">*</span><span
class="cmmi-10">avgNR</span>) = <span
class="cmmi-10">m</span><span
class="cmsy-10">* </span>(<span
class="cmmi-10">maxNR</span><span
class="cmsy-10">-</span><span
class="cmmi-10">avgNR</span>)] for both <span class="obeylines-h"><span class="verb"><span
<span
class="zplmr7t-">[(</span><span
class="zplmr7m-">m</span><span
class="zplmr7y-">*</span><span
class="zplmr7m-">maxNR</span><span
class="zplmr7t-">) </span><span
class="zplmr7y-">- </span><span
class="zplmr7t-">(</span><span
class="zplmr7m-">m</span><span
class="zplmr7y-">*</span><span
class="zplmr7m-">avgNR</span><span
class="zplmr7t-">) = </span><span
class="zplmr7m-">m</span><span
class="zplmr7y-">* </span><span
class="zplmr7t-">(</span><span
class="zplmr7m-">maxNR</span><span
class="zplmr7y-">-</span><span
class="zplmr7m-">avgNR</span><span
class="zplmr7t-">)] </span>for both <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">JA</span></span></span> arrays,
where <span
class="cmmi-10">m </span>is equal to the number of rows of the matrix, <span
class="cmmi-10">maxNR </span>is the maximum
class="zplmr7m-">m </span>is equal to the number of rows of the matrix, <span
class="zplmr7m-">maxNR </span>is the maximum
number of nonzero elements in every row and <span
class="cmmi-10">avgNR </span>is the average number of
class="zplmr7m-">avgNR </span>is the average number of
nonzeros. Therefore a single densely populated row can seriously affect the total size
of the allocation.
<!--l. 317--><p class="indent" > To limit this effect, in the HLL format we break the original matrix into equally
sized groups of rows (called <span
class="cmti-10">hacks</span>), and then store these groups as independent
class="pplri7t-">hacks</span>), and then store these groups as independent
matrices in ELLPACK format. The groups can be arranged selecting rows in an
arbitrarily manner; indeed, if the rows are sorted by decreasing number of nonzeros
we obtain essentially the JAgged Diagonals format. If the rows are not in the original
order, then an additional vector <span
class="cmti-10">rIdx </span>is required, storing the actual row index for
each row in the data structure.
class="pplri7t-">rIdx </span>is required, storing the actual row index for each
row in the data structure.
<!--l. 327--><p class="indent" > The multiple ELLPACK-like buffers are stacked together inside a single, one
dimensional array; an additional vector <span
class="cmti-10">hackOffsets </span>is provided to keep track of the
class="pplri7t-">hackOffsets </span>is provided to keep track of the
individual submatrices. All hacks have the same number of rows <span
class="cmti-10">hackSize</span>; hence, the
class="pplri7t-">hackSize</span>; hence, the
<span
class="cmti-10">hackOffsets </span>vector is an array of (<span
class="cmmi-10">m&#x2215;hackSize</span>) + 1 elements, each one pointing to
the first index of a submatrix inside the stacked <span
class="cmti-10">cM</span>/<span
class="cmti-10">rP </span>buffers, plus an additional
class="pplri7t-">hackOffsets </span>vector is an array of <span
class="zplmr7t-">(</span><span
class="zplmr7m-">m</span><span
class="zplmr7t-">/</span><span
class="zplmr7m-">hackSize</span><span
class="zplmr7t-">) + </span>1 elements, each one pointing to the
first index of a submatrix inside the stacked <span
class="pplri7t-">cM</span>/<span
class="pplri7t-">rP </span>buffers, plus an additional
element pointing past the end of the last block, where the next one would begin. We
thus have the property that the elements of the <span
class="cmmi-10">k</span>-th <span
class="cmti-10">hack </span>are stored between
class="zplmr7m-">k</span>-th <span
class="pplri7t-">hack </span>are stored between
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">hackOffsets[k]</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">hackOffsets[k+1]</span></span></span>, similarly to what happens in the CSR
@ -558,7 +564,7 @@ format.
<a
id="x19-150001r7"></a>
id="x19-151001r7"></a>
@ -568,7 +574,7 @@ width="248" height="248" >
<br /> <div class="caption"
><span class="id">Figure&#x00A0;7: </span><span
class="content">Hacked ELLPACK compression of matrix in Figure&#x00A0;<a
href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r7 -->
href="#x19-148002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r7 -->
@ -595,9 +601,9 @@ class="cmtt-10">psb_T_hll_sparse_mat</span></span></span>:
</pre>
<!--l. 388--><p class="nopar" > </div></div>
<h5 class="likesubsubsectionHead"><a
id="x19-151000"></a>Diagonal storage</h5>
id="x19-152000"></a>Diagonal storage</h5>
<!--l. 396--><p class="noindent" >The DIAgonal (DIA) format (shown in Figure&#x00A0;<a
href="#x19-151001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span
href="#x19-152001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">AS</span></span></span>
containing in each column the coefficients along a diagonal of the matrix, and an
integer array <span class="obeylines-h"><span class="verb"><span
@ -605,9 +611,10 @@ class="cmtt-10">OFFSET</span></span></span> that determines where each diagonal
class="cmtt-10">AS</span></span></span>
are padded with zeros as necessary.
<!--l. 402--><p class="indent" > The code to compute the matrix-vector product <span
class="cmmi-10">y </span>= <span
class="cmmi-10">Ax </span>is shown in Alg.&#x00A0;<a
href="#x19-151003r2">2<!--tex4ht:ref: alg:dia --></a>; it
class="zplmr7m-">y </span><span
class="zplmr7t-">= </span><span
class="zplmr7m-">Ax </span>is shown in Alg.&#x00A0;<a
href="#x19-152003r2">2<!--tex4ht:ref: alg:dia --></a>; it
costs one memory read per outer iteration, plus three memory reads, one memory
write and two floating-point operations per inner iteration. The accesses to
<span class="obeylines-h"><span class="verb"><span
@ -620,7 +627,7 @@ required.
<a
id="x19-151001r8"></a>
id="x19-152001r8"></a>
@ -630,13 +637,13 @@ width="248" height="248" >
<br /> <div class="caption"
><span class="id">Figure&#x00A0;8: </span><span
class="content">DIA compression of matrix in Figure&#x00A0;<a
href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r8 -->
href="#x19-148002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-152001r8 -->
<!--l. 419--><p class="indent" > </div><hr class="endfigure">
<a
id="x19-151002r2"></a>
id="x19-152002r2"></a>
@ -662,12 +669,12 @@ href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:l
</pre>
<!--l. 450--><p class="nopar" > </div></div>
<a
id="x19-151003r2"></a>
id="x19-152003r2"></a>
<a
id="x19-151004"></a>
<span
class="cmbx-10">Algorithm</span><span
class="cmbx-10">&#x00A0;2:</span>&#x00A0; Matrix-Vector product in DIA format
id="x19-152004"></a>
<span
class="pplb7t-">Algorithm</span><span
class="pplb7t-">&#x00A0;2:</span>&#x00A0; Matrix-Vector product in DIA format
@ -691,7 +698,7 @@ class="cmtt-10">psb_T_dia_sparse_mat</span></span></span>:
</pre>
<!--l. 486--><p class="nopar" > </div></div>
<h5 class="likesubsubsectionHead"><a
id="x19-152000"></a>Hacked DIA</h5>
id="x19-153000"></a>Hacked DIA</h5>
<!--l. 495--><p class="noindent" >Storage by DIAgonals is an attractive option for matrices whose coefficients are
located on a small set of diagonals, since they do away with storing explicitly the
indices and therefore reduce significantly memory traffic. However, having a few
@ -705,40 +712,44 @@ class="cmtt-10">y</span></span></span> is too large to remain
in the cache memory, the associated cache miss penalty is paid multiple
times.
<!--l. 507--><p class="indent" > The <span
class="cmti-10">hacked DIA </span>(<span
class="cmbx-10">HDIA</span>) format was designed to contain the amount of padding,
class="pplri7t-">hacked DIA </span>(<span
class="pplb7t-">HDIA</span>) format was designed to contain the amount of padding,
by breaking the original matrix into equally sized groups of rows (<span
class="cmti-10">hacks</span>), and then
storing these groups as independent matrices in DIA format. This approach is similar
to that of HLL, and requires using an offset vector for each submatrix. Again,
class="pplri7t-">hacks</span>), and then
storing these groups as independent matrices in DIA format. This approach is
similar to that of HLL, and requires using an offset vector for each submatrix. Again,
similarly to HLL, the various submatrices are stacked inside a linear array to
improve memory management. The fact that the matrix is accessed in slices
helps in reducing cache misses, especially regarding accesses to the vector
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">y</span></span></span>.
<!--l. 519--><p class="indent" > An additional vector <span
class="cmti-10">hackOffsets </span>is provided to complete the matrix format; given
class="pplri7t-">hackOffsets </span>is provided to complete the matrix format; given
that <span
class="cmti-10">hackSize </span>is the number of rows of each hack, the <span
class="cmti-10">hackOffsets </span>vector is made by
an array of (<span
class="cmmi-10">m&#x2215;hackSize</span>) + 1 elements, pointing to the first diagonal offset of a
class="pplri7t-">hackSize </span>is the number of rows of each hack, the <span
class="pplri7t-">hackOffsets </span>vector is made by an
array of <span
class="zplmr7t-">(</span><span
class="zplmr7m-">m</span><span
class="zplmr7t-">/</span><span
class="zplmr7m-">hackSize</span><span
class="zplmr7t-">) + </span>1 elements, pointing to the first diagonal offset of a
submatrix inside the stacked <span
class="cmti-10">offsets </span>buffers, plus an additional element equal to the
number of nonzero diagonals in the whole matrix. We thus have the property that
the number of diagonals of the <span
class="cmmi-10">k</span>-th <span
class="cmti-10">hack </span>is given by <span
class="cmti-10">hackOffsets[k+1] -</span>
class="pplri7t-">offsets </span>buffers, plus an additional element equal to the
number of nonzero diagonals in the whole matrix. We thus have the property
that the number of diagonals of the <span
class="zplmr7m-">k</span>-th <span
class="pplri7t-">hack </span>is given by <span
class="pplri7t-">hackOffsets[k+1] -</span>
<span
class="cmti-10">hackOffsets[k]</span>.
class="pplri7t-">hackOffsets[k]</span>.
<!--l. 529--><p class="indent" > <hr class="figure"><div class="figure"
>
<a
id="x19-152001r9"></a>
id="x19-153001r9"></a>
@ -748,7 +759,7 @@ width="248" height="248" >
<br /> <div class="caption"
><span class="id">Figure&#x00A0;9: </span><span
class="content">Hacked DIA compression of matrix in Figure&#x00A0;<a
href="#x19-147002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-152001r9 -->
href="#x19-148002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-153001r9 -->
@ -792,8 +803,8 @@ class="cmtt-10">psb_T_hdia_sparse_mat</span></span></span>:
<h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a
id="x19-15300012.4"></a>CUDA-class extensions</h4>
<h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a
id="x19-15400012.4"></a>CUDA-class extensions</h4>
<!--l. 4--><p class="noindent" >For computing with CUDA we define a dual memorization strategy in which each
variable on the CPU (&#8220;host&#8221;) side has a GPU (&#8220;device&#8221;) side. When a GPU-type
variable is initialized, the data contained is (usually) the same on both sides. Each
@ -801,10 +812,10 @@ operator invoked on the variable may change the data so that only the host side
the device side are up-to-date.
<!--l. 11--><p class="indent" > Keeping track of the updates to data in the variables is essential: we want to
perform most computations on the GPU, but we cannot afford the time needed to
move data between the host memory and the device memory because the bandwidth
of the interconnection bus would become the main bottleneck of the computation.
Thus, each and every computational routine in the library is built according to the
following principles:
move data between the host memory and the device memory because the
bandwidth of the interconnection bus would become the main bottleneck of the
computation. Thus, each and every computational routine in the library is built
according to the following principles:
<ul class="itemize1">
<li class="itemize">
<!--l. 18--><p class="noindent" >If the data type being handled is GPU-enabled, make sure that its device
@ -818,20 +829,20 @@ following principles:
<dl class="description"><dt class="description">
<!--l. 25--><p class="noindent" >
<span
class="cmbx-10">explicitly</span> </dt><dd
class="pplb7t-">explicitly</span> </dt><dd
class="description">
<!--l. 25--><p class="noindent" >by invoking a synchronization method;
</dd><dt class="description">
<!--l. 26--><p class="noindent" >
<span
class="cmbx-10">implicitly</span> </dt><dd
class="pplb7t-">implicitly</span> </dt><dd
class="description">
<!--l. 26--><p class="noindent" >by invoking a method that involves other data items that are not
GPU-enabled, e.g., by assignment ov a vector to a normal array.</dd></dl>
</li></ul>
<!--l. 31--><p class="noindent" >In this way, data items are put on the GPU memory &#8220;on demand&#8221; and remain there as
long as &#8220;normal&#8221; computations are carried out. As an example, the following call to a
matrix-vector product
long as &#8220;normal&#8221; computations are carried out. As an example, the following call to
a matrix-vector product
<div class="center"
>
<!--l. 39--><p class="noindent" >
@ -850,11 +861,11 @@ then
<!--l. 52--><p class="noindent" >The first kernel invocation will find the data in main memory, and will
copy it to the GPU memory, thus incurring a significant overhead; the
result is however <span
class="cmti-10">not </span>copied back, and therefore:
class="pplri7t-">not </span>copied back, and therefore:
</li>
<li class="itemize">
<!--l. 56--><p class="noindent" >Subsequent kernel invocations involving the same vector will find the data
on the GPU side so that they will run at full speed.</li></ul>
<!--l. 56--><p class="noindent" >Subsequent kernel invocations involving the same vector will find the
data on the GPU side so that they will run at full speed.</li></ul>
<!--l. 60--><p class="noindent" >For all invocations after the first the only data that will have to be transferred to/from
the main memory will be the scalars <code class="lstinline"><span style="color:#000000">alpha</span></code> and <code class="lstinline"><span style="color:#000000">beta</span></code>, and the return code
<code class="lstinline"><span style="color:#000000">info</span></code>.
@ -862,7 +873,7 @@ the main memory will be the scalars <code class="lstinline"><span style="color:#
<dl class="description"><dt class="description">
<!--l. 65--><p class="noindent" >
<span
class="cmbx-10">Vectors:</span> </dt><dd
class="pplb7t-">Vectors:</span> </dt><dd
class="description">
<!--l. 65--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_vect_gpu</span></code> provides a GPU-enabled extension of
the inner type <code class="lstinline"><span style="color:#000000">psb_T_base_vect_type</span></code>, and must be used together with
@ -871,23 +882,23 @@ class="description">
</dd><dt class="description">
<!--l. 69--><p class="noindent" >
<span
class="cmbx-10">CSR:</span> </dt><dd
class="pplb7t-">CSR:</span> </dt><dd
class="description">
<!--l. 69--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_csrg_sparse_mat</span></code> provides an interface to the GPU
version of CSR available in the NVIDIA CuSPARSE library;
</dd><dt class="description">
<!--l. 72--><p class="noindent" >
<span
class="cmbx-10">HYB:</span> </dt><dd
class="pplb7t-">HYB:</span> </dt><dd
class="description">
<!--l. 72--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hybg_sparse_mat</span></code> provides an interface to the HYB
GPU storage available in the NVIDIA CuSPARSE library. The internal
GPU storage available in the NVIDIA CuSPARSE library. The internal
structure is opaque, hence the host side is just CSR; the HYB data format
is only available up to CUDA version 10.
</dd><dt class="description">
<!--l. 77--><p class="noindent" >
<span
class="cmbx-10">ELL:</span> </dt><dd
class="pplb7t-">ELL:</span> </dt><dd
class="description">
<!--l. 77--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_elg_sparse_mat</span></code> provides an interface to the
ELLPACK implementation from SPGPU;
@ -897,14 +908,14 @@ class="description">
</dd><dt class="description">
<!--l. 80--><p class="noindent" >
<span
class="cmbx-10">HLL:</span> </dt><dd
class="pplb7t-">HLL:</span> </dt><dd
class="description">
<!--l. 80--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hlg_sparse_mat</span></code> provides an interface to the Hacked
ELLPACK implementation from SPGPU;
<!--l. 80--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hlg_sparse_mat</span></code> provides an interface to the
Hacked ELLPACK implementation from SPGPU;
</dd><dt class="description">
<!--l. 82--><p class="noindent" >
<span
class="cmbx-10">HDIA:</span> </dt><dd
class="pplb7t-">HDIA:</span> </dt><dd
class="description">
<!--l. 82--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hdiag_sparse_mat</span></code> provides an interface to the
Hacked DIAgonals implementation from SPGPU;</dd></dl>

@ -15,13 +15,13 @@ href="userhtmlse12.html" >prev</a>] [<a
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
href="userhtmlse10.html#tailuserhtmlse13.html">tail</a>] [<a
href="userhtml.html# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">13 </span> <a
id="x20-15400013"></a>CUDA Environment Routines</h3>
<h3 class="sectionHead"><span class="titlemark">13 </span> <a
id="x20-15500013"></a>CUDA Environment Routines</h3>
<!--l. 91--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-155000"></a>psb_cuda_init &#8212; Initializes PSBLAS-CUDA environment</h4>
id="x20-156000"></a>psb_cuda_init &#8212; Initializes PSBLAS-CUDA environment</h4>
<a
id="Q1-20-192"></a>
id="Q1-20-194"></a>
<div class="center"
>
<!--l. 99--><p class="noindent" >
@ -33,44 +33,44 @@ call&#x00A0;psb_cuda_init(ctxt&#x00A0;[,&#x00A0;device])
<dl class="description"><dt class="description">
<!--l. 110--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 110--><p class="noindent" >Synchronous.
</dd><dt class="description">
<!--l. 111--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 111--><p class="noindent" >
</dd><dt class="description">
<!--l. 112--><p class="noindent" >
<span
class="cmbx-10">device</span> </dt><dd
class="pplb7t-">device</span> </dt><dd
class="description">
<!--l. 112--><p class="noindent" >ID of CUDA device to attach to.<br
class="newline" />Scope: <span
class="cmbx-10">local</span>.<br
class="pplb7t-">local</span>.<br
class="newline" />Type: <span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: an integer value. &#x00A0;Default: use <code class="lstinline"><span style="color:#000000">mod</span><span style="color:#000000">(</span><span style="color:#000000">iam</span><span style="color:#000000">,</span><span style="color:#000000">ngpu</span><span style="color:#000000">)</span></code> where <code class="lstinline"><span style="color:#000000">iam</span></code> is
the calling process index and <code class="lstinline"><span style="color:#000000">ngpu</span></code> is the total number of CUDA devices
available on the current node.</dd></dl>
<!--l. 123--><p class="noindent" ><span
class="cmbx-12">Notes</span>
class="pplb7t-x-x-120">Notes</span>
<ol class="enumerate1" >
<li
class="enumerate" id="x20-155002x1">
class="enumerate" id="x20-156002x1">
<!--l. 125--><p class="noindent" >A call to this routine must precede any other PSBLAS-CUDA call.</li></ol>
<!--l. 129--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-156000"></a>psb_cuda_exit &#8212; Exit from PSBLAS-CUDA environment</h4>
id="x20-157000"></a>psb_cuda_exit &#8212; Exit from PSBLAS-CUDA environment</h4>
<a
id="Q1-20-194"></a>
id="Q1-20-196"></a>
<div class="center"
>
<!--l. 137--><p class="noindent" >
@ -82,33 +82,33 @@ call&#x00A0;psb_cuda_exit(ctxt)
<dl class="description"><dt class="description">
<!--l. 148--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 148--><p class="noindent" >Synchronous.
</dd><dt class="description">
<!--l. 149--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 149--><p class="noindent" >
</dd><dt class="description">
<!--l. 150--><p class="noindent" >
<span
class="cmbx-10">ctxt</span> </dt><dd
class="pplb7t-">ctxt</span> </dt><dd
class="description">
<!--l. 150--><p class="noindent" >the communication context identifying the virtual parallel machine.<br
class="newline" />Scope: <span
class="cmbx-10">global</span>.<br
class="pplb7t-">global</span>.<br
class="newline" />Type: <span
class="cmbx-10">required</span>.<br
class="pplb7t-">required</span>.<br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: an integer variable.</dd></dl>
<!--l. 161--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-157000"></a>psb_cuda_DeviceSync &#8212; Synchronize CUDA device</h4>
id="x20-158000"></a>psb_cuda_DeviceSync &#8212; Synchronize CUDA device</h4>
<a
id="Q1-20-196"></a>
id="Q1-20-198"></a>
@ -123,9 +123,9 @@ call&#x00A0;psb_cuda_DeviceSync()
CUDA-side code, have completed.
<!--l. 182--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-158000"></a>psb_cuda_getDeviceCount </h4>
id="x20-159000"></a>psb_cuda_getDeviceCount </h4>
<a
id="Q1-20-198"></a>
id="Q1-20-200"></a>
<div class="center"
>
<!--l. 190--><p class="noindent" >
@ -136,9 +136,9 @@ ngpus&#x00A0;=&#x00A0;&#x00A0;psb_cuda_getDeviceCount()
<!--l. 199--><p class="noindent" >Get number of devices available on current computing node.
<!--l. 201--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-159000"></a>psb_cuda_getDevice </h4>
id="x20-160000"></a>psb_cuda_getDevice </h4>
<a
id="Q1-20-200"></a>
id="Q1-20-202"></a>
<div class="center"
>
<!--l. 209--><p class="noindent" >
@ -147,14 +147,14 @@ ngpus&#x00A0;=&#x00A0;&#x00A0;psb_cuda_getDevice()
</pre>
<!--l. 213--><p class="nopar" > </div></div>
<!--l. 218--><p class="noindent" >Get device in use by current process.
<!--l. 220--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-160000"></a>psb_cuda_setDevice </h4>
<a
id="Q1-20-202"></a>
<!--l. 220--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-161000"></a>psb_cuda_setDevice </h4>
<a
id="Q1-20-204"></a>
<div class="center"
>
<!--l. 228--><p class="noindent" >
@ -165,9 +165,9 @@ info&#x00A0;=&#x00A0;psb_cuda_setDevice(dev)
<!--l. 237--><p class="noindent" >Set device to be used by current process.
<!--l. 239--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-161000"></a>psb_cuda_DeviceHasUVA </h4>
id="x20-162000"></a>psb_cuda_DeviceHasUVA </h4>
<a
id="Q1-20-204"></a>
id="Q1-20-206"></a>
<div class="center"
>
<!--l. 247--><p class="noindent" >
@ -178,9 +178,9 @@ hasUva&#x00A0;=&#x00A0;psb_cuda_DeviceHasUVA()
<!--l. 256--><p class="noindent" >Returns true if device currently in use supports UVA (Unified Virtual Addressing).
<!--l. 259--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-162000"></a>psb_cuda_WarpSize </h4>
id="x20-163000"></a>psb_cuda_WarpSize </h4>
<a
id="Q1-20-206"></a>
id="Q1-20-208"></a>
<div class="center"
>
<!--l. 267--><p class="noindent" >
@ -189,14 +189,14 @@ nw&#x00A0;=&#x00A0;psb_cuda_WarpSize()
</pre>
<!--l. 271--><p class="nopar" > </div></div>
<!--l. 276--><p class="noindent" >Returns the warp size.
<!--l. 279--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-163000"></a>psb_cuda_MultiProcessors </h4>
<a
id="Q1-20-208"></a>
<!--l. 279--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-164000"></a>psb_cuda_MultiProcessors </h4>
<a
id="Q1-20-210"></a>
<div class="center"
>
<!--l. 287--><p class="noindent" >
@ -207,9 +207,9 @@ nmp&#x00A0;=&#x00A0;psb_cuda_MultiProcessors()
<!--l. 296--><p class="noindent" >Returns the number of multiprocessors in the CUDA device.
<!--l. 298--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-164000"></a>psb_cuda_MaxThreadsPerMP </h4>
id="x20-165000"></a>psb_cuda_MaxThreadsPerMP </h4>
<a
id="Q1-20-210"></a>
id="Q1-20-212"></a>
<div class="center"
>
<!--l. 306--><p class="noindent" >
@ -220,9 +220,9 @@ nt&#x00A0;=&#x00A0;psb_cuda_MaxThreadsPerMP()
<!--l. 315--><p class="noindent" >Returns the maximum number of threads per multiprocessor.
<!--l. 318--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-165000"></a>psb_cuda_MaxRegistersPerBlock </h4>
id="x20-166000"></a>psb_cuda_MaxRegistersPerBlock </h4>
<a
id="Q1-20-212"></a>
id="Q1-20-214"></a>
<div class="center"
>
<!--l. 326--><p class="noindent" >
@ -231,14 +231,14 @@ nr&#x00A0;=&#x00A0;psb_cuda_MaxRegistersPerBlock()
</pre>
<!--l. 330--><p class="nopar" > </div></div>
<!--l. 335--><p class="noindent" >Returns the maximum number of register per thread block.
<!--l. 338--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-166000"></a>psb_cuda_MemoryClockRate </h4>
<a
id="Q1-20-214"></a>
<!--l. 338--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-167000"></a>psb_cuda_MemoryClockRate </h4>
<a
id="Q1-20-216"></a>
<div class="center"
>
<!--l. 346--><p class="noindent" >
@ -249,9 +249,9 @@ cl&#x00A0;=&#x00A0;psb_cuda_MemoryClockRate()
<!--l. 355--><p class="noindent" >Returns the memory clock rate in KHz, as an integer.
<!--l. 357--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-167000"></a>psb_cuda_MemoryBusWidth </h4>
id="x20-168000"></a>psb_cuda_MemoryBusWidth </h4>
<a
id="Q1-20-216"></a>
id="Q1-20-218"></a>
<div class="center"
>
<!--l. 365--><p class="noindent" >
@ -262,9 +262,9 @@ nb&#x00A0;=&#x00A0;psb_cuda_MemoryBusWidth()
<!--l. 374--><p class="noindent" >Returns the memory bus width in bits.
<!--l. 376--><p class="noindent" >
<h4 class="likesubsectionHead"><a
id="x20-168000"></a>psb_cuda_MemoryPeakBandwidth </h4>
id="x20-169000"></a>psb_cuda_MemoryPeakBandwidth </h4>
<a
id="Q1-20-218"></a>
id="Q1-20-220"></a>
<div class="center"
>
<!--l. 384--><p class="noindent" >
@ -282,7 +282,7 @@ bw&#x00A0;=&#x00A0;psb_cuda_MemoryPeakBandwidth()
<!--l. 126--><p class="indent" >
<!--l. 134--><p class="indent" >

@ -16,26 +16,26 @@ href="userhtmlse1.html" >prev</a>] [<a
href="userhtmlse1.html#tailuserhtmlse1.html" >prev-tail</a>] [<a
href="#tailuserhtmlse2.html">tail</a>] [<a
href="userhtml.html#userhtmlse2.html" >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">2 </span> <a
<h3 class="sectionHead"><span class="titlemark">2 </span> <a
id="x4-30002"></a>General overview</h3>
<!--l. 74--><p class="noindent" >The PSBLAS library is designed to handle the implementation of iterative solvers for
sparse linear systems on distributed memory parallel computers. The system
coefficient matrix <span
class="cmmi-10">A </span>must be square; it may be real or complex, nonsymmetric, and
class="zplmr7m-">A </span>must be square; it may be real or complex, nonsymmetric, and
its sparsity pattern needs not to be symmetric. The serial computation parts are
based on the serial sparse BLAS, so that any extension made to the data structures
of the serial kernels is available to the parallel version. The overall design and
based on the serial sparse BLAS, so that any extension made to the data structures of
the serial kernels is available to the parallel version. The overall design and
parallelization strategy have been influenced by the structure of the ScaLAPACK
parallel library. The layered structure of the PSBLAS library is shown in figure&#x00A0;<a
href="#x4-3001r1">1<!--tex4ht:ref: fig:psblas --></a>;
lower layers of the library indicate an encapsulation relationship with upper
layers. The ongoing discussion focuses on the Fortran&#x00A0;2003 layer immediately
lower layers of the library indicate an encapsulation relationship with upper layers.
The ongoing discussion focuses on the Fortran&#x00A0;2003 layer immediately
below the application layer. The serial parts of the computation on each
process are executed through calls to the serial sparse BLAS subroutines. In a
similar way, the inter-process message exchanges are encapsulated in an
applicaiton layer that has been strongly inspired by the Basic Linear Algebra
Communication Subroutines (BLACS) library&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XBLACS">6</a>]</span>. Usually there is no need to deal
href="userhtmlli2.html#XBLACS">7</a>]</span>. Usually there is no need to deal
directly with MPI; however, in some cases, MPI routines are used directly
to improve efficiency. For further details on our communication layer see
Sec.&#x00A0;<a
@ -76,7 +76,7 @@ mesh.
process that will own the corresponding row in the coefficient matrix and will
carry out all related computations. This allocation strategy is equivalent to a
partition of the discretization mesh into <span
class="cmti-10">sub-domains</span>. Our library supports any
class="pplri7t-">sub-domains</span>. Our library supports any
distribution that keeps together the coefficients of each matrix row; there are no
other constraints on the variable assignment. This choice is consistent with
simple data distributions such as <span class="obeylines-h"><span class="verb"><span
@ -85,7 +85,7 @@ class="cmtt-10">BLOCK</span></span></span>, as well as completely
arbitrary assignments of equation indices to processes. In particular it is
consistent with the usage of graph partitioning tools commonly available in
the literature, e.g. METIS&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#XMETIS">13</a>]</span>. Dense vectors conform to sparse matrices,
href="userhtmlli2.html#XMETIS">14</a>]</span>. Dense vectors conform to sparse matrices,
that is, the entries of a vector follow the same distribution of the matrix
rows.
<!--l. 146--><p class="indent" > We assume that the sparse matrix is built in parallel, where each process generates
@ -94,35 +94,35 @@ node. However, it is possible to hold the entire matrix in one process and distr
explicitly<span class="footnote-mark"><a
href="userhtml5.html#fn1x0"><sup class="textsuperscript">1</sup></a></span><a
id="x4-3002f1"></a> ,
even though the resulting memory bottleneck would make this option unattractive in
most cases.
<h4 class="subsectionHead"><span class="titlemark">2.1 </span> <a
even though the resulting memory bottleneck would make this option unattractive
in most cases.
<h4 class="subsectionHead"><span class="titlemark">2.1 </span> <a
id="x4-40002.1"></a>Basic Nomenclature</h4>
<!--l. 158--><p class="noindent" >Our computational model implies that the data allocation on the parallel distributed
memory machine is guided by the structure of the physical model, and specifically by
the discretization mesh of the PDE.
memory machine is guided by the structure of the physical model, and specifically
by the discretization mesh of the PDE.
<!--l. 163--><p class="indent" > Each point of the discretization mesh will have (at least) one associated
equation/variable, and therefore one index. We say that point <span
class="cmmi-10">i </span><span
class="cmti-10">depends </span>on point <span
class="cmmi-10">j </span>if
class="zplmr7m-">i </span><span
class="pplri7t-">depends </span>on point <span
class="zplmr7m-">j </span>if
the equation for a variable associated with <span
class="cmmi-10">i </span>contains a term in <span
class="cmmi-10">j</span>, or equivalently if
class="zplmr7m-">i </span>contains a term in <span
class="zplmr7m-">j</span>, or equivalently if
<span
class="cmmi-10">a</span><sub><span
class="cmmi-7">ij</span></sub><span
class="cmmi-10">&#x2260;</span>0. After the partition of the discretization mesh into <span
class="cmti-10">sub-domains </span>assigned
class="zplmr7m-">a</span><sub><span
class="zplmr7m-x-x-76">ij</span></sub><span
class="zplmr7m-">&#x2260;</span>0. After the partition of the discretization mesh into <span
class="pplri7t-">sub-domains </span>assigned
to the parallel processes, we classify the points of a given sub-domain as
following.
<dl class="description"><dt class="description">
<!--l. 172--><p class="noindent" >
<span
class="cmbx-10">Internal.</span> </dt><dd
class="pplb7t-">Internal.</span> </dt><dd
class="description">
<!--l. 172--><p class="noindent" >An internal point of a given domain <span
class="cmti-10">depends </span>only on points of the same
class="pplri7t-">depends </span>only on points of the same
domain. If all points of a domain are assigned to one process, then
a computational step (e.g., a matrix-vector product) of the equations
@ -133,22 +133,22 @@ class="cmti-10">depends </span>only on points of the same
</dd><dt class="description">
<!--l. 181--><p class="noindent" >
<span
class="cmbx-10">Boundary.</span> </dt><dd
class="pplb7t-">Boundary.</span> </dt><dd
class="description">
<!--l. 181--><p class="noindent" >A point of a given domain is a boundary point if it <span
class="cmti-10">depends </span>on points
<!--l. 181--><p class="noindent" >A point of a given domain is a boundary point if it <span
class="pplri7t-">depends </span>on points
belonging to other domains.
</dd><dt class="description">
<!--l. 185--><p class="noindent" >
<span
class="cmbx-10">Halo.</span> </dt><dd
class="pplb7t-">Halo.</span> </dt><dd
class="description">
<!--l. 185--><p class="noindent" >A halo point for a given domain is a point belonging to another domain
such that there is a boundary point which <span
class="cmti-10">depends </span>on it. Whenever performing
class="pplri7t-">depends </span>on it. Whenever performing
a computational step, such as a matrix-vector product, the values associated
with halo points are requested from other domains. A boundary point of a
given domain is usually a halo point for some other domain<span class="footnote-mark"><a
with halo points are requested from other domains. A boundary point of
a given domain is usually a halo point for some other domain<span class="footnote-mark"><a
href="userhtml6.html#fn2x0"><sup class="textsuperscript">2</sup></a></span><a
id="x4-4001f2"></a> ;
therefore the cardinality of the boundary points set denotes the amount
@ -156,50 +156,53 @@ href="userhtml6.html#fn2x0"><sup class="textsuperscript">2</sup></a></span><a
</dd><dt class="description">
<!--l. 198--><p class="noindent" >
<span
class="cmbx-10">Overlap.</span> </dt><dd
class="pplb7t-">Overlap.</span> </dt><dd
class="description">
<!--l. 198--><p class="noindent" >An overlap point is a boundary point assigned to multiple domains. Any
operation that involves an overlap point has to be replicated for each
operation that involves an overlap point has to be replicated for each
assignment.</dd></dl>
<!--l. 202--><p class="noindent" >Overlap points do not usually exist in the basic data distributions; however they are a
feature of Domain Decomposition Schwarz preconditioners which are the subject of
related research work&#x00A0;<span class="cite">[<a
href="userhtmlli2.html#X2007c">3</a>,&#x00A0;<a
href="userhtmlli2.html#X2007d">2</a>]</span>.
href="userhtmlli2.html#X2007c">4</a>,&#x00A0;<a
href="userhtmlli2.html#X2007d">3</a>]</span>.
<!--l. 207--><p class="indent" > We denote the sets of internal, boundary and halo points for a given subdomain
by <span
class="cmsy-10"><img
src="cmsy10-49.png" alt="I" class="10x-x-49" /></span>, <span
class="cmsy-10"><img
src="cmsy10-42.png" alt="B" class="10x-x-42" /> </span>and <span
class="cmsy-10"><img
src="cmsy10-48.png" alt="H" class="10x-x-48" /></span>. Each subdomain is assigned to one process; each process usually owns
one subdomain, although the user may choose to assign more than one subdomain to
a process. If each process <span
class="cmmi-10">i </span>owns one subdomain, the number of rows in
the local sparse matrix is <span
class="cmsy-10">|<img
src="cmsy10-49.png" alt="I" class="10x-x-49" /></span><sub><span
class="cmmi-7">i</span></sub><span
class="cmsy-10">| </span>+ <span
class="cmsy-10">|<img
src="cmsy10-42.png" alt="B" class="10x-x-42" /></span><sub><span
class="cmmi-7">i</span></sub><span
class="cmsy-10">|</span>, and the number of local columns (i.e.
class="zplmr7y-"><img
src="zplmr7y-49.png" alt="I" class="x-x-49" /></span>, <span
class="zplmr7y-"><img
src="zplmr7y-42.png" alt="B" class="x-x-42" /> </span>and <span
class="zplmr7y-"><img
src="zplmr7y-48.png" alt="H" class="x-x-48" /></span>. Each subdomain is assigned to one process; each process usually
owns one subdomain, although the user may choose to assign more than one
subdomain to a process. If each process <span
class="zplmr7m-">i </span>owns one subdomain, the number of rows
in the local sparse matrix is <span
class="zplmr7y-">|<img
src="zplmr7y-49.png" alt="I" class="x-x-49" /></span><sub><span
class="zplmr7m-x-x-76">i</span></sub><span
class="zplmr7y-">| </span><span
class="zplmr7t-">+ </span><span
class="zplmr7y-">|<img
src="zplmr7y-42.png" alt="B" class="x-x-42" /></span><sub><span
class="zplmr7m-x-x-76">i</span></sub><span
class="zplmr7y-">|</span>, and the number of local columns (i.e.
those for which there exists at least one non-zero entry in the local rows) is
<span
class="cmsy-10">|<img
src="cmsy10-49.png" alt="I" class="10x-x-49" /></span><sub><span
class="cmmi-7">i</span></sub><span
class="cmsy-10">| </span>+ <span
class="cmsy-10">|<img
src="cmsy10-42.png" alt="B" class="10x-x-42" /></span><sub><span
class="cmmi-7">i</span></sub><span
class="cmsy-10">| </span>+ <span
class="cmsy-10">|<img
src="cmsy10-48.png" alt="H" class="10x-x-48" /></span><sub><span
class="cmmi-7">i</span></sub><span
class="cmsy-10">|</span>.
class="zplmr7y-">|<img
src="zplmr7y-49.png" alt="I" class="x-x-49" /></span><sub><span
class="zplmr7m-x-x-76">i</span></sub><span
class="zplmr7y-">| </span><span
class="zplmr7t-">+ </span><span
class="zplmr7y-">|<img
src="zplmr7y-42.png" alt="B" class="x-x-42" /></span><sub><span
class="zplmr7m-x-x-76">i</span></sub><span
class="zplmr7y-">| </span><span
class="zplmr7t-">+ </span><span
class="zplmr7y-">|<img
src="zplmr7y-48.png" alt="H" class="x-x-48" /></span><sub><span
class="zplmr7m-x-x-76">i</span></sub><span
class="zplmr7y-">|</span>.
<!--l. 217--><p class="indent" > <hr class="figure"><div class="figure"
>
@ -226,13 +229,13 @@ class="content">Point classfication.</span></div><!--tex4ht:label?: x4-4003r2 --
<!--l. 229--><p class="indent" > This classification of mesh points guides the naming scheme that we adopted in
the library internals and in the data structures. We explicitly note that &#8220;Halo&#8221; points
are also often called &#8220;ghost&#8221; points in the literature.
<h4 class="subsectionHead"><span class="titlemark">2.2 </span> <a
<h4 class="subsectionHead"><span class="titlemark">2.2 </span> <a
id="x4-50002.2"></a>Library contents</h4>
<!--l. 238--><p class="noindent" >The PSBLAS library consists of various classes of subroutines:
<dl class="description"><dt class="description">
<!--l. 240--><p class="noindent" >
<span
class="cmbx-10">Computational routines</span> </dt><dd
class="pplb7t-">Computational routines</span> </dt><dd
class="description">
<!--l. 240--><p class="noindent" >comprising:
<ul class="itemize1">
@ -253,13 +256,13 @@ class="description">
</dd><dt class="description">
<!--l. 249--><p class="noindent" >
<span
class="cmbx-10">Communication routines</span> </dt><dd
class="pplb7t-">Communication routines</span> </dt><dd
class="description">
<!--l. 249--><p class="noindent" >handling halo and overlap communications;
</dd><dt class="description">
<!--l. 251--><p class="noindent" >
<span
class="cmbx-10">Data management and auxiliary routines</span> </dt><dd
class="pplb7t-">Data management and auxiliary routines</span> </dt><dd
class="description">
<!--l. 251--><p class="noindent" >including:
<ul class="itemize1">
@ -283,17 +286,17 @@ class="description">
</dd><dt class="description">
<!--l. 259--><p class="noindent" >
<span
class="cmbx-10">Preconditioner routines</span> </dt><dd
class="pplb7t-">Preconditioner routines</span> </dt><dd
class="description">
<!--l. 259--><p class="noindent" >
</dd><dt class="description">
<!--l. 260--><p class="noindent" >
<span
class="cmbx-10">Iterative methods</span> </dt><dd
class="pplb7t-">Iterative methods</span> </dt><dd
class="description">
<!--l. 260--><p class="noindent" >a subset of Krylov subspace iterative methods</dd></dl>
<!--l. 263--><p class="noindent" >The following naming scheme has been adopted for all the symbols internally defined in
the PSBLAS software package:
<!--l. 263--><p class="noindent" >The following naming scheme has been adopted for all the symbols internally defined
in the PSBLAS software package:
<ul class="itemize1">
<li class="itemize">
<!--l. 266--><p class="noindent" >all symbols (i.e. subroutine names, data types...) are prefixed by <span class="obeylines-h"><span class="verb"><span
@ -341,15 +344,15 @@ as:
<dl class="description"><dt class="description">
<!--l. 288--><p class="noindent" >
<span
class="cmbx-10">global</span> </dt><dd
class="pplb7t-">global</span> </dt><dd
class="description">
<!--l. 288--><p class="noindent" >For input arguments, the value must be the same on all processes
participating in the subroutine call; for output arguments the value is
participating in the subroutine call; for output arguments the value is
guaranteed to be the same.
</dd><dt class="description">
<!--l. 291--><p class="noindent" >
<span
class="cmbx-10">local</span> </dt><dd
class="pplb7t-">local</span> </dt><dd
class="description">
<!--l. 291--><p class="noindent" >Each process has its own value(s) independently.</dd></dl>
<!--l. 293--><p class="noindent" >To finish our general description, we define a version string with the constant
@ -360,36 +363,36 @@ src="userhtml0x.png" alt="psb_version_string_
<!--l. 295--><p class="nopar" > whose current value is <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">3.8.0</span></span></span>
<!--l. 298--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">2.3 </span> <a
<h4 class="subsectionHead"><span class="titlemark">2.3 </span> <a
id="x4-60002.3"></a>Application structure</h4>
<!--l. 301--><p class="noindent" >The main underlying principle of the PSBLAS library is that the library objects are
created and exist with reference to a discretized space to which there corresponds
an index space and a matrix sparsity pattern. As an example, consider a
cell-centered finite-volume discretization of the Navier-Stokes equations on a
simulation domain; the index space 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n </span>is isomorphic to the set of cell centers,
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n </span>is isomorphic to the set of cell centers,
whereas the pattern of the associated linear system matrix is isomorphic to the
adjacency graph imposed on the discretization mesh by the discretization
stencil.
<!--l. 311--><p class="indent" > Thus the first order of business is to establish an index space, and this is done
with a call to <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdall</span></span></span> in which we specify the size of the index space <span
class="cmmi-10">n </span>and the
class="zplmr7m-">n </span>and the
allocation of the elements of the index space to the various processes making up the
MPI (virtual) parallel machine.
<!--l. 317--><p class="indent" > The index space is partitioned among processes, and this creates a mapping from
the &#8220;global&#8221; numbering 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n </span>to a numbering &#8220;local&#8221; to each process; each process <span
class="cmmi-10">i</span>
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n </span>to a numbering &#8220;local&#8221; to each process; each process <span
class="zplmr7m-">i</span>
will own a certain subset 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n</span><sub>row<sub><span
class="cmmi-5">i</span></sub></sub>, each element of which corresponds to a certain
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n</span><sub>row<sub><span
class="zplmr7m-x-x-60">i</span></sub></sub>, each element of which corresponds to a certain
element of 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n</span>. The user does not set explicitly this mapping; when the application
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n</span>. The user does not set explicitly this mapping; when the application
needs to indicate to which element of the index space a certain item is related,
such as the row and column index of a matrix coefficient, it does so in the
&#8220;global&#8221; numbering, and the library will translate into the appropriate &#8220;local&#8221;
@ -398,8 +401,8 @@ numbering.
<!--l. 327--><p class="indent" > For a given index space 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n </span>there are many possible associated topologies, i.e.
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n </span>there are many possible associated topologies, i.e.
many different discretization stencils; thus the description of the index space is not
completed until the user has defined a sparsity pattern, either explicitly through
<span class="obeylines-h"><span class="verb"><span
@ -410,19 +413,20 @@ class="cmtt-10">psb_cdasb</span></span></span> and a sparse matrix with a call t
class="cmtt-10">psb_spasb</span></span></span>. After <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdasb</span></span></span> each
process <span
class="cmmi-10">i </span>will have defined a set of &#8220;halo&#8221; (or &#8220;ghost&#8221;) indices <span
class="cmmi-10">n</span><sub>row<sub><span
class="cmmi-5">i</span></sub></sub> + 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n</span><sub>col<sub>
class="zplmr7m-">i </span>will have defined a set of &#8220;halo&#8221; (or &#8220;ghost&#8221;) indices <span
class="zplmr7m-">n</span><sub>row<sub><span
class="zplmr7m-x-x-60">i</span></sub></sub> <span
class="zplmr7t-">+ </span>1<span
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n</span><sub>col<sub>
<span
class="cmmi-5">i</span></sub></sub>,
class="zplmr7m-x-x-60">i</span></sub></sub>,
denoting elements of the index space that are <span
class="cmti-10">not </span>assigned to process <span
class="cmmi-10">i</span>; however the
class="pplri7t-">not </span>assigned to process <span
class="zplmr7m-">i</span>; however the
variables associated with them are needed to complete computations associated with
the sparse matrix <span
class="cmmi-10">A</span>, and thus they have to be fetched from (neighbouring)
class="zplmr7m-">A</span>, and thus they have to be fetched from (neighbouring)
processes. The descriptor of the index space is built exactly for the purpose
of properly sequencing the communication steps required to achieve this
objective.
@ -432,18 +436,18 @@ matrix/vector creation and linear system solution as follows:
<li
class="enumerate" id="x4-6002x1">
<!--l. 347--><p class="noindent" >Initialize parallel environment with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_init</span></span></span>
class="cmtt-10">psb_init</span></span></span>;
</li>
<li
class="enumerate" id="x4-6004x2">
<!--l. 348--><p class="noindent" >Initialize index space with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdall</span></span></span>
class="cmtt-10">psb_cdall</span></span></span>;
</li>
<li
class="enumerate" id="x4-6006x3">
<!--l. 349--><p class="noindent" >Allocate sparse matrix and dense vectors with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spall</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geall</span></span></span>
class="cmtt-10">psb_geall</span></span></span>;
</li>
<li
class="enumerate" id="x4-6008x4">
@ -459,12 +463,12 @@ class="cmtt-10">psb_geins</span></span></span>
<li
class="enumerate" id="x4-6012x1">
<!--l. 355--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdasb</span></span></span>
class="cmtt-10">psb_cdasb</span></span></span>,
</li>
<li
class="enumerate" id="x4-6014x2">
<!--l. 356--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spasb</span></span></span>
class="cmtt-10">psb_spasb</span></span></span>,
@ -472,147 +476,157 @@ class="cmtt-10">psb_spasb</span></span></span>
<li
class="enumerate" id="x4-6016x3">
<!--l. 357--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geasb</span></span></span></li></ol>
class="cmtt-10">psb_geasb</span></span></span>;</li></ol>
</li>
<li
class="enumerate" id="x4-6018x6">
<!--l. 359--><p class="noindent" >Choose the preconditioner to be used with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%init</span></span></span> and build it with
class="cmtt-10">prec%init</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%set</span></span></span>, and build it with
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%build</span></span></span><span class="footnote-mark"><a
href="userhtml7.html#fn3x0"><sup class="textsuperscript">3</sup></a></span><a
id="x4-6019f3"></a> .
id="x4-6019f3"></a> ;
</li>
<li
class="enumerate" id="x4-6022x7">
<!--l. 363--><p class="noindent" >Call the iterative driver <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_krylov</span></span></span> with the method of choice, e.g.
<span class="obeylines-h"><span class="verb"><span
<!--l. 364--><p class="noindent" >Call one of the iterative drivers with the method of choice, e.g. <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_krylov</span></span></span>
with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">bicgstab</span></span></span>.</li></ol>
<!--l. 366--><p class="noindent" >This is the structure of the sample programs in the directory <span class="obeylines-h"><span class="verb"><span
<!--l. 367--><p class="noindent" >This is the structure of the sample programs in the directory <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">test/pargen/</span></span></span>.
<!--l. 369--><p class="indent" > For a simulation in which the same discretization mesh is used over multiple time
steps, the following structure may be more appropriate:
<!--l. 370--><p class="indent" > For a simulation in which the same discretization mesh is used over multiple
time steps, the following structure may be more appropriate:
<ol class="enumerate1" >
<li
class="enumerate" id="x4-6024x1">
<!--l. 372--><p class="noindent" >Initialize parallel environment with <span class="obeylines-h"><span class="verb"><span
<!--l. 373--><p class="noindent" >Initialize parallel environment with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_init</span></span></span>
</li>
<li
class="enumerate" id="x4-6026x2">
<!--l. 373--><p class="noindent" >Initialize index space with <span class="obeylines-h"><span class="verb"><span
<!--l. 374--><p class="noindent" >Initialize index space with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdall</span></span></span>
</li>
<li
class="enumerate" id="x4-6028x3">
<!--l. 374--><p class="noindent" >Loop over the topology of the discretization mesh and build the descriptor
with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdins</span></span></span>
<!--l. 375--><p class="noindent" >Loop over the topology of the discretization mesh and build the
descriptor with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdins</span></span></span>;
</li>
<li
class="enumerate" id="x4-6030x4">
<!--l. 376--><p class="noindent" >Assemble the descriptor with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdasb</span></span></span>
<!--l. 377--><p class="noindent" >Assemble the descriptor with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdasb</span></span></span>;
</li>
<li
class="enumerate" id="x4-6032x5">
<!--l. 377--><p class="noindent" >Allocate the sparse matrices and dense vectors with <span class="obeylines-h"><span class="verb"><span
<!--l. 378--><p class="noindent" >Allocate the sparse matrices and dense vectors with; <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spall</span></span></span> and
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geall</span></span></span>
class="cmtt-10">psb_geall</span></span></span>;
</li>
<li
class="enumerate" id="x4-6034x6">
<!--l. 379--><p class="noindent" >Loop over the time steps:
<!--l. 380--><p class="noindent" >Loop over the time steps:
<ol class="enumerate2" >
<li
class="enumerate" id="x4-6036x1">
<!--l. 381--><p class="noindent" >If after first time step, reinitialize the sparse matrix with <span class="obeylines-h"><span class="verb"><span
<!--l. 382--><p class="noindent" >If after first time step, reinitialize the sparse matrix with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_sprn</span></span></span>;
also zero out the dense vectors;
</li>
<li
class="enumerate" id="x4-6038x2">
<!--l. 384--><p class="noindent" >Loop over the mesh, generate the coefficients and insert/update them
with <span class="obeylines-h"><span class="verb"><span
<!--l. 385--><p class="noindent" >Loop over the mesh, generate the coefficients and insert/update
them with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spins</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geins</span></span></span>
class="cmtt-10">psb_geins</span></span></span>;
</li>
<li
class="enumerate" id="x4-6040x3">
<!--l. 386--><p class="noindent" >Assemble with <span class="obeylines-h"><span class="verb"><span
<!--l. 387--><p class="noindent" >Assemble with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spasb</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geasb</span></span></span>
class="cmtt-10">psb_geasb</span></span></span>;
</li>
<li
class="enumerate" id="x4-6042x4">
<!--l. 387--><p class="noindent" >Choose and build preconditioner with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%init</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%build</span></span></span>
<!--l. 388--><p class="noindent" >
</li>
<li
class="enumerate" id="x4-6044x5">
<!--l. 389--><p class="noindent" >Call the iterative method of choice, e.g. <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_bicgstab</span></span></span></li></ol>
<!--l. 388--><p class="noindent" >Choose the preconditioner to be used with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%init</span></span></span> and
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%set</span></span></span>, and build it with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">prec%build</span></span></span>;
</li>
<li
class="enumerate" id="x4-6046x6">
<!--l. 391--><p class="noindent" >Call one of the iterative drivers with the method of choice, e.g.
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_krylov</span></span></span> with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">bicgstab</span></span></span>.</li></ol>
</li></ol>
<!--l. 392--><p class="noindent" >The insertion routines will be called as many times as needed; they only need to be
<!--l. 395--><p class="noindent" >The insertion routines will be called as many times as needed; they only need to be
called on the data that is actually allocated to the current process, i.e. each process
generates its own data.
<!--l. 397--><p class="indent" > In principle there is no specific order in the calls to <span class="obeylines-h"><span class="verb"><span
<!--l. 400--><p class="indent" > In principle there is no specific order in the calls to <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spins</span></span></span>, nor is there a
requirement to build a matrix row in its entirety before calling the routine; this
allows the application programmer to walk through the discretization mesh element
by element, generating the main part of a given matrix row but also contributions to
the rows corresponding to neighbouring elements.
<!--l. 404--><p class="indent" > From a functional point of view it is even possible to execute one call for each
<!--l. 407--><p class="indent" > From a functional point of view it is even possible to execute one call for each
nonzero coefficient; however this would have a substantial computational
overhead. It is therefore advisable to pack a certain amount of data into each
call to the insertion routine, say touching on a few tens of rows; the best
performng value would depend on both the architecture of the computer being
used and on the problem structure. At the opposite extreme, it would be
possible to generate the entire part of a coefficient matrix residing on a
performng value would depend on both the architecture of the computer
being used and on the problem structure. At the opposite extreme, it would
be possible to generate the entire part of a coefficient matrix residing on a
process and pass it in a single call to <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spins</span></span></span>; this, however, would entail a
doubling of memory occupation, and thus would be almost always far from
optimal.
<!--l. 417--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.3.1 </span> <a
<!--l. 420--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">2.3.1 </span> <a
id="x4-70002.3.1"></a>User-defined index mappings</h5>
<!--l. 419--><p class="noindent" >PSBLAS supports user-defined global to local index mappings, subject to the
<!--l. 422--><p class="noindent" >PSBLAS supports user-defined global to local index mappings, subject to the
constraints outlined in sec.&#x00A0;<a
href="#x4-60002.3">2.3<!--tex4ht:ref: sec:appstruct --></a>:
<ol class="enumerate1" >
<li
class="enumerate" id="x4-7002x1">
<!--l. 422--><p class="noindent" >The set of indices owned locally must be mapped to the set 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n</span><sub>row<sub><span
class="cmmi-5">i</span></sub></sub>;
<!--l. 425--><p class="noindent" >The set of indices owned locally must be mapped to the set 1<span
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n</span><sub>row<sub><span
class="zplmr7m-x-x-60">i</span></sub></sub>;
</li>
<li
class="enumerate" id="x4-7004x2">
<!--l. 424--><p class="noindent" >The set of halo points must be mapped to the set <span
class="cmmi-10">n</span><sub>row<sub><span
class="cmmi-5">i</span></sub></sub> + 1<span
class="cmmi-10">&#x2026;</span><span
class="cmmi-10">n</span><sub>col<sub>
<!--l. 427--><p class="noindent" >The set of halo points must be mapped to the set <span
class="zplmr7m-">n</span><sub>row<sub><span
class="zplmr7m-x-x-60">i</span></sub></sub> <span
class="zplmr7t-">+ </span>1<span
class="zplmr7m-">&#x2026;</span><span
class="zplmr7m-">n</span><sub>col<sub>
<span
class="cmmi-5">i</span></sub></sub>;</li></ol>
<!--l. 427--><p class="noindent" >but otherwise the mapping is arbitrary. The user application is responsible to ensure
class="zplmr7m-x-x-60">i</span></sub></sub>;</li></ol>
<!--l. 430--><p class="noindent" >but otherwise the mapping is arbitrary. The user application is responsible to ensure
consistency of this mapping; some errors may be caught by the library, but
this is not guaranteed. The application structure to support this usage is as
follows:
<ol class="enumerate1" >
<li
class="enumerate" id="x4-7006x1">
<!--l. 433--><p class="noindent" >Initialize index
<!--l. 436--><p class="noindent" >Initialize index
space with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdall(ictx,desc,info,vl=vl,lidx=lidx)</span></span></span> passing the
vectors <span class="obeylines-h"><span class="verb"><span
@ -622,66 +636,66 @@ class="cmtt-10">lidx(:)</span></span></span> containing the corresponding local
</li>
<li
class="enumerate" id="x4-7008x2">
<!--l. 438--><p class="noindent" >Add the halo points <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">ja(:)</span></span></span> and their associated local indices <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">lidx(:)</span></span></span> with
a(some) call(s) to <span class="obeylines-h"><span class="verb"><span
<!--l. 441--><p class="noindent" >Add the halo points <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">ja(:)</span></span></span> and their associated local indices <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">lidx(:)</span></span></span>
with a(some) call(s) to <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdins(nz,ja,desc,info,lidx=lidx)</span></span></span>;
</li>
<li
class="enumerate" id="x4-7010x3">
<!--l. 441--><p class="noindent" >Assemble the descriptor with <span class="obeylines-h"><span class="verb"><span
<!--l. 444--><p class="noindent" >Assemble the descriptor with <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_cdasb</span></span></span>;
</li>
<li
class="enumerate" id="x4-7012x4">
<!--l. 442--><p class="noindent" >Build the sparse matrices and vectors, optionally making use in <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spins</span></span></span>
and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geins</span></span></span> of the <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">local</span></span></span> argument specifying that the indices in <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">ia</span></span></span>,
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">ja</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">irw</span></span></span>, respectively, are already local indices.</li></ol>
<!--l. 449--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">2.4 </span> <a
<!--l. 445--><p class="noindent" >Build the sparse matrices and vectors, optionally making use in
<span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_spins</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_geins</span></span></span> of the <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">local</span></span></span> argument specifying that the
indices in <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">ia</span></span></span>, <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">ja</span></span></span> and <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">irw</span></span></span>, respectively, are already local indices.</li></ol>
<!--l. 452--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">2.4 </span> <a
id="x4-80002.4"></a>Programming model</h4>
<!--l. 451--><p class="noindent" >The PSBLAS librarary is based on the Single Program Multiple Data (SPMD)
<!--l. 454--><p class="noindent" >The PSBLAS librarary is based on the Single Program Multiple Data (SPMD)
programming model: each process participating in the computation performs the
same actions on a chunk of data. Parallelism is thus data-driven.
<!--l. 456--><p class="indent" > Because of this structure, many subroutines coordinate their action across the
<!--l. 459--><p class="indent" > Because of this structure, many subroutines coordinate their action across the
various processes, thus providing an implicit synchronization point, and therefore
<span
class="cmti-10">must </span>be called simultaneously by all processes participating in the computation. This
class="pplri7t-">must </span>be called simultaneously by all processes participating in the computation. This
is certainly true for the data allocation and assembly routines, for all the
computational routines and for some of the tools routines.
<!--l. 464--><p class="indent" > However there are many cases where no synchronization, and indeed no
<!--l. 467--><p class="indent" > However there are many cases where no synchronization, and indeed no
communication among processes, is implied; for instance, all the routines in sec.&#x00A0;<a
href="userhtmlse3.html#x8-90003">3<!--tex4ht:ref: sec:datastruct --></a>
are only acting on the local data structures, and thus may be called independently.
The most important case is that of the coefficient insertion routines: since the
number of coefficients in the sparse and dense matrices varies among the processors,
and since the user is free to choose an arbitrary order in builiding the matrix entries,
The most important case is that of the coefficient insertion routines: since the number
of coefficients in the sparse and dense matrices varies among the processors, and
since the user is free to choose an arbitrary order in builiding the matrix entries,
these routines cannot imply a synchronization.
<!--l. 474--><p class="indent" > Throughout this user&#8217;s guide each subroutine will be clearly indicated
<!--l. 477--><p class="indent" > Throughout this user&#8217;s guide each subroutine will be clearly indicated
as:
<dl class="description"><dt class="description">
<!--l. 477--><p class="noindent" >
<!--l. 480--><p class="noindent" >
<span
class="cmbx-10">Synchronous:</span> </dt><dd
class="pplb7t-">Synchronous:</span> </dt><dd
class="description">
<!--l. 477--><p class="noindent" >must be called simultaneously by all the processes in the relevant
<!--l. 480--><p class="noindent" >must be called simultaneously by all the processes in the relevant
communication context;
</dd><dt class="description">
<!--l. 479--><p class="noindent" >
<!--l. 482--><p class="noindent" >
<span
class="cmbx-10">Asynchronous:</span> </dt><dd
class="pplb7t-">Asynchronous:</span> </dt><dd
class="description">
<!--l. 479--><p class="noindent" >may be called in a totally independent manner.</dd></dl>
<!--l. 482--><p class="noindent" >may be called in a totally independent manner.</dd></dl>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -16,21 +16,21 @@ href="userhtmlse7.html" >prev</a>] [<a
href="userhtmlse7.html#tailuserhtmlse7.html" >prev-tail</a>] [<a
href="userhtmlse5.html#tailuserhtmlse8.html">tail</a>] [<a
href="userhtml.html#userhtmlse11.html" >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">8 </span> <a
<h3 class="sectionHead"><span class="titlemark">8 </span> <a
id="x13-1230008"></a>Error handling</h3>
<!--l. 5--><p class="noindent" >The PSBLAS library error handling policy has been completely rewritten in version
2.0. The idea behind the design of this new error handling strategy is to keep error
messages on a stack allowing the user to trace back up to the point where the first
error message has been generated. Every routine in the PSBLAS-2.0 library has, as
last non-optional argument, an integer <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">info</span></span></span> variable; whenever, inside the routine, an
error is detected, this variable is set to a value corresponding to a specific
error code. Then this error code is also pushed on the error stack and then
either control is returned to the caller routine or the execution is aborted,
depending on the users choice. At the time when the execution is aborted,
an error message is printed on standard output with a level of verbosity
than can be chosen by the user. If the execution is not aborted, then, the
caller routine checks the value returned in the <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">info</span></span></span> variable; whenever, inside the
routine, an error is detected, this variable is set to a value corresponding to a
specific error code. Then this error code is also pushed on the error stack
and then either control is returned to the caller routine or the execution is
aborted, depending on the users choice. At the time when the execution is
aborted, an error message is printed on standard output with a level of
verbosity than can be chosen by the user. If the execution is not aborted, then,
the caller routine checks the value returned in the <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">info</span></span></span> variable and, if not
zero, an error condition is raised. This process continues on all the levels of
nested calls until the level where the user decides to abort the program
@ -100,7 +100,6 @@ class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">info</span></span><span style="color:#000000"><span
class="cmtt-9">=</span></span><span style="color:#000000"><span
@ -111,7 +110,6 @@ class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">call</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">psb_errpush</span></span><span style="color:#000000"><span
@ -128,7 +126,6 @@ class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">goto</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">9999</span></span>
@ -178,7 +175,6 @@ class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">info</span></span><span style="color:#000000"><span
class="cmtt-9">=</span></span><span style="color:#000000"><span
@ -189,7 +185,6 @@ class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">call</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">psb_errpush</span></span><span style="color:#000000"><span
@ -206,7 +201,6 @@ class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">goto</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">9999</span></span>
@ -246,7 +240,6 @@ class="cmtt-9">then</span></span>
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">call</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
class="cmtt-9">psb_error</span></span><span style="color:#000000"><span
@ -258,7 +251,6 @@ class="cmtt-9">)</span></span>
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">return</span></span>
<span class="label"><a
@ -272,7 +264,6 @@ class="cmtt-9">else</span></span>
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span
class="cmtt-9">&#x00A0;</span><span style="color:#000000"><span
class="cmtt-9">return</span></span>
<span class="label"><a
@ -337,7 +328,7 @@ Format&#x00A0;FOO&#x00A0;is&#x00A0;unknown
==========================================================
Aborting...
</pre>
<!--l. 156--><p class="nopar" > </div> </div>
<!--l. 156--><p class="nopar" > </div></div>
</div>
<br /> <div class="caption"
><span class="id">Listing 6: </span><span
@ -350,7 +341,7 @@ condition inside the psb_cest subroutine</span></div><!--tex4ht:label?: x13-1230
<h4 class="subsectionHead"><span class="titlemark">8.1 </span> <a
<h4 class="subsectionHead"><span class="titlemark">8.1 </span> <a
id="x13-1240008.1"></a>psb_errpush &#8212; Pushes an error code onto the error stack</h4>
<!--l. 174-->
<pre class="lstlisting" id="listing-155"><span class="label"><a
@ -371,77 +362,77 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 179--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 179--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 180--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 180--><p class="noindent" >
</dd><dt class="description">
<!--l. 181--><p class="noindent" >
<span
class="cmbx-10">err</span><span
class="cmbx-10">_c</span> </dt><dd
class="pplb7t-">err</span><span
class="pplb7t-">_c</span> </dt><dd
class="description">
<!--l. 181--><p class="noindent" >the error code<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: an integer.
</dd><dt class="description">
<!--l. 186--><p class="noindent" >
<span
class="cmbx-10">r</span><span
class="cmbx-10">_name</span> </dt><dd
class="pplb7t-">r</span><span
class="pplb7t-">_name</span> </dt><dd
class="description">
<!--l. 186--><p class="noindent" >the soutine where the error has been caught.<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: a string.<br
class="newline" />
</dd><dt class="description">
<!--l. 191--><p class="noindent" >
<span
class="cmbx-10">i</span><span
class="cmbx-10">_err</span> </dt><dd
class="pplb7t-">i</span><span
class="pplb7t-">_err</span> </dt><dd
class="description">
<!--l. 191--><p class="noindent" >addional info for error code<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Specified as: an integer array<br
class="newline" />
</dd><dt class="description">
<!--l. 195--><p class="noindent" >
<span
class="cmbx-10">a</span><span
class="cmbx-10">_err</span> </dt><dd
class="pplb7t-">a</span><span
class="pplb7t-">_err</span> </dt><dd
class="description">
<!--l. 195--><p class="noindent" >addional info for error code<br
class="newline" />Scope: <span
class="cmbx-10">local </span><br
class="pplb7t-">local </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Specified as: a string.<br
class="newline" /></dd></dl>
<h4 class="subsectionHead"><span class="titlemark">8.2 </span> <a
<h4 class="subsectionHead"><span class="titlemark">8.2 </span> <a
id="x13-1250008.2"></a>psb_error &#8212; Prints the error stack content and aborts execution</h4>
<!--l. 204-->
<pre class="lstlisting" id="listing-156"><span class="label"><a
@ -456,32 +447,32 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 209--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 209--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 210--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 210--><p class="noindent" >
</dd><dt class="description">
<!--l. 211--><p class="noindent" >
<span
class="cmbx-10">icontxt</span> </dt><dd
class="pplb7t-">icontxt</span> </dt><dd
class="description">
<!--l. 211--><p class="noindent" >the communication context.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">optional</span><br
class="pplb7t-">optional</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: an integer.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">8.3 </span> <a
<h4 class="subsectionHead"><span class="titlemark">8.3 </span> <a
id="x13-1260008.3"></a>psb_set_errverbosity &#8212; Sets the verbosity of error messages</h4>
<!--l. 224-->
<pre class="lstlisting" id="listing-157"><span class="label"><a
@ -496,32 +487,32 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 229--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 229--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 230--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 230--><p class="noindent" >
</dd><dt class="description">
<!--l. 231--><p class="noindent" >
<span
class="cmbx-10">v</span> </dt><dd
class="pplb7t-">v</span> </dt><dd
class="description">
<!--l. 231--><p class="noindent" >the verbosity level<br
class="newline" />Scope: <span
class="cmbx-10">global</span><br
class="pplb7t-">global</span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: an integer.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">8.4 </span> <a
<h4 class="subsectionHead"><span class="titlemark">8.4 </span> <a
id="x13-1270008.4"></a>psb_set_erraction &#8212; Set the type of action to be taken upon error
condition</h4>
<!--l. 241-->
@ -537,28 +528,28 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 246--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 246--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 247--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 247--><p class="noindent" >
</dd><dt class="description">
<!--l. 248--><p class="noindent" >
<span
class="cmbx-10">err</span><span
class="cmbx-10">_act</span> </dt><dd
class="pplb7t-">err</span><span
class="pplb7t-">_act</span> </dt><dd
class="description">
<!--l. 248--><p class="noindent" >the type of action.<br
class="newline" />Scope: <span
class="cmbx-10">global </span><br
class="pplb7t-">global </span><br
class="newline" />Type: <span
class="cmbx-10">required</span><br
class="pplb7t-">required</span><br
class="newline" />Intent: <span
class="cmbx-10">in</span>.<br
class="pplb7t-">in</span>.<br
class="newline" />Specified as: an integer. Possible values: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_act_ret</span></span></span>, <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">psb_act_abort</span></span></span>.</dd></dl>

@ -16,7 +16,7 @@ href="userhtmlse8.html" >prev</a>] [<a
href="userhtmlse8.html#tailuserhtmlse8.html" >prev-tail</a>] [<a
href="userhtmlse6.html#tailuserhtmlse9.html">tail</a>] [<a
href="userhtml.html#userhtmlse12.html" >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">9 </span> <a
<h3 class="sectionHead"><span class="titlemark">9 </span> <a
id="x14-1280009"></a>Utilities</h3>
<!--l. 4--><p class="noindent" >We have some utilities available for input and output of sparse matrices; the
interfaces to these routines are available in the module <span class="obeylines-h"><span class="verb"><span
@ -24,7 +24,7 @@ class="cmtt-10">psb_util_mod</span></span></span>.
<h4 class="subsectionHead"><span class="titlemark">9.1 </span> <a
<h4 class="subsectionHead"><span class="titlemark">9.1 </span> <a
id="x14-1290009.1"></a> hb_read &#8212; Read a sparse matrix from a file in the Harwell&#8211;Boeing
format</h4>
<!--l. 16-->
@ -50,53 +50,53 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 21--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 21--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 22--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 22--><p class="noindent" >
</dd><dt class="description">
<!--l. 23--><p class="noindent" >
<span
class="cmbx-10">filename</span> </dt><dd
class="pplb7t-">filename</span> </dt><dd
class="description">
<!--l. 23--><p class="noindent" >The name of the file to be read.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
which case the default input unit 5 (i.e. standard input in Unix jargon) is
used. Default: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 28--><p class="noindent" >
<span
class="cmbx-10">iunit</span> </dt><dd
class="pplb7t-">iunit</span> </dt><dd
class="description">
<!--l. 28--><p class="noindent" >The Fortran file unit number.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: an integer value. Only meaningful if filename is not <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.</dd></dl>
<!--l. 33--><p class="noindent" >
<dl class="description"><dt class="description">
<!--l. 34--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 34--><p class="noindent" >
</dd><dt class="description">
<!--l. 35--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="pplb7t-">a</span> </dt><dd
class="description">
<!--l. 35--><p class="noindent" >the sparse matrix read from file.<br
class="newline" />Type:<span
class="cmbx-10">required</span>.<br
class="pplb7t-">required</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
@ -108,37 +108,37 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 38--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="pplb7t-">b</span> </dt><dd
class="description">
<!--l. 38--><p class="noindent" >Rigth hand side(s).<br
class="newline" />Type: <span
class="cmbx-10">Optional </span><br
class="pplb7t-">Optional </span><br
class="newline" />An array of type real or complex, rank 2 and having the ALLOCATABLE
attribute; will be allocated and filled in if the input file contains a right
hand side, otherwise will be left in the UNALLOCATED state.
</dd><dt class="description">
<!--l. 43--><p class="noindent" >
<span
class="cmbx-10">mtitle</span> </dt><dd
class="pplb7t-">mtitle</span> </dt><dd
class="description">
<!--l. 43--><p class="noindent" >Matrix title.<br
class="newline" />Type: <span
class="cmbx-10">Optional </span><br
class="pplb7t-">Optional </span><br
class="newline" />A charachter variable of length 72 holding a copy of the matrix title as
specified by the Harwell-Boeing format and contained in the input file.
</dd><dt class="description">
<!--l. 48--><p class="noindent" >
<span
class="cmbx-10">iret</span> </dt><dd
class="pplb7t-">iret</span> </dt><dd
class="description">
<!--l. 48--><p class="noindent" >Error code.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">9.2 </span> <a
<h4 class="subsectionHead"><span class="titlemark">9.2 </span> <a
id="x14-1300009.2"></a>hb_write &#8212; Write a sparse matrix to a file in the Harwell&#8211;Boeing
format</h4>
<!--l. 59-->
@ -166,23 +166,23 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 66--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 66--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 67--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 67--><p class="noindent" >
</dd><dt class="description">
<!--l. 68--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="pplb7t-">a</span> </dt><dd
class="description">
<!--l. 68--><p class="noindent" >the sparse matrix to be written.<br
class="newline" />Type:<span
class="cmbx-10">required</span>.<br
class="pplb7t-">required</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
@ -191,35 +191,35 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 71--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="pplb7t-">b</span> </dt><dd
class="description">
<!--l. 71--><p class="noindent" >Rigth hand side.<br
class="newline" />Type: <span
class="cmbx-10">Optional </span><br
class="pplb7t-">Optional </span><br
class="newline" />An array of type real or complex, rank 1 and having the ALLOCATABLE
attribute; will be allocated and filled in if the input file contains a right
hand side.
</dd><dt class="description">
<!--l. 76--><p class="noindent" >
<span
class="cmbx-10">filename</span> </dt><dd
class="pplb7t-">filename</span> </dt><dd
class="description">
<!--l. 76--><p class="noindent" >The name of the file to be written to.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
which case the default output unit 6 (i.e. standard output in Unix jargon)
is used. Default: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 81--><p class="noindent" >
<span
class="cmbx-10">iunit</span> </dt><dd
class="pplb7t-">iunit</span> </dt><dd
class="description">
<!--l. 81--><p class="noindent" >The Fortran file unit number.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: an integer value. Only meaningful if filename is not <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
@ -228,43 +228,43 @@ class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 84--><p class="noindent" >
<span
class="cmbx-10">key</span> </dt><dd
class="pplb7t-">key</span> </dt><dd
class="description">
<!--l. 84--><p class="noindent" >Matrix key.<br
class="newline" />Type: <span
class="cmbx-10">Optional </span><br
class="pplb7t-">Optional </span><br
class="newline" />A charachter variable of length 8 holding the matrix key as specified by
the Harwell-Boeing format and to be written to file.
</dd><dt class="description">
<!--l. 89--><p class="noindent" >
<span
class="cmbx-10">mtitle</span> </dt><dd
class="pplb7t-">mtitle</span> </dt><dd
class="description">
<!--l. 89--><p class="noindent" >Matrix title.<br
class="newline" />Type: <span
class="cmbx-10">Optional </span><br
class="pplb7t-">Optional </span><br
class="newline" />A charachter variable of length 72 holding the matrix title as specified by
the Harwell-Boeing format and to be written to file.</dd></dl>
<!--l. 96--><p class="noindent" >
<dl class="description"><dt class="description">
<!--l. 97--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 97--><p class="noindent" >
</dd><dt class="description">
<!--l. 98--><p class="noindent" >
<span
class="cmbx-10">iret</span> </dt><dd
class="pplb7t-">iret</span> </dt><dd
class="description">
<!--l. 98--><p class="noindent" >Error code.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">9.3 </span> <a
<h4 class="subsectionHead"><span class="titlemark">9.3 </span> <a
id="x14-1310009.3"></a>mm_mat_read &#8212; Read a sparse matrix from a file in the MatrixMarket
format</h4>
<!--l. 111-->
@ -286,53 +286,53 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 116--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 116--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 117--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 117--><p class="noindent" >
</dd><dt class="description">
<!--l. 118--><p class="noindent" >
<span
class="cmbx-10">filename</span> </dt><dd
class="pplb7t-">filename</span> </dt><dd
class="description">
<!--l. 118--><p class="noindent" >The name of the file to be read.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
which case the default input unit 5 (i.e. standard input in Unix jargon) is
used. Default: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 123--><p class="noindent" >
<span
class="cmbx-10">iunit</span> </dt><dd
class="pplb7t-">iunit</span> </dt><dd
class="description">
<!--l. 123--><p class="noindent" >The Fortran file unit number.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: an integer value. Only meaningful if filename is not <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.</dd></dl>
<!--l. 128--><p class="noindent" >
<dl class="description"><dt class="description">
<!--l. 129--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 129--><p class="noindent" >
</dd><dt class="description">
<!--l. 130--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="pplb7t-">a</span> </dt><dd
class="description">
<!--l. 130--><p class="noindent" >the sparse matrix read from file.<br
class="newline" />Type:<span
class="cmbx-10">required</span>.<br
class="pplb7t-">required</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
@ -344,16 +344,16 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 133--><p class="noindent" >
<span
class="cmbx-10">iret</span> </dt><dd
class="pplb7t-">iret</span> </dt><dd
class="description">
<!--l. 133--><p class="noindent" >Error code.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">9.4 </span> <a
<h4 class="subsectionHead"><span class="titlemark">9.4 </span> <a
id="x14-1320009.4"></a>mm_array_read &#8212; Read a dense array from a file in the MatrixMarket
format</h4>
<!--l. 142-->
@ -375,54 +375,54 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 147--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 147--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 148--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 148--><p class="noindent" >
</dd><dt class="description">
<!--l. 149--><p class="noindent" >
<span
class="cmbx-10">filename</span> </dt><dd
class="pplb7t-">filename</span> </dt><dd
class="description">
<!--l. 149--><p class="noindent" >The name of the file to be read.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
which case the default input unit 5 (i.e. standard input in Unix jargon) is
used. Default: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 154--><p class="noindent" >
<span
class="cmbx-10">iunit</span> </dt><dd
class="pplb7t-">iunit</span> </dt><dd
class="description">
<!--l. 154--><p class="noindent" >The Fortran file unit number.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: an integer value. Only meaningful if filename is not <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.</dd></dl>
<!--l. 159--><p class="noindent" >
<dl class="description"><dt class="description">
<!--l. 160--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 160--><p class="noindent" >
</dd><dt class="description">
<!--l. 161--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="pplb7t-">b</span> </dt><dd
class="description">
<!--l. 161--><p class="noindent" >Rigth hand side(s).<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="newline" />An array of type real or complex, rank 1 or 2 and having the
class="pplb7t-">required </span><br
class="newline" />An array of type real or complex, rank 1 or 2 and having the
@ -431,24 +431,24 @@ href="userhtmlse3.html#vdata"><span
class="cmtt-10">psb</span><span
class="cmtt-10">_T</span><span
class="cmtt-10">_vect</span><span
class="cmtt-10">_type</span></a>, of
type real or complex.<br
class="cmtt-10">_type</span></a>, of type
real or complex.<br
class="newline" />Will be allocated and filled in if the input file contains a right hand side,
otherwise will be left in the UNALLOCATED state. <br
class="newline" />
</dd><dt class="description">
<!--l. 168--><p class="noindent" >
<span
class="cmbx-10">iret</span> </dt><dd
class="pplb7t-">iret</span> </dt><dd
class="description">
<!--l. 168--><p class="noindent" >Error code.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<h4 class="subsectionHead"><span class="titlemark">9.5 </span> <a
<h4 class="subsectionHead"><span class="titlemark">9.5 </span> <a
id="x14-1330009.5"></a>mm_mat_write &#8212; Write a sparse matrix to a file in the MatrixMarket
format</h4>
<!--l. 179-->
@ -472,23 +472,23 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 183--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 183--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 184--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 184--><p class="noindent" >
</dd><dt class="description">
<!--l. 185--><p class="noindent" >
<span
class="cmbx-10">a</span> </dt><dd
class="pplb7t-">a</span> </dt><dd
class="description">
<!--l. 185--><p class="noindent" >the sparse matrix to be written.<br
class="newline" />Type:<span
class="cmbx-10">required</span>.<br
class="pplb7t-">required</span>.<br
class="newline" />Specified as: a structured data of type <a
href="userhtmlse3.html#spdata"><span
class="cmtt-10">psb</span><span
@ -497,34 +497,34 @@ class="cmtt-10">_type</span></a>.
</dd><dt class="description">
<!--l. 188--><p class="noindent" >
<span
class="cmbx-10">mtitle</span> </dt><dd
class="pplb7t-">mtitle</span> </dt><dd
class="description">
<!--l. 188--><p class="noindent" >Matrix title.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />A charachter variable holding a descriptive title for the matrix to be
written to file.
</dd><dt class="description">
<!--l. 192--><p class="noindent" >
<span
class="cmbx-10">filename</span> </dt><dd
class="pplb7t-">filename</span> </dt><dd
class="description">
<!--l. 192--><p class="noindent" >The name of the file to be written to.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
which case the default output unit 6 (i.e. standard output in Unix jargon)
is used. Default: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 197--><p class="noindent" >
<span
class="cmbx-10">iunit</span> </dt><dd
class="pplb7t-">iunit</span> </dt><dd
class="description">
<!--l. 197--><p class="noindent" >The Fortran file unit number.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: an integer value. Only meaningful if filename is not <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.</dd></dl>
@ -534,27 +534,27 @@ class="cmtt-10">-</span></span></span>.</dd></dl>
<dl class="description"><dt class="description">
<!--l. 203--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 203--><p class="noindent" >
</dd><dt class="description">
<!--l. 204--><p class="noindent" >
<span
class="cmbx-10">iret</span> </dt><dd
class="pplb7t-">iret</span> </dt><dd
class="description">
<!--l. 204--><p class="noindent" >Error code.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<!--l. 209--><p class="noindent" ><span
class="cmbx-12">Notes</span>
class="pplb7t-x-x-120">Notes</span>
<!--l. 211--><p class="indent" > If this function is called on a matrix <code class="lstinline"><span style="color:#000000">a</span></code> on a distributed communicator only the
local part is written in output. To get a single MatrixMarket file with the whole
matrix when appropriate, e.g. for debugging purposes, one could <span
class="cmti-10">gather </span>the whole
class="pplri7t-">gather </span>the whole
matrix on a single rank and then write it. Consider the following example for a
<span
class="cmti-10">double </span>precision matrix
class="pplri7t-">double </span>precision matrix
<div class="center"
>
<!--l. 227--><p class="noindent" >
@ -581,7 +581,7 @@ psb_i_t&#x00A0;psb_c_&#x003C;s,d,c,z&#x003E;global_mat_write(ah,cdh);
<h4 class="subsectionHead"><span class="titlemark">9.6 </span> <a
<h4 class="subsectionHead"><span class="titlemark">9.6 </span> <a
id="x14-1340009.6"></a>mm_array_write &#8212; Write a dense array from a file in the MatrixMarket
format</h4>
<!--l. 261-->
@ -605,23 +605,23 @@ class="cmtt-10">)</span></span></pre>
<dl class="description"><dt class="description">
<!--l. 266--><p class="noindent" >
<span
class="cmbx-10">Type:</span> </dt><dd
class="pplb7t-">Type:</span> </dt><dd
class="description">
<!--l. 266--><p class="noindent" >Asynchronous.
</dd><dt class="description">
<!--l. 267--><p class="noindent" >
<span
class="cmbx-10">On Entry</span> </dt><dd
class="pplb7t-">On Entry</span> </dt><dd
class="description">
<!--l. 267--><p class="noindent" >
</dd><dt class="description">
<!--l. 268--><p class="noindent" >
<span
class="cmbx-10">b</span> </dt><dd
class="pplb7t-">b</span> </dt><dd
class="description">
<!--l. 268--><p class="noindent" >Rigth hand side(s).<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An array of type real or complex, rank 1 or 2, or an object of type
<a
href="userhtmlse3.html#vdata"><span
@ -634,63 +634,63 @@ class="newline" />
</dd><dt class="description">
<!--l. 273--><p class="noindent" >
<span
class="cmbx-10">filename</span> </dt><dd
class="pplb7t-">filename</span> </dt><dd
class="description">
<!--l. 273--><p class="noindent" >The name of the file to be written.<br
class="newline" />
</dd><dt class="description">
<!--l. 274--><p class="noindent" >
<span
class="cmbx-10">vtitle</span> </dt><dd
class="pplb7t-">vtitle</span> </dt><dd
class="description">
<!--l. 274--><p class="noindent" >Matrix title.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="newline" />A charachter variable holding a descriptive title for the vector to be written
to file. Type:<span
class="cmbx-10">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
class="pplb7t-">required </span><br
class="newline" />A charachter variable holding a descriptive title for the vector to be
written to file. Type:<span
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: a character variable containing a valid file name, or <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>, in
which case the default input unit 5 (i.e. standard input in Unix jargon) is
used. Default: <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.
</dd><dt class="description">
<!--l. 282--><p class="noindent" >
<span
class="cmbx-10">iunit</span> </dt><dd
class="pplb7t-">iunit</span> </dt><dd
class="description">
<!--l. 282--><p class="noindent" >The Fortran file unit number.<br
class="newline" />Type:<span
class="cmbx-10">optional</span>.<br
class="pplb7t-">optional</span>.<br
class="newline" />Specified as: an integer value. Only meaningful if filename is not <span class="obeylines-h"><span class="verb"><span
class="cmtt-10">-</span></span></span>.</dd></dl>
<!--l. 287--><p class="noindent" >
<dl class="description"><dt class="description">
<!--l. 288--><p class="noindent" >
<span
class="cmbx-10">On Return</span> </dt><dd
class="pplb7t-">On Return</span> </dt><dd
class="description">
<!--l. 288--><p class="noindent" >
</dd><dt class="description">
<!--l. 289--><p class="noindent" >
<span
class="cmbx-10">iret</span> </dt><dd
class="pplb7t-">iret</span> </dt><dd
class="description">
<!--l. 289--><p class="noindent" >Error code.<br
class="newline" />Type: <span
class="cmbx-10">required </span><br
class="pplb7t-">required </span><br
class="newline" />An integer value; 0 means no error has been detected.</dd></dl>
<!--l. 294--><p class="noindent" ><span
class="cmbx-12">Notes</span>
class="pplb7t-x-x-120">Notes</span>
<!--l. 296--><p class="indent" > If this function is called on a vector <code class="lstinline"><span style="color:#000000">v</span></code> on a distributed communicator only the
local part is written in output. To get a single MatrixMarket file with the whole
vector when appropriate, e.g. for debugging purposes, one could <span
class="cmti-10">gather </span>the whole
class="pplri7t-">gather </span>the whole
vector on a single rank and then write it. Consider the following example for a <span
class="cmti-10">double</span>
class="pplri7t-">double</span>
precision vector
<div class="center"
>

Binary file not shown.

After

Width:  |  Height:  |  Size: 325 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 378 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 290 B

File diff suppressed because one or more lines are too long

@ -8,6 +8,11 @@ in J.~Dongarra, K.~Madsen, J.~Wasniewski, editors,
Proceedings of PARA~04 Workshop on State of the Art
in Scientific Computing, pp.~546--553, Lecture Notes in Computer Science,
Springer, 2005.
\bibitem{BERTACCINIFILIPPONE}
D. Bertaccini\ and\ S. Filippone,
{\em Sparse approximate inverse preconditioners on high performance GPU platforms},
Comput. Math. Appl., 71, (2016), no.~3, 693--711.
%
\bibitem{2007d} A. Buttari, D. di Serafino, P. D'Ambra, S. Filippone,\newblock
2LEV-D2P4: a package of high-performance preconditioners,\newblock
Applicable Algebra in Engineering, Communications and Computing,
@ -183,7 +188,14 @@ MIT Press, 1998.
{\em Scientific Programming\/}~{\em 22,\/}~1, 1--19.
\bibitem{OurTechRep}
D.~Barbieri, V.~Cardellini, A.~Fanfarillo, S.~Filippone, Three storage formats
for sparse matrices on {GPGPUs}, Tech. Rep. DICII RR-15.6, Universit\`a di
for sparse matrices on {GPGPUs}, Tech. Rep. DICII RR-15.6,
Universit\`a di
Roma Tor Vergata (February 2015).
\bibitem{Filippone:2017:SMM:3034774.3017994}
S.~Filippone, V.~Cardellini, D.~Barbieri, and A.~Fanfarillo.
Sparse matrix-vector multiplication on {GPGPUs}.
{\em ACM Trans. Math. Softw.}, 43(4):30:1--30:49, 2017.
\end{thebibliography}

@ -1317,13 +1317,14 @@ like Diagonal Scaling or Block Jacobi with incomplete
factorization ILU(0).
A preconditioner is held in the \hypertarget{precdata}{{\tt
psb\_prec\_type}} data structure reported in
figure~\ref{fig:prectype}. The \fortinline|psb_prec_type|
psb\_Tprec\_type}} data structure reported in
figure~\ref{fig:prectype}. The \fortinline|psb_Tprec_type|
data type may contain a simple preconditioning matrix with the
associated communication descriptor.%% which may be different than the
associated communication descriptor.
%% which may be different from the
%% system communication descriptor in the case of parallel
%% preconditioners like the Additive Schwarz one. Then the
%% \fortinline|psb_prec_type| may contain more than one preconditioning matrix
%% \fortinline|psb_Tprec_type| may contain more than one preconditioning matrix
%% like in the case of Two-Level (in general Multi-Level) preconditioners.
%% The user can choose the type of preconditioner to be used by means of
%% the \fortinline|psb_precset| subroutine; once the type of preconditioning
@ -1407,8 +1408,8 @@ Given a heap object, the following methods are defined on it:
\item[dump] Print on file;
\item[free] Release memory.
\end{description}
These objects are used in AMG4PSBLAS to implement the factorization
algorithms.
These objects are used to implement the factorization
and approximate inversion algorithms.
%%% Local Variables:
%%% mode: latex

@ -1,6 +1,6 @@
\section{Error handling}
\section{Error handling\label{sec:errors}}
The PSBLAS library error handling policy has been completely rewritten
in version 2.0. The idea behind the design of this new error handling

@ -344,24 +344,25 @@ A simple application structure will walk through the index space
allocation, matrix/vector creation and linear system solution as
follows:
\begin{enumerate}
\item Initialize parallel environment with \verb|psb_init|
\item Initialize index space with \verb|psb_cdall|
\item Initialize parallel environment with \verb|psb_init|;
\item Initialize index space with \verb|psb_cdall|;
\item Allocate sparse matrix and dense vectors with \verb|psb_spall|
and \verb|psb_geall|
and \verb|psb_geall|;
\item Loop over all local rows, generate matrix and vector entries,
and insert them with \verb|psb_spins| and \verb|psb_geins|
\item Assemble the various entities:
\begin{enumerate}
\item \verb|psb_cdasb|
\item \verb|psb_spasb|
\item \verb|psb_geasb|
\item \verb|psb_cdasb|,
\item \verb|psb_spasb|,
\item \verb|psb_geasb|;
\end{enumerate}
\item Choose the preconditioner to be used with \verb|prec%init| and
\verb|prec%set|, and
build it with \verb|prec%build|\footnote{The subroutine style {\tt
psb\_precinit} and {\tt psb\_precbl} are still supported for
backward compatibility}.
\item Call the iterative driver \verb|psb_krylov| with the method of
choice, e.g. \verb|bicgstab|.
psb\_precinit} and {\tt psb\_precbld} are still supported for
backward compatibility};
\item Call one of the iterative drivers with the method of
choice, e.g. \verb|psb_krylov| with \verb|bicgstab|.
\end{enumerate}
This is the structure of the sample programs in the directory
\verb|test/pargen/|.
@ -372,21 +373,23 @@ multiple time steps, the following structure may be more appropriate:
\item Initialize parallel environment with \verb|psb_init|
\item Initialize index space with \verb|psb_cdall|
\item Loop over the topology of the discretization mesh and build the
descriptor with \verb|psb_cdins|
\item Assemble the descriptor with \verb|psb_cdasb|
\item Allocate the sparse matrices and dense vectors with
\verb|psb_spall| and \verb|psb_geall|
descriptor with \verb|psb_cdins|;
\item Assemble the descriptor with \verb|psb_cdasb|;
\item Allocate the sparse matrices and dense vectors with;
\verb|psb_spall| and \verb|psb_geall|;
\item Loop over the time steps:
\begin{enumerate}
\item If after first time step,
reinitialize the sparse matrix with \verb|psb_sprn|; also zero out
the dense vectors;
\item Loop over the mesh, generate the coefficients and insert/update
them with \verb|psb_spins| and \verb|psb_geins|
\item Assemble with \verb|psb_spasb| and \verb|psb_geasb|
\item Choose and build preconditioner with \verb|prec%init| and
\verb|prec%build|
\item Call the iterative method of choice, e.g. \verb|psb_bicgstab|
them with \verb|psb_spins| and \verb|psb_geins|;
\item Assemble with \verb|psb_spasb| and \verb|psb_geasb|;
\item \item Choose the preconditioner to be used with \verb|prec%init| and
\verb|prec%set|, and
build it with \verb|prec%build|;
\item Call one of the iterative drivers with the method of
choice, e.g. \verb|psb_krylov| with \verb|bicgstab|.
\end{enumerate}
\end{enumerate}
The insertion routines will be called as many times as needed;

@ -76,14 +76,125 @@ $ptype$ string as follows\footnote{The string is case-insensitive}:
\item[DIAG] Diagonal scaling; each entry of the input vector is
multiplied by the reciprocal of the sum of the absolute values of
the coefficients in the corresponding row of matrix $A$;
\item[BJAC] Precondition by a factorization of the
block-diagonal of matrix $A$, where block boundaries are determined
by the data allocation boundaries for each process; requires no
communication. Only the incomplete factorization $ILU(0)$ is
currently implemented.
\item[BJAC] Precondition by a factorization or an approximante inverse
of the block-diagonal of matrix $A$, where block boundaries are
determined by the data allocation boundaries for each process;
requires no communication. See also Table-\ref{tab:p_subsolve_1}.
\end{description}
\clearpage
\subsection{Set\label{sec:precset} --- set preconditioner parameters}
\begin{center}
\fortinline|call p%set(what,val,info)|
\end{center}
\noindent
This method sets the parameters defining the subdomain solver when the
preconditioner type is \verb|BJAC|. More precisely, the parameter
identified by \fortinline|what| is assigned the value
contained in \fortinline|val|.
{\vskip1.5\baselineskip\noindent\large\bfseries Arguments} \smallskip
\begin{tabular}{p{1.2cm}p{12cm}}
\fortinline|what| & \fortinline|character(len=*)|. \\
& The parameter to be set. It can be specified through its name;
the string is case-insensitive. See
Table~\ref{tab:p_subsolve_1}.\\
\fortinline|val | & \fortinline|integer| \emph{or} \fortinline|character(len=*)| \emph{or}
\fortinline|real(psb_spk_)| \emph{or} \fortinline|real(psb_dpk_)|,
\fortinline|intent(in)|.\\
& The value of the parameter to be set. The list of allowed
values and the corresponding data types is given in
Table~\ref{tab:p_subsolve_1}.
When the value is of type \fortinline|character(len=*)|,
it is also treated as case insensitive.\\
\fortinline|info| & \fortinline|integer, intent(out)|.\\
& Error code. If no error, 0 is returned. See Section~\ref{sec:errors}
for details.
\end{tabular}
\noindent
A number of subdomain solvers can be chosen with this method;
a list of the parameters that can be set, along with their allowed and
default values, is given in Table-\ref{tab:p_subsolve_1}.\\
\bsideways
\begin{center}
\small
% \begin{tabular}{|p{3.6cm}|l|p{1.9cm}|p{3.6cm}|p{6.5cm}|}
\begin{tabular}{|p{3.2cm}|l|p{2.6cm}|p{2.6cm}|p{6.7cm}|}
\hline
\fortinline|what| & \textsc{data type} & \fortinline|val| & \textsc{default} &
\textsc{comments} \\ \hline
\fortinline|'SUB_SOLVE'| & \fortinline|character(len=*)|
& \fortinline|'ILU'| \par
\fortinline|'ILUT'| \par
\par \fortinline|'INVT'| \par \fortinline|'INVK'| \par \fortinline|'AINV'|
&
& The local solver to be used with the smoother or one-level
preconditioner ILU($p$), ILU($p,t$),
Approximate Inverses INVK($p,q$),
INVT($p_1,p2,t_1,t_2$) and
AINV($t$); note that approximate inverses
are specifically suited for GPUs since they
do not employ triangular system solve
kernels,
see~\cite{BERTACCINIFILIPPONE}.\\ \hline
\fortinline|'SUB_FILLIN'| & \fortinline|integer|
& Any integer \par number~$\ge 0$
& 0
& Fill-in level $p$ of the incomplete LU factorizations. \\ \hline
\fortinline|'SUB_ILUTHRS'| & \fortinline|real(kind_parameter)|
& Any real number~$\ge 0$
& 0
& Drop tolerance $t$ in the ILU($p,t$) factorization. \\ \hline
\fortinline|'ILU_ALG'| & \fortinline|character(len=*)|
& \fortinline|'MILU'|
& \fortinline|'NONE'|
& ILU algorithmic variant \\ \hline
\fortinline|'ILUT_SCALE'| & \fortinline|character(len=*)|
& \fortinline|'MAXVAL'| \par
\fortinline|'DIAG'| \par
\fortinline|'ARSWUM'| \par
\fortinline|'ARCSUM'| \par
\fortinline|'ACLSUM'| \par
\fortinline|'NONE'|
& \fortinline|'NONE'|
& ILU scaling strategy \\ \hline
\fortinline|'INV_FILLIN'| & \fortinline|integer|
& Any integer \par number~$\ge 0$
& 0
& Second fill-in level $q$ of the INVK($p,q$)
approximate inverse. \\ \hline
\fortinline|'INV_ILUTHRS'| & \fortinline|real(kind_parameter)|
& Any real number~$\ge 0$
& 0
& Second drop tolerance $s$ in the
INVT($t,s$) approximate inverse. \\ \hline
\fortinline|'AINV_ALG'| & \fortinline|character(len=*)|
& \fortinline|'LLK'| \par
\fortinline|'SYM-LLK'| \par
\fortinline|'STAB-LLK'| \par
\fortinline|'MLK,LMX'|
& \fortinline|'LLK'|
& AINV algorithmic strategy. \\ \hline
\end{tabular}
\end{center}
\caption{Parameters defining the solver of the BJAC
preconditioner.\label{tab:p_subsolve_1}}
\esideways
\clearpage\subsection{build --- Builds a preconditioner}
\begin{verbatim}

@ -17,6 +17,8 @@
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\usepackage{listings}
\usepackage{rotating}
\usepackage{microtype}
\usepackage{algorithm2e}
\usepackage{minted}
\usemintedstyle{friendly}
@ -91,12 +93,14 @@
\newcommand{\example}{\stepcounter{example}%
\section*{\examplename~\theexample}}
\newcommand{\precdata}{\hyperlink{precdata}{{\tt psb\_prec\_type}}}
\newcommand{\precdata}{\hyperlink{precdata}{{\tt psb\_Tprec\_type}}}
\newcommand{\descdata}{\hyperlink{descdata}{{\tt psb\_desc\_type}}}
\newcommand{\spdata}{\hyperlink{spdata}{{\tt psb\_Tspmat\_type}}}
\newcommand{\vdata}{\hyperlink{vdata}{{\tt psb\_T\_vect\_type}}}
\newcommand{\spbasedata}{\hypertarget{spbasedata}{{\tt psb\_T\_base\_sparse\_mat}}}
\newcommand{\vbasedata}{\hypertarget{vbasedata}{{\tt psb\_T\_base\_vect\_type}}}
\def\bsideways{\begin{sidewaystable}}
\def\esideways{\end{sidewaystable}}
\begin{document}
{

@ -17,8 +17,14 @@
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\usepackage{listings}
\usepackage{algorithm2e}
\usepackage{rotating}
\usepackage{microtype}
\usepackage{algorithm2e}
\definecolor{bg}{rgb}{0.95,0.95,0.95}
\usepackage{breakurl}
\usepackage{mathpazo}
\usepackage[english]{babel}
\ifpdf
\newmintinline[fortinline]{fortran}{}
\else%
@ -78,12 +84,14 @@
\newcommand{\example}{\stepcounter{example}%
\section*{\examplename~\theexample}}
\newcommand{\precdata}{\hyperlink{precdata}{{\tt psb\_prec\_type}}}
\newcommand{\precdata}{\hyperlink{precdata}{{\tt psb\_Tprec\_type}}}
\newcommand{\descdata}{\hyperlink{descdata}{{\tt psb\_desc\_type}}}
\newcommand{\spdata}{\hyperlink{spdata}{{\tt psb\_Tspmat\_type}}}
\newcommand{\vdata}{\hyperlink{vdata}{{\tt psb\_T\_vect\_type}}}
\newcommand{\spbasedata}{\hypertarget{spbasedata}{{\tt psb\_T\_base\_sparse\_mat}}}
\newcommand{\vbasedata}{\hypertarget{vbasedata}{{\tt psb\_T\_base\_vect\_type}}}
\def\bsideways{\begin{table}}
\def\esideways{\end{table}}
\begin{document}
\lstset{language=Fortran}

@ -5,7 +5,7 @@ include $(INCDIR)/Make.inc.psblas
#
# Libraries used
LIBDIR=$(BASEDIR)/lib
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base
LDLIBS=$(PSBLDLIBS)
#
# Compilers and such

@ -6,7 +6,7 @@ INCDIR=$(INSTALLDIR)/include/
MODDIR=$(INSTALLDIR)/modules/
include $(INCDIR)/Make.inc.psblas
LIBDIR=$(INSTALLDIR)/lib/
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base
LDLIBS=$(PSBLDLIBS)
FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG).

@ -5,7 +5,7 @@ include $(INCDIR)/Make.inc.psblas
#
# Libraries used
LIBDIR=$(INSTALLDIR)/lib
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base
LDLIBS=$(PSBLDLIBS)
#
# Compilers and such

@ -658,7 +658,7 @@ end module psb_d_pde3d_mod
program psb_d_pde3d
use psb_base_mod
use psb_prec_mod
use psb_krylov_mod
use psb_linsolve_mod
use psb_util_mod
use psb_d_pde3d_mod
#if defined(OPENMP)

@ -8,11 +8,11 @@ CSR Storage format for matrix A: CSR COO
0200 MAXIT
10 ITRACE
002 IRST restart for RGMRES and BiCGSTABL
INVK Block Solver ILU,ILUT,INVK,AINVT,AORTH
INVK Block Solver ILU,ILUT,INVK,INVT,AINV
NONE If ILU : MILU or NONE othewise ignored
NONE Scaling if ILUT: NONE, MAXVAL otherwise ignored
0 Level of fill for forward factorization
1 Level of fill for inverse factorization (only INVK)
1 Level of fill for inverse factorization (only INVK,INVT)
1E-1 Threshold for forward factorization
1E-1 Threshold for inverse factorization (Only INVK, AINVT)
LLK What orthogonalization algorithm? (Only AINVT)
1E-1 Threshold for inverse factorization (Only INVK, INVT)
LLK What orthogonalization algorithm? (Only AINV)

@ -6,7 +6,7 @@ INCDIR=$(INSTALLDIR)/include/
MODDIR=$(INSTALLDIR)/modules/
include $(INCDIR)/Make.inc.psblas
LIBDIR=$(INSTALLDIR)/lib/
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base
LDLIBS=$(PSBLDLIBS)
FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG).

@ -3,7 +3,7 @@ INCDIR=$(INSTALLDIR)/include/
MODDIR=$(INSTALLDIR)/modules/
include $(INCDIR)/Make.inc.psblas
LIBDIR=$(INSTALLDIR)/lib/
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base
LDLIBS=$(PSBLDLIBS)
CCOPT= -g
FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG).

@ -6,7 +6,7 @@ include $(INCDIR)/Make.inc.psblas
# Libraries used
#
LIBDIR=$(INSTALLDIR)/lib/
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base
LDLIBS=$(PSBLDLIBS)
FINCLUDES=$(FMFLAG)$(MODDIR) $(FMFLAG).

Loading…
Cancel
Save