Merge branch 'repackage' into oacc_loloum
Before Width: | Height: | Size: 334 B After Width: | Height: | Size: 328 B |
Before Width: | Height: | Size: 382 B After Width: | Height: | Size: 366 B |
Before Width: | Height: | Size: 296 B After Width: | Height: | Size: 289 B |
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 61 KiB |
After Width: | Height: | Size: 90 KiB |
After Width: | Height: | Size: 29 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.3 KiB |
@ -0,0 +1,19 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 72--><p class="indent" > <span class="footnote-mark"><a
|
||||
id="fn4x0"><a
|
||||
id="x16-136002x10.1"></a> <sup class="textsuperscript">4</sup></a></span><span
|
||||
class="cmr-8">The string is case-insensitive</span></div>
|
||||
|
||||
</body></html>
|
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 968 B |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.1 KiB |
@ -0,0 +1,20 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 53--><p class="noindent" ><span class="footnote-mark"><a
|
||||
id="fn5x0"><a
|
||||
id="x18-143004x11.1"></a> <sup class="textsuperscript">5</sup></a></span><span
|
||||
class="cmr-8">Note: the implementation is for </span><span
|
||||
class="cmmi-8">FCG</span><span
|
||||
class="cmr-8">(1).</span></div>
|
||||
</body></html>
|
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 7.5 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.0 KiB After Width: | Height: | Size: 970 B |
Before Width: | Height: | Size: 439 B After Width: | Height: | Size: 420 B |
Before Width: | Height: | Size: 763 B After Width: | Height: | Size: 710 B |
Before Width: | Height: | Size: 1.9 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1016 B |
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 1.7 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 1.5 KiB After Width: | Height: | Size: 1.3 KiB |
@ -0,0 +1,24 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 195--><p class="noindent" ><span class="footnote-mark"><a
|
||||
id="fn2x0"><a
|
||||
id="x6-4002x2.1"></a> <sup class="textsuperscript">2</sup></a></span><span
|
||||
class="cmr-8">This is the normal situation when the pattern of the sparse matrix is symmetric, which is</span>
|
||||
<span
|
||||
class="cmr-8">equivalent to say that the interaction between two variables is reciprocal. If the matrix pattern is</span>
|
||||
<span
|
||||
class="cmr-8">non-symmetric we may have one-way interactions, and these could cause a situation in which a</span>
|
||||
<span
|
||||
class="cmr-8">boundary point is not a halo point for its neighbour.</span></div>
|
||||
</body></html>
|
Before Width: | Height: | Size: 2.5 KiB After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.7 KiB |
@ -0,0 +1,921 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title>Extensions</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<!--l. 1--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse11.html" >prev</a>] [<a
|
||||
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse9.html#tailuserhtmlse12.html">tail</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<h3 class="sectionHead"><span class="titlemark">12 </span> <a
|
||||
id="x19-14400012"></a>Extensions</h3>
|
||||
<!--l. 3--><p class="noindent" >The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
|
||||
library. The extensions provide additional storage formats beyond the ones already
|
||||
contained in the base library, as well as interfaces to:
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 8--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">SPGPU</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 8--><p class="noindent" >a CUDA library originally
|
||||
published as <a
|
||||
href="https://code.google.com/p/spgpu/" class="url" ><span
|
||||
class="cmtt-10">https://code.google.com/p/spgpu/</span></a> and now included
|
||||
in the <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">cuda</span></span></span> subdir, for computations on NVIDIA GPUs;
|
||||
</dd><dt class="description">
|
||||
<!--l. 11--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">LIBRSB</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 11--><p class="noindent" ><a
|
||||
href="http://sourceforge.net/projects/librsb/" class="url" ><span
|
||||
class="cmtt-10">http://sourceforge.net/projects/librsb/</span></a>, for computations on
|
||||
multicore parallel machines.</dd></dl>
|
||||
<!--l. 14--><p class="noindent" >The infrastructure laid out in the base library to allow for these extensions is detailed in
|
||||
the references <span class="cite">[<a
|
||||
href="userhtmlli2.html#XDesPat:11">20</a>, <a
|
||||
href="userhtmlli2.html#XCaFiRo:2014">21</a>, <a
|
||||
href="userhtmlli2.html#XSparse03">10</a>]</span>; the CUDA-specific data formats are described
|
||||
in <span class="cite">[<a
|
||||
href="userhtmlli2.html#XOurTechRep">22</a>]</span>.
|
||||
<!--l. 19--><p class="noindent" >
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a
|
||||
id="x19-14500012.1"></a>Using the extensions</h4>
|
||||
<!--l. 21--><p class="noindent" >A sample application using the PSBLAS extensions will contain the following
|
||||
steps:
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 24--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">USE</span></span></span> the appropriat modules (<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_ext_mod</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_cuda_mod</span></span></span>);
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 26--><p class="noindent" >Declare a <span
|
||||
class="cmti-10">mold </span>variable of the necessary type (e.g.
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_ell_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_hlg_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_vect_cuda</span></span></span>);
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 29--><p class="noindent" >Pass the mold variable to the base library interface where needed to ensure
|
||||
the appropriate dynamic type.</li></ul>
|
||||
|
||||
|
||||
|
||||
<!--l. 32--><p class="noindent" >Suppose you want to use the CUDA-enabled ELLPACK data structure; you would use a
|
||||
piece of code like this (and don’t forget, you need CUDA-side vectors along with the
|
||||
matrices):
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 85--><p class="noindent" >
|
||||
|
||||
|
||||
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-103">
|
||||
program my_cuda_test
|
||||
  use psb_base_mod
|
||||
  use psb_util_mod
|
||||
  use psb_ext_mod
|
||||
  use psb_cuda_mod
|
||||
  type(psb_dspmat_type) :: a, agpu
|
||||
  type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
  real(psb_dpk_), allocatable :: xtmp(:)
|
||||
  type(psb_d_vect_cuda)       :: vmold
|
||||
  type(psb_d_elg_sparse_mat) :: aelg
|
||||
  type(psb_ctxt_type) :: ctxt
|
||||
  integer             :: iam, np
|
||||
|
||||
|
||||
  call psb_init(ctxt)
|
||||
  call psb_info(ctxt,iam,np)
|
||||
  call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
  ! My own home-grown matrix generator
|
||||
  call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
  if (info /= 0) goto 9999
|
||||
|
||||
  call a%cscnv(agpu,info,mold=aelg)
|
||||
  if (info /= 0) goto 9999
|
||||
  xtmp = x%get_vect()
|
||||
  call xg%bld(xtmp,mold=vmold)
|
||||
  call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
  ! Do sparse MV
|
||||
  call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
  if (info == 0) then
|
||||
     write(*,*) ’42’
|
||||
  else
|
||||
     write(*,*) ’Something went wrong ’,info
|
||||
  end if
|
||||
|
||||
|
||||
  call psb_cuda_exit()
|
||||
  call psb_exit(ctxt)
|
||||
  stop
|
||||
end program my_cuda_test
|
||||
</pre>
|
||||
<!--l. 134--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
<!--l. 139--><p class="indent" > A full example of this strategy can be seen in the <span
|
||||
class="cmtt-10">test/ext/kernel </span>and
|
||||
<span
|
||||
class="cmtt-10">test/cuda/kernel </span>subdirectories, where we provide sample programs to test the
|
||||
speed of the sparse matrix-vector product with the various data structures included
|
||||
in the library.
|
||||
<!--l. 146--><p class="noindent" >
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a
|
||||
id="x19-14600012.2"></a>Extensions’ Data Structures</h4>
|
||||
<!--l. 150--><p class="noindent" >Access to the facilities provided by the EXT library is mainly achieved through
|
||||
the data types that are provided within. The data classes are derived from
|
||||
the base classes in PSBLAS, through the Fortran 2003 mechanism of <span
|
||||
class="cmti-10">type</span>
|
||||
<span
|
||||
class="cmti-10">extension</span> <span class="cite">[<a
|
||||
href="userhtmlli2.html#XMRC:11">17</a>]</span>.
|
||||
<!--l. 155--><p class="indent" > The data classes are divided between the general purpose CPU extensions, the
|
||||
GPU interfaces and the RSB interfaces. In the description we will make use of the
|
||||
notation introduced in Table <a
|
||||
href="#x19-146001r21">21<!--tex4ht:ref: tab:notation --></a>.
|
||||
<div class="table">
|
||||
|
||||
|
||||
|
||||
<!--l. 160--><p class="indent" > <a
|
||||
id="x19-146001r21"></a><hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<div class="caption"
|
||||
><span class="id">Table 21: </span><span
|
||||
class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-146001r21 -->
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 162--><p class="noindent" >
|
||||
<div class="tabular"> <table id="TBL-23" class="tabular"
|
||||
|
||||
><colgroup id="TBL-23-1g"><col
|
||||
id="TBL-23-1"><col
|
||||
id="TBL-23-2"></colgroup><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-1-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">Name </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Description </span></td>
|
||||
</tr><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-2-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">M </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of rows in matrix </span></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-3-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">N </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of columns in matrix</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-4-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NZ </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of nonzeros in matrix </span></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-5-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">AVGNZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Average number of nonzeros per row</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-6-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">MAXNZR</span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Maximum number of nonzeros per row</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-7-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NDIAG </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Numero of nonzero diagonals </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-8-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">AS </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Coefficients array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-9-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">IA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Row indices array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-10-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">JA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Column indices array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-11-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">IRP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Row start pointers array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-12-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">JCP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Column start pointers array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-13-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of nonzeros per row array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-14-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">OFFSET </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Offset for diagonals </span></td>
|
||||
</tr><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-15-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-15-1"
|
||||
class="td11"> </td></tr></table> </div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
</div>
|
||||
<!--l. 188--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-146002r5"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 192--><p class="noindent" ><img
|
||||
src="mat.png" alt="PIC"
|
||||
width="147" height="147" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 5: </span><span
|
||||
class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-146002r5 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 198--><p class="indent" > </div><hr class="endfigure">
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a
|
||||
id="x19-14700012.3"></a>CPU-class extensions</h4>
|
||||
<!--l. 203--><p class="noindent" >
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-148000"></a>ELLPACK</h5>
|
||||
<!--l. 205--><p class="noindent" >The ELLPACK/ITPACK format (shown in Figure <a
|
||||
href="#x19-148001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional
|
||||
arrays <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> with <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">M</span></span></span> rows and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> columns, where <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> is the maximum
|
||||
number of nonzeros in any row <span class="cite">[<span
|
||||
class="cmbx-10">?</span>]</span>. Each row of the arrays <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> contains the
|
||||
coefficients and column indices; rows shorter than <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> are padded with zero
|
||||
coefficients and appropriate column indices, e.g. the last valid one found in the same
|
||||
row.
|
||||
<!--l. 215--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-148001r6"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 219--><p class="noindent" ><img
|
||||
src="ell.png" alt="PIC"
|
||||
width="233" height="233" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 6: </span><span
|
||||
class="content">ELLPACK compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-148001r6 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 225--><p class="indent" > </div><hr class="endfigure">
|
||||
<a
|
||||
id="x19-148002r1"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 229--><p class="indent" > <hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<!--l. 231-->
|
||||
<pre class="lstlisting" id="listing-168"><span class="label"><a
|
||||
id="x19-148003r1"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">n</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148004r2"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=0</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148005r3"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">maxnzr</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148006r4"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">+</span></span><span style="color:#000000"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">as</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">)*</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">x</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">ja</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">))</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148007r5"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148008r6"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">y</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">)</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148009r7"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span></pre>
|
||||
|
||||
<a
|
||||
id="x19-148010r1"></a>
|
||||
<a
|
||||
id="x19-148011"></a>
|
||||
<span
|
||||
class="cmbx-10">Algorithm</span><span
|
||||
class="cmbx-10"> 1:</span>  Matrix-Vector product in ELL format
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
<!--l. 242--><p class="indent" > The matrix-vector product <span
|
||||
class="cmmi-10">y </span>= <span
|
||||
class="cmmi-10">Ax </span>can be computed with the code shown in
|
||||
Alg. <a
|
||||
href="#x19-148010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and
|
||||
two floating-point operations per inner iteration.
|
||||
<!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the coefficients
|
||||
in the <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> array will be zeros; therefore this data structure will have an overhead both
|
||||
in terms of memory space and redundant operations (multiplications by zero). The
|
||||
overhead can be acceptable if:
|
||||
<ol class="enumerate1" >
|
||||
<li
|
||||
class="enumerate" id="x19-148013x1">
|
||||
<!--l. 253--><p class="noindent" >The maximum number of nonzeros per row is not much larger than the
|
||||
average;
|
||||
</li>
|
||||
<li
|
||||
class="enumerate" id="x19-148015x2">
|
||||
<!--l. 255--><p class="noindent" >The regularity of the data structure allows for faster code, e.g. by allowing
|
||||
vectorization, thereby offsetting the additional storage requirements.</li></ol>
|
||||
<!--l. 259--><p class="noindent" >In the extreme case where the input matrix has one full row, the ELLPACK
|
||||
structure would require more memory than the normal 2D array storage. The
|
||||
ELLPACK storage format was very popular in the vector computing days; in
|
||||
modern CPUs it is not quite as popular, but it is the basis for many GPU
|
||||
formats.
|
||||
<!--l. 265--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_ell_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 281--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-104">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
    !
|
||||
    ! ITPACK/ELL format, extended.
|
||||
    !
|
||||
|
||||
    integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
    real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
  contains
|
||||
    ....
|
||||
  end type psb_d_ell_sparse_mat
|
||||
</pre>
|
||||
<!--l. 295--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-149000"></a>Hacked ELLPACK</h5>
|
||||
<!--l. 303--><p class="noindent" >The <span
|
||||
class="cmti-10">hacked ELLPACK </span>(<span
|
||||
class="cmbx-10">HLL</span>) format alleviates the main problem of the ELLPACK
|
||||
format, that is, the amount of memory required by padding for sparse matrices in
|
||||
which the maximum row length is larger than the average.
|
||||
|
||||
|
||||
|
||||
<!--l. 308--><p class="indent" > The number of elements allocated to padding is
|
||||
[(<span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">*</span><span
|
||||
class="cmmi-10">maxNR</span>) <span
|
||||
class="cmsy-10">- </span>(<span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">*</span><span
|
||||
class="cmmi-10">avgNR</span>) = <span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">* </span>(<span
|
||||
class="cmmi-10">maxNR</span><span
|
||||
class="cmsy-10">-</span><span
|
||||
class="cmmi-10">avgNR</span>)] for both <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> arrays,
|
||||
where <span
|
||||
class="cmmi-10">m </span>is equal to the number of rows of the matrix, <span
|
||||
class="cmmi-10">maxNR </span>is the maximum
|
||||
number of nonzero elements in every row and <span
|
||||
class="cmmi-10">avgNR </span>is the average number of
|
||||
nonzeros. Therefore a single densely populated row can seriously affect the total size
|
||||
of the allocation.
|
||||
<!--l. 317--><p class="indent" > To limit this effect, in the HLL format we break the original matrix into equally
|
||||
sized groups of rows (called <span
|
||||
class="cmti-10">hacks</span>), and then store these groups as independent
|
||||
matrices in ELLPACK format. The groups can be arranged selecting rows in an
|
||||
arbitrarily manner; indeed, if the rows are sorted by decreasing number of nonzeros
|
||||
we obtain essentially the JAgged Diagonals format. If the rows are not in the original
|
||||
order, then an additional vector <span
|
||||
class="cmti-10">rIdx </span>is required, storing the actual row index for
|
||||
each row in the data structure.
|
||||
<!--l. 327--><p class="indent" > The multiple ELLPACK-like buffers are stacked together inside a single, one
|
||||
dimensional array; an additional vector <span
|
||||
class="cmti-10">hackOffsets </span>is provided to keep track of the
|
||||
individual submatrices. All hacks have the same number of rows <span
|
||||
class="cmti-10">hackSize</span>; hence, the
|
||||
<span
|
||||
class="cmti-10">hackOffsets </span>vector is an array of (<span
|
||||
class="cmmi-10">m∕hackSize</span>) + 1 elements, each one pointing to
|
||||
the first index of a submatrix inside the stacked <span
|
||||
class="cmti-10">cM</span>/<span
|
||||
class="cmti-10">rP </span>buffers, plus an additional
|
||||
element pointing past the end of the last block, where the next one would begin. We
|
||||
thus have the property that the elements of the <span
|
||||
class="cmmi-10">k</span>-th <span
|
||||
class="cmti-10">hack </span>are stored between
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">hackOffsets[k]</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">hackOffsets[k+1]</span></span></span>, similarly to what happens in the CSR
|
||||
format.
|
||||
<!--l. 342--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-149001r7"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 346--><p class="noindent" ><img
|
||||
src="hll.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 7: </span><span
|
||||
class="content">Hacked ELLPACK compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-149001r7 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 352--><p class="indent" > </div><hr class="endfigure">
|
||||
<!--l. 354--><p class="indent" > With this data structure a very long row only affects one hack, and therefore the
|
||||
additional memory is limited to the hack in which the row appears.
|
||||
<!--l. 358--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_hll_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 374--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-105">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
    !
|
||||
    ! HLL format. (Hacked ELL)
|
||||
    !
|
||||
    integer(psb_ipk_) :: hksz
|
||||
    integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
    real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
  contains
|
||||
   ....
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 388--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-150000"></a>Diagonal storage</h5>
|
||||
<!--l. 396--><p class="noindent" >The DIAgonal (DIA) format (shown in Figure <a
|
||||
href="#x19-150001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span>
|
||||
containing in each column the coefficients along a diagonal of the matrix, and an
|
||||
integer array <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">OFFSET</span></span></span> that determines where each diagonal starts. The diagonals in <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span>
|
||||
are padded with zeros as necessary.
|
||||
<!--l. 402--><p class="indent" > The code to compute the matrix-vector product <span
|
||||
class="cmmi-10">y </span>= <span
|
||||
class="cmmi-10">Ax </span>is shown in Alg. <a
|
||||
href="#x19-150003r2">2<!--tex4ht:ref: alg:dia --></a>; it
|
||||
costs one memory read per outer iteration, plus three memory reads, one memory
|
||||
write and two floating-point operations per inner iteration. The accesses to
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">x</span></span></span> are in strict sequential order, therefore no indirect addressing is
|
||||
required.
|
||||
<!--l. 409--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-150001r8"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 413--><p class="noindent" ><img
|
||||
src="dia.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 8: </span><span
|
||||
class="content">DIA compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r8 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 419--><p class="indent" > </div><hr class="endfigure">
|
||||
<a
|
||||
id="x19-150002r2"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 423--><p class="indent" > <hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 437--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-106">
|
||||
    do j=1,ndiag
|
||||
      if (offset(j) > 0) then
|
||||
        ir1 = 1; ir2 = m - offset(j);
|
||||
      else
|
||||
        ir1 = 1 - offset(j); ir2 = m;
|
||||
      end if
|
||||
      do i=ir1,ir2
|
||||
        y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
      end do
|
||||
    end do
|
||||
</pre>
|
||||
<!--l. 450--><p class="nopar" > </div></div>
|
||||
<a
|
||||
id="x19-150003r2"></a>
|
||||
<a
|
||||
id="x19-150004"></a>
|
||||
<span
|
||||
class="cmbx-10">Algorithm</span><span
|
||||
class="cmbx-10"> 2:</span>  Matrix-Vector product in DIA format
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
<!--l. 458--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_dia_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 473--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-107">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
    !
|
||||
    ! DIA format, extended.
|
||||
    !
|
||||
|
||||
    integer(psb_ipk_), allocatable :: offset(:)
|
||||
    integer(psb_ipk_) :: nzeros
|
||||
    real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 486--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-151000"></a>Hacked DIA</h5>
|
||||
<!--l. 495--><p class="noindent" >Storage by DIAgonals is an attractive option for matrices whose coefficients are
|
||||
located on a small set of diagonals, since they do away with storing explicitly the
|
||||
indices and therefore reduce significantly memory traffic. However, having a few
|
||||
coefficients outside of the main set of diagonals may significantly increase the
|
||||
amount of needed padding; moreover, while the DIA code is easily vectorized,
|
||||
it does not necessarily make optimal use of the memory hierarchy. While
|
||||
processing each diagonal we are updating entries in the output vector <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span>,
|
||||
which is then accessed multiple times; if the vector <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span> is too large to remain
|
||||
in the cache memory, the associated cache miss penalty is paid multiple
|
||||
times.
|
||||
<!--l. 507--><p class="indent" > The <span
|
||||
class="cmti-10">hacked DIA </span>(<span
|
||||
class="cmbx-10">HDIA</span>) format was designed to contain the amount of padding,
|
||||
by breaking the original matrix into equally sized groups of rows (<span
|
||||
class="cmti-10">hacks</span>), and then
|
||||
storing these groups as independent matrices in DIA format. This approach is similar
|
||||
to that of HLL, and requires using an offset vector for each submatrix. Again,
|
||||
similarly to HLL, the various submatrices are stacked inside a linear array to
|
||||
improve memory management. The fact that the matrix is accessed in slices
|
||||
helps in reducing cache misses, especially regarding accesses to the vector
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span>.
|
||||
<!--l. 519--><p class="indent" > An additional vector <span
|
||||
class="cmti-10">hackOffsets </span>is provided to complete the matrix format; given
|
||||
that <span
|
||||
class="cmti-10">hackSize </span>is the number of rows of each hack, the <span
|
||||
class="cmti-10">hackOffsets </span>vector is made by
|
||||
an array of (<span
|
||||
class="cmmi-10">m∕hackSize</span>) + 1 elements, pointing to the first diagonal offset of a
|
||||
submatrix inside the stacked <span
|
||||
class="cmti-10">offsets </span>buffers, plus an additional element equal to the
|
||||
number of nonzero diagonals in the whole matrix. We thus have the property that
|
||||
the number of diagonals of the <span
|
||||
class="cmmi-10">k</span>-th <span
|
||||
class="cmti-10">hack </span>is given by <span
|
||||
class="cmti-10">hackOffsets[k+1] -</span>
|
||||
<span
|
||||
class="cmti-10">hackOffsets[k]</span>.
|
||||
<!--l. 529--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-151001r9"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 533--><p class="noindent" ><img
|
||||
src="hdia.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 9: </span><span
|
||||
class="content">Hacked DIA compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r9 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 539--><p class="indent" > </div><hr class="endfigure">
|
||||
<!--l. 541--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_hdia_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 568--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-108">
|
||||
  type pm
|
||||
     real(psb_dpk_), allocatable  :: data(:,:)
|
||||
  end type pm
|
||||
|
||||
  type po
|
||||
     integer(psb_ipk_), allocatable  :: off(:)
|
||||
  end type po
|
||||
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
    !
|
||||
    ! HDIA format, extended.
|
||||
    !
|
||||
|
||||
    type(pm), allocatable :: hdia(:)
|
||||
    type(po), allocatable :: offset(:)
|
||||
    integer(psb_ipk_) :: nblocks, nzeros
|
||||
    integer(psb_ipk_) :: hack = 64
|
||||
    integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
  contains
|
||||
   ....
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 593--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a
|
||||
id="x19-15200012.4"></a>CUDA-class extensions</h4>
|
||||
<!--l. 4--><p class="noindent" >For computing with CUDA we define a dual memorization strategy in which each
|
||||
variable on the CPU (“host”) side has a GPU (“device”) side. When a GPU-type
|
||||
variable is initialized, the data contained is (usually) the same on both sides. Each
|
||||
operator invoked on the variable may change the data so that only the host side or
|
||||
the device side are up-to-date.
|
||||
<!--l. 11--><p class="indent" > Keeping track of the updates to data in the variables is essential: we want to
|
||||
perform most computations on the GPU, but we cannot afford the time needed to
|
||||
move data between the host memory and the device memory because the bandwidth
|
||||
of the interconnection bus would become the main bottleneck of the computation.
|
||||
Thus, each and every computational routine in the library is built according to the
|
||||
following principles:
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 18--><p class="noindent" >If the data type being handled is GPU-enabled, make sure that its device
|
||||
copy is up to date, perform any arithmetic operation on the GPU, and
|
||||
if the data has been altered as a result, mark the main-memory copy as
|
||||
outdated.
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 22--><p class="noindent" >The main-memory copy is never updated unless this is requested by the user
|
||||
either
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 25--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">explicitly</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 25--><p class="noindent" >by invoking a synchronization method;
|
||||
</dd><dt class="description">
|
||||
<!--l. 26--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">implicitly</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 26--><p class="noindent" >by invoking a method that involves other data items that are not
|
||||
GPU-enabled, e.g., by assignment ov a vector to a normal array.</dd></dl>
|
||||
</li></ul>
|
||||
<!--l. 31--><p class="noindent" >In this way, data items are put on the GPU memory “on demand” and remain there as
|
||||
long as “normal” computations are carried out. As an example, the following call to a
|
||||
matrix-vector product
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 39--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-109">
|
||||
    call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
</pre>
|
||||
<!--l. 43--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
<!--l. 47--><p class="noindent" >will transparently and automatically be performed on the GPU whenever all three data
|
||||
inputs <code class="lstinline"><span style="color:#000000">a</span></code>, <code class="lstinline"><span style="color:#000000">x</span></code> and <code class="lstinline"><span style="color:#000000">y</span></code> are GPU-enabled. If a program makes many such calls sequentially,
|
||||
then
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 52--><p class="noindent" >The first kernel invocation will find the data in main memory, and will
|
||||
copy it to the GPU memory, thus incurring a significant overhead; the
|
||||
result is however <span
|
||||
class="cmti-10">not </span>copied back, and therefore:
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 56--><p class="noindent" >Subsequent kernel invocations involving the same vector will find the data
|
||||
on the GPU side so that they will run at full speed.</li></ul>
|
||||
<!--l. 60--><p class="noindent" >For all invocations after the first the only data that will have to be transferred to/from
|
||||
the main memory will be the scalars <code class="lstinline"><span style="color:#000000">alpha</span></code> and <code class="lstinline"><span style="color:#000000">beta</span></code>, and the return code
|
||||
<code class="lstinline"><span style="color:#000000">info</span></code>.
|
||||
<!--l. 64--><p class="indent" >
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 65--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Vectors:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 65--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_vect_gpu</span></code> provides a GPU-enabled extension of
|
||||
the inner type <code class="lstinline"><span style="color:#000000">psb_T_base_vect_type</span></code>, and must be used together with
|
||||
the other inner matrix type to make full use of the GPU computational
|
||||
capabilities;
|
||||
</dd><dt class="description">
|
||||
<!--l. 69--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">CSR:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 69--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_csrg_sparse_mat</span></code> provides an interface to the GPU
|
||||
version of CSR available in the NVIDIA CuSPARSE library;
|
||||
</dd><dt class="description">
|
||||
<!--l. 72--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HYB:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 72--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hybg_sparse_mat</span></code> provides an interface to the HYB
|
||||
GPU storage available in the NVIDIA CuSPARSE library. The internal
|
||||
structure is opaque, hence the host side is just CSR; the HYB data format
|
||||
is only available up to CUDA version 10.
|
||||
</dd><dt class="description">
|
||||
<!--l. 77--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">ELL:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 77--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_elg_sparse_mat</span></code> provides an interface to the
|
||||
ELLPACK implementation from SPGPU;
|
||||
|
||||
|
||||
|
||||
</dd><dt class="description">
|
||||
<!--l. 80--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HLL:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 80--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hlg_sparse_mat</span></code> provides an interface to the Hacked
|
||||
ELLPACK implementation from SPGPU;
|
||||
</dd><dt class="description">
|
||||
<!--l. 82--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HDIA:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 82--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hdiag_sparse_mat</span></code> provides an interface to the
|
||||
Hacked DIAgonals implementation from SPGPU;</dd></dl>
|
||||
|
||||
|
||||
|
||||
<!--l. 87--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse11.html" >prev</a>] [<a
|
||||
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse12.html" >front</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<!--l. 87--><p class="indent" > <a
|
||||
id="tailuserhtmlse12.html"></a>
|
||||
</body></html>
|
@ -0,0 +1,299 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title>CUDA Environment Routines</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<!--l. 87--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse12.html" >prev</a>] [<a
|
||||
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse10.html#tailuserhtmlse13.html">tail</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<h3 class="sectionHead"><span class="titlemark">13 </span> <a
|
||||
id="x20-15300013"></a>CUDA Environment Routines</h3>
|
||||
<!--l. 91--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-154000"></a>psb_cuda_init — Initializes PSBLAS-CUDA environment</h4>
|
||||
<a
|
||||
id="Q1-20-191"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 99--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-110">
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
</pre>
|
||||
<!--l. 103--><p class="nopar" > </div></div>
|
||||
<!--l. 108--><p class="noindent" >This subroutine initializes the PSBLAS-CUDA environment.
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 110--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Type:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 110--><p class="noindent" >Synchronous.
|
||||
</dd><dt class="description">
|
||||
<!--l. 111--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">On Entry</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 111--><p class="noindent" >
|
||||
</dd><dt class="description">
|
||||
<!--l. 112--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">device</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 112--><p class="noindent" >ID of CUDA device to attach to.<br
|
||||
class="newline" />Scope: <span
|
||||
class="cmbx-10">local</span>.<br
|
||||
class="newline" />Type: <span
|
||||
class="cmbx-10">optional</span>.<br
|
||||
class="newline" />Intent: <span
|
||||
class="cmbx-10">in</span>.<br
|
||||
class="newline" />Specified as: an integer value.  Default: use <code class="lstinline"><span style="color:#000000">mod</span><span style="color:#000000">(</span><span style="color:#000000">iam</span><span style="color:#000000">,</span><span style="color:#000000">ngpu</span><span style="color:#000000">)</span></code> where <code class="lstinline"><span style="color:#000000">iam</span></code> is
|
||||
the calling process index and <code class="lstinline"><span style="color:#000000">ngpu</span></code> is the total number of CUDA devices
|
||||
available on the current node.</dd></dl>
|
||||
<!--l. 123--><p class="noindent" ><span
|
||||
class="cmbx-12">Notes</span>
|
||||
|
||||
|
||||
|
||||
<ol class="enumerate1" >
|
||||
<li
|
||||
class="enumerate" id="x20-154002x1">
|
||||
<!--l. 125--><p class="noindent" >A call to this routine must precede any other PSBLAS-CUDA call.</li></ol>
|
||||
<!--l. 129--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-155000"></a>psb_cuda_exit — Exit from PSBLAS-CUDA environment</h4>
|
||||
<a
|
||||
id="Q1-20-193"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 137--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-111">
|
||||
call psb_cuda_exit(ctxt)
|
||||
</pre>
|
||||
<!--l. 141--><p class="nopar" > </div></div>
|
||||
<!--l. 146--><p class="noindent" >This subroutine exits from the PSBLAS CUDA context.
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 148--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Type:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 148--><p class="noindent" >Synchronous.
|
||||
</dd><dt class="description">
|
||||
<!--l. 149--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">On Entry</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 149--><p class="noindent" >
|
||||
</dd><dt class="description">
|
||||
<!--l. 150--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">ctxt</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 150--><p class="noindent" >the communication context identifying the virtual parallel machine.<br
|
||||
class="newline" />Scope: <span
|
||||
class="cmbx-10">global</span>.<br
|
||||
class="newline" />Type: <span
|
||||
class="cmbx-10">required</span>.<br
|
||||
class="newline" />Intent: <span
|
||||
class="cmbx-10">in</span>.<br
|
||||
class="newline" />Specified as: an integer variable.</dd></dl>
|
||||
<!--l. 161--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-156000"></a>psb_cuda_DeviceSync — Synchronize CUDA device</h4>
|
||||
<a
|
||||
id="Q1-20-195"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 169--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-112">
|
||||
call psb_cuda_DeviceSync()
|
||||
</pre>
|
||||
<!--l. 173--><p class="nopar" > </div></div>
|
||||
<!--l. 178--><p class="noindent" >This subroutine ensures that all previosly invoked kernels, i.e. all invocation of
|
||||
CUDA-side code, have completed.
|
||||
<!--l. 182--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-157000"></a>psb_cuda_getDeviceCount </h4>
|
||||
<a
|
||||
id="Q1-20-197"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 190--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-113">
|
||||
ngpus =  psb_cuda_getDeviceCount()
|
||||
</pre>
|
||||
<!--l. 194--><p class="nopar" > </div></div>
|
||||
<!--l. 199--><p class="noindent" >Get number of devices available on current computing node.
|
||||
<!--l. 201--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-158000"></a>psb_cuda_getDevice </h4>
|
||||
<a
|
||||
id="Q1-20-199"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 209--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-114">
|
||||
ngpus =  psb_cuda_getDevice()
|
||||
</pre>
|
||||
<!--l. 213--><p class="nopar" > </div></div>
|
||||
<!--l. 218--><p class="noindent" >Get device in use by current process.
|
||||
<!--l. 220--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-159000"></a>psb_cuda_setDevice </h4>
|
||||
<a
|
||||
id="Q1-20-201"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 228--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-115">
|
||||
info = psb_cuda_setDevice(dev)
|
||||
</pre>
|
||||
<!--l. 232--><p class="nopar" > </div></div>
|
||||
<!--l. 237--><p class="noindent" >Set device to be used by current process.
|
||||
<!--l. 239--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-160000"></a>psb_cuda_DeviceHasUVA </h4>
|
||||
<a
|
||||
id="Q1-20-203"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 247--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-116">
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
</pre>
|
||||
<!--l. 251--><p class="nopar" > </div></div>
|
||||
<!--l. 256--><p class="noindent" >Returns true if device currently in use supports UVA (Unified Virtual Addressing).
|
||||
<!--l. 259--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-161000"></a>psb_cuda_WarpSize </h4>
|
||||
<a
|
||||
id="Q1-20-205"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 267--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-117">
|
||||
nw = psb_cuda_WarpSize()
|
||||
</pre>
|
||||
<!--l. 271--><p class="nopar" > </div></div>
|
||||
<!--l. 276--><p class="noindent" >Returns the warp size.
|
||||
<!--l. 279--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-162000"></a>psb_cuda_MultiProcessors </h4>
|
||||
<a
|
||||
id="Q1-20-207"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 287--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-118">
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
</pre>
|
||||
<!--l. 291--><p class="nopar" > </div></div>
|
||||
<!--l. 296--><p class="noindent" >Returns the number of multiprocessors in the CUDA device.
|
||||
<!--l. 298--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-163000"></a>psb_cuda_MaxThreadsPerMP </h4>
|
||||
<a
|
||||
id="Q1-20-209"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 306--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-119">
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
</pre>
|
||||
<!--l. 310--><p class="nopar" > </div></div>
|
||||
<!--l. 315--><p class="noindent" >Returns the maximum number of threads per multiprocessor.
|
||||
<!--l. 318--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-164000"></a>psb_cuda_MaxRegistersPerBlock </h4>
|
||||
<a
|
||||
id="Q1-20-211"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 326--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-120">
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
</pre>
|
||||
<!--l. 330--><p class="nopar" > </div></div>
|
||||
<!--l. 335--><p class="noindent" >Returns the maximum number of register per thread block.
|
||||
<!--l. 338--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-165000"></a>psb_cuda_MemoryClockRate </h4>
|
||||
<a
|
||||
id="Q1-20-213"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 346--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-121">
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
</pre>
|
||||
<!--l. 350--><p class="nopar" > </div></div>
|
||||
<!--l. 355--><p class="noindent" >Returns the memory clock rate in KHz, as an integer.
|
||||
<!--l. 357--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-166000"></a>psb_cuda_MemoryBusWidth </h4>
|
||||
<a
|
||||
id="Q1-20-215"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 365--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-122">
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
</pre>
|
||||
<!--l. 369--><p class="nopar" > </div></div>
|
||||
<!--l. 374--><p class="noindent" >Returns the memory bus width in bits.
|
||||
<!--l. 376--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-167000"></a>psb_cuda_MemoryPeakBandwidth </h4>
|
||||
<a
|
||||
id="Q1-20-217"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 384--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-123">
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
</pre>
|
||||
<!--l. 388--><p class="nopar" > </div></div>
|
||||
<!--l. 392--><p class="noindent" >Returns the peak memory bandwidth in MB/s (real double precision).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!--l. 126--><p class="indent" >
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!--l. 2--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse12.html" >prev</a>] [<a
|
||||
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse13.html" >front</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<!--l. 2--><p class="indent" > <a
|
||||
id="tailuserhtmlse13.html"></a>
|
||||
</body></html>
|
@ -0,0 +1,395 @@
|
||||
|
||||
\subsection{CUDA-class extensions}
|
||||
|
||||
For computing with CUDA we define a dual memorization strategy in
|
||||
which each variable on the CPU (``host'') side has a GPU (``device'')
|
||||
side. When a GPU-type variable is initialized, the data contained is
|
||||
(usually) the same on both sides. Each operator invoked on the
|
||||
variable may change the data so that only the host side or the device
|
||||
side are up-to-date.
|
||||
|
||||
Keeping track of the updates to data in the variables is essential: we want
|
||||
to perform most computations on the GPU, but we cannot afford the time
|
||||
needed to move data between the host memory and the device memory
|
||||
because the bandwidth of the interconnection bus would become the main
|
||||
bottleneck of the computation. Thus, each and every computational
|
||||
routine in the library is built according to the following principles:
|
||||
\begin{itemize}
|
||||
\item If the data type being handled is {GPU}-enabled, make sure that
|
||||
its device copy is up to date, perform any arithmetic operation on
|
||||
the {GPU}, and if the data has been altered as a result, mark
|
||||
the main-memory copy as outdated.
|
||||
\item The main-memory copy is never updated unless this is requested
|
||||
by the user either
|
||||
\begin{description}
|
||||
\item[explicitly] by invoking a synchronization method;
|
||||
\item[implicitly] by invoking a method that involves other data items
|
||||
that are not {GPU}-enabled, e.g., by assignment ov a vector to a
|
||||
normal array.
|
||||
\end{description}
|
||||
\end{itemize}
|
||||
In this way, data items are put on the {GPU} memory ``on demand'' and
|
||||
remain there as long as ``normal'' computations are carried out.
|
||||
As an example, the following call to a matrix-vector product
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
will transparently and automatically be performed on the {GPU} whenever
|
||||
all three data inputs \fortinline|a|, \fortinline|x| and
|
||||
\fortinline|y| are {GPU}-enabled. If a program makes many such calls
|
||||
sequentially, then
|
||||
\begin{itemize}
|
||||
\item The first kernel invocation will find the data in main memory,
|
||||
and will copy it to the {GPU} memory, thus incurring a significant
|
||||
overhead; the result is however \emph{not} copied back, and
|
||||
therefore:
|
||||
\item Subsequent kernel invocations involving the same vector will
|
||||
find the data on the {GPU} side so that they will run at full
|
||||
speed.
|
||||
\end{itemize}
|
||||
For all invocations after the first the only data that will have to be
|
||||
transferred to/from the main memory will be the scalars \fortinline|alpha|
|
||||
and \fortinline|beta|, and the return code \fortinline|info|.
|
||||
|
||||
\begin{description}
|
||||
\item[Vectors:] The data type \fortinline|psb_T_vect_gpu| provides a
|
||||
GPU-enabled extension of the inner type \fortinline|psb_T_base_vect_type|,
|
||||
and must be used together with the other inner matrix type to make
|
||||
full use of the GPU computational capabilities;
|
||||
\item[CSR:] The data type \fortinline|psb_T_csrg_sparse_mat| provides an
|
||||
interface to the GPU version of CSR available in the NVIDIA CuSPARSE
|
||||
library;
|
||||
\item[HYB:] The data type \fortinline|psb_T_hybg_sparse_mat| provides an
|
||||
interface to the HYB GPU storage available in the NVIDIA CuSPARSE
|
||||
library. The internal structure is opaque, hence the host side is
|
||||
just CSR; the HYB data format is only available up to CUDA version
|
||||
10.
|
||||
\item[ELL:] The data type \fortinline|psb_T_elg_sparse_mat| provides an
|
||||
interface to the ELLPACK implementation from SPGPU;
|
||||
|
||||
\item[HLL:] The data type \fortinline|psb_T_hlg_sparse_mat| provides an
|
||||
interface to the Hacked ELLPACK implementation from SPGPU;
|
||||
\item[HDIA:] The data type \fortinline|psb_T_hdiag_sparse_mat| provides an
|
||||
interface to the Hacked DIAgonals implementation from SPGPU;
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{CUDA Environment Routines}
|
||||
\label{sec:cudaenv}
|
||||
|
||||
\subsection*{psb\_cuda\_init --- Initializes PSBLAS-CUDA
|
||||
environment}
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_init}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
This subroutine initializes the PSBLAS-CUDA environment.
|
||||
\begin{description}
|
||||
\item[Type:] Synchronous.
|
||||
\item[\bf On Entry ]
|
||||
\item[device] ID of CUDA device to attach to.\\
|
||||
Scope: {\bf local}.\\
|
||||
Type: {\bf optional}.\\
|
||||
Intent: {\bf in}.\\
|
||||
Specified as: an integer value. \
|
||||
Default: use \fortinline|mod(iam,ngpu)| where \fortinline|iam| is the calling
|
||||
process index and \fortinline|ngpu| is the total number of CUDA devices
|
||||
available on the current node.
|
||||
\end{description}
|
||||
|
||||
|
||||
{\par\noindent\large\bfseries Notes}
|
||||
\begin{enumerate}
|
||||
\item A call to this routine must precede any other PSBLAS-CUDA call.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection*{psb\_cuda\_exit --- Exit from PSBLAS-CUDA
|
||||
environment}
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_exit}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
call psb_cuda_exit(ctxt)
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_cuda_exit(ctxt)
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
This subroutine exits from the PSBLAS CUDA context.
|
||||
\begin{description}
|
||||
\item[Type:] Synchronous.
|
||||
\item[\bf On Entry ]
|
||||
\item[ctxt] the communication context identifying the virtual
|
||||
parallel machine.\\
|
||||
Scope: {\bf global}.\\
|
||||
Type: {\bf required}.\\
|
||||
Intent: {\bf in}.\\
|
||||
Specified as: an integer variable.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_DeviceSync --- Synchronize CUDA device}
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_DeviceSync}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
call psb_cuda_DeviceSync()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_cuda_DeviceSync()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
This subroutine ensures that all previosly invoked kernels, i.e. all
|
||||
invocation of CUDA-side code, have completed.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_getDeviceCount }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_getDeviceCount}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
ngpus = psb_cuda_getDeviceCount()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
ngpus = psb_cuda_getDeviceCount()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Get number of devices available on current computing node.
|
||||
|
||||
\subsection*{psb\_cuda\_getDevice }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_getDevice}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
ngpus = psb_cuda_getDevice()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
ngpus = psb_cuda_getDevice()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Get device in use by current process.
|
||||
|
||||
\subsection*{psb\_cuda\_setDevice }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_setDevice}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
info = psb_cuda_setDevice(dev)
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
info = psb_cuda_setDevice(dev)
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Set device to be used by current process.
|
||||
|
||||
\subsection*{psb\_cuda\_DeviceHasUVA }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_DeviceHasUVA}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns true if device currently in use supports UVA
|
||||
(Unified Virtual Addressing).
|
||||
|
||||
\subsection*{psb\_cuda\_WarpSize }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_WarpSize}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nw = psb_cuda_WarpSize()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nw = psb_cuda_WarpSize()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the warp size.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_MultiProcessors }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MultiProcessors}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the number of multiprocessors in the CUDA device.
|
||||
|
||||
\subsection*{psb\_cuda\_MaxThreadsPerMP }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MaxThreadsPerMP}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the maximum number of threads per multiprocessor.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_MaxRegistersPerBlock }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MaxRegisterPerBlock}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the maximum number of register per thread block.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_MemoryClockRate }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MemoryClockRate}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the memory clock rate in KHz, as an integer.
|
||||
|
||||
\subsection*{psb\_cuda\_MemoryBusWidth }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MemoryBusWidth}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the memory bus width in bits.
|
||||
|
||||
\subsection*{psb\_cuda\_MemoryPeakBandwidth }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MemoryPeakBandwidth}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
Returns the peak memory bandwidth in MB/s (real double precision).
|
||||
|
||||
|
||||
|
@ -0,0 +1,598 @@
|
||||
\section{Extensions}\label{sec:ext-intro}
|
||||
|
||||
The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
|
||||
library. The extensions provide additional storage formats beyond the
|
||||
ones already contained in the base library, as well as interfaces
|
||||
to:
|
||||
\begin{description}
|
||||
\item[SPGPU] a CUDA library originally published as
|
||||
\url{https://code.google.com/p/spgpu/} and now included in the
|
||||
\verb|cuda| subdir, for computations on NVIDIA GPUs;
|
||||
\item[LIBRSB] \url{http://sourceforge.net/projects/librsb/}, for
|
||||
computations on multicore parallel machines.
|
||||
\end{description}
|
||||
The infrastructure laid out in the base library to allow for these
|
||||
extensions is detailed in the references~\cite{DesPat:11,CaFiRo:2014,Sparse03};
|
||||
the CUDA-specific data formats are described in~\cite{OurTechRep}.
|
||||
|
||||
|
||||
\subsection{Using the extensions}
|
||||
\label{sec:ext-appstruct}
|
||||
A sample application using the PSBLAS extensions will contain the
|
||||
following steps:
|
||||
\begin{itemize}
|
||||
\item \verb|USE| the appropriat modules (\verb|psb_ext_mod|,
|
||||
\verb|psb_cuda_mod|);
|
||||
\item Declare a \emph{mold} variable of the necessary type
|
||||
(e.g. \verb|psb_d_ell_sparse_mat|, \verb|psb_d_hlg_sparse_mat|,
|
||||
\verb|psb_d_vect_cuda|);
|
||||
\item Pass the mold variable to the base library interface where
|
||||
needed to ensure the appropriate dynamic type.
|
||||
\end{itemize}
|
||||
Suppose you want to use the CUDA-enabled ELLPACK data structure; you
|
||||
would use a piece of code like this (and don't forget, you need
|
||||
CUDA-side vectors along with the matrices):
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
program my_cuda_test
|
||||
use psb_base_mod
|
||||
use psb_util_mod
|
||||
use psb_ext_mod
|
||||
use psb_cuda_mod
|
||||
type(psb_dspmat_type) :: a, agpu
|
||||
type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
real(psb_dpk_), allocatable :: xtmp(:)
|
||||
type(psb_d_vect_cuda) :: vmold
|
||||
type(psb_d_elg_sparse_mat) :: aelg
|
||||
type(psb_ctxt_type) :: ctxt
|
||||
integer :: iam, np
|
||||
|
||||
|
||||
call psb_init(ctxt)
|
||||
call psb_info(ctxt,iam,np)
|
||||
call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
! My own home-grown matrix generator
|
||||
call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
if (info /= 0) goto 9999
|
||||
|
||||
call a%cscnv(agpu,info,mold=aelg)
|
||||
if (info /= 0) goto 9999
|
||||
xtmp = x%get_vect()
|
||||
call xg%bld(xtmp,mold=vmold)
|
||||
call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
! Do sparse MV
|
||||
call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
if (info == 0) then
|
||||
write(*,*) '42'
|
||||
else
|
||||
write(*,*) 'Something went wrong ',info
|
||||
end if
|
||||
|
||||
|
||||
call psb_cuda_exit()
|
||||
call psb_exit(ctxt)
|
||||
stop
|
||||
end program my_cuda_test
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
program my_cuda_test
|
||||
use psb_base_mod
|
||||
use psb_util_mod
|
||||
use psb_ext_mod
|
||||
use psb_cuda_mod
|
||||
type(psb_dspmat_type) :: a, agpu
|
||||
type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
real(psb_dpk_), allocatable :: xtmp(:)
|
||||
type(psb_d_vect_cuda) :: vmold
|
||||
type(psb_d_elg_sparse_mat) :: aelg
|
||||
type(psb_ctxt_type) :: ctxt
|
||||
integer :: iam, np
|
||||
|
||||
|
||||
call psb_init(ctxt)
|
||||
call psb_info(ctxt,iam,np)
|
||||
call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
! My own home-grown matrix generator
|
||||
call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
if (info /= 0) goto 9999
|
||||
|
||||
call a%cscnv(agpu,info,mold=aelg)
|
||||
if (info /= 0) goto 9999
|
||||
xtmp = x%get_vect()
|
||||
call xg%bld(xtmp,mold=vmold)
|
||||
call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
! Do sparse MV
|
||||
call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
if (info == 0) then
|
||||
write(*,*) '42'
|
||||
else
|
||||
write(*,*) 'Something went wrong ',info
|
||||
end if
|
||||
|
||||
|
||||
call psb_cuda_exit()
|
||||
call psb_exit(ctxt)
|
||||
stop
|
||||
end program my_cuda_test
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
A full example of this strategy can be seen in the
|
||||
\texttt{test/ext/kernel} and \texttt{test/\-cuda/\-kernel} subdirectories,
|
||||
where we provide sample programs
|
||||
to test the speed of the sparse matrix-vector product with the various
|
||||
data structures included in the library.
|
||||
|
||||
|
||||
\subsection{Extensions' Data Structures}
|
||||
\label{sec:ext-datastruct}
|
||||
%\ifthenelse{\boolean{mtc}}{\minitoc}{}
|
||||
|
||||
Access to the facilities provided by the EXT library is mainly
|
||||
achieved through the data types that are provided within.
|
||||
The data classes are derived from the base classes in PSBLAS, through
|
||||
the Fortran~2003 mechanism of \emph{type extension}~\cite{MRC:11}.
|
||||
|
||||
The data classes are divided between the general purpose CPU
|
||||
extensions, the GPU interfaces and the RSB interfaces.
|
||||
In the description we will make use of the notation introduced in
|
||||
Table~\ref{tab:notation}.
|
||||
|
||||
\begin{table}[ht]
|
||||
\caption{Notation for parameters describing a sparse matrix}
|
||||
\begin{center}
|
||||
{\footnotesize
|
||||
\begin{tabular}{ll}
|
||||
\hline
|
||||
Name & Description \\
|
||||
\hline
|
||||
M & Number of rows in matrix \\
|
||||
N & Number of columns in matrix \\
|
||||
NZ & Number of nonzeros in matrix \\
|
||||
AVGNZR & Average number of nonzeros per row \\
|
||||
MAXNZR & Maximum number of nonzeros per row \\
|
||||
NDIAG & Numero of nonzero diagonals\\
|
||||
AS & Coefficients array \\
|
||||
IA & Row indices array \\
|
||||
JA & Column indices array \\
|
||||
IRP & Row start pointers array \\
|
||||
JCP & Column start pointers array \\
|
||||
NZR & Number of nonzeros per row array \\
|
||||
OFFSET & Offset for diagonals \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
}
|
||||
\end{center}
|
||||
\label{tab:notation}
|
||||
\end{table}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=5.2cm]{figures/mat.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=5.2cm]{mat.png}
|
||||
\or
|
||||
\includegraphics[width=5.2cm]{figures/mat.pdf}
|
||||
\fi
|
||||
\caption{Example of sparse matrix}
|
||||
\label{fig:dense}
|
||||
\end{figure}
|
||||
|
||||
\subsection{CPU-class extensions}
|
||||
|
||||
|
||||
\subsubsection*{ELLPACK}
|
||||
|
||||
The ELLPACK/ITPACK format (shown in Figure~\ref{fig:ell})
|
||||
comprises two 2-dimensional arrays \verb|AS| and
|
||||
\verb|JA| with \verb|M| rows and \verb|MAXNZR| columns, where
|
||||
\verb|MAXNZR| is the maximum
|
||||
number of nonzeros in any row~\cite{ELLPACK}.
|
||||
Each row of the arrays \verb|AS| and \verb|JA| contains the
|
||||
coefficients and column indices; rows shorter than
|
||||
\verb|MAXNZR| are padded with zero coefficients and appropriate column
|
||||
indices, e.g. the last valid one found in the same row.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{figures/ell.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=8.2cm]{ell.png}
|
||||
\or
|
||||
\includegraphics[width=8.2cm]{figures/ell.pdf}
|
||||
\fi
|
||||
\caption{ELLPACK compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:ell}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{algorithm}
|
||||
\lstset{language=Fortran}
|
||||
\small
|
||||
\begin{lstlisting}
|
||||
do i=1,n
|
||||
t=0
|
||||
do j=1,maxnzr
|
||||
t = t + as(i,j)*x(ja(i,j))
|
||||
end do
|
||||
y(i) = t
|
||||
end do
|
||||
\end{lstlisting}
|
||||
\caption{\label{alg:ell} Matrix-Vector product in ELL format}
|
||||
\end{algorithm}
|
||||
The matrix-vector product $y=Ax$ can be computed with the code shown in
|
||||
Alg.~\ref{alg:ell}; it costs one memory write per outer iteration,
|
||||
plus three memory reads and two floating-point operations per inner
|
||||
iteration.
|
||||
|
||||
Unless all rows have exactly the same number of nonzeros, some of the
|
||||
coefficients in the \verb|AS| array will be zeros; therefore this
|
||||
data structure will have an overhead both in terms of memory space
|
||||
and redundant operations (multiplications by zero). The overhead can
|
||||
be acceptable if:
|
||||
\begin{enumerate}
|
||||
\item The maximum number of nonzeros per row is not much larger than
|
||||
the average;
|
||||
\item The regularity of the data structure allows for faster code,
|
||||
e.g. by allowing vectorization, thereby offsetting the additional
|
||||
storage requirements.
|
||||
\end{enumerate}
|
||||
In the extreme case where the input matrix has one full row, the
|
||||
ELLPACK structure would require more memory than the normal 2D array
|
||||
storage. The ELLPACK storage format was very popular in the vector
|
||||
computing days; in modern CPUs it is not quite as popular, but it
|
||||
is the basis for many GPU formats.
|
||||
|
||||
The relevant data type is \verb|psb_T_ell_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
!
|
||||
! ITPACK/ELL format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type psb_d_ell_sparse_mat
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
!
|
||||
! ITPACK/ELL format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type psb_d_ell_sparse_mat
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
|
||||
\subsubsection*{Hacked ELLPACK}
|
||||
|
||||
The \textit{hacked ELLPACK} (\textbf{HLL}) format
|
||||
alleviates the main problem of the ELLPACK format, that is,
|
||||
the amount of memory required by padding for sparse matrices in
|
||||
which the maximum row length is larger than the average.
|
||||
|
||||
The number of elements allocated to padding is $[(m*maxNR) -
|
||||
(m*avgNR) = m*(maxNR-avgNR)]$
|
||||
for both \verb|AS| and \verb|JA| arrays,
|
||||
where $m$ is equal to the number of rows of the matrix, $maxNR$ is the
|
||||
maximum number of nonzero elements
|
||||
in every row and $avgNR$ is the average number of nonzeros.
|
||||
Therefore a single densely populated row can seriously affect the
|
||||
total size of the allocation.
|
||||
|
||||
To limit this effect, in the HLL format we break the original matrix
|
||||
into equally sized groups of rows (called \textit{hacks}), and then store
|
||||
these groups as independent matrices in ELLPACK format.
|
||||
The groups can be arranged selecting rows in an arbitrarily manner;
|
||||
indeed, if the rows are sorted by decreasing number of nonzeros we
|
||||
obtain essentially the JAgged Diagonals format.
|
||||
If the rows are not in the original order, then an additional vector
|
||||
\textit{rIdx} is required, storing the actual row index for each row
|
||||
in the data structure.
|
||||
|
||||
The multiple ELLPACK-like buffers are stacked together inside a
|
||||
single, one dimensional array;
|
||||
an additional vector \textit{hackOffsets} is provided to keep track
|
||||
of the individual submatrices.
|
||||
All hacks have the same number of rows \textit{hackSize}; hence,
|
||||
the \textit{hackOffsets} vector is an array of
|
||||
$(m/hackSize)+1$ elements, each one pointing to the first index of a
|
||||
submatrix inside the stacked \textit{cM}/\textit{rP} buffers, plus an
|
||||
additional element pointing past the end of the last block, where the
|
||||
next one would begin.
|
||||
We thus have the property that
|
||||
the elements of the $k$-th \textit{hack} are stored between
|
||||
\verb|hackOffsets[k]| and
|
||||
\verb|hackOffsets[k+1]|, similarly to what happens in the CSR format.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{../figures/hll.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=.72\textwidth]{hll.png}
|
||||
\or
|
||||
\includegraphics[width=.72\textwidth]{../figures/hll.pdf}
|
||||
\fi
|
||||
\caption{Hacked ELLPACK compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:hll}
|
||||
\end{figure}
|
||||
|
||||
With this data structure a very long row only affects one hack, and
|
||||
therefore the additional memory is limited to the hack in which the
|
||||
row appears.
|
||||
|
||||
The relevant data type is \verb|psb_T_hll_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
!
|
||||
! HLL format. (Hacked ELL)
|
||||
!
|
||||
integer(psb_ipk_) :: hksz
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
!
|
||||
! HLL format. (Hacked ELL)
|
||||
!
|
||||
integer(psb_ipk_) :: hksz
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
\subsubsection*{Diagonal storage}
|
||||
|
||||
|
||||
The DIAgonal (DIA) format (shown in Figure~\ref{fig:dia})
|
||||
has a 2-dimensional array \verb|AS| containing in each column the
|
||||
coefficients along a diagonal of the matrix, and an integer array
|
||||
\verb|OFFSET| that determines where each diagonal starts. The
|
||||
diagonals in \verb|AS| are padded with zeros as necessary.
|
||||
|
||||
The code to compute the matrix-vector product $y=Ax$ is shown in Alg.~\ref{alg:dia};
|
||||
it costs one memory read per outer iteration,
|
||||
plus three memory reads, one memory write and two floating-point
|
||||
operations per inner iteration. The accesses to \verb|AS| and
|
||||
\verb|x| are in strict sequential order, therefore no indirect
|
||||
addressing is required.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{figures/dia.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=.72\textwidth]{dia.png}
|
||||
\or
|
||||
\includegraphics[width=.72\textwidth]{figures/dia.pdf}
|
||||
\fi
|
||||
\caption{DIA compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:dia}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{algorithm}
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
do j=1,ndiag
|
||||
if (offset(j) > 0) then
|
||||
ir1 = 1; ir2 = m - offset(j);
|
||||
else
|
||||
ir1 = 1 - offset(j); ir2 = m;
|
||||
end if
|
||||
do i=ir1,ir2
|
||||
y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
end do
|
||||
end do
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
do j=1,ndiag
|
||||
if (offset(j) > 0) then
|
||||
ir1 = 1; ir2 = m - offset(j);
|
||||
else
|
||||
ir1 = 1 - offset(j); ir2 = m;
|
||||
end if
|
||||
do i=ir1,ir2
|
||||
y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
end do
|
||||
end do
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
\caption{\label{alg:dia} Matrix-Vector product in DIA format}
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
The relevant data type is \verb|psb_T_dia_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
!
|
||||
! DIA format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nzeros
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
end type
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
!
|
||||
! DIA format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nzeros
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
end type
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
|
||||
|
||||
\subsubsection*{Hacked DIA}
|
||||
|
||||
Storage by DIAgonals is an attractive option for matrices whose
|
||||
coefficients are located on a small set of diagonals, since they do
|
||||
away with storing explicitly the indices and therefore reduce
|
||||
significantly memory traffic. However, having a few coefficients
|
||||
outside of the main set of diagonals may significantly increase the
|
||||
amount of needed padding; moreover, while the DIA code is easily
|
||||
vectorized, it does not necessarily make optimal use of the memory
|
||||
hierarchy. While processing each diagonal we are updating entries in
|
||||
the output vector \verb|y|, which is then accessed multiple times; if
|
||||
the vector \verb|y| is too large to remain in the cache memory, the
|
||||
associated cache miss penalty is paid multiple times.
|
||||
|
||||
The \textit{hacked DIA} (\textbf{HDIA}) format was designed to contain
|
||||
the amount of padding, by breaking the original matrix
|
||||
into equally sized groups of rows (\textit{hacks}), and then storing
|
||||
these groups as independent matrices in DIA format. This approach is
|
||||
similar to that of HLL, and requires using an offset vector for each
|
||||
submatrix. Again, similarly to HLL, the various submatrices are
|
||||
stacked inside a linear array to improve memory management. The fact
|
||||
that the matrix is accessed in slices helps in reducing cache misses,
|
||||
especially regarding accesses to the %output
|
||||
vector \verb|y|.
|
||||
|
||||
|
||||
An additional vector \textit{hackOffsets} is provided to complete
|
||||
the matrix format; given that \textit{hackSize} is the number of rows of each hack,
|
||||
the \textit{hackOffsets} vector is made by an array of
|
||||
$(m/hackSize)+1$ elements, pointing to the first diagonal offset of a
|
||||
submatrix inside the stacked \textit{offsets} buffers, plus an
|
||||
additional element equal to the number of nonzero diagonals in the whole matrix.
|
||||
We thus have the property that
|
||||
the number of diagonals of the $k$-th \textit{hack} is given by
|
||||
\textit{hackOffsets[k+1] - hackOffsets[k]}.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{../figures/hdia.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=.72\textwidth]{hdia.png}
|
||||
\or
|
||||
\includegraphics[width=.72\textwidth]{../figures/hdia.pdf}
|
||||
\fi
|
||||
\caption{Hacked DIA compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:hdia}
|
||||
\end{figure}
|
||||
|
||||
The relevant data type is \verb|psb_T_hdia_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type pm
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
end type pm
|
||||
|
||||
type po
|
||||
integer(psb_ipk_), allocatable :: off(:)
|
||||
end type po
|
||||
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
!
|
||||
! HDIA format, extended.
|
||||
!
|
||||
|
||||
type(pm), allocatable :: hdia(:)
|
||||
type(po), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nblocks, nzeros
|
||||
integer(psb_ipk_) :: hack = 64
|
||||
integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type pm
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
end type pm
|
||||
|
||||
type po
|
||||
integer(psb_ipk_), allocatable :: off(:)
|
||||
end type po
|
||||
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
!
|
||||
! HDIA format, extended.
|
||||
!
|
||||
|
||||
type(pm), allocatable :: hdia(:)
|
||||
type(po), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nblocks, nzeros
|
||||
integer(psb_ipk_) :: hack = 64
|
||||
integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
|
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 52 KiB |