Merge branch 'repackage' of github.com:sfilippone/psblas3 into repackage
Before Width: | Height: | Size: 334 B After Width: | Height: | Size: 328 B |
Before Width: | Height: | Size: 382 B After Width: | Height: | Size: 366 B |
Before Width: | Height: | Size: 296 B After Width: | Height: | Size: 289 B |
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 61 KiB |
After Width: | Height: | Size: 90 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.3 KiB |
@ -0,0 +1,19 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 72--><p class="indent" > <span class="footnote-mark"><a
|
||||
id="fn4x0"><a
|
||||
id="x16-136002x10.1"></a> <sup class="textsuperscript">4</sup></a></span><span
|
||||
class="cmr-8">The string is case-insensitive</span></div>
|
||||
|
||||
</body></html>
|
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 968 B |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.1 KiB |
@ -0,0 +1,20 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 53--><p class="noindent" ><span class="footnote-mark"><a
|
||||
id="fn5x0"><a
|
||||
id="x18-143004x11.1"></a> <sup class="textsuperscript">5</sup></a></span><span
|
||||
class="cmr-8">Note: the implementation is for </span><span
|
||||
class="cmmi-8">FCG</span><span
|
||||
class="cmr-8">(1).</span></div>
|
||||
</body></html>
|
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 7.5 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.0 KiB After Width: | Height: | Size: 970 B |
Before Width: | Height: | Size: 439 B After Width: | Height: | Size: 420 B |
Before Width: | Height: | Size: 763 B After Width: | Height: | Size: 710 B |
Before Width: | Height: | Size: 1.9 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1016 B |
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 1.7 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 1.5 KiB After Width: | Height: | Size: 1.3 KiB |
@ -0,0 +1,24 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 195--><p class="noindent" ><span class="footnote-mark"><a
|
||||
id="fn2x0"><a
|
||||
id="x6-4002x2.1"></a> <sup class="textsuperscript">2</sup></a></span><span
|
||||
class="cmr-8">This is the normal situation when the pattern of the sparse matrix is symmetric, which is</span>
|
||||
<span
|
||||
class="cmr-8">equivalent to say that the interaction between two variables is reciprocal. If the matrix pattern is</span>
|
||||
<span
|
||||
class="cmr-8">non-symmetric we may have one-way interactions, and these could cause a situation in which a</span>
|
||||
<span
|
||||
class="cmr-8">boundary point is not a halo point for its neighbour.</span></div>
|
||||
</body></html>
|
Before Width: | Height: | Size: 2.5 KiB After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.7 KiB |
@ -0,0 +1,921 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title>Extensions</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<!--l. 1--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse11.html" >prev</a>] [<a
|
||||
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse9.html#tailuserhtmlse12.html">tail</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<h3 class="sectionHead"><span class="titlemark">12 </span> <a
|
||||
id="x19-14400012"></a>Extensions</h3>
|
||||
<!--l. 3--><p class="noindent" >The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
|
||||
library. The extensions provide additional storage formats beyond the ones already
|
||||
contained in the base library, as well as interfaces to:
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 8--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">SPGPU</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 8--><p class="noindent" >a CUDA library originally
|
||||
published as <a
|
||||
href="https://code.google.com/p/spgpu/" class="url" ><span
|
||||
class="cmtt-10">https://code.google.com/p/spgpu/</span></a> and now included
|
||||
in the <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">cuda</span></span></span> subdir, for computations on NVIDIA GPUs;
|
||||
</dd><dt class="description">
|
||||
<!--l. 11--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">LIBRSB</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 11--><p class="noindent" ><a
|
||||
href="http://sourceforge.net/projects/librsb/" class="url" ><span
|
||||
class="cmtt-10">http://sourceforge.net/projects/librsb/</span></a>, for computations on
|
||||
multicore parallel machines.</dd></dl>
|
||||
<!--l. 14--><p class="noindent" >The infrastructure laid out in the base library to allow for these extensions is detailed in
|
||||
the references <span class="cite">[<a
|
||||
href="userhtmlli2.html#XDesPat:11">20</a>, <a
|
||||
href="userhtmlli2.html#XCaFiRo:2014">21</a>, <a
|
||||
href="userhtmlli2.html#XSparse03">10</a>]</span>; the CUDA-specific data formats are described
|
||||
in <span class="cite">[<a
|
||||
href="userhtmlli2.html#XOurTechRep">22</a>]</span>.
|
||||
<!--l. 19--><p class="noindent" >
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a
|
||||
id="x19-14500012.1"></a>Using the extensions</h4>
|
||||
<!--l. 21--><p class="noindent" >A sample application using the PSBLAS extensions will contain the following
|
||||
steps:
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 24--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">USE</span></span></span> the appropriat modules (<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_ext_mod</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_cuda_mod</span></span></span>);
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 26--><p class="noindent" >Declare a <span
|
||||
class="cmti-10">mold </span>variable of the necessary type (e.g.
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_ell_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_hlg_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_vect_cuda</span></span></span>);
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 29--><p class="noindent" >Pass the mold variable to the base library interface where needed to ensure
|
||||
the appropriate dynamic type.</li></ul>
|
||||
|
||||
|
||||
|
||||
<!--l. 32--><p class="noindent" >Suppose you want to use the CUDA-enabled ELLPACK data structure; you would use a
|
||||
piece of code like this (and don’t forget, you need CUDA-side vectors along with the
|
||||
matrices):
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 85--><p class="noindent" >
|
||||
|
||||
|
||||
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-103">
|
||||
program my_cuda_test
|
||||
  use psb_base_mod
|
||||
  use psb_util_mod
|
||||
  use psb_ext_mod
|
||||
  use psb_cuda_mod
|
||||
  type(psb_dspmat_type) :: a, agpu
|
||||
  type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
  real(psb_dpk_), allocatable :: xtmp(:)
|
||||
  type(psb_d_vect_cuda)       :: vmold
|
||||
  type(psb_d_elg_sparse_mat) :: aelg
|
||||
  type(psb_ctxt_type) :: ctxt
|
||||
  integer             :: iam, np
|
||||
|
||||
|
||||
  call psb_init(ctxt)
|
||||
  call psb_info(ctxt,iam,np)
|
||||
  call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
  ! My own home-grown matrix generator
|
||||
  call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
  if (info /= 0) goto 9999
|
||||
|
||||
  call a%cscnv(agpu,info,mold=aelg)
|
||||
  if (info /= 0) goto 9999
|
||||
  xtmp = x%get_vect()
|
||||
  call xg%bld(xtmp,mold=vmold)
|
||||
  call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
  ! Do sparse MV
|
||||
  call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
  if (info == 0) then
|
||||
     write(*,*) ’42’
|
||||
  else
|
||||
     write(*,*) ’Something went wrong ’,info
|
||||
  end if
|
||||
|
||||
|
||||
  call psb_cuda_exit()
|
||||
  call psb_exit(ctxt)
|
||||
  stop
|
||||
end program my_cuda_test
|
||||
</pre>
|
||||
<!--l. 134--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
<!--l. 139--><p class="indent" > A full example of this strategy can be seen in the <span
|
||||
class="cmtt-10">test/ext/kernel </span>and
|
||||
<span
|
||||
class="cmtt-10">test/cuda/kernel </span>subdirectories, where we provide sample programs to test the
|
||||
speed of the sparse matrix-vector product with the various data structures included
|
||||
in the library.
|
||||
<!--l. 146--><p class="noindent" >
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a
|
||||
id="x19-14600012.2"></a>Extensions’ Data Structures</h4>
|
||||
<!--l. 150--><p class="noindent" >Access to the facilities provided by the EXT library is mainly achieved through
|
||||
the data types that are provided within. The data classes are derived from
|
||||
the base classes in PSBLAS, through the Fortran 2003 mechanism of <span
|
||||
class="cmti-10">type</span>
|
||||
<span
|
||||
class="cmti-10">extension</span> <span class="cite">[<a
|
||||
href="userhtmlli2.html#XMRC:11">17</a>]</span>.
|
||||
<!--l. 155--><p class="indent" > The data classes are divided between the general purpose CPU extensions, the
|
||||
GPU interfaces and the RSB interfaces. In the description we will make use of the
|
||||
notation introduced in Table <a
|
||||
href="#x19-146001r21">21<!--tex4ht:ref: tab:notation --></a>.
|
||||
<div class="table">
|
||||
|
||||
|
||||
|
||||
<!--l. 160--><p class="indent" > <a
|
||||
id="x19-146001r21"></a><hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<div class="caption"
|
||||
><span class="id">Table 21: </span><span
|
||||
class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-146001r21 -->
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 162--><p class="noindent" >
|
||||
<div class="tabular"> <table id="TBL-23" class="tabular"
|
||||
|
||||
><colgroup id="TBL-23-1g"><col
|
||||
id="TBL-23-1"><col
|
||||
id="TBL-23-2"></colgroup><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-1-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">Name </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Description </span></td>
|
||||
</tr><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-2-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">M </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of rows in matrix </span></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-3-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">N </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of columns in matrix</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-4-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NZ </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of nonzeros in matrix </span></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-5-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">AVGNZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Average number of nonzeros per row</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-6-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">MAXNZR</span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Maximum number of nonzeros per row</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-7-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NDIAG </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Numero of nonzero diagonals </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-8-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">AS </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Coefficients array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-9-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">IA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Row indices array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-10-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">JA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Column indices array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-11-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">IRP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Row start pointers array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-12-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">JCP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Column start pointers array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-13-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of nonzeros per row array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-14-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">OFFSET </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Offset for diagonals </span></td>
|
||||
</tr><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-15-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-15-1"
|
||||
class="td11"> </td></tr></table> </div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
</div>
|
||||
<!--l. 188--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-146002r5"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 192--><p class="noindent" ><img
|
||||
src="mat.png" alt="PIC"
|
||||
width="147" height="147" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 5: </span><span
|
||||
class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-146002r5 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 198--><p class="indent" > </div><hr class="endfigure">
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a
|
||||
id="x19-14700012.3"></a>CPU-class extensions</h4>
|
||||
<!--l. 203--><p class="noindent" >
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-148000"></a>ELLPACK</h5>
|
||||
<!--l. 205--><p class="noindent" >The ELLPACK/ITPACK format (shown in Figure <a
|
||||
href="#x19-148001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional
|
||||
arrays <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> with <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">M</span></span></span> rows and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> columns, where <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> is the maximum
|
||||
number of nonzeros in any row <span class="cite">[<span
|
||||
class="cmbx-10">?</span>]</span>. Each row of the arrays <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> contains the
|
||||
coefficients and column indices; rows shorter than <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> are padded with zero
|
||||
coefficients and appropriate column indices, e.g. the last valid one found in the same
|
||||
row.
|
||||
<!--l. 215--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-148001r6"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 219--><p class="noindent" ><img
|
||||
src="ell.png" alt="PIC"
|
||||
width="233" height="233" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 6: </span><span
|
||||
class="content">ELLPACK compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-148001r6 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 225--><p class="indent" > </div><hr class="endfigure">
|
||||
<a
|
||||
id="x19-148002r1"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 229--><p class="indent" > <hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<!--l. 231-->
|
||||
<pre class="lstlisting" id="listing-168"><span class="label"><a
|
||||
id="x19-148003r1"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">n</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148004r2"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=0</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148005r3"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">maxnzr</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148006r4"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">+</span></span><span style="color:#000000"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">as</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">)*</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">x</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">ja</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">))</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148007r5"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148008r6"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">y</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">)</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148009r7"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span></pre>
|
||||
|
||||
<a
|
||||
id="x19-148010r1"></a>
|
||||
<a
|
||||
id="x19-148011"></a>
|
||||
<span
|
||||
class="cmbx-10">Algorithm</span><span
|
||||
class="cmbx-10"> 1:</span>  Matrix-Vector product in ELL format
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
<!--l. 242--><p class="indent" > The matrix-vector product <span
|
||||
class="cmmi-10">y </span>= <span
|
||||
class="cmmi-10">Ax </span>can be computed with the code shown in
|
||||
Alg. <a
|
||||
href="#x19-148010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and
|
||||
two floating-point operations per inner iteration.
|
||||
<!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the coefficients
|
||||
in the <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> array will be zeros; therefore this data structure will have an overhead both
|
||||
in terms of memory space and redundant operations (multiplications by zero). The
|
||||
overhead can be acceptable if:
|
||||
<ol class="enumerate1" >
|
||||
<li
|
||||
class="enumerate" id="x19-148013x1">
|
||||
<!--l. 253--><p class="noindent" >The maximum number of nonzeros per row is not much larger than the
|
||||
average;
|
||||
</li>
|
||||
<li
|
||||
class="enumerate" id="x19-148015x2">
|
||||
<!--l. 255--><p class="noindent" >The regularity of the data structure allows for faster code, e.g. by allowing
|
||||
vectorization, thereby offsetting the additional storage requirements.</li></ol>
|
||||
<!--l. 259--><p class="noindent" >In the extreme case where the input matrix has one full row, the ELLPACK
|
||||
structure would require more memory than the normal 2D array storage. The
|
||||
ELLPACK storage format was very popular in the vector computing days; in
|
||||
modern CPUs it is not quite as popular, but it is the basis for many GPU
|
||||
formats.
|
||||
<!--l. 265--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_ell_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 281--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-104">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
    !
|
||||
    ! ITPACK/ELL format, extended.
|
||||
    !
|
||||
|
||||
    integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
    real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
  contains
|
||||
    ....
|
||||
  end type psb_d_ell_sparse_mat
|
||||
</pre>
|
||||
<!--l. 295--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-149000"></a>Hacked ELLPACK</h5>
|
||||
<!--l. 303--><p class="noindent" >The <span
|
||||
class="cmti-10">hacked ELLPACK </span>(<span
|
||||
class="cmbx-10">HLL</span>) format alleviates the main problem of the ELLPACK
|
||||
format, that is, the amount of memory required by padding for sparse matrices in
|
||||
which the maximum row length is larger than the average.
|
||||
|
||||
|
||||
|
||||
<!--l. 308--><p class="indent" > The number of elements allocated to padding is
|
||||
[(<span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">*</span><span
|
||||
class="cmmi-10">maxNR</span>) <span
|
||||
class="cmsy-10">- </span>(<span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">*</span><span
|
||||
class="cmmi-10">avgNR</span>) = <span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">* </span>(<span
|
||||
class="cmmi-10">maxNR</span><span
|
||||
class="cmsy-10">-</span><span
|
||||
class="cmmi-10">avgNR</span>)] for both <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> arrays,
|
||||
where <span
|
||||
class="cmmi-10">m </span>is equal to the number of rows of the matrix, <span
|
||||
class="cmmi-10">maxNR </span>is the maximum
|
||||
number of nonzero elements in every row and <span
|
||||
class="cmmi-10">avgNR </span>is the average number of
|
||||
nonzeros. Therefore a single densely populated row can seriously affect the total size
|
||||
of the allocation.
|
||||
<!--l. 317--><p class="indent" > To limit this effect, in the HLL format we break the original matrix into equally
|
||||
sized groups of rows (called <span
|
||||
class="cmti-10">hacks</span>), and then store these groups as independent
|
||||
matrices in ELLPACK format. The groups can be arranged selecting rows in an
|
||||
arbitrarily manner; indeed, if the rows are sorted by decreasing number of nonzeros
|
||||
we obtain essentially the JAgged Diagonals format. If the rows are not in the original
|
||||
order, then an additional vector <span
|
||||
class="cmti-10">rIdx </span>is required, storing the actual row index for
|
||||
each row in the data structure.
|
||||
<!--l. 327--><p class="indent" > The multiple ELLPACK-like buffers are stacked together inside a single, one
|
||||
dimensional array; an additional vector <span
|
||||
class="cmti-10">hackOffsets </span>is provided to keep track of the
|
||||
individual submatrices. All hacks have the same number of rows <span
|
||||
class="cmti-10">hackSize</span>; hence, the
|
||||
<span
|
||||
class="cmti-10">hackOffsets </span>vector is an array of (<span
|
||||
class="cmmi-10">m∕hackSize</span>) + 1 elements, each one pointing to
|
||||
the first index of a submatrix inside the stacked <span
|
||||
class="cmti-10">cM</span>/<span
|
||||
class="cmti-10">rP </span>buffers, plus an additional
|
||||
element pointing past the end of the last block, where the next one would begin. We
|
||||
thus have the property that the elements of the <span
|
||||
class="cmmi-10">k</span>-th <span
|
||||
class="cmti-10">hack </span>are stored between
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">hackOffsets[k]</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">hackOffsets[k+1]</span></span></span>, similarly to what happens in the CSR
|
||||
format.
|
||||
<!--l. 342--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-149001r7"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 346--><p class="noindent" ><img
|
||||
src="hll.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 7: </span><span
|
||||
class="content">Hacked ELLPACK compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-149001r7 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 352--><p class="indent" > </div><hr class="endfigure">
|
||||
<!--l. 354--><p class="indent" > With this data structure a very long row only affects one hack, and therefore the
|
||||
additional memory is limited to the hack in which the row appears.
|
||||
<!--l. 358--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_hll_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 374--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-105">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
    !
|
||||
    ! HLL format. (Hacked ELL)
|
||||
    !
|
||||
    integer(psb_ipk_) :: hksz
|
||||
    integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
    real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
  contains
|
||||
   ....
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 388--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-150000"></a>Diagonal storage</h5>
|
||||
<!--l. 396--><p class="noindent" >The DIAgonal (DIA) format (shown in Figure <a
|
||||
href="#x19-150001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span>
|
||||
containing in each column the coefficients along a diagonal of the matrix, and an
|
||||
integer array <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">OFFSET</span></span></span> that determines where each diagonal starts. The diagonals in <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span>
|
||||
are padded with zeros as necessary.
|
||||
<!--l. 402--><p class="indent" > The code to compute the matrix-vector product <span
|
||||
class="cmmi-10">y </span>= <span
|
||||
class="cmmi-10">Ax </span>is shown in Alg. <a
|
||||
href="#x19-150003r2">2<!--tex4ht:ref: alg:dia --></a>; it
|
||||
costs one memory read per outer iteration, plus three memory reads, one memory
|
||||
write and two floating-point operations per inner iteration. The accesses to
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">x</span></span></span> are in strict sequential order, therefore no indirect addressing is
|
||||
required.
|
||||
<!--l. 409--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-150001r8"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 413--><p class="noindent" ><img
|
||||
src="dia.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 8: </span><span
|
||||
class="content">DIA compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r8 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 419--><p class="indent" > </div><hr class="endfigure">
|
||||
<a
|
||||
id="x19-150002r2"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 423--><p class="indent" > <hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 437--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-106">
|
||||
    do j=1,ndiag
|
||||
      if (offset(j) > 0) then
|
||||
        ir1 = 1; ir2 = m - offset(j);
|
||||
      else
|
||||
        ir1 = 1 - offset(j); ir2 = m;
|
||||
      end if
|
||||
      do i=ir1,ir2
|
||||
        y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
      end do
|
||||
    end do
|
||||
</pre>
|
||||
<!--l. 450--><p class="nopar" > </div></div>
|
||||
<a
|
||||
id="x19-150003r2"></a>
|
||||
<a
|
||||
id="x19-150004"></a>
|
||||
<span
|
||||
class="cmbx-10">Algorithm</span><span
|
||||
class="cmbx-10"> 2:</span>  Matrix-Vector product in DIA format
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
<!--l. 458--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_dia_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 473--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-107">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
    !
|
||||
    ! DIA format, extended.
|
||||
    !
|
||||
|
||||
    integer(psb_ipk_), allocatable :: offset(:)
|
||||
    integer(psb_ipk_) :: nzeros
|
||||
    real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 486--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-151000"></a>Hacked DIA</h5>
|
||||
<!--l. 495--><p class="noindent" >Storage by DIAgonals is an attractive option for matrices whose coefficients are
|
||||
located on a small set of diagonals, since they do away with storing explicitly the
|
||||
indices and therefore reduce significantly memory traffic. However, having a few
|
||||
coefficients outside of the main set of diagonals may significantly increase the
|
||||
amount of needed padding; moreover, while the DIA code is easily vectorized,
|
||||
it does not necessarily make optimal use of the memory hierarchy. While
|
||||
processing each diagonal we are updating entries in the output vector <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span>,
|
||||
which is then accessed multiple times; if the vector <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span> is too large to remain
|
||||
in the cache memory, the associated cache miss penalty is paid multiple
|
||||
times.
|
||||
<!--l. 507--><p class="indent" > The <span
|
||||
class="cmti-10">hacked DIA </span>(<span
|
||||
class="cmbx-10">HDIA</span>) format was designed to contain the amount of padding,
|
||||
by breaking the original matrix into equally sized groups of rows (<span
|
||||
class="cmti-10">hacks</span>), and then
|
||||
storing these groups as independent matrices in DIA format. This approach is similar
|
||||
to that of HLL, and requires using an offset vector for each submatrix. Again,
|
||||
similarly to HLL, the various submatrices are stacked inside a linear array to
|
||||
improve memory management. The fact that the matrix is accessed in slices
|
||||
helps in reducing cache misses, especially regarding accesses to the vector
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span>.
|
||||
<!--l. 519--><p class="indent" > An additional vector <span
|
||||
class="cmti-10">hackOffsets </span>is provided to complete the matrix format; given
|
||||
that <span
|
||||
class="cmti-10">hackSize </span>is the number of rows of each hack, the <span
|
||||
class="cmti-10">hackOffsets </span>vector is made by
|
||||
an array of (<span
|
||||
class="cmmi-10">m∕hackSize</span>) + 1 elements, pointing to the first diagonal offset of a
|
||||
submatrix inside the stacked <span
|
||||
class="cmti-10">offsets </span>buffers, plus an additional element equal to the
|
||||
number of nonzero diagonals in the whole matrix. We thus have the property that
|
||||
the number of diagonals of the <span
|
||||
class="cmmi-10">k</span>-th <span
|
||||
class="cmti-10">hack </span>is given by <span
|
||||
class="cmti-10">hackOffsets[k+1] -</span>
|
||||
<span
|
||||
class="cmti-10">hackOffsets[k]</span>.
|
||||
<!--l. 529--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-151001r9"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 533--><p class="noindent" ><img
|
||||
src="hdia.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 9: </span><span
|
||||
class="content">Hacked DIA compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r9 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 539--><p class="indent" > </div><hr class="endfigure">
|
||||
<!--l. 541--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_hdia_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 568--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-108">
|
||||
  type pm
|
||||
     real(psb_dpk_), allocatable  :: data(:,:)
|
||||
  end type pm
|
||||
|
||||
  type po
|
||||
     integer(psb_ipk_), allocatable  :: off(:)
|
||||
  end type po
|
||||
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
    !
|
||||
    ! HDIA format, extended.
|
||||
    !
|
||||
|
||||
    type(pm), allocatable :: hdia(:)
|
||||
    type(po), allocatable :: offset(:)
|
||||
    integer(psb_ipk_) :: nblocks, nzeros
|
||||
    integer(psb_ipk_) :: hack = 64
|
||||
    integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
  contains
|
||||
   ....
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 593--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a
|
||||
id="x19-15200012.4"></a>CUDA-class extensions</h4>
|
||||
<!--l. 4--><p class="noindent" >For computing with CUDA we define a dual memorization strategy in which each
|
||||
variable on the CPU (“host”) side has a GPU (“device”) side. When a GPU-type
|
||||
variable is initialized, the data contained is (usually) the same on both sides. Each
|
||||
operator invoked on the variable may change the data so that only the host side or
|
||||
the device side are up-to-date.
|
||||
<!--l. 11--><p class="indent" > Keeping track of the updates to data in the variables is essential: we want to
|
||||
perform most computations on the GPU, but we cannot afford the time needed to
|
||||
move data between the host memory and the device memory because the bandwidth
|
||||
of the interconnection bus would become the main bottleneck of the computation.
|
||||
Thus, each and every computational routine in the library is built according to the
|
||||
following principles:
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 18--><p class="noindent" >If the data type being handled is GPU-enabled, make sure that its device
|
||||
copy is up to date, perform any arithmetic operation on the GPU, and
|
||||
if the data has been altered as a result, mark the main-memory copy as
|
||||
outdated.
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 22--><p class="noindent" >The main-memory copy is never updated unless this is requested by the user
|
||||
either
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 25--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">explicitly</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 25--><p class="noindent" >by invoking a synchronization method;
|
||||
</dd><dt class="description">
|
||||
<!--l. 26--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">implicitly</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 26--><p class="noindent" >by invoking a method that involves other data items that are not
|
||||
GPU-enabled, e.g., by assignment ov a vector to a normal array.</dd></dl>
|
||||
</li></ul>
|
||||
<!--l. 31--><p class="noindent" >In this way, data items are put on the GPU memory “on demand” and remain there as
|
||||
long as “normal” computations are carried out. As an example, the following call to a
|
||||
matrix-vector product
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 39--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-109">
|
||||
    call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
</pre>
|
||||
<!--l. 43--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
<!--l. 47--><p class="noindent" >will transparently and automatically be performed on the GPU whenever all three data
|
||||
inputs <code class="lstinline"><span style="color:#000000">a</span></code>, <code class="lstinline"><span style="color:#000000">x</span></code> and <code class="lstinline"><span style="color:#000000">y</span></code> are GPU-enabled. If a program makes many such calls sequentially,
|
||||
then
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 52--><p class="noindent" >The first kernel invocation will find the data in main memory, and will
|
||||
copy it to the GPU memory, thus incurring a significant overhead; the
|
||||
result is however <span
|
||||
class="cmti-10">not </span>copied back, and therefore:
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 56--><p class="noindent" >Subsequent kernel invocations involving the same vector will find the data
|
||||
on the GPU side so that they will run at full speed.</li></ul>
|
||||
<!--l. 60--><p class="noindent" >For all invocations after the first the only data that will have to be transferred to/from
|
||||
the main memory will be the scalars <code class="lstinline"><span style="color:#000000">alpha</span></code> and <code class="lstinline"><span style="color:#000000">beta</span></code>, and the return code
|
||||
<code class="lstinline"><span style="color:#000000">info</span></code>.
|
||||
<!--l. 64--><p class="indent" >
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 65--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Vectors:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 65--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_vect_gpu</span></code> provides a GPU-enabled extension of
|
||||
the inner type <code class="lstinline"><span style="color:#000000">psb_T_base_vect_type</span></code>, and must be used together with
|
||||
the other inner matrix type to make full use of the GPU computational
|
||||
capabilities;
|
||||
</dd><dt class="description">
|
||||
<!--l. 69--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">CSR:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 69--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_csrg_sparse_mat</span></code> provides an interface to the GPU
|
||||
version of CSR available in the NVIDIA CuSPARSE library;
|
||||
</dd><dt class="description">
|
||||
<!--l. 72--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HYB:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 72--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hybg_sparse_mat</span></code> provides an interface to the HYB
|
||||
GPU storage available in the NVIDIA CuSPARSE library. The internal
|
||||
structure is opaque, hence the host side is just CSR; the HYB data format
|
||||
is only available up to CUDA version 10.
|
||||
</dd><dt class="description">
|
||||
<!--l. 77--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">ELL:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 77--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_elg_sparse_mat</span></code> provides an interface to the
|
||||
ELLPACK implementation from SPGPU;
|
||||
|
||||
|
||||
|
||||
</dd><dt class="description">
|
||||
<!--l. 80--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HLL:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 80--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hlg_sparse_mat</span></code> provides an interface to the Hacked
|
||||
ELLPACK implementation from SPGPU;
|
||||
</dd><dt class="description">
|
||||
<!--l. 82--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HDIA:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 82--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hdiag_sparse_mat</span></code> provides an interface to the
|
||||
Hacked DIAgonals implementation from SPGPU;</dd></dl>
|
||||
|
||||
|
||||
|
||||
<!--l. 87--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse11.html" >prev</a>] [<a
|
||||
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse12.html" >front</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<!--l. 87--><p class="indent" > <a
|
||||
id="tailuserhtmlse12.html"></a>
|
||||
</body></html>
|
@ -0,0 +1,299 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title>CUDA Environment Routines</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<!--l. 87--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse12.html" >prev</a>] [<a
|
||||
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse10.html#tailuserhtmlse13.html">tail</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<h3 class="sectionHead"><span class="titlemark">13 </span> <a
|
||||
id="x20-15300013"></a>CUDA Environment Routines</h3>
|
||||
<!--l. 91--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-154000"></a>psb_cuda_init — Initializes PSBLAS-CUDA environment</h4>
|
||||
<a
|
||||
id="Q1-20-191"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 99--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-110">
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
</pre>
|
||||
<!--l. 103--><p class="nopar" > </div></div>
|
||||
<!--l. 108--><p class="noindent" >This subroutine initializes the PSBLAS-CUDA environment.
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 110--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Type:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 110--><p class="noindent" >Synchronous.
|
||||
</dd><dt class="description">
|
||||
<!--l. 111--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">On Entry</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 111--><p class="noindent" >
|
||||
</dd><dt class="description">
|
||||
<!--l. 112--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">device</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 112--><p class="noindent" >ID of CUDA device to attach to.<br
|
||||
class="newline" />Scope: <span
|
||||
class="cmbx-10">local</span>.<br
|
||||
class="newline" />Type: <span
|
||||
class="cmbx-10">optional</span>.<br
|
||||
class="newline" />Intent: <span
|
||||
class="cmbx-10">in</span>.<br
|
||||
class="newline" />Specified as: an integer value.  Default: use <code class="lstinline"><span style="color:#000000">mod</span><span style="color:#000000">(</span><span style="color:#000000">iam</span><span style="color:#000000">,</span><span style="color:#000000">ngpu</span><span style="color:#000000">)</span></code> where <code class="lstinline"><span style="color:#000000">iam</span></code> is
|
||||
the calling process index and <code class="lstinline"><span style="color:#000000">ngpu</span></code> is the total number of CUDA devices
|
||||
available on the current node.</dd></dl>
|
||||
<!--l. 123--><p class="noindent" ><span
|
||||
class="cmbx-12">Notes</span>
|
||||
|
||||
|
||||
|
||||
<ol class="enumerate1" >
|
||||
<li
|
||||
class="enumerate" id="x20-154002x1">
|
||||
<!--l. 125--><p class="noindent" >A call to this routine must precede any other PSBLAS-CUDA call.</li></ol>
|
||||
<!--l. 129--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-155000"></a>psb_cuda_exit — Exit from PSBLAS-CUDA environment</h4>
|
||||
<a
|
||||
id="Q1-20-193"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 137--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-111">
|
||||
call psb_cuda_exit(ctxt)
|
||||
</pre>
|
||||
<!--l. 141--><p class="nopar" > </div></div>
|
||||
<!--l. 146--><p class="noindent" >This subroutine exits from the PSBLAS CUDA context.
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 148--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Type:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 148--><p class="noindent" >Synchronous.
|
||||
</dd><dt class="description">
|
||||
<!--l. 149--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">On Entry</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 149--><p class="noindent" >
|
||||
</dd><dt class="description">
|
||||
<!--l. 150--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">ctxt</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 150--><p class="noindent" >the communication context identifying the virtual parallel machine.<br
|
||||
class="newline" />Scope: <span
|
||||
class="cmbx-10">global</span>.<br
|
||||
class="newline" />Type: <span
|
||||
class="cmbx-10">required</span>.<br
|
||||
class="newline" />Intent: <span
|
||||
class="cmbx-10">in</span>.<br
|
||||
class="newline" />Specified as: an integer variable.</dd></dl>
|
||||
<!--l. 161--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-156000"></a>psb_cuda_DeviceSync — Synchronize CUDA device</h4>
|
||||
<a
|
||||
id="Q1-20-195"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 169--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-112">
|
||||
call psb_cuda_DeviceSync()
|
||||
</pre>
|
||||
<!--l. 173--><p class="nopar" > </div></div>
|
||||
<!--l. 178--><p class="noindent" >This subroutine ensures that all previosly invoked kernels, i.e. all invocation of
|
||||
CUDA-side code, have completed.
|
||||
<!--l. 182--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-157000"></a>psb_cuda_getDeviceCount </h4>
|
||||
<a
|
||||
id="Q1-20-197"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 190--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-113">
|
||||
ngpus =  psb_cuda_getDeviceCount()
|
||||
</pre>
|
||||
<!--l. 194--><p class="nopar" > </div></div>
|
||||
<!--l. 199--><p class="noindent" >Get number of devices available on current computing node.
|
||||
<!--l. 201--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-158000"></a>psb_cuda_getDevice </h4>
|
||||
<a
|
||||
id="Q1-20-199"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 209--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-114">
|
||||
ngpus =  psb_cuda_getDevice()
|
||||
</pre>
|
||||
<!--l. 213--><p class="nopar" > </div></div>
|
||||
<!--l. 218--><p class="noindent" >Get device in use by current process.
|
||||
<!--l. 220--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-159000"></a>psb_cuda_setDevice </h4>
|
||||
<a
|
||||
id="Q1-20-201"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 228--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-115">
|
||||
info = psb_cuda_setDevice(dev)
|
||||
</pre>
|
||||
<!--l. 232--><p class="nopar" > </div></div>
|
||||
<!--l. 237--><p class="noindent" >Set device to be used by current process.
|
||||
<!--l. 239--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-160000"></a>psb_cuda_DeviceHasUVA </h4>
|
||||
<a
|
||||
id="Q1-20-203"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 247--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-116">
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
</pre>
|
||||
<!--l. 251--><p class="nopar" > </div></div>
|
||||
<!--l. 256--><p class="noindent" >Returns true if device currently in use supports UVA (Unified Virtual Addressing).
|
||||
<!--l. 259--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-161000"></a>psb_cuda_WarpSize </h4>
|
||||
<a
|
||||
id="Q1-20-205"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 267--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-117">
|
||||
nw = psb_cuda_WarpSize()
|
||||
</pre>
|
||||
<!--l. 271--><p class="nopar" > </div></div>
|
||||
<!--l. 276--><p class="noindent" >Returns the warp size.
|
||||
<!--l. 279--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-162000"></a>psb_cuda_MultiProcessors </h4>
|
||||
<a
|
||||
id="Q1-20-207"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 287--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-118">
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
</pre>
|
||||
<!--l. 291--><p class="nopar" > </div></div>
|
||||
<!--l. 296--><p class="noindent" >Returns the number of multiprocessors in the CUDA device.
|
||||
<!--l. 298--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-163000"></a>psb_cuda_MaxThreadsPerMP </h4>
|
||||
<a
|
||||
id="Q1-20-209"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 306--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-119">
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
</pre>
|
||||
<!--l. 310--><p class="nopar" > </div></div>
|
||||
<!--l. 315--><p class="noindent" >Returns the maximum number of threads per multiprocessor.
|
||||
<!--l. 318--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-164000"></a>psb_cuda_MaxRegistersPerBlock </h4>
|
||||
<a
|
||||
id="Q1-20-211"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 326--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-120">
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
</pre>
|
||||
<!--l. 330--><p class="nopar" > </div></div>
|
||||
<!--l. 335--><p class="noindent" >Returns the maximum number of register per thread block.
|
||||
<!--l. 338--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-165000"></a>psb_cuda_MemoryClockRate </h4>
|
||||
<a
|
||||
id="Q1-20-213"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 346--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-121">
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
</pre>
|
||||
<!--l. 350--><p class="nopar" > </div></div>
|
||||
<!--l. 355--><p class="noindent" >Returns the memory clock rate in KHz, as an integer.
|
||||
<!--l. 357--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-166000"></a>psb_cuda_MemoryBusWidth </h4>
|
||||
<a
|
||||
id="Q1-20-215"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 365--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-122">
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
</pre>
|
||||
<!--l. 369--><p class="nopar" > </div></div>
|
||||
<!--l. 374--><p class="noindent" >Returns the memory bus width in bits.
|
||||
<!--l. 376--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-167000"></a>psb_cuda_MemoryPeakBandwidth </h4>
|
||||
<a
|
||||
id="Q1-20-217"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 384--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-123">
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
</pre>
|
||||
<!--l. 388--><p class="nopar" > </div></div>
|
||||
<!--l. 392--><p class="noindent" >Returns the peak memory bandwidth in MB/s (real double precision).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!--l. 126--><p class="indent" >
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!--l. 2--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse12.html" >prev</a>] [<a
|
||||
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse13.html" >front</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<!--l. 2--><p class="indent" > <a
|
||||
id="tailuserhtmlse13.html"></a>
|
||||
</body></html>
|
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 61 KiB |
After Width: | Height: | Size: 90 KiB |