Merge branch 'repackage' into development
Before Width: | Height: | Size: 334 B After Width: | Height: | Size: 328 B |
Before Width: | Height: | Size: 382 B After Width: | Height: | Size: 366 B |
Before Width: | Height: | Size: 296 B After Width: | Height: | Size: 289 B |
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 61 KiB |
After Width: | Height: | Size: 90 KiB |
After Width: | Height: | Size: 29 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 1.6 KiB After Width: | Height: | Size: 1.3 KiB |
@ -0,0 +1,19 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 72--><p class="indent" > <span class="footnote-mark"><a
|
||||
id="fn4x0"><a
|
||||
id="x16-136002x10.1"></a> <sup class="textsuperscript">4</sup></a></span><span
|
||||
class="cmr-8">The string is case-insensitive</span></div>
|
||||
|
||||
</body></html>
|
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 968 B |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.1 KiB |
@ -0,0 +1,20 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 53--><p class="noindent" ><span class="footnote-mark"><a
|
||||
id="fn5x0"><a
|
||||
id="x18-143004x11.1"></a> <sup class="textsuperscript">5</sup></a></span><span
|
||||
class="cmr-8">Note: the implementation is for </span><span
|
||||
class="cmmi-8">FCG</span><span
|
||||
class="cmr-8">(1).</span></div>
|
||||
</body></html>
|
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 10 KiB After Width: | Height: | Size: 7.5 KiB |
Before Width: | Height: | Size: 1.4 KiB After Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.0 KiB After Width: | Height: | Size: 970 B |
Before Width: | Height: | Size: 439 B After Width: | Height: | Size: 420 B |
Before Width: | Height: | Size: 763 B After Width: | Height: | Size: 710 B |
Before Width: | Height: | Size: 1.9 KiB After Width: | Height: | Size: 1.7 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1016 B |
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 1.7 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 1.5 KiB After Width: | Height: | Size: 1.3 KiB |
@ -0,0 +1,24 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<div class="footnote-text">
|
||||
<!--l. 195--><p class="noindent" ><span class="footnote-mark"><a
|
||||
id="fn2x0"><a
|
||||
id="x6-4002x2.1"></a> <sup class="textsuperscript">2</sup></a></span><span
|
||||
class="cmr-8">This is the normal situation when the pattern of the sparse matrix is symmetric, which is</span>
|
||||
<span
|
||||
class="cmr-8">equivalent to say that the interaction between two variables is reciprocal. If the matrix pattern is</span>
|
||||
<span
|
||||
class="cmr-8">non-symmetric we may have one-way interactions, and these could cause a situation in which a</span>
|
||||
<span
|
||||
class="cmr-8">boundary point is not a halo point for its neighbour.</span></div>
|
||||
</body></html>
|
Before Width: | Height: | Size: 2.5 KiB After Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.0 KiB |
Before Width: | Height: | Size: 2.0 KiB After Width: | Height: | Size: 1.7 KiB |
@ -0,0 +1,921 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title>Extensions</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<!--l. 1--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse11.html" >prev</a>] [<a
|
||||
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse9.html#tailuserhtmlse12.html">tail</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<h3 class="sectionHead"><span class="titlemark">12 </span> <a
|
||||
id="x19-14400012"></a>Extensions</h3>
|
||||
<!--l. 3--><p class="noindent" >The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
|
||||
library. The extensions provide additional storage formats beyond the ones already
|
||||
contained in the base library, as well as interfaces to:
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 8--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">SPGPU</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 8--><p class="noindent" >a CUDA library originally
|
||||
published as <a
|
||||
href="https://code.google.com/p/spgpu/" class="url" ><span
|
||||
class="cmtt-10">https://code.google.com/p/spgpu/</span></a> and now included
|
||||
in the <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">cuda</span></span></span> subdir, for computations on NVIDIA GPUs;
|
||||
</dd><dt class="description">
|
||||
<!--l. 11--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">LIBRSB</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 11--><p class="noindent" ><a
|
||||
href="http://sourceforge.net/projects/librsb/" class="url" ><span
|
||||
class="cmtt-10">http://sourceforge.net/projects/librsb/</span></a>, for computations on
|
||||
multicore parallel machines.</dd></dl>
|
||||
<!--l. 14--><p class="noindent" >The infrastructure laid out in the base library to allow for these extensions is detailed in
|
||||
the references <span class="cite">[<a
|
||||
href="userhtmlli2.html#XDesPat:11">20</a>, <a
|
||||
href="userhtmlli2.html#XCaFiRo:2014">21</a>, <a
|
||||
href="userhtmlli2.html#XSparse03">10</a>]</span>; the CUDA-specific data formats are described
|
||||
in <span class="cite">[<a
|
||||
href="userhtmlli2.html#XOurTechRep">22</a>]</span>.
|
||||
<!--l. 19--><p class="noindent" >
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.1 </span> <a
|
||||
id="x19-14500012.1"></a>Using the extensions</h4>
|
||||
<!--l. 21--><p class="noindent" >A sample application using the PSBLAS extensions will contain the following
|
||||
steps:
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 24--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">USE</span></span></span> the appropriat modules (<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_ext_mod</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_cuda_mod</span></span></span>);
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 26--><p class="noindent" >Declare a <span
|
||||
class="cmti-10">mold </span>variable of the necessary type (e.g.
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_ell_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_hlg_sparse_mat</span></span></span>, <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_d_vect_cuda</span></span></span>);
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 29--><p class="noindent" >Pass the mold variable to the base library interface where needed to ensure
|
||||
the appropriate dynamic type.</li></ul>
|
||||
|
||||
|
||||
|
||||
<!--l. 32--><p class="noindent" >Suppose you want to use the CUDA-enabled ELLPACK data structure; you would use a
|
||||
piece of code like this (and don’t forget, you need CUDA-side vectors along with the
|
||||
matrices):
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 85--><p class="noindent" >
|
||||
|
||||
|
||||
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-103">
|
||||
program my_cuda_test
|
||||
  use psb_base_mod
|
||||
  use psb_util_mod
|
||||
  use psb_ext_mod
|
||||
  use psb_cuda_mod
|
||||
  type(psb_dspmat_type) :: a, agpu
|
||||
  type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
  real(psb_dpk_), allocatable :: xtmp(:)
|
||||
  type(psb_d_vect_cuda)       :: vmold
|
||||
  type(psb_d_elg_sparse_mat) :: aelg
|
||||
  type(psb_ctxt_type) :: ctxt
|
||||
  integer             :: iam, np
|
||||
|
||||
|
||||
  call psb_init(ctxt)
|
||||
  call psb_info(ctxt,iam,np)
|
||||
  call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
  ! My own home-grown matrix generator
|
||||
  call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
  if (info /= 0) goto 9999
|
||||
|
||||
  call a%cscnv(agpu,info,mold=aelg)
|
||||
  if (info /= 0) goto 9999
|
||||
  xtmp = x%get_vect()
|
||||
  call xg%bld(xtmp,mold=vmold)
|
||||
  call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
  ! Do sparse MV
|
||||
  call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
  if (info == 0) then
|
||||
     write(*,*) ’42’
|
||||
  else
|
||||
     write(*,*) ’Something went wrong ’,info
|
||||
  end if
|
||||
|
||||
|
||||
  call psb_cuda_exit()
|
||||
  call psb_exit(ctxt)
|
||||
  stop
|
||||
end program my_cuda_test
|
||||
</pre>
|
||||
<!--l. 134--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
<!--l. 139--><p class="indent" > A full example of this strategy can be seen in the <span
|
||||
class="cmtt-10">test/ext/kernel </span>and
|
||||
<span
|
||||
class="cmtt-10">test/cuda/kernel </span>subdirectories, where we provide sample programs to test the
|
||||
speed of the sparse matrix-vector product with the various data structures included
|
||||
in the library.
|
||||
<!--l. 146--><p class="noindent" >
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.2 </span> <a
|
||||
id="x19-14600012.2"></a>Extensions’ Data Structures</h4>
|
||||
<!--l. 150--><p class="noindent" >Access to the facilities provided by the EXT library is mainly achieved through
|
||||
the data types that are provided within. The data classes are derived from
|
||||
the base classes in PSBLAS, through the Fortran 2003 mechanism of <span
|
||||
class="cmti-10">type</span>
|
||||
<span
|
||||
class="cmti-10">extension</span> <span class="cite">[<a
|
||||
href="userhtmlli2.html#XMRC:11">17</a>]</span>.
|
||||
<!--l. 155--><p class="indent" > The data classes are divided between the general purpose CPU extensions, the
|
||||
GPU interfaces and the RSB interfaces. In the description we will make use of the
|
||||
notation introduced in Table <a
|
||||
href="#x19-146001r21">21<!--tex4ht:ref: tab:notation --></a>.
|
||||
<div class="table">
|
||||
|
||||
|
||||
|
||||
<!--l. 160--><p class="indent" > <a
|
||||
id="x19-146001r21"></a><hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<div class="caption"
|
||||
><span class="id">Table 21: </span><span
|
||||
class="content">Notation for parameters describing a sparse matrix</span></div><!--tex4ht:label?: x19-146001r21 -->
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 162--><p class="noindent" >
|
||||
<div class="tabular"> <table id="TBL-23" class="tabular"
|
||||
|
||||
><colgroup id="TBL-23-1g"><col
|
||||
id="TBL-23-1"><col
|
||||
id="TBL-23-2"></colgroup><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-1-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">Name </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-1-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Description </span></td>
|
||||
</tr><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-2-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">M </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-2-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of rows in matrix </span></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-3-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">N </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-3-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of columns in matrix</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-4-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NZ </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-4-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of nonzeros in matrix </span></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-5-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">AVGNZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-5-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Average number of nonzeros per row</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-6-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">MAXNZR</span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-6-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Maximum number of nonzeros per row</span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-7-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NDIAG </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-7-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Numero of nonzero diagonals </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-8-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">AS </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-8-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Coefficients array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-9-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">IA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-9-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Row indices array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-10-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">JA </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-10-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Column indices array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-11-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">IRP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-11-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Row start pointers array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-12-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">JCP </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-12-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Column start pointers array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-13-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">NZR </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-13-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Number of nonzeros per row array </span></td>
|
||||
</tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-14-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-1"
|
||||
class="td11"><span
|
||||
class="cmr-8">OFFSET </span></td><td style="white-space:nowrap; text-align:left;" id="TBL-23-14-2"
|
||||
class="td11"><span
|
||||
class="cmr-8">Offset for diagonals </span></td>
|
||||
</tr><tr
|
||||
class="hline"><td><hr></td><td><hr></td></tr><tr
|
||||
style="vertical-align:baseline;" id="TBL-23-15-"><td style="white-space:nowrap; text-align:left;" id="TBL-23-15-1"
|
||||
class="td11"> </td></tr></table> </div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
</div>
|
||||
<!--l. 188--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-146002r5"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 192--><p class="noindent" ><img
|
||||
src="mat.png" alt="PIC"
|
||||
width="147" height="147" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 5: </span><span
|
||||
class="content">Example of sparse matrix</span></div><!--tex4ht:label?: x19-146002r5 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 198--><p class="indent" > </div><hr class="endfigure">
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.3 </span> <a
|
||||
id="x19-14700012.3"></a>CPU-class extensions</h4>
|
||||
<!--l. 203--><p class="noindent" >
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-148000"></a>ELLPACK</h5>
|
||||
<!--l. 205--><p class="noindent" >The ELLPACK/ITPACK format (shown in Figure <a
|
||||
href="#x19-148001r6">6<!--tex4ht:ref: fig:ell --></a>) comprises two 2-dimensional
|
||||
arrays <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> with <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">M</span></span></span> rows and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> columns, where <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> is the maximum
|
||||
number of nonzeros in any row <span class="cite">[<span
|
||||
class="cmbx-10">?</span>]</span>. Each row of the arrays <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> contains the
|
||||
coefficients and column indices; rows shorter than <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">MAXNZR</span></span></span> are padded with zero
|
||||
coefficients and appropriate column indices, e.g. the last valid one found in the same
|
||||
row.
|
||||
<!--l. 215--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-148001r6"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 219--><p class="noindent" ><img
|
||||
src="ell.png" alt="PIC"
|
||||
width="233" height="233" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 6: </span><span
|
||||
class="content">ELLPACK compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-148001r6 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 225--><p class="indent" > </div><hr class="endfigure">
|
||||
<a
|
||||
id="x19-148002r1"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 229--><p class="indent" > <hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<!--l. 231-->
|
||||
<pre class="lstlisting" id="listing-168"><span class="label"><a
|
||||
id="x19-148003r1"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">n</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148004r2"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=0</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148005r3"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">=1,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">maxnzr</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148006r4"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">+</span></span><span style="color:#000000"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">as</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">)*</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">x</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">ja</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">,</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">j</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">))</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148007r5"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148008r6"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">y</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">(</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">i</span></span><span style="color:#000000"><span
|
||||
class="cmtt-9">)</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">=</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">t</span></span>
|
||||
<span class="label"><a
|
||||
id="x19-148009r7"></a></span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span
|
||||
class="cmtt-9"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">end</span></span><span style="color:#000000"> </span><span style="color:#000000"><span
|
||||
class="cmtt-9">do</span></span></pre>
|
||||
|
||||
<a
|
||||
id="x19-148010r1"></a>
|
||||
<a
|
||||
id="x19-148011"></a>
|
||||
<span
|
||||
class="cmbx-10">Algorithm</span><span
|
||||
class="cmbx-10"> 1:</span>  Matrix-Vector product in ELL format
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
<!--l. 242--><p class="indent" > The matrix-vector product <span
|
||||
class="cmmi-10">y </span>= <span
|
||||
class="cmmi-10">Ax </span>can be computed with the code shown in
|
||||
Alg. <a
|
||||
href="#x19-148010r1">1<!--tex4ht:ref: alg:ell --></a>; it costs one memory write per outer iteration, plus three memory reads and
|
||||
two floating-point operations per inner iteration.
|
||||
<!--l. 247--><p class="indent" > Unless all rows have exactly the same number of nonzeros, some of the coefficients
|
||||
in the <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> array will be zeros; therefore this data structure will have an overhead both
|
||||
in terms of memory space and redundant operations (multiplications by zero). The
|
||||
overhead can be acceptable if:
|
||||
<ol class="enumerate1" >
|
||||
<li
|
||||
class="enumerate" id="x19-148013x1">
|
||||
<!--l. 253--><p class="noindent" >The maximum number of nonzeros per row is not much larger than the
|
||||
average;
|
||||
</li>
|
||||
<li
|
||||
class="enumerate" id="x19-148015x2">
|
||||
<!--l. 255--><p class="noindent" >The regularity of the data structure allows for faster code, e.g. by allowing
|
||||
vectorization, thereby offsetting the additional storage requirements.</li></ol>
|
||||
<!--l. 259--><p class="noindent" >In the extreme case where the input matrix has one full row, the ELLPACK
|
||||
structure would require more memory than the normal 2D array storage. The
|
||||
ELLPACK storage format was very popular in the vector computing days; in
|
||||
modern CPUs it is not quite as popular, but it is the basis for many GPU
|
||||
formats.
|
||||
<!--l. 265--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_ell_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 281--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-104">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
    !
|
||||
    ! ITPACK/ELL format, extended.
|
||||
    !
|
||||
|
||||
    integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
    real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
  contains
|
||||
    ....
|
||||
  end type psb_d_ell_sparse_mat
|
||||
</pre>
|
||||
<!--l. 295--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-149000"></a>Hacked ELLPACK</h5>
|
||||
<!--l. 303--><p class="noindent" >The <span
|
||||
class="cmti-10">hacked ELLPACK </span>(<span
|
||||
class="cmbx-10">HLL</span>) format alleviates the main problem of the ELLPACK
|
||||
format, that is, the amount of memory required by padding for sparse matrices in
|
||||
which the maximum row length is larger than the average.
|
||||
|
||||
|
||||
|
||||
<!--l. 308--><p class="indent" > The number of elements allocated to padding is
|
||||
[(<span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">*</span><span
|
||||
class="cmmi-10">maxNR</span>) <span
|
||||
class="cmsy-10">- </span>(<span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">*</span><span
|
||||
class="cmmi-10">avgNR</span>) = <span
|
||||
class="cmmi-10">m</span><span
|
||||
class="cmsy-10">* </span>(<span
|
||||
class="cmmi-10">maxNR</span><span
|
||||
class="cmsy-10">-</span><span
|
||||
class="cmmi-10">avgNR</span>)] for both <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">JA</span></span></span> arrays,
|
||||
where <span
|
||||
class="cmmi-10">m </span>is equal to the number of rows of the matrix, <span
|
||||
class="cmmi-10">maxNR </span>is the maximum
|
||||
number of nonzero elements in every row and <span
|
||||
class="cmmi-10">avgNR </span>is the average number of
|
||||
nonzeros. Therefore a single densely populated row can seriously affect the total size
|
||||
of the allocation.
|
||||
<!--l. 317--><p class="indent" > To limit this effect, in the HLL format we break the original matrix into equally
|
||||
sized groups of rows (called <span
|
||||
class="cmti-10">hacks</span>), and then store these groups as independent
|
||||
matrices in ELLPACK format. The groups can be arranged selecting rows in an
|
||||
arbitrarily manner; indeed, if the rows are sorted by decreasing number of nonzeros
|
||||
we obtain essentially the JAgged Diagonals format. If the rows are not in the original
|
||||
order, then an additional vector <span
|
||||
class="cmti-10">rIdx </span>is required, storing the actual row index for
|
||||
each row in the data structure.
|
||||
<!--l. 327--><p class="indent" > The multiple ELLPACK-like buffers are stacked together inside a single, one
|
||||
dimensional array; an additional vector <span
|
||||
class="cmti-10">hackOffsets </span>is provided to keep track of the
|
||||
individual submatrices. All hacks have the same number of rows <span
|
||||
class="cmti-10">hackSize</span>; hence, the
|
||||
<span
|
||||
class="cmti-10">hackOffsets </span>vector is an array of (<span
|
||||
class="cmmi-10">m∕hackSize</span>) + 1 elements, each one pointing to
|
||||
the first index of a submatrix inside the stacked <span
|
||||
class="cmti-10">cM</span>/<span
|
||||
class="cmti-10">rP </span>buffers, plus an additional
|
||||
element pointing past the end of the last block, where the next one would begin. We
|
||||
thus have the property that the elements of the <span
|
||||
class="cmmi-10">k</span>-th <span
|
||||
class="cmti-10">hack </span>are stored between
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">hackOffsets[k]</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">hackOffsets[k+1]</span></span></span>, similarly to what happens in the CSR
|
||||
format.
|
||||
<!--l. 342--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-149001r7"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 346--><p class="noindent" ><img
|
||||
src="hll.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 7: </span><span
|
||||
class="content">Hacked ELLPACK compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-149001r7 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 352--><p class="indent" > </div><hr class="endfigure">
|
||||
<!--l. 354--><p class="indent" > With this data structure a very long row only affects one hack, and therefore the
|
||||
additional memory is limited to the hack in which the row appears.
|
||||
<!--l. 358--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_hll_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 374--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-105">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
    !
|
||||
    ! HLL format. (Hacked ELL)
|
||||
    !
|
||||
    integer(psb_ipk_) :: hksz
|
||||
    integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
    real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
  contains
|
||||
   ....
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 388--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-150000"></a>Diagonal storage</h5>
|
||||
<!--l. 396--><p class="noindent" >The DIAgonal (DIA) format (shown in Figure <a
|
||||
href="#x19-150001r8">8<!--tex4ht:ref: fig:dia --></a>) has a 2-dimensional array <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span>
|
||||
containing in each column the coefficients along a diagonal of the matrix, and an
|
||||
integer array <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">OFFSET</span></span></span> that determines where each diagonal starts. The diagonals in <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span>
|
||||
are padded with zeros as necessary.
|
||||
<!--l. 402--><p class="indent" > The code to compute the matrix-vector product <span
|
||||
class="cmmi-10">y </span>= <span
|
||||
class="cmmi-10">Ax </span>is shown in Alg. <a
|
||||
href="#x19-150003r2">2<!--tex4ht:ref: alg:dia --></a>; it
|
||||
costs one memory read per outer iteration, plus three memory reads, one memory
|
||||
write and two floating-point operations per inner iteration. The accesses to
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">AS</span></span></span> and <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">x</span></span></span> are in strict sequential order, therefore no indirect addressing is
|
||||
required.
|
||||
<!--l. 409--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-150001r8"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 413--><p class="noindent" ><img
|
||||
src="dia.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 8: </span><span
|
||||
class="content">DIA compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-150001r8 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 419--><p class="indent" > </div><hr class="endfigure">
|
||||
<a
|
||||
id="x19-150002r2"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 423--><p class="indent" > <hr class="float"><div class="float"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 437--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-106">
|
||||
    do j=1,ndiag
|
||||
      if (offset(j) > 0) then
|
||||
        ir1 = 1; ir2 = m - offset(j);
|
||||
      else
|
||||
        ir1 = 1 - offset(j); ir2 = m;
|
||||
      end if
|
||||
      do i=ir1,ir2
|
||||
        y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
      end do
|
||||
    end do
|
||||
</pre>
|
||||
<!--l. 450--><p class="nopar" > </div></div>
|
||||
<a
|
||||
id="x19-150003r2"></a>
|
||||
<a
|
||||
id="x19-150004"></a>
|
||||
<span
|
||||
class="cmbx-10">Algorithm</span><span
|
||||
class="cmbx-10"> 2:</span>  Matrix-Vector product in DIA format
|
||||
|
||||
|
||||
|
||||
</div><hr class="endfloat" />
|
||||
<!--l. 458--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_dia_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 473--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-107">
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
    !
|
||||
    ! DIA format, extended.
|
||||
    !
|
||||
|
||||
    integer(psb_ipk_), allocatable :: offset(:)
|
||||
    integer(psb_ipk_) :: nzeros
|
||||
    real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 486--><p class="nopar" > </div></div>
|
||||
<h5 class="likesubsubsectionHead"><a
|
||||
id="x19-151000"></a>Hacked DIA</h5>
|
||||
<!--l. 495--><p class="noindent" >Storage by DIAgonals is an attractive option for matrices whose coefficients are
|
||||
located on a small set of diagonals, since they do away with storing explicitly the
|
||||
indices and therefore reduce significantly memory traffic. However, having a few
|
||||
coefficients outside of the main set of diagonals may significantly increase the
|
||||
amount of needed padding; moreover, while the DIA code is easily vectorized,
|
||||
it does not necessarily make optimal use of the memory hierarchy. While
|
||||
processing each diagonal we are updating entries in the output vector <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span>,
|
||||
which is then accessed multiple times; if the vector <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span> is too large to remain
|
||||
in the cache memory, the associated cache miss penalty is paid multiple
|
||||
times.
|
||||
<!--l. 507--><p class="indent" > The <span
|
||||
class="cmti-10">hacked DIA </span>(<span
|
||||
class="cmbx-10">HDIA</span>) format was designed to contain the amount of padding,
|
||||
by breaking the original matrix into equally sized groups of rows (<span
|
||||
class="cmti-10">hacks</span>), and then
|
||||
storing these groups as independent matrices in DIA format. This approach is similar
|
||||
to that of HLL, and requires using an offset vector for each submatrix. Again,
|
||||
similarly to HLL, the various submatrices are stacked inside a linear array to
|
||||
improve memory management. The fact that the matrix is accessed in slices
|
||||
helps in reducing cache misses, especially regarding accesses to the vector
|
||||
<span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">y</span></span></span>.
|
||||
<!--l. 519--><p class="indent" > An additional vector <span
|
||||
class="cmti-10">hackOffsets </span>is provided to complete the matrix format; given
|
||||
that <span
|
||||
class="cmti-10">hackSize </span>is the number of rows of each hack, the <span
|
||||
class="cmti-10">hackOffsets </span>vector is made by
|
||||
an array of (<span
|
||||
class="cmmi-10">m∕hackSize</span>) + 1 elements, pointing to the first diagonal offset of a
|
||||
submatrix inside the stacked <span
|
||||
class="cmti-10">offsets </span>buffers, plus an additional element equal to the
|
||||
number of nonzero diagonals in the whole matrix. We thus have the property that
|
||||
the number of diagonals of the <span
|
||||
class="cmmi-10">k</span>-th <span
|
||||
class="cmti-10">hack </span>is given by <span
|
||||
class="cmti-10">hackOffsets[k+1] -</span>
|
||||
<span
|
||||
class="cmti-10">hackOffsets[k]</span>.
|
||||
<!--l. 529--><p class="indent" > <hr class="figure"><div class="figure"
|
||||
>
|
||||
|
||||
|
||||
|
||||
<a
|
||||
id="x19-151001r9"></a>
|
||||
|
||||
|
||||
|
||||
<!--l. 533--><p class="noindent" ><img
|
||||
src="hdia.png" alt="PIC"
|
||||
width="248" height="248" >
|
||||
<br /> <div class="caption"
|
||||
><span class="id">Figure 9: </span><span
|
||||
class="content">Hacked DIA compression of matrix in Figure <a
|
||||
href="#x19-146002r5">5<!--tex4ht:ref: fig:dense --></a></span></div><!--tex4ht:label?: x19-151001r9 -->
|
||||
|
||||
|
||||
|
||||
<!--l. 539--><p class="indent" > </div><hr class="endfigure">
|
||||
<!--l. 541--><p class="indent" > The relevant data type is <span class="obeylines-h"><span class="verb"><span
|
||||
class="cmtt-10">psb_T_hdia_sparse_mat</span></span></span>:
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 568--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-108">
|
||||
  type pm
|
||||
     real(psb_dpk_), allocatable  :: data(:,:)
|
||||
  end type pm
|
||||
|
||||
  type po
|
||||
     integer(psb_ipk_), allocatable  :: off(:)
|
||||
  end type po
|
||||
|
||||
  type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
    !
|
||||
    ! HDIA format, extended.
|
||||
    !
|
||||
|
||||
    type(pm), allocatable :: hdia(:)
|
||||
    type(po), allocatable :: offset(:)
|
||||
    integer(psb_ipk_) :: nblocks, nzeros
|
||||
    integer(psb_ipk_) :: hack = 64
|
||||
    integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
  contains
|
||||
   ....
|
||||
  end type
|
||||
</pre>
|
||||
<!--l. 593--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h4 class="subsectionHead"><span class="titlemark">12.4 </span> <a
|
||||
id="x19-15200012.4"></a>CUDA-class extensions</h4>
|
||||
<!--l. 4--><p class="noindent" >For computing with CUDA we define a dual memorization strategy in which each
|
||||
variable on the CPU (“host”) side has a GPU (“device”) side. When a GPU-type
|
||||
variable is initialized, the data contained is (usually) the same on both sides. Each
|
||||
operator invoked on the variable may change the data so that only the host side or
|
||||
the device side are up-to-date.
|
||||
<!--l. 11--><p class="indent" > Keeping track of the updates to data in the variables is essential: we want to
|
||||
perform most computations on the GPU, but we cannot afford the time needed to
|
||||
move data between the host memory and the device memory because the bandwidth
|
||||
of the interconnection bus would become the main bottleneck of the computation.
|
||||
Thus, each and every computational routine in the library is built according to the
|
||||
following principles:
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 18--><p class="noindent" >If the data type being handled is GPU-enabled, make sure that its device
|
||||
copy is up to date, perform any arithmetic operation on the GPU, and
|
||||
if the data has been altered as a result, mark the main-memory copy as
|
||||
outdated.
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 22--><p class="noindent" >The main-memory copy is never updated unless this is requested by the user
|
||||
either
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 25--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">explicitly</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 25--><p class="noindent" >by invoking a synchronization method;
|
||||
</dd><dt class="description">
|
||||
<!--l. 26--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">implicitly</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 26--><p class="noindent" >by invoking a method that involves other data items that are not
|
||||
GPU-enabled, e.g., by assignment ov a vector to a normal array.</dd></dl>
|
||||
</li></ul>
|
||||
<!--l. 31--><p class="noindent" >In this way, data items are put on the GPU memory “on demand” and remain there as
|
||||
long as “normal” computations are carried out. As an example, the following call to a
|
||||
matrix-vector product
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 39--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-109">
|
||||
    call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
</pre>
|
||||
<!--l. 43--><p class="nopar" > </div></div>
|
||||
|
||||
|
||||
|
||||
<!--l. 47--><p class="noindent" >will transparently and automatically be performed on the GPU whenever all three data
|
||||
inputs <code class="lstinline"><span style="color:#000000">a</span></code>, <code class="lstinline"><span style="color:#000000">x</span></code> and <code class="lstinline"><span style="color:#000000">y</span></code> are GPU-enabled. If a program makes many such calls sequentially,
|
||||
then
|
||||
<ul class="itemize1">
|
||||
<li class="itemize">
|
||||
<!--l. 52--><p class="noindent" >The first kernel invocation will find the data in main memory, and will
|
||||
copy it to the GPU memory, thus incurring a significant overhead; the
|
||||
result is however <span
|
||||
class="cmti-10">not </span>copied back, and therefore:
|
||||
</li>
|
||||
<li class="itemize">
|
||||
<!--l. 56--><p class="noindent" >Subsequent kernel invocations involving the same vector will find the data
|
||||
on the GPU side so that they will run at full speed.</li></ul>
|
||||
<!--l. 60--><p class="noindent" >For all invocations after the first the only data that will have to be transferred to/from
|
||||
the main memory will be the scalars <code class="lstinline"><span style="color:#000000">alpha</span></code> and <code class="lstinline"><span style="color:#000000">beta</span></code>, and the return code
|
||||
<code class="lstinline"><span style="color:#000000">info</span></code>.
|
||||
<!--l. 64--><p class="indent" >
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 65--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Vectors:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 65--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_vect_gpu</span></code> provides a GPU-enabled extension of
|
||||
the inner type <code class="lstinline"><span style="color:#000000">psb_T_base_vect_type</span></code>, and must be used together with
|
||||
the other inner matrix type to make full use of the GPU computational
|
||||
capabilities;
|
||||
</dd><dt class="description">
|
||||
<!--l. 69--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">CSR:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 69--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_csrg_sparse_mat</span></code> provides an interface to the GPU
|
||||
version of CSR available in the NVIDIA CuSPARSE library;
|
||||
</dd><dt class="description">
|
||||
<!--l. 72--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HYB:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 72--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hybg_sparse_mat</span></code> provides an interface to the HYB
|
||||
GPU storage available in the NVIDIA CuSPARSE library. The internal
|
||||
structure is opaque, hence the host side is just CSR; the HYB data format
|
||||
is only available up to CUDA version 10.
|
||||
</dd><dt class="description">
|
||||
<!--l. 77--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">ELL:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 77--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_elg_sparse_mat</span></code> provides an interface to the
|
||||
ELLPACK implementation from SPGPU;
|
||||
|
||||
|
||||
|
||||
</dd><dt class="description">
|
||||
<!--l. 80--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HLL:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 80--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hlg_sparse_mat</span></code> provides an interface to the Hacked
|
||||
ELLPACK implementation from SPGPU;
|
||||
</dd><dt class="description">
|
||||
<!--l. 82--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">HDIA:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 82--><p class="noindent" >The data type <code class="lstinline"><span style="color:#000000">psb_T_hdiag_sparse_mat</span></code> provides an interface to the
|
||||
Hacked DIAgonals implementation from SPGPU;</dd></dl>
|
||||
|
||||
|
||||
|
||||
<!--l. 87--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse11.html" >prev</a>] [<a
|
||||
href="userhtmlse11.html#tailuserhtmlse11.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse12.html" >front</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<!--l. 87--><p class="indent" > <a
|
||||
id="tailuserhtmlse12.html"></a>
|
||||
</body></html>
|
@ -0,0 +1,299 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html >
|
||||
<head><title>CUDA Environment Routines</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="generator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<meta name="originator" content="TeX4ht (https://tug.org/tex4ht/)">
|
||||
<!-- html,3 -->
|
||||
<meta name="src" content="userhtml.tex">
|
||||
<link rel="stylesheet" type="text/css" href="userhtml.css">
|
||||
</head><body
|
||||
>
|
||||
<!--l. 87--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse12.html" >prev</a>] [<a
|
||||
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse10.html#tailuserhtmlse13.html">tail</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<h3 class="sectionHead"><span class="titlemark">13 </span> <a
|
||||
id="x20-15300013"></a>CUDA Environment Routines</h3>
|
||||
<!--l. 91--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-154000"></a>psb_cuda_init — Initializes PSBLAS-CUDA environment</h4>
|
||||
<a
|
||||
id="Q1-20-191"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 99--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-110">
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
</pre>
|
||||
<!--l. 103--><p class="nopar" > </div></div>
|
||||
<!--l. 108--><p class="noindent" >This subroutine initializes the PSBLAS-CUDA environment.
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 110--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Type:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 110--><p class="noindent" >Synchronous.
|
||||
</dd><dt class="description">
|
||||
<!--l. 111--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">On Entry</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 111--><p class="noindent" >
|
||||
</dd><dt class="description">
|
||||
<!--l. 112--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">device</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 112--><p class="noindent" >ID of CUDA device to attach to.<br
|
||||
class="newline" />Scope: <span
|
||||
class="cmbx-10">local</span>.<br
|
||||
class="newline" />Type: <span
|
||||
class="cmbx-10">optional</span>.<br
|
||||
class="newline" />Intent: <span
|
||||
class="cmbx-10">in</span>.<br
|
||||
class="newline" />Specified as: an integer value.  Default: use <code class="lstinline"><span style="color:#000000">mod</span><span style="color:#000000">(</span><span style="color:#000000">iam</span><span style="color:#000000">,</span><span style="color:#000000">ngpu</span><span style="color:#000000">)</span></code> where <code class="lstinline"><span style="color:#000000">iam</span></code> is
|
||||
the calling process index and <code class="lstinline"><span style="color:#000000">ngpu</span></code> is the total number of CUDA devices
|
||||
available on the current node.</dd></dl>
|
||||
<!--l. 123--><p class="noindent" ><span
|
||||
class="cmbx-12">Notes</span>
|
||||
|
||||
|
||||
|
||||
<ol class="enumerate1" >
|
||||
<li
|
||||
class="enumerate" id="x20-154002x1">
|
||||
<!--l. 125--><p class="noindent" >A call to this routine must precede any other PSBLAS-CUDA call.</li></ol>
|
||||
<!--l. 129--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-155000"></a>psb_cuda_exit — Exit from PSBLAS-CUDA environment</h4>
|
||||
<a
|
||||
id="Q1-20-193"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 137--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-111">
|
||||
call psb_cuda_exit(ctxt)
|
||||
</pre>
|
||||
<!--l. 141--><p class="nopar" > </div></div>
|
||||
<!--l. 146--><p class="noindent" >This subroutine exits from the PSBLAS CUDA context.
|
||||
<dl class="description"><dt class="description">
|
||||
<!--l. 148--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">Type:</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 148--><p class="noindent" >Synchronous.
|
||||
</dd><dt class="description">
|
||||
<!--l. 149--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">On Entry</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 149--><p class="noindent" >
|
||||
</dd><dt class="description">
|
||||
<!--l. 150--><p class="noindent" >
|
||||
<span
|
||||
class="cmbx-10">ctxt</span> </dt><dd
|
||||
class="description">
|
||||
<!--l. 150--><p class="noindent" >the communication context identifying the virtual parallel machine.<br
|
||||
class="newline" />Scope: <span
|
||||
class="cmbx-10">global</span>.<br
|
||||
class="newline" />Type: <span
|
||||
class="cmbx-10">required</span>.<br
|
||||
class="newline" />Intent: <span
|
||||
class="cmbx-10">in</span>.<br
|
||||
class="newline" />Specified as: an integer variable.</dd></dl>
|
||||
<!--l. 161--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-156000"></a>psb_cuda_DeviceSync — Synchronize CUDA device</h4>
|
||||
<a
|
||||
id="Q1-20-195"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 169--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-112">
|
||||
call psb_cuda_DeviceSync()
|
||||
</pre>
|
||||
<!--l. 173--><p class="nopar" > </div></div>
|
||||
<!--l. 178--><p class="noindent" >This subroutine ensures that all previosly invoked kernels, i.e. all invocation of
|
||||
CUDA-side code, have completed.
|
||||
<!--l. 182--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-157000"></a>psb_cuda_getDeviceCount </h4>
|
||||
<a
|
||||
id="Q1-20-197"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 190--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-113">
|
||||
ngpus =  psb_cuda_getDeviceCount()
|
||||
</pre>
|
||||
<!--l. 194--><p class="nopar" > </div></div>
|
||||
<!--l. 199--><p class="noindent" >Get number of devices available on current computing node.
|
||||
<!--l. 201--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-158000"></a>psb_cuda_getDevice </h4>
|
||||
<a
|
||||
id="Q1-20-199"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 209--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-114">
|
||||
ngpus =  psb_cuda_getDevice()
|
||||
</pre>
|
||||
<!--l. 213--><p class="nopar" > </div></div>
|
||||
<!--l. 218--><p class="noindent" >Get device in use by current process.
|
||||
<!--l. 220--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-159000"></a>psb_cuda_setDevice </h4>
|
||||
<a
|
||||
id="Q1-20-201"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 228--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-115">
|
||||
info = psb_cuda_setDevice(dev)
|
||||
</pre>
|
||||
<!--l. 232--><p class="nopar" > </div></div>
|
||||
<!--l. 237--><p class="noindent" >Set device to be used by current process.
|
||||
<!--l. 239--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-160000"></a>psb_cuda_DeviceHasUVA </h4>
|
||||
<a
|
||||
id="Q1-20-203"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 247--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-116">
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
</pre>
|
||||
<!--l. 251--><p class="nopar" > </div></div>
|
||||
<!--l. 256--><p class="noindent" >Returns true if device currently in use supports UVA (Unified Virtual Addressing).
|
||||
<!--l. 259--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-161000"></a>psb_cuda_WarpSize </h4>
|
||||
<a
|
||||
id="Q1-20-205"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 267--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-117">
|
||||
nw = psb_cuda_WarpSize()
|
||||
</pre>
|
||||
<!--l. 271--><p class="nopar" > </div></div>
|
||||
<!--l. 276--><p class="noindent" >Returns the warp size.
|
||||
<!--l. 279--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-162000"></a>psb_cuda_MultiProcessors </h4>
|
||||
<a
|
||||
id="Q1-20-207"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 287--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-118">
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
</pre>
|
||||
<!--l. 291--><p class="nopar" > </div></div>
|
||||
<!--l. 296--><p class="noindent" >Returns the number of multiprocessors in the CUDA device.
|
||||
<!--l. 298--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-163000"></a>psb_cuda_MaxThreadsPerMP </h4>
|
||||
<a
|
||||
id="Q1-20-209"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 306--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-119">
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
</pre>
|
||||
<!--l. 310--><p class="nopar" > </div></div>
|
||||
<!--l. 315--><p class="noindent" >Returns the maximum number of threads per multiprocessor.
|
||||
<!--l. 318--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-164000"></a>psb_cuda_MaxRegistersPerBlock </h4>
|
||||
<a
|
||||
id="Q1-20-211"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 326--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-120">
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
</pre>
|
||||
<!--l. 330--><p class="nopar" > </div></div>
|
||||
<!--l. 335--><p class="noindent" >Returns the maximum number of register per thread block.
|
||||
<!--l. 338--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-165000"></a>psb_cuda_MemoryClockRate </h4>
|
||||
<a
|
||||
id="Q1-20-213"></a>
|
||||
|
||||
|
||||
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 346--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-121">
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
</pre>
|
||||
<!--l. 350--><p class="nopar" > </div></div>
|
||||
<!--l. 355--><p class="noindent" >Returns the memory clock rate in KHz, as an integer.
|
||||
<!--l. 357--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-166000"></a>psb_cuda_MemoryBusWidth </h4>
|
||||
<a
|
||||
id="Q1-20-215"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 365--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-122">
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
</pre>
|
||||
<!--l. 369--><p class="nopar" > </div></div>
|
||||
<!--l. 374--><p class="noindent" >Returns the memory bus width in bits.
|
||||
<!--l. 376--><p class="noindent" >
|
||||
<h4 class="likesubsectionHead"><a
|
||||
id="x20-167000"></a>psb_cuda_MemoryPeakBandwidth </h4>
|
||||
<a
|
||||
id="Q1-20-217"></a>
|
||||
<div class="center"
|
||||
>
|
||||
<!--l. 384--><p class="noindent" >
|
||||
<div class="minipage"><pre class="verbatim" id="verbatim-123">
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
</pre>
|
||||
<!--l. 388--><p class="nopar" > </div></div>
|
||||
<!--l. 392--><p class="noindent" >Returns the peak memory bandwidth in MB/s (real double precision).
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!--l. 126--><p class="indent" >
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<!--l. 2--><div class="crosslinks"><p class="noindent">[<a
|
||||
href="userhtmlse12.html" >prev</a>] [<a
|
||||
href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail</a>] [<a
|
||||
href="userhtmlse13.html" >front</a>] [<a
|
||||
href="userhtml.html# " >up</a>] </p></div>
|
||||
<!--l. 2--><p class="indent" > <a
|
||||
id="tailuserhtmlse13.html"></a>
|
||||
</body></html>
|
@ -0,0 +1,395 @@
|
||||
|
||||
\subsection{CUDA-class extensions}
|
||||
|
||||
For computing with CUDA we define a dual memorization strategy in
|
||||
which each variable on the CPU (``host'') side has a GPU (``device'')
|
||||
side. When a GPU-type variable is initialized, the data contained is
|
||||
(usually) the same on both sides. Each operator invoked on the
|
||||
variable may change the data so that only the host side or the device
|
||||
side are up-to-date.
|
||||
|
||||
Keeping track of the updates to data in the variables is essential: we want
|
||||
to perform most computations on the GPU, but we cannot afford the time
|
||||
needed to move data between the host memory and the device memory
|
||||
because the bandwidth of the interconnection bus would become the main
|
||||
bottleneck of the computation. Thus, each and every computational
|
||||
routine in the library is built according to the following principles:
|
||||
\begin{itemize}
|
||||
\item If the data type being handled is {GPU}-enabled, make sure that
|
||||
its device copy is up to date, perform any arithmetic operation on
|
||||
the {GPU}, and if the data has been altered as a result, mark
|
||||
the main-memory copy as outdated.
|
||||
\item The main-memory copy is never updated unless this is requested
|
||||
by the user either
|
||||
\begin{description}
|
||||
\item[explicitly] by invoking a synchronization method;
|
||||
\item[implicitly] by invoking a method that involves other data items
|
||||
that are not {GPU}-enabled, e.g., by assignment ov a vector to a
|
||||
normal array.
|
||||
\end{description}
|
||||
\end{itemize}
|
||||
In this way, data items are put on the {GPU} memory ``on demand'' and
|
||||
remain there as long as ``normal'' computations are carried out.
|
||||
As an example, the following call to a matrix-vector product
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_spmm(alpha,a,x,beta,y,desc_a,info)
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
will transparently and automatically be performed on the {GPU} whenever
|
||||
all three data inputs \fortinline|a|, \fortinline|x| and
|
||||
\fortinline|y| are {GPU}-enabled. If a program makes many such calls
|
||||
sequentially, then
|
||||
\begin{itemize}
|
||||
\item The first kernel invocation will find the data in main memory,
|
||||
and will copy it to the {GPU} memory, thus incurring a significant
|
||||
overhead; the result is however \emph{not} copied back, and
|
||||
therefore:
|
||||
\item Subsequent kernel invocations involving the same vector will
|
||||
find the data on the {GPU} side so that they will run at full
|
||||
speed.
|
||||
\end{itemize}
|
||||
For all invocations after the first the only data that will have to be
|
||||
transferred to/from the main memory will be the scalars \fortinline|alpha|
|
||||
and \fortinline|beta|, and the return code \fortinline|info|.
|
||||
|
||||
\begin{description}
|
||||
\item[Vectors:] The data type \fortinline|psb_T_vect_gpu| provides a
|
||||
GPU-enabled extension of the inner type \fortinline|psb_T_base_vect_type|,
|
||||
and must be used together with the other inner matrix type to make
|
||||
full use of the GPU computational capabilities;
|
||||
\item[CSR:] The data type \fortinline|psb_T_csrg_sparse_mat| provides an
|
||||
interface to the GPU version of CSR available in the NVIDIA CuSPARSE
|
||||
library;
|
||||
\item[HYB:] The data type \fortinline|psb_T_hybg_sparse_mat| provides an
|
||||
interface to the HYB GPU storage available in the NVIDIA CuSPARSE
|
||||
library. The internal structure is opaque, hence the host side is
|
||||
just CSR; the HYB data format is only available up to CUDA version
|
||||
10.
|
||||
\item[ELL:] The data type \fortinline|psb_T_elg_sparse_mat| provides an
|
||||
interface to the ELLPACK implementation from SPGPU;
|
||||
|
||||
\item[HLL:] The data type \fortinline|psb_T_hlg_sparse_mat| provides an
|
||||
interface to the Hacked ELLPACK implementation from SPGPU;
|
||||
\item[HDIA:] The data type \fortinline|psb_T_hdiag_sparse_mat| provides an
|
||||
interface to the Hacked DIAgonals implementation from SPGPU;
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{CUDA Environment Routines}
|
||||
\label{sec:cudaenv}
|
||||
|
||||
\subsection*{psb\_cuda\_init --- Initializes PSBLAS-CUDA
|
||||
environment}
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_init}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_cuda_init(ctxt [, device])
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
This subroutine initializes the PSBLAS-CUDA environment.
|
||||
\begin{description}
|
||||
\item[Type:] Synchronous.
|
||||
\item[\bf On Entry ]
|
||||
\item[device] ID of CUDA device to attach to.\\
|
||||
Scope: {\bf local}.\\
|
||||
Type: {\bf optional}.\\
|
||||
Intent: {\bf in}.\\
|
||||
Specified as: an integer value. \
|
||||
Default: use \fortinline|mod(iam,ngpu)| where \fortinline|iam| is the calling
|
||||
process index and \fortinline|ngpu| is the total number of CUDA devices
|
||||
available on the current node.
|
||||
\end{description}
|
||||
|
||||
|
||||
{\par\noindent\large\bfseries Notes}
|
||||
\begin{enumerate}
|
||||
\item A call to this routine must precede any other PSBLAS-CUDA call.
|
||||
\end{enumerate}
|
||||
|
||||
\subsection*{psb\_cuda\_exit --- Exit from PSBLAS-CUDA
|
||||
environment}
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_exit}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
call psb_cuda_exit(ctxt)
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_cuda_exit(ctxt)
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
This subroutine exits from the PSBLAS CUDA context.
|
||||
\begin{description}
|
||||
\item[Type:] Synchronous.
|
||||
\item[\bf On Entry ]
|
||||
\item[ctxt] the communication context identifying the virtual
|
||||
parallel machine.\\
|
||||
Scope: {\bf global}.\\
|
||||
Type: {\bf required}.\\
|
||||
Intent: {\bf in}.\\
|
||||
Specified as: an integer variable.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_DeviceSync --- Synchronize CUDA device}
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_DeviceSync}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
call psb_cuda_DeviceSync()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
call psb_cuda_DeviceSync()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
This subroutine ensures that all previosly invoked kernels, i.e. all
|
||||
invocation of CUDA-side code, have completed.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_getDeviceCount }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_getDeviceCount}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
ngpus = psb_cuda_getDeviceCount()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
ngpus = psb_cuda_getDeviceCount()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Get number of devices available on current computing node.
|
||||
|
||||
\subsection*{psb\_cuda\_getDevice }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_getDevice}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
ngpus = psb_cuda_getDevice()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
ngpus = psb_cuda_getDevice()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Get device in use by current process.
|
||||
|
||||
\subsection*{psb\_cuda\_setDevice }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_setDevice}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
info = psb_cuda_setDevice(dev)
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
info = psb_cuda_setDevice(dev)
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Set device to be used by current process.
|
||||
|
||||
\subsection*{psb\_cuda\_DeviceHasUVA }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_DeviceHasUVA}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
hasUva = psb_cuda_DeviceHasUVA()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns true if device currently in use supports UVA
|
||||
(Unified Virtual Addressing).
|
||||
|
||||
\subsection*{psb\_cuda\_WarpSize }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_WarpSize}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nw = psb_cuda_WarpSize()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nw = psb_cuda_WarpSize()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the warp size.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_MultiProcessors }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MultiProcessors}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nmp = psb_cuda_MultiProcessors()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the number of multiprocessors in the CUDA device.
|
||||
|
||||
\subsection*{psb\_cuda\_MaxThreadsPerMP }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MaxThreadsPerMP}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nt = psb_cuda_MaxThreadsPerMP()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the maximum number of threads per multiprocessor.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_MaxRegistersPerBlock }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MaxRegisterPerBlock}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nr = psb_cuda_MaxRegistersPerBlock()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the maximum number of register per thread block.
|
||||
|
||||
|
||||
\subsection*{psb\_cuda\_MemoryClockRate }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MemoryClockRate}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
cl = psb_cuda_MemoryClockRate()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the memory clock rate in KHz, as an integer.
|
||||
|
||||
\subsection*{psb\_cuda\_MemoryBusWidth }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MemoryBusWidth}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
nb = psb_cuda_MemoryBusWidth()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
Returns the memory bus width in bits.
|
||||
|
||||
\subsection*{psb\_cuda\_MemoryPeakBandwidth }
|
||||
\addcontentsline{toc}{subsection}{psb\_cuda\_MemoryPeakBandwidth}
|
||||
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true]{fortran}
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
bw = psb_cuda_MemoryPeakBandwidth()
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
Returns the peak memory bandwidth in MB/s (real double precision).
|
||||
|
||||
|
||||
|
@ -0,0 +1,598 @@
|
||||
\section{Extensions}\label{sec:ext-intro}
|
||||
|
||||
The EXT, CUDA and RSB subdirectories contains a set of extensions to the base
|
||||
library. The extensions provide additional storage formats beyond the
|
||||
ones already contained in the base library, as well as interfaces
|
||||
to:
|
||||
\begin{description}
|
||||
\item[SPGPU] a CUDA library originally published as
|
||||
\url{https://code.google.com/p/spgpu/} and now included in the
|
||||
\verb|cuda| subdir, for computations on NVIDIA GPUs;
|
||||
\item[LIBRSB] \url{http://sourceforge.net/projects/librsb/}, for
|
||||
computations on multicore parallel machines.
|
||||
\end{description}
|
||||
The infrastructure laid out in the base library to allow for these
|
||||
extensions is detailed in the references~\cite{DesPat:11,CaFiRo:2014,Sparse03};
|
||||
the CUDA-specific data formats are described in~\cite{OurTechRep}.
|
||||
|
||||
|
||||
\subsection{Using the extensions}
|
||||
\label{sec:ext-appstruct}
|
||||
A sample application using the PSBLAS extensions will contain the
|
||||
following steps:
|
||||
\begin{itemize}
|
||||
\item \verb|USE| the appropriat modules (\verb|psb_ext_mod|,
|
||||
\verb|psb_cuda_mod|);
|
||||
\item Declare a \emph{mold} variable of the necessary type
|
||||
(e.g. \verb|psb_d_ell_sparse_mat|, \verb|psb_d_hlg_sparse_mat|,
|
||||
\verb|psb_d_vect_cuda|);
|
||||
\item Pass the mold variable to the base library interface where
|
||||
needed to ensure the appropriate dynamic type.
|
||||
\end{itemize}
|
||||
Suppose you want to use the CUDA-enabled ELLPACK data structure; you
|
||||
would use a piece of code like this (and don't forget, you need
|
||||
CUDA-side vectors along with the matrices):
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
program my_cuda_test
|
||||
use psb_base_mod
|
||||
use psb_util_mod
|
||||
use psb_ext_mod
|
||||
use psb_cuda_mod
|
||||
type(psb_dspmat_type) :: a, agpu
|
||||
type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
real(psb_dpk_), allocatable :: xtmp(:)
|
||||
type(psb_d_vect_cuda) :: vmold
|
||||
type(psb_d_elg_sparse_mat) :: aelg
|
||||
type(psb_ctxt_type) :: ctxt
|
||||
integer :: iam, np
|
||||
|
||||
|
||||
call psb_init(ctxt)
|
||||
call psb_info(ctxt,iam,np)
|
||||
call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
! My own home-grown matrix generator
|
||||
call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
if (info /= 0) goto 9999
|
||||
|
||||
call a%cscnv(agpu,info,mold=aelg)
|
||||
if (info /= 0) goto 9999
|
||||
xtmp = x%get_vect()
|
||||
call xg%bld(xtmp,mold=vmold)
|
||||
call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
! Do sparse MV
|
||||
call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
if (info == 0) then
|
||||
write(*,*) '42'
|
||||
else
|
||||
write(*,*) 'Something went wrong ',info
|
||||
end if
|
||||
|
||||
|
||||
call psb_cuda_exit()
|
||||
call psb_exit(ctxt)
|
||||
stop
|
||||
end program my_cuda_test
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
program my_cuda_test
|
||||
use psb_base_mod
|
||||
use psb_util_mod
|
||||
use psb_ext_mod
|
||||
use psb_cuda_mod
|
||||
type(psb_dspmat_type) :: a, agpu
|
||||
type(psb_d_vect_type) :: x, xg, bg
|
||||
|
||||
real(psb_dpk_), allocatable :: xtmp(:)
|
||||
type(psb_d_vect_cuda) :: vmold
|
||||
type(psb_d_elg_sparse_mat) :: aelg
|
||||
type(psb_ctxt_type) :: ctxt
|
||||
integer :: iam, np
|
||||
|
||||
|
||||
call psb_init(ctxt)
|
||||
call psb_info(ctxt,iam,np)
|
||||
call psb_cuda_init(ctxt, iam)
|
||||
|
||||
|
||||
! My own home-grown matrix generator
|
||||
call gen_matrix(ctxt,idim,desc_a,a,x,info)
|
||||
if (info /= 0) goto 9999
|
||||
|
||||
call a%cscnv(agpu,info,mold=aelg)
|
||||
if (info /= 0) goto 9999
|
||||
xtmp = x%get_vect()
|
||||
call xg%bld(xtmp,mold=vmold)
|
||||
call bg%bld(size(xtmp),mold=vmold)
|
||||
|
||||
! Do sparse MV
|
||||
call psb_spmm(done,agpu,xg,dzero,bg,desc_a,info)
|
||||
|
||||
|
||||
9999 continue
|
||||
if (info == 0) then
|
||||
write(*,*) '42'
|
||||
else
|
||||
write(*,*) 'Something went wrong ',info
|
||||
end if
|
||||
|
||||
|
||||
call psb_cuda_exit()
|
||||
call psb_exit(ctxt)
|
||||
stop
|
||||
end program my_cuda_test
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
A full example of this strategy can be seen in the
|
||||
\texttt{test/ext/kernel} and \texttt{test/\-cuda/\-kernel} subdirectories,
|
||||
where we provide sample programs
|
||||
to test the speed of the sparse matrix-vector product with the various
|
||||
data structures included in the library.
|
||||
|
||||
|
||||
\subsection{Extensions' Data Structures}
|
||||
\label{sec:ext-datastruct}
|
||||
%\ifthenelse{\boolean{mtc}}{\minitoc}{}
|
||||
|
||||
Access to the facilities provided by the EXT library is mainly
|
||||
achieved through the data types that are provided within.
|
||||
The data classes are derived from the base classes in PSBLAS, through
|
||||
the Fortran~2003 mechanism of \emph{type extension}~\cite{MRC:11}.
|
||||
|
||||
The data classes are divided between the general purpose CPU
|
||||
extensions, the GPU interfaces and the RSB interfaces.
|
||||
In the description we will make use of the notation introduced in
|
||||
Table~\ref{tab:notation}.
|
||||
|
||||
\begin{table}[ht]
|
||||
\caption{Notation for parameters describing a sparse matrix}
|
||||
\begin{center}
|
||||
{\footnotesize
|
||||
\begin{tabular}{ll}
|
||||
\hline
|
||||
Name & Description \\
|
||||
\hline
|
||||
M & Number of rows in matrix \\
|
||||
N & Number of columns in matrix \\
|
||||
NZ & Number of nonzeros in matrix \\
|
||||
AVGNZR & Average number of nonzeros per row \\
|
||||
MAXNZR & Maximum number of nonzeros per row \\
|
||||
NDIAG & Numero of nonzero diagonals\\
|
||||
AS & Coefficients array \\
|
||||
IA & Row indices array \\
|
||||
JA & Column indices array \\
|
||||
IRP & Row start pointers array \\
|
||||
JCP & Column start pointers array \\
|
||||
NZR & Number of nonzeros per row array \\
|
||||
OFFSET & Offset for diagonals \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
}
|
||||
\end{center}
|
||||
\label{tab:notation}
|
||||
\end{table}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=5.2cm]{figures/mat.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=5.2cm]{mat.png}
|
||||
\or
|
||||
\includegraphics[width=5.2cm]{figures/mat.pdf}
|
||||
\fi
|
||||
\caption{Example of sparse matrix}
|
||||
\label{fig:dense}
|
||||
\end{figure}
|
||||
|
||||
\subsection{CPU-class extensions}
|
||||
|
||||
|
||||
\subsubsection*{ELLPACK}
|
||||
|
||||
The ELLPACK/ITPACK format (shown in Figure~\ref{fig:ell})
|
||||
comprises two 2-dimensional arrays \verb|AS| and
|
||||
\verb|JA| with \verb|M| rows and \verb|MAXNZR| columns, where
|
||||
\verb|MAXNZR| is the maximum
|
||||
number of nonzeros in any row~\cite{ELLPACK}.
|
||||
Each row of the arrays \verb|AS| and \verb|JA| contains the
|
||||
coefficients and column indices; rows shorter than
|
||||
\verb|MAXNZR| are padded with zero coefficients and appropriate column
|
||||
indices, e.g. the last valid one found in the same row.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{figures/ell.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=8.2cm]{ell.png}
|
||||
\or
|
||||
\includegraphics[width=8.2cm]{figures/ell.pdf}
|
||||
\fi
|
||||
\caption{ELLPACK compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:ell}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{algorithm}
|
||||
\lstset{language=Fortran}
|
||||
\small
|
||||
\begin{lstlisting}
|
||||
do i=1,n
|
||||
t=0
|
||||
do j=1,maxnzr
|
||||
t = t + as(i,j)*x(ja(i,j))
|
||||
end do
|
||||
y(i) = t
|
||||
end do
|
||||
\end{lstlisting}
|
||||
\caption{\label{alg:ell} Matrix-Vector product in ELL format}
|
||||
\end{algorithm}
|
||||
The matrix-vector product $y=Ax$ can be computed with the code shown in
|
||||
Alg.~\ref{alg:ell}; it costs one memory write per outer iteration,
|
||||
plus three memory reads and two floating-point operations per inner
|
||||
iteration.
|
||||
|
||||
Unless all rows have exactly the same number of nonzeros, some of the
|
||||
coefficients in the \verb|AS| array will be zeros; therefore this
|
||||
data structure will have an overhead both in terms of memory space
|
||||
and redundant operations (multiplications by zero). The overhead can
|
||||
be acceptable if:
|
||||
\begin{enumerate}
|
||||
\item The maximum number of nonzeros per row is not much larger than
|
||||
the average;
|
||||
\item The regularity of the data structure allows for faster code,
|
||||
e.g. by allowing vectorization, thereby offsetting the additional
|
||||
storage requirements.
|
||||
\end{enumerate}
|
||||
In the extreme case where the input matrix has one full row, the
|
||||
ELLPACK structure would require more memory than the normal 2D array
|
||||
storage. The ELLPACK storage format was very popular in the vector
|
||||
computing days; in modern CPUs it is not quite as popular, but it
|
||||
is the basis for many GPU formats.
|
||||
|
||||
The relevant data type is \verb|psb_T_ell_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
!
|
||||
! ITPACK/ELL format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type psb_d_ell_sparse_mat
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_ell_sparse_mat
|
||||
!
|
||||
! ITPACK/ELL format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:,:), idiag(:)
|
||||
real(psb_dpk_), allocatable :: val(:,:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type psb_d_ell_sparse_mat
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
|
||||
\subsubsection*{Hacked ELLPACK}
|
||||
|
||||
The \textit{hacked ELLPACK} (\textbf{HLL}) format
|
||||
alleviates the main problem of the ELLPACK format, that is,
|
||||
the amount of memory required by padding for sparse matrices in
|
||||
which the maximum row length is larger than the average.
|
||||
|
||||
The number of elements allocated to padding is $[(m*maxNR) -
|
||||
(m*avgNR) = m*(maxNR-avgNR)]$
|
||||
for both \verb|AS| and \verb|JA| arrays,
|
||||
where $m$ is equal to the number of rows of the matrix, $maxNR$ is the
|
||||
maximum number of nonzero elements
|
||||
in every row and $avgNR$ is the average number of nonzeros.
|
||||
Therefore a single densely populated row can seriously affect the
|
||||
total size of the allocation.
|
||||
|
||||
To limit this effect, in the HLL format we break the original matrix
|
||||
into equally sized groups of rows (called \textit{hacks}), and then store
|
||||
these groups as independent matrices in ELLPACK format.
|
||||
The groups can be arranged selecting rows in an arbitrarily manner;
|
||||
indeed, if the rows are sorted by decreasing number of nonzeros we
|
||||
obtain essentially the JAgged Diagonals format.
|
||||
If the rows are not in the original order, then an additional vector
|
||||
\textit{rIdx} is required, storing the actual row index for each row
|
||||
in the data structure.
|
||||
|
||||
The multiple ELLPACK-like buffers are stacked together inside a
|
||||
single, one dimensional array;
|
||||
an additional vector \textit{hackOffsets} is provided to keep track
|
||||
of the individual submatrices.
|
||||
All hacks have the same number of rows \textit{hackSize}; hence,
|
||||
the \textit{hackOffsets} vector is an array of
|
||||
$(m/hackSize)+1$ elements, each one pointing to the first index of a
|
||||
submatrix inside the stacked \textit{cM}/\textit{rP} buffers, plus an
|
||||
additional element pointing past the end of the last block, where the
|
||||
next one would begin.
|
||||
We thus have the property that
|
||||
the elements of the $k$-th \textit{hack} are stored between
|
||||
\verb|hackOffsets[k]| and
|
||||
\verb|hackOffsets[k+1]|, similarly to what happens in the CSR format.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{../figures/hll.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=.72\textwidth]{hll.png}
|
||||
\or
|
||||
\includegraphics[width=.72\textwidth]{../figures/hll.pdf}
|
||||
\fi
|
||||
\caption{Hacked ELLPACK compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:hll}
|
||||
\end{figure}
|
||||
|
||||
With this data structure a very long row only affects one hack, and
|
||||
therefore the additional memory is limited to the hack in which the
|
||||
row appears.
|
||||
|
||||
The relevant data type is \verb|psb_T_hll_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
!
|
||||
! HLL format. (Hacked ELL)
|
||||
!
|
||||
integer(psb_ipk_) :: hksz
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hll_sparse_mat
|
||||
!
|
||||
! HLL format. (Hacked ELL)
|
||||
!
|
||||
integer(psb_ipk_) :: hksz
|
||||
integer(psb_ipk_), allocatable :: irn(:), ja(:), idiag(:), hkoffs(:)
|
||||
real(psb_dpk_), allocatable :: val(:)
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
\subsubsection*{Diagonal storage}
|
||||
|
||||
|
||||
The DIAgonal (DIA) format (shown in Figure~\ref{fig:dia})
|
||||
has a 2-dimensional array \verb|AS| containing in each column the
|
||||
coefficients along a diagonal of the matrix, and an integer array
|
||||
\verb|OFFSET| that determines where each diagonal starts. The
|
||||
diagonals in \verb|AS| are padded with zeros as necessary.
|
||||
|
||||
The code to compute the matrix-vector product $y=Ax$ is shown in Alg.~\ref{alg:dia};
|
||||
it costs one memory read per outer iteration,
|
||||
plus three memory reads, one memory write and two floating-point
|
||||
operations per inner iteration. The accesses to \verb|AS| and
|
||||
\verb|x| are in strict sequential order, therefore no indirect
|
||||
addressing is required.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{figures/dia.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=.72\textwidth]{dia.png}
|
||||
\or
|
||||
\includegraphics[width=.72\textwidth]{figures/dia.pdf}
|
||||
\fi
|
||||
\caption{DIA compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:dia}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{algorithm}
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
do j=1,ndiag
|
||||
if (offset(j) > 0) then
|
||||
ir1 = 1; ir2 = m - offset(j);
|
||||
else
|
||||
ir1 = 1 - offset(j); ir2 = m;
|
||||
end if
|
||||
do i=ir1,ir2
|
||||
y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
end do
|
||||
end do
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
do j=1,ndiag
|
||||
if (offset(j) > 0) then
|
||||
ir1 = 1; ir2 = m - offset(j);
|
||||
else
|
||||
ir1 = 1 - offset(j); ir2 = m;
|
||||
end if
|
||||
do i=ir1,ir2
|
||||
y(i) = y(i) + alpha*as(i,j)*x(i+offset(j))
|
||||
end do
|
||||
end do
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
\caption{\label{alg:dia} Matrix-Vector product in DIA format}
|
||||
\end{algorithm}
|
||||
|
||||
|
||||
The relevant data type is \verb|psb_T_dia_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
!
|
||||
! DIA format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nzeros
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
end type
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_dia_sparse_mat
|
||||
!
|
||||
! DIA format, extended.
|
||||
!
|
||||
|
||||
integer(psb_ipk_), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nzeros
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
|
||||
end type
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
|
||||
|
||||
\subsubsection*{Hacked DIA}
|
||||
|
||||
Storage by DIAgonals is an attractive option for matrices whose
|
||||
coefficients are located on a small set of diagonals, since they do
|
||||
away with storing explicitly the indices and therefore reduce
|
||||
significantly memory traffic. However, having a few coefficients
|
||||
outside of the main set of diagonals may significantly increase the
|
||||
amount of needed padding; moreover, while the DIA code is easily
|
||||
vectorized, it does not necessarily make optimal use of the memory
|
||||
hierarchy. While processing each diagonal we are updating entries in
|
||||
the output vector \verb|y|, which is then accessed multiple times; if
|
||||
the vector \verb|y| is too large to remain in the cache memory, the
|
||||
associated cache miss penalty is paid multiple times.
|
||||
|
||||
The \textit{hacked DIA} (\textbf{HDIA}) format was designed to contain
|
||||
the amount of padding, by breaking the original matrix
|
||||
into equally sized groups of rows (\textit{hacks}), and then storing
|
||||
these groups as independent matrices in DIA format. This approach is
|
||||
similar to that of HLL, and requires using an offset vector for each
|
||||
submatrix. Again, similarly to HLL, the various submatrices are
|
||||
stacked inside a linear array to improve memory management. The fact
|
||||
that the matrix is accessed in slices helps in reducing cache misses,
|
||||
especially regarding accesses to the %output
|
||||
vector \verb|y|.
|
||||
|
||||
|
||||
An additional vector \textit{hackOffsets} is provided to complete
|
||||
the matrix format; given that \textit{hackSize} is the number of rows of each hack,
|
||||
the \textit{hackOffsets} vector is made by an array of
|
||||
$(m/hackSize)+1$ elements, pointing to the first diagonal offset of a
|
||||
submatrix inside the stacked \textit{offsets} buffers, plus an
|
||||
additional element equal to the number of nonzero diagonals in the whole matrix.
|
||||
We thus have the property that
|
||||
the number of diagonals of the $k$-th \textit{hack} is given by
|
||||
\textit{hackOffsets[k+1] - hackOffsets[k]}.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
% \includegraphics[width=8.2cm]{../figures/hdia.eps}
|
||||
\ifcase\pdfoutput
|
||||
\includegraphics[width=.72\textwidth]{hdia.png}
|
||||
\or
|
||||
\includegraphics[width=.72\textwidth]{../figures/hdia.pdf}
|
||||
\fi
|
||||
\caption{Hacked DIA compression of matrix in Figure~\ref{fig:dense}}
|
||||
\label{fig:hdia}
|
||||
\end{figure}
|
||||
|
||||
The relevant data type is \verb|psb_T_hdia_sparse_mat|:
|
||||
\ifpdf
|
||||
\begin{minted}[breaklines=true,bgcolor=bg,fontsize=\small]{fortran}
|
||||
type pm
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
end type pm
|
||||
|
||||
type po
|
||||
integer(psb_ipk_), allocatable :: off(:)
|
||||
end type po
|
||||
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
!
|
||||
! HDIA format, extended.
|
||||
!
|
||||
|
||||
type(pm), allocatable :: hdia(:)
|
||||
type(po), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nblocks, nzeros
|
||||
integer(psb_ipk_) :: hack = 64
|
||||
integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{minted}
|
||||
\else
|
||||
\begin{center}
|
||||
\begin{minipage}[tl]{0.9\textwidth}
|
||||
\begin{verbatim}
|
||||
type pm
|
||||
real(psb_dpk_), allocatable :: data(:,:)
|
||||
end type pm
|
||||
|
||||
type po
|
||||
integer(psb_ipk_), allocatable :: off(:)
|
||||
end type po
|
||||
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_hdia_sparse_mat
|
||||
!
|
||||
! HDIA format, extended.
|
||||
!
|
||||
|
||||
type(pm), allocatable :: hdia(:)
|
||||
type(po), allocatable :: offset(:)
|
||||
integer(psb_ipk_) :: nblocks, nzeros
|
||||
integer(psb_ipk_) :: hack = 64
|
||||
integer(psb_long_int_k_) :: dim=0
|
||||
|
||||
contains
|
||||
....
|
||||
end type
|
||||
\end{verbatim}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\fi
|
||||
|
||||
|
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 52 KiB |
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 61 KiB |
After Width: | Height: | Size: 90 KiB |
After Width: | Height: | Size: 29 KiB |
@ -0,0 +1,53 @@
|
||||
include ../Make.inc
|
||||
#
|
||||
# Libraries used
|
||||
#
|
||||
PSBLIBDIR=$(PSBLASDIR)/lib/
|
||||
PSBINCDIR=$(PSBLASDIR)/include
|
||||
PSBMODDIR=$(PSBLASDIR)/modules
|
||||
LIBDIR=../lib
|
||||
INCDIR=../include
|
||||
MODDIR=../modules
|
||||
PSBLAS_LIB= -L$(PSBLIBDIR) -lpsb_util -lpsb_base
|
||||
#-lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
|
||||
LDLIBS=$(PSBLDLIBS)
|
||||
#
|
||||
# Compilers and such
|
||||
#
|
||||
#CCOPT= -g
|
||||
FINCLUDES=$(FMFLAG). $(FMFLAG)$(PSBMODDIR) $(FMFLAG)$(PSBINCDIR) $(FIFLAG). $(LIBRSB_INCLUDES) $(LIBRSB_DEFINES)
|
||||
CINCLUDES=-I$(GPU_INCDIR) -I$(CUDA_INCDIR)
|
||||
LIBNAME=libpsb_rsb.a
|
||||
|
||||
|
||||
FOBJS= rsb_mod.o psb_d_rsb_mat_mod.o \
|
||||
psb_rsb_penv_mod.o psb_rsb_mod.o
|
||||
|
||||
COBJS= rsb_int.o
|
||||
|
||||
OBJS=$(COBJS) $(FOBJS)
|
||||
|
||||
lib: objs ilib
|
||||
/bin/cp -p $(LIBNAME) $(LIBDIR)
|
||||
|
||||
objs: $(OBJS) iobjs
|
||||
/bin/cp -p *$(.mod) $(MODDIR)
|
||||
|
||||
iobjs:
|
||||
$(MAKE) -C impl objs
|
||||
ilib: iobjs
|
||||
$(MAKE) -C impl lib LIBNAME=$(LIBNAME)
|
||||
|
||||
clean: cclean iclean
|
||||
/bin/rm -f $(FOBJS) *$(.mod) *.a
|
||||
|
||||
cclean:
|
||||
/bin/rm -f $(COBJS)
|
||||
iclean:
|
||||
cd impl && $(MAKE) clean
|
||||
|
||||
verycleanlib:
|
||||
(cd ../..; make veryclean)
|
||||
|
||||
|
||||
|
@ -0,0 +1,30 @@
|
||||
include ../../Make.inc
|
||||
PSBLIBDIR=$(PSBLASDIR)/lib/
|
||||
PSBINCDIR=$(PSBLASDIR)/include
|
||||
PSBMODDIR=$(PSBLASDIR)/modules
|
||||
LIBDIR=../../lib
|
||||
INCDIR=../../include
|
||||
MODDIR=../../modules
|
||||
PSBLAS_LIB= -L$(PSBLIBDIR) -lpsb_util -lpsb_base
|
||||
#-lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base
|
||||
LDLIBS=$(PSBLDLIBS)
|
||||
#
|
||||
# Compilers and such
|
||||
#
|
||||
#CCOPT= -g
|
||||
FINCLUDES=$(FMFLAG).. $(FMFLAG)$(INCDIR) $(FMFLAG)$(MODDIR) $(FMFLAG)$(PSBMODDIR) $(FMFLAG)$(PSBINCDIR) $(LIBRSB_INCLUDES) $(FIFLAG).. $(LIBRSB_DEFINES)
|
||||
CINCLUDES=
|
||||
LIBNAME=libpsb_rsb.a
|
||||
|
||||
OBJS= \
|
||||
psb_d_cp_rsb_from_coo.o \
|
||||
psb_d_mv_rsb_from_coo.o \
|
||||
psb_d_cp_rsb_to_coo.o psb_d_rsb_csmv.o
|
||||
|
||||
objs: $(OBJS)
|
||||
|
||||
lib: objs
|
||||
ar cur ../$(LIBNAME) $(OBJS)
|
||||
|
||||
clean:
|
||||
/bin/rm -f $(OBJS)
|
@ -0,0 +1,78 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
subroutine psb_d_cp_rsb_from_coo(a,b,info)
|
||||
|
||||
use psb_base_mod
|
||||
use rsb_mod
|
||||
use psb_d_rsb_mat_mod, psb_protect_name => psb_d_cp_rsb_from_coo
|
||||
implicit none
|
||||
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(in) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
|
||||
!locals
|
||||
type(psb_d_coo_sparse_mat) :: tmp
|
||||
Integer(Psb_ipk_) :: nza, nr, i,j,irw, idl,err_act, nc
|
||||
integer(psb_ipk_) :: nzm, ir, ic, k ,bs
|
||||
integer(psb_ipk_) :: debug_level, debug_unit
|
||||
character(len=20) :: name
|
||||
|
||||
info = psb_success_
|
||||
#ifdef HAVE_RSB
|
||||
! This is to have fix_coo called behind the scenes
|
||||
call b%cp_to_coo(tmp,info)
|
||||
|
||||
call tmp%fix(info)
|
||||
if (info /= psb_success_) return
|
||||
|
||||
nr = tmp%get_nrows()
|
||||
nc = tmp%get_ncols()
|
||||
nza = tmp%get_nzeros()
|
||||
! If it is sorted then we can lessen memory impact
|
||||
a%psb_d_base_sparse_mat = tmp%psb_d_base_sparse_mat
|
||||
|
||||
bs = 1!RSB_DEFAULT_BLOCKING
|
||||
|
||||
info = Rsb_from_coo(a%rsbMat,b%val,b%ia,b%ja,nza,nr,nc,bs,bs)
|
||||
|
||||
call tmp%free()
|
||||
#endif
|
||||
|
||||
return
|
||||
|
||||
9999 continue
|
||||
info = psb_err_alloc_dealloc_
|
||||
return
|
||||
|
||||
end subroutine psb_d_cp_rsb_from_coo
|
@ -0,0 +1,77 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
subroutine psb_d_cp_rsb_to_coo(a,b,info)
|
||||
|
||||
use psb_base_mod
|
||||
use rsb
|
||||
use psb_d_rsb_mat_mod, psb_protect_name => psb_d_cp_rsb_to_coo
|
||||
implicit none
|
||||
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
real(psb_dpk_), pointer :: val_point(:)
|
||||
type(c_ptr) :: t_p,s_p
|
||||
|
||||
!locals
|
||||
integer(psb_ipk_) :: i, j, k,nr,nza,nc
|
||||
|
||||
info = psb_success_
|
||||
|
||||
nr = a%get_nrows()
|
||||
nc = a%get_ncols()
|
||||
nza = a%get_nzeros()
|
||||
|
||||
call b%allocate(nr,nc,nza)
|
||||
b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat
|
||||
|
||||
allocate(val_point(nza))
|
||||
|
||||
t_p = c_loc(val_point(1))
|
||||
|
||||
info = rsb_mtx_get_coo(a%rsbMat, t_p, b%ia, b%ja,RSB_FLAG_FORTRAN_INDICES_INTERFACE)
|
||||
|
||||
!info = rsb_mtx_switch_to_coo(a%rsbMat,t_p,b%ia,b%ja,RSB_FLAG_FORTRAN_INDICES_INTERFACE)
|
||||
|
||||
k = rsb_perror(s_p,info)
|
||||
|
||||
do i=1,nza
|
||||
b%val(i)=val_point(i)
|
||||
enddo
|
||||
|
||||
deallocate(val_point)
|
||||
|
||||
call b%set_nzeros(nza)
|
||||
call b%fix(info)
|
||||
|
||||
end subroutine psb_d_cp_rsb_to_coo
|
@ -0,0 +1,114 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
subroutine psb_d_mv_rsb_from_coo(a,b,info)
|
||||
|
||||
use psb_base_mod
|
||||
use psb_d_rsb_mat_mod, psb_protect_name => psb_d_mv_rsb_from_coo
|
||||
implicit none
|
||||
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
|
||||
!locals
|
||||
Integer(Psb_ipk_) :: nza, nr, i,j,k, idl,err_act, nc, nzm, ir, ic
|
||||
|
||||
info = psb_success_
|
||||
|
||||
call b%fix(info)
|
||||
if (info /= psb_success_) return
|
||||
|
||||
nr = b%get_nrows()
|
||||
nc = b%get_ncols()
|
||||
nza = b%get_nzeros()
|
||||
! if (b%is_sorted()) then
|
||||
! ! If it is sorted then we can lessen memory impact
|
||||
! a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat
|
||||
|
||||
! ! First compute the number of nonzeros in each row.
|
||||
! call psb_realloc(nr,a%irn,info)
|
||||
! if (info /= 0) goto 9999
|
||||
! a%irn = 0
|
||||
! do i=1, nza
|
||||
! a%irn(b%ia(i)) = a%irn(b%ia(i)) + 1
|
||||
! end do
|
||||
! nzm = 0
|
||||
! do i=1, nr
|
||||
! nzm = max(nzm,a%irn(i))
|
||||
! a%irn(i) = 0
|
||||
! end do
|
||||
! ! Second: copy the column indices.
|
||||
! call psb_realloc(nr,a%idiag,info)
|
||||
! if (info == 0) call psb_realloc(nr,nzm,a%ja,info)
|
||||
! if (info /= 0) goto 9999
|
||||
! do i=1, nza
|
||||
! ir = b%ia(i)
|
||||
! ic = b%ja(i)
|
||||
! j = a%irn(ir) + 1
|
||||
! a%ja(ir,j) = ic
|
||||
! a%irn(ir) = j
|
||||
! end do
|
||||
! ! Third copy the other stuff
|
||||
! deallocate(b%ia,b%ja,stat=info)
|
||||
! if (info == 0) call psb_realloc(nr,a%idiag,info)
|
||||
! if (info == 0) call psb_realloc(nr,nzm,a%val,info)
|
||||
! if (info /= 0) goto 9999
|
||||
! k = 0
|
||||
! do i=1, nr
|
||||
! a%idiag(i) = 0
|
||||
! do j=1, a%irn(i)
|
||||
! k = k + 1
|
||||
! a%val(i,j) = b%val(k)
|
||||
! if (i==a%ja(i,j)) a%idiag(i)=j
|
||||
! end do
|
||||
! do j=a%irn(i)+1, nzm
|
||||
! a%ja(i,j) = i
|
||||
! a%val(i,j) = dzero
|
||||
! end do
|
||||
! end do
|
||||
|
||||
! else
|
||||
! If b is not sorted, the only way is to copy.
|
||||
call a%cp_from_coo(b,info)
|
||||
if (info /= 0) goto 9999
|
||||
! end if
|
||||
|
||||
call b%free()
|
||||
|
||||
return
|
||||
|
||||
9999 continue
|
||||
info = psb_err_alloc_dealloc_
|
||||
return
|
||||
|
||||
end subroutine psb_d_mv_rsb_from_coo
|
@ -0,0 +1,138 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
subroutine psb_d_rsb_csmv(alpha,a,x,beta,y,info,trans)
|
||||
|
||||
use psb_base_mod
|
||||
use rsb_mod
|
||||
use psb_d_rsb_mat_mod, psb_protect_name => psb_d_rsb_csmv
|
||||
implicit none
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(in) :: alpha, beta, x(:)
|
||||
real(psb_dpk_), intent(inout) :: y(:)
|
||||
integer, intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
|
||||
character :: trans_
|
||||
integer :: i,j,k,m,n, nnz, ir, jc
|
||||
real(psb_dpk_) :: acc
|
||||
type(c_ptr) :: gpX, gpY
|
||||
logical :: tra
|
||||
Integer :: err_act
|
||||
character(len=20) :: name='d_rsb_csmv'
|
||||
logical, parameter :: debug=.false.
|
||||
|
||||
call psb_erractionsave(err_act)
|
||||
info = psb_success_
|
||||
#ifdef HAVE_RSB
|
||||
if (present(trans)) then
|
||||
trans_ = trans
|
||||
else
|
||||
trans_ = 'N'
|
||||
end if
|
||||
|
||||
if (.not.a%is_asb()) then
|
||||
info = psb_err_invalid_mat_state_
|
||||
call psb_errpush(info,name)
|
||||
goto 9999
|
||||
endif
|
||||
|
||||
|
||||
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
|
||||
|
||||
if (tra) then
|
||||
m = a%get_ncols()
|
||||
n = a%get_nrows()
|
||||
else
|
||||
n = a%get_ncols()
|
||||
m = a%get_nrows()
|
||||
end if
|
||||
|
||||
if (size(x,1)<n) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (size(y,1)<m) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
! if (tra) then
|
||||
! call a%psb_d_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
|
||||
! else
|
||||
|
||||
info = Rsb_spmv(a%rsbMat,x,alpha,y,beta,trans_)
|
||||
|
||||
if (info /= 0) goto 9999
|
||||
|
||||
! if (info == 0) &
|
||||
! & info = FallocMultiVecDevice(gpX,1,size(x,1),spgpu_type_double)
|
||||
! if (alpha /= dzero) then
|
||||
! if (info == 0) &
|
||||
! & info = writeMultiVecDevice(gpX,x)
|
||||
! end if
|
||||
! if (info == 0) &
|
||||
! & info = FallocMultiVecDevice(gpY,1,size(y,1),spgpu_type_double)
|
||||
! if (beta /= dzero) then
|
||||
! if (info == 0) &
|
||||
! & info = writeMultiVecDevice(gpY,y)
|
||||
! end if
|
||||
! if (info == 0) &
|
||||
! & info = spmvHllDevice(a%deviceMat,alpha,gpX,beta,gpY)
|
||||
! if (info == 0) &
|
||||
! & info = readMultiVecDevice(gpY,y)
|
||||
! if (info /= 0) goto 9999
|
||||
! call freeMultiVecDevice(gpX)
|
||||
! call freeMultiVecDevice(gpY)
|
||||
! endif
|
||||
! #else
|
||||
! call a%psb_d_hll_sparse_mat%spmm(alpha,x,beta,y,info,trans)
|
||||
! write(*,*) y
|
||||
#endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
|
||||
|
||||
end subroutine psb_d_rsb_csmv
|
@ -0,0 +1,487 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
module psb_d_rsb_mat_mod
|
||||
|
||||
use psb_d_base_mat_mod
|
||||
use iso_c_binding
|
||||
|
||||
type, extends(psb_d_base_sparse_mat) :: psb_d_rsb_sparse_mat
|
||||
|
||||
type(c_ptr) :: rsbMat = c_null_ptr
|
||||
|
||||
contains
|
||||
! procedure, pass(a) :: get_size => d_rsb_get_size
|
||||
procedure, pass(a) :: get_nzeros => d_rsb_get_nzeros
|
||||
procedure, nopass :: get_fmt => d_rsb_get_fmt
|
||||
procedure, pass(a) :: sizeof => d_rsb_sizeof
|
||||
! procedure, pass(a) :: csmm => psb_d_rsb_csmm
|
||||
procedure, pass(a) :: csmv => psb_d_rsb_csmv
|
||||
! procedure, pass(a) :: inner_cssm => psb_d_rsb_cssm
|
||||
! procedure, pass(a) :: inner_cssv => psb_d_rsb_cssv
|
||||
! procedure, pass(a) :: scals => psb_d_rsb_scals
|
||||
! procedure, pass(a) :: scalv => psb_d_rsb_scal
|
||||
! procedure, pass(a) :: maxval => psb_d_rsb_maxval
|
||||
! procedure, pass(a) :: csnmi => psb_d_rsb_csnmi
|
||||
! procedure, pass(a) :: csnm1 => psb_d_rsb_csnm1
|
||||
! procedure, pass(a) :: rowsum => psb_d_rsb_rowsum
|
||||
! procedure, pass(a) :: arwsum => psb_d_rsb_arwsum
|
||||
! procedure, pass(a) :: colsum => psb_d_rsb_colsum
|
||||
! procedure, pass(a) :: aclsum => psb_d_rsb_aclsum
|
||||
! procedure, pass(a) :: reallocate_nz => psb_d_rsb_reallocate_nz
|
||||
! procedure, pass(a) :: allocate_mnnz => psb_d_rsb_allocate_mnnz
|
||||
procedure, pass(a) :: cp_to_coo => psb_d_cp_rsb_to_coo
|
||||
procedure, pass(a) :: cp_from_coo => psb_d_cp_rsb_from_coo
|
||||
! procedure, pass(a) :: cp_to_fmt => psb_d_cp_rsb_to_fmt
|
||||
! procedure, pass(a) :: cp_from_fmt => psb_d_cp_rsb_from_fmt
|
||||
! procedure, pass(a) :: mv_to_coo => psb_d_mv_rsb_to_coo
|
||||
procedure, pass(a) :: mv_from_coo => psb_d_mv_rsb_from_coo
|
||||
! procedure, pass(a) :: mv_to_fmt => psb_d_mv_rsb_to_fmt
|
||||
! procedure, pass(a) :: mv_from_fmt => psb_d_mv_rsb_from_fmt
|
||||
! procedure, pass(a) :: csput => psb_d_rsb_csput
|
||||
! procedure, pass(a) :: get_diag => psb_d_rsb_get_diag
|
||||
! procedure, pass(a) :: csgetptn => psb_d_rsb_csgetptn
|
||||
! procedure, pass(a) :: csgetrow => psb_d_rsb_csgetrow
|
||||
! procedure, pass(a) :: get_nz_row => d_rsb_get_nz_row
|
||||
! procedure, pass(a) :: reinit => psb_d_rsb_reinit
|
||||
! procedure, pass(a) :: trim => psb_d_rsb_trim
|
||||
! procedure, pass(a) :: print => psb_d_rsb_print
|
||||
procedure, pass(a) :: free => d_rsb_free
|
||||
! procedure, pass(a) :: mold => psb_d_rsb_mold
|
||||
|
||||
end type psb_d_rsb_sparse_mat
|
||||
|
||||
private :: d_rsb_get_nzeros, d_rsb_free, d_rsb_get_fmt, &
|
||||
& d_rsb_get_size, d_rsb_sizeof, d_rsb_get_nz_row
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_reallocate_nz(nz,a)
|
||||
import :: psb_d_rsb_sparse_mat, psb_ipk_
|
||||
integer(psb_ipk_), intent(in) :: nz
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
end subroutine psb_d_rsb_reallocate_nz
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_reinit(a,clear)
|
||||
import :: psb_d_rsb_sparse_mat
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
logical, intent(in), optional :: clear
|
||||
end subroutine psb_d_rsb_reinit
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_trim(a)
|
||||
import :: psb_d_rsb_sparse_mat
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
end subroutine psb_d_rsb_trim
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_mold(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
class(psb_d_base_sparse_mat), intent(inout), allocatable :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_rsb_mold
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_allocate_mnnz(m,n,a,nz)
|
||||
import :: psb_d_rsb_sparse_mat, psb_ipk_
|
||||
integer(psb_ipk_), intent(in) :: m,n
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
integer(psb_ipk_), intent(in), optional :: nz
|
||||
end subroutine psb_d_rsb_allocate_mnnz
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_print(iout,a,iv,head,ivr,ivc)
|
||||
import :: psb_d_rsb_sparse_mat, psb_ipk_
|
||||
integer(psb_ipk_), intent(in) :: iout
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_ipk_), intent(in), optional :: iv(:)
|
||||
character(len=*), optional :: head
|
||||
integer(psb_ipk_), intent(in), optional :: ivr(:), ivc(:)
|
||||
end subroutine psb_d_rsb_print
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_cp_rsb_to_coo(a,b,info)
|
||||
import :: psb_d_coo_sparse_mat, psb_d_rsb_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_cp_rsb_to_coo
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_cp_rsb_from_coo(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(in) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_cp_rsb_from_coo
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_cp_rsb_to_fmt(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
class(psb_d_base_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_cp_rsb_to_fmt
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_cp_rsb_from_fmt(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_base_sparse_mat), intent(in) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_cp_rsb_from_fmt
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_mv_rsb_to_coo(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_mv_rsb_to_coo
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_mv_rsb_from_coo(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_coo_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_mv_rsb_from_coo
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_mv_rsb_to_fmt(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_base_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_mv_rsb_to_fmt
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_mv_rsb_from_fmt(a,b,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_d_base_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
class(psb_d_base_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_mv_rsb_from_fmt
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_csput(nz,ia,ja,val,a,imin,imax,jmin,jmax,info,gtl)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
real(psb_dpk_), intent(in) :: val(:)
|
||||
integer(psb_ipk_), intent(in) :: nz,ia(:), ja(:),&
|
||||
& imin,imax,jmin,jmax
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
integer(psb_ipk_), intent(in), optional :: gtl(:)
|
||||
end subroutine psb_d_rsb_csput
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_csgetptn(imin,imax,a,nz,ia,ja,info,&
|
||||
& jmin,jmax,iren,append,nzin,rscale,cscale)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_ipk_), intent(in) :: imin,imax
|
||||
integer(psb_ipk_), intent(out) :: nz
|
||||
integer(psb_ipk_), allocatable, intent(inout) :: ia(:), ja(:)
|
||||
integer(psb_ipk_),intent(out) :: info
|
||||
logical, intent(in), optional :: append
|
||||
integer(psb_ipk_), intent(in), optional :: iren(:)
|
||||
integer(psb_ipk_), intent(in), optional :: jmin,jmax, nzin
|
||||
logical, intent(in), optional :: rscale,cscale
|
||||
end subroutine psb_d_rsb_csgetptn
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_csgetrow(imin,imax,a,nz,ia,ja,val,info,&
|
||||
& jmin,jmax,iren,append,nzin,rscale,cscale)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_ipk_), intent(in) :: imin,imax
|
||||
integer(psb_ipk_), intent(out) :: nz
|
||||
integer(psb_ipk_), allocatable, intent(inout) :: ia(:), ja(:)
|
||||
real(psb_dpk_), allocatable, intent(inout) :: val(:)
|
||||
integer(psb_ipk_),intent(out) :: info
|
||||
logical, intent(in), optional :: append
|
||||
integer(psb_ipk_), intent(in), optional :: iren(:)
|
||||
integer(psb_ipk_), intent(in), optional :: jmin,jmax, nzin
|
||||
logical, intent(in), optional :: rscale,cscale
|
||||
end subroutine psb_d_rsb_csgetrow
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_csgetblk(imin,imax,a,b,info,&
|
||||
& jmin,jmax,iren,append,rscale,cscale)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_d_coo_sparse_mat, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
class(psb_d_coo_sparse_mat), intent(inout) :: b
|
||||
integer(psb_ipk_), intent(in) :: imin,imax
|
||||
integer(psb_ipk_),intent(out) :: info
|
||||
logical, intent(in), optional :: append
|
||||
integer(psb_ipk_), intent(in), optional :: iren(:)
|
||||
integer(psb_ipk_), intent(in), optional :: jmin,jmax
|
||||
logical, intent(in), optional :: rscale,cscale
|
||||
end subroutine psb_d_rsb_csgetblk
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_cssv(alpha,a,x,beta,y,info,trans)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(in) :: alpha, beta, x(:)
|
||||
real(psb_dpk_), intent(inout) :: y(:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
end subroutine psb_d_rsb_cssv
|
||||
subroutine psb_d_rsb_cssm(alpha,a,x,beta,y,info,trans)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
|
||||
real(psb_dpk_), intent(inout) :: y(:,:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
end subroutine psb_d_rsb_cssm
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_csmv(alpha,a,x,beta,y,info,trans)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(in) :: alpha, beta, x(:)
|
||||
real(psb_dpk_), intent(inout) :: y(:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
end subroutine psb_d_rsb_csmv
|
||||
subroutine psb_d_rsb_csmm(alpha,a,x,beta,y,info,trans)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
|
||||
real(psb_dpk_), intent(inout) :: y(:,:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
end subroutine psb_d_rsb_csmm
|
||||
end interface
|
||||
|
||||
|
||||
interface
|
||||
function psb_d_rsb_maxval(a) result(res)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_) :: res
|
||||
end function psb_d_rsb_maxval
|
||||
end interface
|
||||
|
||||
interface
|
||||
function psb_d_rsb_csnmi(a) result(res)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_) :: res
|
||||
end function psb_d_rsb_csnmi
|
||||
end interface
|
||||
|
||||
interface
|
||||
function psb_d_rsb_csnm1(a) result(res)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_) :: res
|
||||
end function psb_d_rsb_csnm1
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_rowsum(d,a)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(out) :: d(:)
|
||||
end subroutine psb_d_rsb_rowsum
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_arwsum(d,a)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(out) :: d(:)
|
||||
end subroutine psb_d_rsb_arwsum
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_colsum(d,a)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(out) :: d(:)
|
||||
end subroutine psb_d_rsb_colsum
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_aclsum(d,a)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(out) :: d(:)
|
||||
end subroutine psb_d_rsb_aclsum
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_get_diag(a,d,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(out) :: d(:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_rsb_get_diag
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_scal(d,a,info,side)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
real(psb_dpk_), intent(in) :: d(:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, intent(in), optional :: side
|
||||
end subroutine psb_d_rsb_scal
|
||||
end interface
|
||||
|
||||
interface
|
||||
subroutine psb_d_rsb_scals(d,a,info)
|
||||
import :: psb_d_rsb_sparse_mat, psb_dpk_, psb_ipk_
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
real(psb_dpk_), intent(in) :: d
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
end subroutine psb_d_rsb_scals
|
||||
end interface
|
||||
|
||||
|
||||
|
||||
contains
|
||||
|
||||
! == ===================================
|
||||
!
|
||||
!
|
||||
!
|
||||
! Getters
|
||||
!
|
||||
!
|
||||
!
|
||||
!
|
||||
!
|
||||
! == ===================================
|
||||
|
||||
|
||||
function d_rsb_sizeof(a) result(res)
|
||||
implicit none
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_epk_) :: res
|
||||
|
||||
|
||||
|
||||
|
||||
end function d_rsb_sizeof
|
||||
|
||||
function d_rsb_get_fmt() result(res)
|
||||
implicit none
|
||||
character(len=5) :: res
|
||||
res = 'RSB'
|
||||
end function d_rsb_get_fmt
|
||||
|
||||
function d_rsb_get_nzeros(a) result(res)
|
||||
use rsb_mod
|
||||
implicit none
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_ipk_) :: res
|
||||
|
||||
res = Rsb_get_nzeros(a%rsbMat)
|
||||
|
||||
end function d_rsb_get_nzeros
|
||||
|
||||
function d_rsb_get_size(a) result(res)
|
||||
implicit none
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_ipk_) :: res
|
||||
|
||||
end function d_rsb_get_size
|
||||
|
||||
|
||||
function d_rsb_get_nz_row(idx,a) result(res)
|
||||
|
||||
implicit none
|
||||
|
||||
class(psb_d_rsb_sparse_mat), intent(in) :: a
|
||||
integer(psb_ipk_), intent(in) :: idx
|
||||
integer(psb_ipk_) :: res
|
||||
|
||||
res = 0
|
||||
|
||||
|
||||
end function d_rsb_get_nz_row
|
||||
|
||||
|
||||
|
||||
! == ===================================
|
||||
!
|
||||
!
|
||||
!
|
||||
! Data management
|
||||
!
|
||||
!
|
||||
!
|
||||
!
|
||||
!
|
||||
! == ===================================
|
||||
|
||||
subroutine d_rsb_free(a)
|
||||
use rsb_mod
|
||||
implicit none
|
||||
|
||||
class(psb_d_rsb_sparse_mat), intent(inout) :: a
|
||||
|
||||
call freeRsbMat(a%rsbMat)
|
||||
|
||||
call a%set_null()
|
||||
call a%set_nrows(0)
|
||||
call a%set_ncols(0)
|
||||
|
||||
return
|
||||
|
||||
end subroutine d_rsb_free
|
||||
|
||||
|
||||
end module psb_d_rsb_mat_mod
|
@ -0,0 +1,50 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
module psb_rsb_mod
|
||||
use psb_const_mod
|
||||
use rsb_mod
|
||||
use psb_rsb_penv_mod
|
||||
! use psb_d_ell_mat_mod
|
||||
! use psb_s_ell_mat_mod
|
||||
! use psb_z_ell_mat_mod
|
||||
! use psb_c_ell_mat_mod
|
||||
|
||||
! use psb_s_hll_mat_mod
|
||||
! use psb_d_hll_mat_mod
|
||||
! use psb_c_hll_mat_mod
|
||||
! use psb_z_hll_mat_mod
|
||||
|
||||
! use psb_d_dia_mat_mod
|
||||
! use psb_d_hdia_mat_mod
|
||||
use psb_d_rsb_mat_mod
|
||||
end module psb_rsb_mod
|
@ -0,0 +1,99 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
module psb_rsb_penv_mod
|
||||
use psb_const_mod
|
||||
use psb_penv_mod
|
||||
!use psi_comm_buffers_mod, only : psb_buffer_queue
|
||||
use iso_c_binding
|
||||
|
||||
! interface psb_rsb_init
|
||||
! module procedure psb_rsb_init
|
||||
! end interface
|
||||
#if defined(HAVE_RSB)
|
||||
interface
|
||||
function psb_C_rsb_init() &
|
||||
& result(res) bind(c,name='rsbInit')
|
||||
use iso_c_binding
|
||||
integer(c_int) :: res
|
||||
end function psb_C_rsb_init
|
||||
end interface
|
||||
|
||||
interface
|
||||
function psb_C_rsb_exit() &
|
||||
& result(res) bind(c,name='rsbExit')
|
||||
use iso_c_binding
|
||||
integer(c_int) :: res
|
||||
end function psb_C_rsb_exit
|
||||
end interface
|
||||
|
||||
#endif
|
||||
|
||||
contains
|
||||
! !!!!!!!!!!!!!!!!!!!!!!
|
||||
!
|
||||
! Environment handling
|
||||
!
|
||||
! !!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
|
||||
subroutine psb_rsb_init()
|
||||
use psb_penv_mod
|
||||
use psb_const_mod
|
||||
use psb_error_mod
|
||||
! type(psb_ctxt_type), intent(in) :: ctxt
|
||||
! integer, intent(in), optional :: dev
|
||||
|
||||
integer :: info
|
||||
|
||||
#if defined (HAVE_RSB)
|
||||
info = psb_C_rsb_init()
|
||||
if (info/=0) write(*,*) 'error during rsb_init'
|
||||
#endif
|
||||
end subroutine psb_rsb_init
|
||||
|
||||
subroutine psb_rsb_exit()
|
||||
use psb_penv_mod
|
||||
use psb_const_mod
|
||||
use psb_error_mod
|
||||
! type(psb_ctxt_type), intent(in) :: ctxt
|
||||
! integer, intent(in), optional :: dev
|
||||
|
||||
integer :: info
|
||||
|
||||
#if defined (HAVE_RSB)
|
||||
info = psb_C_rsb_exit()
|
||||
if (info/=0) write(*,*) 'error during rsb_exit'
|
||||
#endif
|
||||
end subroutine psb_rsb_exit
|
||||
|
||||
end module psb_rsb_penv_mod
|
@ -0,0 +1,110 @@
|
||||
/* Parallel Sparse BLAS GPU plugin */
|
||||
/* (C) Copyright 2013 */
|
||||
|
||||
/* Salvatore Filippone */
|
||||
/* Alessandro Fanfarillo */
|
||||
|
||||
/* Redistribution and use in source and binary forms, with or without */
|
||||
/* modification, are permitted provided that the following conditions */
|
||||
/* are met: */
|
||||
/* 1. Redistributions of source code must retain the above copyright */
|
||||
/* notice, this list of conditions and the following disclaimer. */
|
||||
/* 2. Redistributions in binary form must reproduce the above copyright */
|
||||
/* notice, this list of conditions, and the following disclaimer in the */
|
||||
/* documentation and/or other materials provided with the distribution. */
|
||||
/* 3. The name of the PSBLAS group or the names of its contributors may */
|
||||
/* not be used to endorse or promote products derived from this */
|
||||
/* software without specific written permission. */
|
||||
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
|
||||
/* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
|
||||
/* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
|
||||
/* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS */
|
||||
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */
|
||||
/* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF */
|
||||
/* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS */
|
||||
/* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
|
||||
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) */
|
||||
/* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
|
||||
#include <sys/time.h>
|
||||
#if defined(HAVE_RSB)
|
||||
#include "rsb.h"
|
||||
#include "rsb_int.h"
|
||||
|
||||
int rsbInit()
|
||||
{
|
||||
rsb_err_t errval = RSB_ERR_NO_ERROR;
|
||||
|
||||
if((errval = rsb_lib_init(RSB_NULL_INIT_OPTIONS))!=RSB_ERR_NO_ERROR)
|
||||
{
|
||||
printf("Error initializing the library!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rsbExit()
|
||||
{
|
||||
rsb_err_t errval = RSB_ERR_NO_ERROR;
|
||||
|
||||
if((errval = rsb_lib_exit(RSB_NULL_INIT_OPTIONS))!=RSB_ERR_NO_ERROR)
|
||||
{
|
||||
printf("Error finalizing the library!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Rsb_double_from_coo(void **rsbMat, double *va, int *ia,int *ja,int nnz,int nr,
|
||||
int nc, int br, int bc)
|
||||
{
|
||||
int i=0;
|
||||
rsb_err_t errval = RSB_ERR_NO_ERROR;
|
||||
|
||||
*rsbMat = rsb_mtx_alloc_from_coo_const(va,ia,ja,nnz,RSB_NUMERICAL_TYPE_DOUBLE,nr,nc,br,bc,RSB_FLAG_FORTRAN_INDICES_INTERFACE,&errval);
|
||||
|
||||
if((!*rsbMat) || (errval != RSB_ERR_NO_ERROR))
|
||||
{
|
||||
printf("Error while allocating the matrix!\n");
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//X is the input and y is the output
|
||||
int Rsb_double_spmv(void *rsbMat, double *x, double alfa, double *y, double beta,char trans)
|
||||
{
|
||||
rsb_err_t errval = RSB_ERR_NO_ERROR;
|
||||
|
||||
if(trans=='N')
|
||||
errval = rsb_spmv(RSB_TRANSPOSITION_N,&alfa,(struct rsb_mtx_t *)rsbMat,x,1,&beta,y,1);
|
||||
else
|
||||
errval = rsb_spmv(RSB_TRANSPOSITION_T,&alfa,(struct rsb_mtx_t *)rsbMat,x,1,&beta,y,1);
|
||||
|
||||
if(errval != RSB_ERR_NO_ERROR)
|
||||
{
|
||||
printf("Error performing a multiplication!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//Should it return a long instead of integer?
|
||||
int Rsb_getNZeros(void *rsbMat)
|
||||
{
|
||||
int res = 0;
|
||||
rsb_mtx_get_info((struct rsb_mtx_t *)rsbMat,RSB_MIF_MATRIX_NNZ__TO__RSB_NNZ_INDEX_T,(void *)&res);
|
||||
return res;
|
||||
}
|
||||
|
||||
void freeRsbMat(void *rsbMat)
|
||||
{
|
||||
rsb_mtx_free(rsbMat);
|
||||
}
|
||||
|
||||
#endif
|
@ -0,0 +1,2 @@
|
||||
int Rsb_double_from_coo(void **rsbMat,double *va, int *ia,int *ja,int nnz,int nr,
|
||||
int nc, int br, int bc);
|
@ -0,0 +1,235 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
|
||||
module rsb_mod
|
||||
use rsb
|
||||
use iso_c_binding
|
||||
|
||||
#ifdef HAVE_RSB
|
||||
|
||||
interface Rsb_from_coo
|
||||
function Rsb_double_from_coo(rsbMat,va,ia,ja,nnz,nr,nc,br,bc) &
|
||||
& result(res) bind(c,name='Rsb_double_from_coo')
|
||||
use iso_c_binding
|
||||
integer(c_int) :: res
|
||||
type(c_ptr) :: rsbMat
|
||||
real(c_double) :: va(*)
|
||||
integer(c_int) :: ia(*),ja(*)
|
||||
integer(c_int),value :: nnz,nr,nc,br,bc
|
||||
end function Rsb_double_from_coo
|
||||
end interface Rsb_from_coo
|
||||
|
||||
interface
|
||||
function Rsb_get_nzeros(rsbMat) &
|
||||
& result(res) bind(c,name='Rsb_getNZeros')
|
||||
use iso_c_binding
|
||||
integer(c_int) :: res
|
||||
type(c_ptr),value :: rsbMat
|
||||
end function Rsb_get_nzeros
|
||||
end interface
|
||||
|
||||
interface Rsb_spmv
|
||||
function Rsb_double_spmv(rsbMat,x,alfa,y,beta,trans) &
|
||||
& result(res) bind(c,name='Rsb_double_spmv')
|
||||
use iso_c_binding
|
||||
integer(c_int) :: res
|
||||
type(c_ptr),value :: rsbMat
|
||||
real(c_double) :: x(*),y(*)
|
||||
real(c_double),value :: alfa,beta
|
||||
character(c_char),value :: trans
|
||||
end function Rsb_double_spmv
|
||||
end interface Rsb_spmv
|
||||
|
||||
interface
|
||||
subroutine freeRsbMat(rsbMat) &
|
||||
& bind(c,name='freeRsbMat')
|
||||
use iso_c_binding
|
||||
type(c_ptr), value :: rsbMat
|
||||
end subroutine freeRsbMat
|
||||
end interface
|
||||
|
||||
! interface writeEllDevice
|
||||
|
||||
! function writeEllDeviceFloat(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='writeEllDeviceFloat')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! real(c_float) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function writeEllDeviceFloat
|
||||
|
||||
! function writeEllDeviceDouble(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='writeEllDeviceDouble')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! real(c_double) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function writeEllDeviceDouble
|
||||
|
||||
! function writeEllDeviceFloatComplex(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='writeEllDeviceFloatComplex')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! complex(c_float_complex) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function writeEllDeviceFloatComplex
|
||||
|
||||
! function writeEllDeviceDoubleComplex(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='writeEllDeviceDoubleComplex')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! complex(c_double_complex) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function writeEllDeviceDoubleComplex
|
||||
|
||||
! end interface writeEllDevice
|
||||
|
||||
! interface readEllDevice
|
||||
|
||||
! function readEllDeviceFloat(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='readEllDeviceFloat')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! real(c_float) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function readEllDeviceFloat
|
||||
|
||||
! function readEllDeviceDouble(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='readEllDeviceDouble')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! real(c_double) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function readEllDeviceDouble
|
||||
|
||||
! function readEllDeviceFloatComplex(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='readEllDeviceFloatComplex')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! complex(c_float_complex) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function readEllDeviceFloatComplex
|
||||
|
||||
! function readEllDeviceDoubleComplex(deviceMat,val,ja,ldj,irn) &
|
||||
! & result(res) bind(c,name='readEllDeviceDoubleComplex')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int), value :: ldj
|
||||
! complex(c_double_complex) :: val(ldj,*)
|
||||
! integer(c_int) :: ja(ldj,*),irn(*)
|
||||
! end function readEllDeviceDoubleComplex
|
||||
|
||||
! end interface readEllDevice
|
||||
|
||||
! interface
|
||||
! subroutine resetEllTimer() bind(c,name='resetEllTimer')
|
||||
! use iso_c_binding
|
||||
! end subroutine resetEllTimer
|
||||
! end interface
|
||||
! interface
|
||||
! function getEllTimer() &
|
||||
! & bind(c,name='getEllTimer') result(res)
|
||||
! use iso_c_binding
|
||||
! real(c_double) :: res
|
||||
! end function getEllTimer
|
||||
! end interface
|
||||
|
||||
|
||||
! interface
|
||||
! function getEllDevicePitch(deviceMat) &
|
||||
! & bind(c,name='getEllDevicePitch') result(res)
|
||||
! use iso_c_binding
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int) :: res
|
||||
! end function getEllDevicePitch
|
||||
! end interface
|
||||
|
||||
! interface
|
||||
! function getEllDeviceMaxRowSize(deviceMat) &
|
||||
! & bind(c,name='getEllDeviceMaxRowSize') result(res)
|
||||
! use iso_c_binding
|
||||
! type(c_ptr), value :: deviceMat
|
||||
! integer(c_int) :: res
|
||||
! end function getEllDeviceMaxRowSize
|
||||
! end interface
|
||||
|
||||
|
||||
! interface spmvEllDevice
|
||||
! function spmvEllDeviceFloat(deviceMat,alpha,x,beta,y) &
|
||||
! & result(res) bind(c,name='spmvEllDeviceFloat')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat, x, y
|
||||
! real(c_float),value :: alpha, beta
|
||||
! end function spmvEllDeviceFloat
|
||||
! function spmvEllDeviceDouble(deviceMat,alpha,x,beta,y) &
|
||||
! & result(res) bind(c,name='spmvEllDeviceDouble')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat, x, y
|
||||
! real(c_double),value :: alpha, beta
|
||||
! end function spmvEllDeviceDouble
|
||||
! function spmvEllDeviceFloatComplex(deviceMat,alpha,x,beta,y) &
|
||||
! & result(res) bind(c,name='spmvEllDeviceFloatComplex')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat, x, y
|
||||
! complex(c_float_complex),value :: alpha, beta
|
||||
! end function spmvEllDeviceFloatComplex
|
||||
! function spmvEllDeviceDoubleComplex(deviceMat,alpha,x,beta,y) &
|
||||
! & result(res) bind(c,name='spmvEllDeviceDoubleComplex')
|
||||
! use iso_c_binding
|
||||
! integer(c_int) :: res
|
||||
! type(c_ptr), value :: deviceMat, x, y
|
||||
! complex(c_double_complex),value :: alpha, beta
|
||||
! end function spmvEllDeviceDoubleComplex
|
||||
! end interface spmvEllDevice
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
end module rsb_mod
|