diff --git a/cbind/test/pargen/Makefile b/cbind/test/pargen/Makefile
index 2e74497f..e1da6892 100644
--- a/cbind/test/pargen/Makefile
+++ b/cbind/test/pargen/Makefile
@@ -9,7 +9,7 @@ FINCLUDES=$(FMFLAG). $(FMFLAG)$(HERE) $(FMFLAG)$(MODDIR)
CINCLUDES=-I. -I$(HERE) -I$(INCLUDEDIR)
PSBC_LIBS= -L$(LIBDIR) -lpsb_cbind
-PSB_LIBS=-lpsb_util -lpsb_krylov -lpsb_prec -lpsb_base -L$(LIBDIR)
+PSB_LIBS=-lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base -L$(LIBDIR)
#
# Compilers and such
diff --git a/docs/html/index.html b/docs/html/index.html
index 083bd90a..2ccf61d9 100644
--- a/docs/html/index.html
+++ b/docs/html/index.html
@@ -54,11 +54,11 @@ href="userhtmlse10.html#x15-13500010" id="QQ2-15-165">Preconditioner routines 11 Iterative Methods
12 Extensions
+href="userhtmlse12.html#x19-14500012" id="QQ2-19-175">Extensions
13 CUDA Environment Routines
+href="userhtmlse13.html#x20-15400013" id="QQ2-20-190">CUDA Environment Routines
References
+href="userhtmlli2.html#x21-169000" id="QQ2-21-219">References
diff --git a/docs/html/userhtml.html b/docs/html/userhtml.html
index 083bd90a..2ccf61d9 100644
--- a/docs/html/userhtml.html
+++ b/docs/html/userhtml.html
@@ -54,11 +54,11 @@ href="userhtmlse10.html#x15-13500010" id="QQ2-15-165">Preconditioner routines 11 Iterative Methods
12 Extensions
+href="userhtmlse12.html#x19-14500012" id="QQ2-19-175">Extensions
13 CUDA Environment Routines
+href="userhtmlse13.html#x20-15400013" id="QQ2-20-190">CUDA Environment Routines
References
+href="userhtmlli2.html#x21-169000" id="QQ2-21-219">References
diff --git a/docs/html/userhtmlli1.html b/docs/html/userhtmlli1.html
index e767ddcf..40ffe257 100644
--- a/docs/html/userhtmlli1.html
+++ b/docs/html/userhtmlli1.html
@@ -310,46 +310,48 @@ href="userhtmlse10.html#x15-14100010.6" id="QQ2-15-171">free — Free a prec
href="userhtmlse11.html#x17-14200011">Iterative Methods
11.1 psb_krylov — Krylov Methods Driver Routine
+
11.2 psb_richardson — Richardson Iteration Driver Routine
12 Extensions
+href="userhtmlse12.html#x19-14500012">Extensions
12.1 Using the extensions
+href="userhtmlse12.html#x19-14600012.1" id="QQ2-19-176">Using the extensions
12.2 Extensions’ Data Structures
+href="userhtmlse12.html#x19-14700012.2" id="QQ2-19-177">Extensions’ Data Structures
12.3 CPU-class extensions
+href="userhtmlse12.html#x19-14800012.3" id="QQ2-19-180">CPU-class extensions
12.4 CUDA-class extensions
+href="userhtmlse12.html#x19-15300012.4" id="QQ2-19-189">CUDA-class extensions
13 CUDA Environment Routines
+href="userhtmlse13.html#x20-15400013">CUDA Environment Routines
psb_cuda_init
+href="userhtmlse13.html#Q1-20-192">psb_cuda_init
psb_cuda_exit
+href="userhtmlse13.html#Q1-20-194">psb_cuda_exit
psb_cuda_DeviceSync
+href="userhtmlse13.html#Q1-20-196">psb_cuda_DeviceSync
psb_cuda_getDeviceCount
+href="userhtmlse13.html#Q1-20-198">psb_cuda_getDeviceCount
psb_cuda_getDevice
+href="userhtmlse13.html#Q1-20-200">psb_cuda_getDevice
psb_cuda_setDevice
+href="userhtmlse13.html#Q1-20-202">psb_cuda_setDevice
psb_cuda_DeviceHasUVA
+href="userhtmlse13.html#Q1-20-204">psb_cuda_DeviceHasUVA
psb_cuda_WarpSize
+href="userhtmlse13.html#Q1-20-206">psb_cuda_WarpSize
psb_cuda_MultiProcessors
+href="userhtmlse13.html#Q1-20-208">psb_cuda_MultiProcessors
psb_cuda_MaxThreadsPerMP
+href="userhtmlse13.html#Q1-20-210">psb_cuda_MaxThreadsPerMP
psb_cuda_MaxRegisterPerBlock
+href="userhtmlse13.html#Q1-20-212">psb_cuda_MaxRegisterPerBlock
psb_cuda_MemoryClockRate
+href="userhtmlse13.html#Q1-20-214">psb_cuda_MemoryClockRate
psb_cuda_MemoryBusWidth
+href="userhtmlse13.html#Q1-20-216">psb_cuda_MemoryBusWidth
psb_cuda_MemoryPeakBandwidth
+href="userhtmlse13.html#Q1-20-218">psb_cuda_MemoryPeakBandwidth
diff --git a/docs/html/userhtmlli2.html b/docs/html/userhtmlli2.html
index e486077a..f31a33cf 100644
--- a/docs/html/userhtmlli2.html
+++ b/docs/html/userhtmlli2.html
@@ -16,7 +16,7 @@ href="userhtmlse13.html#tailuserhtmlse13.html" >prev-tail] [tail] [up]
diff --git a/docs/html/userhtmlse11.html b/docs/html/userhtmlse11.html index 41a4cd65..c99d4e40 100644 --- a/docs/html/userhtmlse11.html +++ b/docs/html/userhtmlse11.html @@ -17,10 +17,9 @@ href="userhtmlse8.html#tailuserhtmlse11.html">tail] [up]
In this chapter we provide routines for preconditioners and iterative methods. -The interfaces for Krylov subspace methods are available in the module -psb_krylov_mod. +
In this chapter we provide routines for preconditioners and iterative methods. The +interfaces for iterative methods are available in the module psb_linsolve_mod. @@ -456,6 +455,344 @@ class="newline" />An integer value; 0 means no error has been detected.11.2 psb_richardson — Richardson Iteration Driver Routine +
This subroutine is a driver implementig a Richardson iteration +
with the preconditioner operator M defined in the previous section. +
The stopping criterion can take the following values: +
+1
normwise backward error in the infinity norm; the iteration is stopped + when +
+
+2
Relative residual in the 2-norm; the iteration is stopped when +
+
+3
Relative residual reduction in the 2-norm; the iteration is stopped when +
The behaviour is controlled by the istop argument (see later). In the above formulae, xi +is the tentative solution and ri = b - Axi the corresponding residual at the i-th +iteration. + +
call psb_richardson(a,prec,b,x,eps,desc_a,info,& + & itmax,iter,err,itrace,istop)+ +
+
+Type:
Synchronous. +
+On Entry
+
+a
the local portion of global sparse matrix A.
Scope: local
Type: required
Intent: in.
Specified as: a structured data of type psb_Tspmat_type.
+
+prec
The data structure containing the preconditioner.
Scope: local
Type: required
Intent: in.
Specified as: a structured data of type psb_prec_type.
+
+b
The RHS vector.
Scope: local
Type: required
Intent: in.
Specified as: a rank one array or an object of type psb_T_vect_type.
+
+x
The initial guess.
Scope: local
Type: required
Intent: inout.
Specified as: a rank one array or an object of type psb_T_vect_type.
+
+eps
The stopping tolerance.
Scope: global
Type: required
Intent: in.
Specified as: a real number.
+
+desc_a
contains data structures for communications.
Scope: local
Type: required
Intent: in.
Specified as: a structured data of type psb_desc_type.
+
+itmax
The maximum number of iterations to perform.
Scope: global
Type: optional
Intent: in.
Default: itmax = 1000.
Specified as: an integer variable itmax ≥ 1.
+
+itrace
If > 0 print out an informational message about convergence every itrace
+ iterations. If = 0 print a message in case of convergence failure.
Scope: global
Type: optional
Intent: in.
Default: itrace = -1.
+
+istop
An integer specifying the stopping criterion.
Scope: global
Type: optional.
Intent: in.
Values: 1: use the normwise backward error, 2: use the scaled 2-norm of
+ the residual, 3: use the residual reduction in the 2-norm. Default: 2.
+
+On Return
+
+x
The computed solution.
Scope: local
Type: required
Intent: inout.
Specified as: a rank one array or an object of type psb_T_vect_type.
+
+iter
The number of iterations performed.
Scope: global
Type: optional
Intent: out.
Returned as: an integer variable.
+
+err
The convergence estimate on exit.
Scope: global
Type: optional
Intent: out.
Returned as: a real number.
+
+info
Error code.
Scope: local
Type: required
Intent: out.
An integer value; 0 means no error has been detected.
The EXT, CUDA and RSB subdirectories contains a set of extensions to the base library. The extensions provide additional storage formats beyond the ones already contained in the base library, as well as interfaces to: @@ -49,7 +49,7 @@ in [22].
A sample application using the PSBLAS extensions will contain the following steps:
Access to the facilities provided by the EXT library is mainly achieved through the data types that are provided within. The data classes are derived from the base classes in PSBLAS, through the Fortran 2003 mechanism of 17].
The data classes are divided between the general purpose CPU extensions, the GPU interfaces and the RSB interfaces. In the description we will make use of the notation introduced in Table 21. +href="#x19-147001r21">21.
@@ -274,7 +274,7 @@ class="td11">
+ id="x19-147002r5">
@@ -283,18 +283,18 @@ src="mat.png" alt="PIC"
width="147" height="147" >
The ELLPACK/ITPACK format (shown in Figure 6) comprises two 2-dimensional
+href="#x19-149001r6">6) comprises two 2-dimensional
arrays AS and JA with
+ id="x19-149001r6">
@@ -325,13 +325,13 @@ width="233" height="233" >
+href="#x19-147002r5">5
i=1,n t=0 j=1,maxnzr ,j)) end do ) = t end do+ id="x19-149010r1"> + id="x19-149011"> Algorithm 1: Matrix-Vector product in ELL format @@ -450,7 +450,7 @@ class="cmbx-10"> 1: Matrix-Vector product in ELL format class="cmmi-10">y = Ax can be computed with the code shown in Alg. 1; it costs one memory write per outer iteration, plus three memory reads and +href="#x19-149010r1">1; it costs one memory write per outer iteration, plus three memory reads and two floating-point operations per inner iteration.
Unless all rows have exactly the same number of nonzeros, some of the coefficients
in the
The maximum number of nonzeros per row is not much larger than the
average;
The regularity of the data structure allows for faster code, e.g. by allowing
vectorization, thereby offsetting the additional storage requirements. In the extreme case where the input matrix has one full row, the ELLPACK
@@ -492,7 +492,7 @@ class="cmtt-10">psb_T_ell_sparse_mat
The hacked ELLPACK (HLL) format alleviates the main problem of the ELLPACK
@@ -558,7 +558,7 @@ format.
+ id="x19-150001r7">
@@ -568,7 +568,7 @@ width="248" height="248" >
The DIAgonal (DIA) format (shown in Figure 8) has a 2-dimensional array 8) has a 2-dimensional array AS
containing in each column the coefficients along a diagonal of the matrix, and an
integer array The code to compute the matrix-vector product y = Ax is shown in Alg. 2; it
+href="#x19-151003r2">2; it
costs one memory read per outer iteration, plus three memory reads, one memory
write and two floating-point operations per inner iteration. The accesses to
+ id="x19-151001r8">
@@ -630,13 +630,13 @@ width="248" height="248" >
+ id="x19-151003r2">
+ id="x19-151004">
Algorithm 2: Matrix-Vector product in DIA format
@@ -691,7 +691,7 @@ class="cmtt-10">psb_T_dia_sparse_mat
Storage by DIAgonals is an attractive option for matrices whose coefficients are
located on a small set of diagonals, since they do away with storing explicitly the
indices and therefore reduce significantly memory traffic. However, having a few
@@ -738,7 +738,7 @@ class="cmti-10">hackOffsets[k]
+href="#x19-147002r5">5
+ id="x19-151002r2">
@@ -662,9 +662,9 @@ href="#x19-146002r5">5Hacked DIA
+ id="x19-152000">Hacked DIA
+href="#x19-147002r5">5
@@ -793,7 +793,7 @@ class="cmtt-10">psb_T_hdia_sparse_mat:
For computing with CUDA we define a dual memorization strategy in which each variable on the CPU (“host”) side has a GPU (“device”) side. When a GPU-type variable is initialized, the data contained is (usually) the same on both sides. Each diff --git a/docs/html/userhtmlse13.html b/docs/html/userhtmlse13.html index fa8fed8f..90d399e0 100644 --- a/docs/html/userhtmlse13.html +++ b/docs/html/userhtmlse13.html @@ -16,12 +16,12 @@ href="userhtmlse12.html#tailuserhtmlse12.html" >prev-tail] [tail] [up]
@@ -64,13 +64,13 @@ class="cmbx-12">Notes
A call to this routine must precede any other PSBLAS-CUDA call.
@@ -106,9 +106,9 @@ class="cmbx-10">in.
Specified as: an integer variable.
@@ -136,9 +136,9 @@ ngpus = psb_cuda_getDeviceCount()
Get number of devices available on current computing node.
@@ -149,9 +149,9 @@ ngpus = psb_cuda_getDevice()
Get device in use by current process.
Set device to be used by current process.
@@ -178,9 +178,9 @@ hasUva = psb_cuda_DeviceHasUVA()
Returns true if device currently in use supports UVA (Unified Virtual Addressing).
@@ -191,9 +191,9 @@ nw = psb_cuda_WarpSize()
Returns the warp size.
Returns the number of multiprocessors in the CUDA device.
@@ -220,9 +220,9 @@ nt = psb_cuda_MaxThreadsPerMP()
Returns the maximum number of threads per multiprocessor.
@@ -233,9 +233,9 @@ nr = psb_cuda_MaxRegistersPerBlock()
Returns the maximum number of register per thread block.
Returns the memory clock rate in KHz, as an integer.
@@ -262,9 +262,9 @@ nb = psb_cuda_MemoryBusWidth()
Returns the memory bus width in bits.
diff --git a/docs/psblas-3.9.pdf b/docs/psblas-3.9.pdf index 6d104b9d..fe0bb60b 100644 --- a/docs/psblas-3.9.pdf +++ b/docs/psblas-3.9.pdf @@ -447,7 +447,7 @@ endobj /Type /ObjStm /N 100 /First 928 -/Length 16303 +/Length 16554 >> stream 403 0 407 48 408 376 411 424 412 766 415 814 416 1030 419 1078 420 1241 423 1283 @@ -458,8 +458,8 @@ stream 504 7435 507 7482 508 7853 511 7900 512 8366 515 8408 516 8489 519 8536 520 8990 523 9037 524 9483 527 9530 528 9986 531 10033 532 10489 535 10536 536 10992 539 11039 540 11505 543 11548 544 11707 547 11755 548 11991 551 12039 552 12260 555 12308 556 12584 559 12632 560 12977 563 13025 -564 13271 567 13319 568 13525 571 13568 572 13697 575 13745 576 14035 579 14078 580 14169 583 14217 -584 14374 587 14422 588 14614 591 14662 592 14816 595 14864 596 15023 599 15066 600 15238 603 15281 +564 13271 567 13319 568 13525 571 13568 572 13697 575 13745 576 14035 579 14083 580 14423 583 14466 +584 14557 587 14605 588 14762 591 14810 592 15002 595 15050 596 15204 599 15252 600 15411 603 15454 % 403 0 obj << /S /GoTo /D (subsection.6.24) >> % 407 0 obj @@ -633,37 +633,37 @@ stream % 575 0 obj (\376\377\0001\0001\000.\0001\000\040\000p\000s\000b\000\137\000k\000r\000y\000l\000o\000v\000\040\000\040\040\024\000\040\000K\000r\000y\000l\000o\000v\000\040\000M\000e\000t\000h\000o\000d\000s\000\040\000D\000r\000i\000v\000e\000r\000\040\000R\000o\000u\000t\000i\000n\000e) % 576 0 obj -<< /S /GoTo /D (section.12) >> +<< /S /GoTo /D (subsection.11.2) >> % 579 0 obj -(\376\377\0001\0002\000\040\000E\000x\000t\000e\000n\000s\000i\000o\000n\000s) +(\376\377\0001\0001\000.\0002\000\040\000p\000s\000b\000\137\000r\000i\000c\000h\000a\000r\000d\000s\000o\000n\000\040\000\040\040\024\000\040\000R\000i\000c\000h\000a\000r\000d\000s\000o\000n\000\040\000I\000t\000e\000r\000a\000t\000i\000o\000n\000\040\000D\000r\000i\000v\000e\000r\000\040\000R\000o\000u\000t\000i\000n\000e) % 580 0 obj -<< /S /GoTo /D (subsection.12.1) >> +<< /S /GoTo /D (section.12) >> % 583 0 obj -(\376\377\0001\0002\000.\0001\000\040\000U\000s\000i\000n\000g\000\040\000t\000h\000e\000\040\000e\000x\000t\000e\000n\000s\000i\000o\000n\000s) +(\376\377\0001\0002\000\040\000E\000x\000t\000e\000n\000s\000i\000o\000n\000s) % 584 0 obj -<< /S /GoTo /D (subsection.12.2) >> +<< /S /GoTo /D (subsection.12.1) >> % 587 0 obj -(\376\377\0001\0002\000.\0002\000\040\000E\000x\000t\000e\000n\000s\000i\000o\000n\000s\000'\000\040\000D\000a\000t\000a\000\040\000S\000t\000r\000u\000c\000t\000u\000r\000e\000s) +(\376\377\0001\0002\000.\0001\000\040\000U\000s\000i\000n\000g\000\040\000t\000h\000e\000\040\000e\000x\000t\000e\000n\000s\000i\000o\000n\000s) % 588 0 obj -<< /S /GoTo /D (subsection.12.3) >> +<< /S /GoTo /D (subsection.12.2) >> % 591 0 obj -(\376\377\0001\0002\000.\0003\000\040\000C\000P\000U\000-\000c\000l\000a\000s\000s\000\040\000e\000x\000t\000e\000n\000s\000i\000o\000n\000s) +(\376\377\0001\0002\000.\0002\000\040\000E\000x\000t\000e\000n\000s\000i\000o\000n\000s\000'\000\040\000D\000a\000t\000a\000\040\000S\000t\000r\000u\000c\000t\000u\000r\000e\000s) % 592 0 obj -<< /S /GoTo /D (subsection.12.4) >> +<< /S /GoTo /D (subsection.12.3) >> % 595 0 obj -(\376\377\0001\0002\000.\0004\000\040\000C\000U\000D\000A\000-\000c\000l\000a\000s\000s\000\040\000e\000x\000t\000e\000n\000s\000i\000o\000n\000s) +(\376\377\0001\0002\000.\0003\000\040\000C\000P\000U\000-\000c\000l\000a\000s\000s\000\040\000e\000x\000t\000e\000n\000s\000i\000o\000n\000s) % 596 0 obj -<< /S /GoTo /D (section.13) >> +<< /S /GoTo /D (subsection.12.4) >> % 599 0 obj -(\376\377\0001\0003\000\040\000C\000U\000D\000A\000\040\000E\000n\000v\000i\000r\000o\000n\000m\000e\000n\000t\000\040\000R\000o\000u\000t\000i\000n\000e\000s) +(\376\377\0001\0002\000.\0004\000\040\000C\000U\000D\000A\000-\000c\000l\000a\000s\000s\000\040\000e\000x\000t\000e\000n\000s\000i\000o\000n\000s) % 600 0 obj -<< /S /GoTo /D (section*.6) >> +<< /S /GoTo /D (section.13) >> % 603 0 obj -(\376\377\000p\000s\000b\000\137\000c\000u\000d\000a\000\137\000i\000n\000i\000t) +(\376\377\0001\0003\000\040\000C\000U\000D\000A\000\040\000E\000n\000v\000i\000r\000o\000n\000m\000e\000n\000t\000\040\000R\000o\000u\000t\000i\000n\000e\000s) endstream endobj -662 0 obj +666 0 obj << /Length 729 >> @@ -708,7 +708,7 @@ ET endstream endobj -659 0 obj +663 0 obj << /Type /XObject /Subtype /Image @@ -716,14 +716,14 @@ endobj /Height 480 /BitsPerComponent 8 /ColorSpace /DeviceRGB -/SMask 669 0 R +/SMask 673 0 R /Length 921600 >> stream ½ ѳ ȳ 賳 Գ ˳ ³ ⳳ ٳ ϳ ų 糳 ܳ ӳ ɳ 볳 ೳ ׳ ͳ ﳳ ų 䳳 ڳ ѳ ȳ 賳 ߳ ճ ˳ ³ ⳳ ٳ ϳ ų 糳 ܳ ӳ ɳ 볳 ᳳ ׳ ͳ ﳳ ų 䳳¿¿¿ 棟͙ͣͣͣͣȔŔŔŔŔŒą~xvpvpvpvpvpvphbf`f`f`f`f`b[WPWPWPWPWPWPLDH@H@H@H@H@F>9090909090900') ) ) ) ) ) ೳ7.% le ڳ' vpB9 賳 ɳZS) vq ֳA8 ME 䳳,# Ƴ|/&