From 8e02a99a1153701873faaead3123a1a578c34969 Mon Sep 17 00:00:00 2001 From: Stack-1 Date: Fri, 12 Jun 2026 19:50:40 +0200 Subject: [PATCH] [ADD] Stock preconditioners, configurable block format and full base-class contract for the nested matrix Complete the integration of the nested (MATNEST) operator into the standard PSBLAS infrastructure: - Preconditioners: implement get_diag and csgetrow on psb_d_nest_base_mat so the stock one-level preconditioners build directly on the nested operator (DIAG through the concatenated block diagonals, BJAC through the format-agnostic csget path used by the ILU factorizations). - Configurable block storage: psb_d_nest_rect_block and psb_d_nest_matrix%asb accept an optional type ('CSR' default, 'CSC', 'COO') or mold (any class extending psb_d_base_sparse_mat, e.g. the psb_ext ELL/HLL formats); the operator is format-agnostic since every operation delegates to the blocks. - Device-capable matvec: override vect_mv to gather/scatter through the vectors' own gth/sct with encapsulated index vectors (device kernels on device vectors) and to run each block through its vect_mv, so device block formats execute their native kernels; bit-equivalent to csmv on host. - Full psb_d_base_sparse_mat contract by delegation to the blocks: transposed csmv (dedicated kernel, ghost contributions left to the transposed halo exchange), multi-RHS csmm, cp_to_coo/mv_to_coo (unlocking cscnv, csclip, tril/triu through the base generics), rowsum/arwsum/colsum/aclsum, maxval/spnmi/spnm1, scal (left/right) and scals, clone (view semantics: shared blocks, re-owned index maps), mold, sizeof. cp_from_coo/mv_from_coo, csput and cssv/cssm are intentionally left to the base error (meaningless for a block-operator view), documented in the type and in the README. Tests: glob assembles the blocks in HLL (psb_ext) and rect in CSC, both still bit-identical to the monolithic CSR oracle; the CG test solves under NONE, DIAG and BJAC/ILU(0), requiring convergence to the exact solution for all of them and DIAG bit-identical to NONE (exactness check of the nested get_diag). README updated with the user API reference, the preconditioner section and the implemented-contract section. Author: Simone Staccone (Stack-1) --- base/modules/Makefile | 3 +- .../serial/psb_d_nest_base_mat_mod.F90 | 798 +++++++++++++++++- base/modules/tools/psb_d_nest_builder_mod.F90 | 13 +- base/modules/tools/psb_d_nest_tools_mod.F90 | 14 +- test/nested/CMakeLists.txt | 8 +- test/nested/Makefile | 2 +- test/nested/README.md | 135 ++- test/nested/psb_d_nest_cg_test.F90 | 89 +- test/nested/psb_d_nest_glob_test.F90 | 9 +- test/nested/psb_d_nest_rect_test.F90 | 4 +- 10 files changed, 1009 insertions(+), 66 deletions(-) diff --git a/base/modules/Makefile b/base/modules/Makefile index 8040e6a4b..27ded2451 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -461,7 +461,8 @@ psblas/psb_s_psblas_mod.o psblas/psb_c_psblas_mod.o psblas/psb_d_psblas_mod.o ps # --- nested mat/desc dependencies (MATNEST) --- desc/psb_desc_nest_mod.o: desc/psb_desc_mod.o serial/psb_d_nest_mat_mod.o: serial/psb_d_mat_mod.o -serial/psb_d_nest_base_mat_mod.o: serial/psb_d_nest_mat_mod.o desc/psb_desc_nest_mod.o serial/psb_d_base_mat_mod.o serial/psb_d_mat_mod.o desc/psb_desc_mod.o +serial/psb_d_nest_base_mat_mod.o: serial/psb_d_nest_mat_mod.o desc/psb_desc_nest_mod.o serial/psb_d_base_mat_mod.o serial/psb_d_mat_mod.o desc/psb_desc_mod.o \ + serial/psb_i_vect_mod.o serial/psb_d_base_vect_mod.o psb_d_nest_mod.o: \ desc/psb_desc_nest_mod.o \ serial/psb_d_nest_mat_mod.o \ diff --git a/base/modules/serial/psb_d_nest_base_mat_mod.F90 b/base/modules/serial/psb_d_nest_base_mat_mod.F90 index af135dcc2..5b48288b5 100644 --- a/base/modules/serial/psb_d_nest_base_mat_mod.F90 +++ b/base/modules/serial/psb_d_nest_base_mat_mod.F90 @@ -52,7 +52,10 @@ module psb_d_nest_base_mat_mod use psb_const_mod use psb_error_mod + use psb_realloc_mod, only : psb_ensure_size use psb_d_base_mat_mod, only : psb_d_base_sparse_mat + use psb_d_base_vect_mod, only : psb_d_base_vect_type + use psb_i_vect_mod, only : psb_i_vect_type use psb_desc_mod, only : psb_desc_type use psb_desc_nest_mod, only : psb_desc_nest_type use psb_d_nest_mat_mod, only : psb_d_nest_sparse_mat @@ -65,6 +68,10 @@ module psb_d_nest_base_mat_mod type :: psb_d_nest_field_map integer(psb_ipk_) :: n_owned = 0 integer(psb_ipk_), allocatable :: global_local_pos(:) + ! same positions as an encapsulated index vector, for the device-capable + ! gather/scatter (gth/sct) used by vect_mv; pointer so that its target can + ! be synced even when the operator dummy argument is intent(in) + type(psb_i_vect_type), pointer :: gather_pos => null() end type psb_d_nest_field_map type, extends(psb_d_base_sparse_mat) :: psb_d_nest_base_mat @@ -77,6 +84,36 @@ module psb_d_nest_base_mat_mod procedure, pass(a) :: get_nzeros => psb_d_nest_base_get_nzeros procedure, nopass :: get_fmt => psb_d_nest_base_get_fmt procedure, pass(a) :: free => psb_d_nest_base_free + ! enable the stock PSBLAS preconditioners on the nested operator: + ! get_diag is used by DIAG/JACOBI, csgetrow by BJAC (ILU factorizations + ! go through the format-agnostic csget path) + procedure, pass(a) :: get_diag => psb_d_nest_base_get_diag + procedure, pass(a) :: csgetrow => psb_d_nest_base_csgetrow + ! device-capable matvec on encapsulated vectors: gathers/scatters through + ! the vectors' own gth/sct and runs each block through its vect_mv, so + ! device block formats execute their device kernels + procedure, pass(a) :: vect_mv => psb_d_nest_base_vect_mv + ! full base-class contract (delegating to the blocks): + procedure, pass(a) :: csmm => psb_d_nest_base_csmm + procedure, pass(a) :: cp_to_coo => psb_d_nest_base_cp_to_coo + procedure, pass(a) :: mv_to_coo => psb_d_nest_base_mv_to_coo + procedure, pass(a) :: rowsum => psb_d_nest_base_rowsum + procedure, pass(a) :: arwsum => psb_d_nest_base_arwsum + procedure, pass(a) :: colsum => psb_d_nest_base_colsum + procedure, pass(a) :: aclsum => psb_d_nest_base_aclsum + procedure, pass(a) :: maxval => psb_d_nest_base_maxval + procedure, pass(a) :: spnmi => psb_d_nest_base_csnmi + procedure, pass(a) :: spnm1 => psb_d_nest_base_csnm1 + procedure, pass(a) :: scals => psb_d_nest_base_scals + procedure, pass(a) :: scalv => psb_d_nest_base_scal + procedure, pass(a) :: clone => psb_d_nest_base_clone + procedure, pass(a) :: mold => psb_d_nest_base_mold + procedure, pass(a) :: sizeof => psb_d_nest_base_sizeof + ! NOT implemented on purpose (base error 700 is the intended behaviour): + ! cp_from_coo / mv_from_coo (a nested operator cannot be built from a flat + ! matrix without the field structure), csput (insertions go to the blocks + ! before assembly), cssv/cssm (triangular solve is undefined for a block + ! operator) end type psb_d_nest_base_mat private @@ -97,9 +134,19 @@ contains ! pointers into the caller), so we only detach them and release the field maps. subroutine psb_d_nest_base_free(a) class(psb_d_nest_base_mat), intent(inout) :: a + integer(psb_ipk_) :: i_field, local_info + if (allocated(a%field_map)) then + do i_field = 1, size(a%field_map) + if (associated(a%field_map(i_field)%gather_pos)) then + call a%field_map(i_field)%gather_pos%free(local_info) + deallocate(a%field_map(i_field)%gather_pos) + a%field_map(i_field)%gather_pos => null() + end if + end do + deallocate(a%field_map) + end if a%block_storage => null() a%grid_desc => null() - if (allocated(a%field_map)) deallocate(a%field_map) a%n_fields = 0 call a%set_null() end subroutine psb_d_nest_base_free @@ -120,6 +167,271 @@ contains end if end function psb_d_nest_base_get_nzeros + ! get_diag: diagonal of the global operator. In the global-local layout the + ! owned entries of field i occupy positions owned_offset+1..owned_offset+n_owned, + ! and for owned indices the field-local column k maps to the same global-local + ! position as row k, so the global diagonal is the concatenation of the + ! diagonals of the diagonal blocks (i,i); absent blocks contribute zeros + ! (e.g. the (2,2) block of a saddle-point operator). + subroutine psb_d_nest_base_get_diag(a, d, info) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(out) :: d(:) + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: block_diag(:) + integer(psb_ipk_) :: i_field, n_owned, owned_offset + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_get_diag' + + if (.not. (associated(a%block_storage) .and. allocated(a%field_map))) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info, name, a_err='nested operator not set up') + return + end if + if (size(d) < a%get_nrows()) then + info = psb_err_invalid_input_ + call psb_errpush(info, name, a_err='d too small') + return + end if + + d(1:a%get_nrows()) = dzero + owned_offset = 0 + do i_field = 1, a%n_fields + n_owned = a%field_map(i_field)%n_owned + if (a%block_storage%has_block(i_field, i_field)) then + allocate(block_diag(n_owned), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + call a%block_storage%mats(i_field,i_field)%a%get_diag(block_diag, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block get_diag') + return + end if + d(owned_offset+1 : owned_offset+n_owned) = block_diag(1:n_owned) + deallocate(block_diag) + end if + owned_offset = owned_offset + n_owned + end do + end subroutine psb_d_nest_base_get_diag + + ! csgetrow: extract local rows imin..imax of the global operator as COO + ! triplets, with columns in the global-local layout (the operator's column + ! space). Each global-local row r belongs to one field i (row k within the + ! field); its entries are the union over j of row k of block (i,j), with the + ! block-local column c remapped through field_map(j)%global_local_pos(c). + ! This is the format-agnostic access path used by the ILU factorizations of + ! the BJAC preconditioner (via csget/csgetblk). + subroutine psb_d_nest_base_csgetrow(imin,imax,a,nz,ia,ja,val,info,& + & jmin,jmax,iren,append,nzin,rscale,cscale,chksz) + class(psb_d_nest_base_mat), intent(in) :: a + integer(psb_ipk_), intent(in) :: imin,imax + integer(psb_ipk_), intent(out) :: nz + integer(psb_ipk_), allocatable, intent(inout) :: ia(:), ja(:) + real(psb_dpk_), allocatable, intent(inout) :: val(:) + integer(psb_ipk_),intent(out) :: info + logical, intent(in), optional :: append + integer(psb_ipk_), intent(in), optional :: iren(:) + integer(psb_ipk_), intent(in), optional :: jmin,jmax, nzin + logical, intent(in), optional :: rscale,cscale,chksz + + integer(psb_ipk_), allocatable :: block_row_ia(:), block_row_ja(:) + real(psb_dpk_), allocatable :: block_row_val(:) + integer(psb_ipk_) :: jmin_, jmax_, nzin_, out_pos + integer(psb_ipk_) :: r_row, i_field, j_field, k_in_field, owned_offset + integer(psb_ipk_) :: block_nz, t_entry, global_local_col + logical :: append_ + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_csgetrow' + + if (.not. (associated(a%block_storage) .and. allocated(a%field_map))) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info, name, a_err='nested operator not set up') + return + end if + if (present(iren)) then + info = psb_err_invalid_input_ + call psb_errpush(info, name, a_err='iren not supported'); return + end if + if (present(rscale)) then + if (rscale) then + info = psb_err_invalid_input_ + call psb_errpush(info, name, a_err='rscale not supported'); return + end if + end if + if (present(cscale)) then + if (cscale) then + info = psb_err_invalid_input_ + call psb_errpush(info, name, a_err='cscale not supported'); return + end if + end if + + jmin_ = 1 + jmax_ = a%get_ncols() + if (present(jmin)) jmin_ = jmin + if (present(jmax)) jmax_ = jmax + append_ = .false. + if (present(append)) append_ = append + nzin_ = 0 + if (append_ .and. present(nzin)) nzin_ = nzin + + nz = 0 + out_pos = nzin_ + + do r_row = max(imin, 1), min(imax, a%get_nrows()) + ! locate the field owning global-local row r_row + owned_offset = 0 + i_field = 0 + do while (i_field < a%n_fields) + i_field = i_field + 1 + if (r_row <= owned_offset + a%field_map(i_field)%n_owned) exit + owned_offset = owned_offset + a%field_map(i_field)%n_owned + end do + k_in_field = r_row - owned_offset + + do j_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + call a%block_storage%mats(i_field,j_field)%a%csgetrow(k_in_field, k_in_field, & + & block_nz, block_row_ia, block_row_ja, block_row_val, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block csgetrow') + return + end if + do t_entry = 1, block_nz + global_local_col = a%field_map(j_field)%global_local_pos(block_row_ja(t_entry)) + if ((global_local_col < jmin_) .or. (global_local_col > jmax_)) cycle + out_pos = out_pos + 1 + call psb_ensure_size(out_pos, ia, info) + if (info == psb_success_) call psb_ensure_size(out_pos, ja, info) + if (info == psb_success_) call psb_ensure_size(out_pos, val, info) + if (info /= psb_success_) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + ia(out_pos) = r_row + ja(out_pos) = global_local_col + val(out_pos) = block_row_val(t_entry) + nz = nz + 1 + end do + end do + end do + end subroutine psb_d_nest_base_csgetrow + + ! vect_mv: matvec on encapsulated vectors (the path taken by psb_spmm with + ! psb_d_vect_type). Instead of falling back to the host-array csmv, it + ! (1) gathers each column-field sub-vector through the vector's own gth with + ! an encapsulated index vector (a device kernel on device vectors), + ! (2) runs each block through its vect_mv (device formats execute their own + ! device kernels), with per-field work vectors allocated with mold=x so + ! they share the dynamic type of the incoming vectors, + ! (3) scatters each row-field result back through the vector's own sct. + ! Host/device traffic is limited to the compact field buffers; on plain host + ! vectors this is exactly equivalent to the array csmv. + subroutine psb_d_nest_base_vect_mv(alpha, a, x, beta, y, info, trans) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_vect_type), intent(inout) :: x + class(psb_d_base_vect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + class(psb_d_base_vect_type), allocatable :: x_field_vec, y_field_vec + real(psb_dpk_), allocatable :: x_field_buf(:), y_field_buf(:) + real(psb_dpk_) :: block_beta + integer(psb_ipk_) :: i_field, j_field, n_owned, n_local, local_info + logical :: row_has_blocks + character :: trans_ + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_vect_mv' + + trans_ = 'N' + if (present(trans)) trans_ = trans + if (.not. (associated(a%block_storage) .and. allocated(a%field_map))) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info, name, a_err='nested operator not set up') + return + end if + if (trans_ /= 'N' .and. trans_ /= 'n') then + ! transposed product: fall back to host arrays (rare path) + block + real(psb_dpk_), allocatable :: x_host(:), y_host(:) + x_host = x%get_vect() + y_host = y%get_vect() + call psb_d_nest_base_csmv_t(alpha, a, x_host, beta, y_host, info) + call y%bld(y_host) + end block + return + end if + + ! work vectors share the dynamic type of the incoming vectors + allocate(x_field_vec, mold=x, stat=info) + if (info == 0) allocate(y_field_vec, mold=y, stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + + do i_field = 1, a%n_fields + n_owned = a%field_map(i_field)%n_owned + call psb_ensure_size(n_owned, y_field_buf, info) + if (info /= psb_success_) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + + row_has_blocks = .false. + block_beta = dzero + do j_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + + ! gather the column-field sub-vector (owned + ghosts) from x + n_local = size(a%field_map(j_field)%global_local_pos) + call psb_ensure_size(n_local, x_field_buf, info) + if (info /= psb_success_) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + call x%gth(ione, int(n_local, psb_mpk_), & + & a%field_map(j_field)%gather_pos%v, x_field_buf) + call x_field_vec%free(local_info) + call x_field_vec%bld(x_field_buf(1:n_local)) + + if (.not. row_has_blocks) then + ! first block of this row field: (re)build the accumulator at the + ! right size, zeroed + y_field_buf(1:n_owned) = dzero + call y_field_vec%free(local_info) + call y_field_vec%bld(y_field_buf(1:n_owned)) + row_has_blocks = .true. + end if + + ! y_field = alpha * A(i,j) * x_field + block_beta * y_field + call a%block_storage%mats(i_field,j_field)%a%spmm(alpha, x_field_vec, & + & block_beta, y_field_vec, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block vect_mv') + return + end if + block_beta = done + end do + + ! scatter the row-field result into y (beta applied on the owned rows); + ! a row field with no blocks still rescales its rows by beta + if (row_has_blocks) then + y_field_buf(1:n_owned) = y_field_vec%get_vect() + else + y_field_buf(1:n_owned) = dzero + end if + call y%sct(ione, int(n_owned, psb_mpk_), & + & a%field_map(i_field)%gather_pos%v, y_field_buf, beta) + end do + + call x_field_vec%free(local_info) + call y_field_vec%free(local_info) + end subroutine psb_d_nest_base_vect_mv + ! Build the per-field gather maps and set the local dimensions, from the nested ! grid descriptor (per-field distribution desc_grid%descs(1,field)) and the ! composed global descriptor desc_global (produced by psb_cd_nest_compose). @@ -187,6 +499,12 @@ contains end if nest_op%field_map(i_field)%global_local_pos(n_owned + i_entry) = local_pos end do + ! encapsulated copy of the positions for the device-capable gth/sct + allocate(nest_op%field_map(i_field)%gather_pos, stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + call nest_op%field_map(i_field)%gather_pos%bld(nest_op%field_map(i_field)%global_local_pos) owned_offset = owned_offset + n_owned end do @@ -217,10 +535,9 @@ contains trans_op = 'N' if (present(trans)) trans_op = trans if (trans_op /= 'N' .and. trans_op /= 'n') then - ! Transposed nested product is not implemented (would swap block indices - ! and need the transposed halo); reject explicitly. See P5. - info = psb_err_transpose_not_n_unsupported_ - call psb_errpush(info, name) + ! transposed product: the block structure of A^T is the transpose of the + ! block grid, handled by the dedicated kernel below + call psb_d_nest_base_csmv_t(alpha, a, x, beta, y, info) return end if if (.not. associated(a%block_storage)) then @@ -277,6 +594,477 @@ contains end subroutine psb_d_nest_base_csmv + ! Transposed matvec kernel: y = alpha * A^T * x + beta * y. + ! The block structure of A^T is the transpose of the block grid: + ! y(cols of field j) += alpha * sum_i A(i,j)^T * x(owned rows of field i). + ! x is read on the owned rows of each row field; the result lands on ALL the + ! local columns of each column field (owned + ghosts); the distributed caller + ! (psb_spmm with trans='T') then accumulates the ghost contributions to their + ! owners through the transposed halo exchange. + subroutine psb_d_nest_base_csmv_t(alpha, a, x, beta, y, info) + real(psb_dpk_), intent(in) :: alpha, beta, x(:) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(inout) :: y(:) + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: x_field(:), y_field(:) + integer(psb_ipk_) :: i_block_row, j_block_col, i_entry + integer(psb_ipk_) :: n_local_col_field, n_owned_row_field + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_base_csmv_t' + + if (.not. associated(a%block_storage)) then + info = psb_err_invalid_input_ + call psb_errpush(info, name, a_err='nested operator not set up') + return + end if + + ! y <- beta * y (on the whole column space) + if (beta == dzero) then + y(:) = dzero + else if (beta /= done) then + y(:) = beta * y(:) + end if + + do j_block_col = 1, a%n_fields + n_local_col_field = size(a%field_map(j_block_col)%global_local_pos) + allocate(y_field(n_local_col_field), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + ! current column-field output sub-vector (owned + ghosts) + do i_entry = 1, n_local_col_field + y_field(i_entry) = y(a%field_map(j_block_col)%global_local_pos(i_entry)) + end do + + do i_block_row = 1, a%n_fields + if (a%block_storage%has_block(i_block_row, j_block_col)) then + n_owned_row_field = a%field_map(i_block_row)%n_owned + allocate(x_field(n_owned_row_field), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + ! gather the row-field input sub-vector (owned rows only) + do i_entry = 1, n_owned_row_field + x_field(i_entry) = x(a%field_map(i_block_row)%global_local_pos(i_entry)) + end do + ! y_field <- alpha * A(i,j)^T * x_field + y_field + call a%block_storage%mats(i_block_row, j_block_col)%a%csmv( & + & alpha, x_field, done, y_field, info, 'T') + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block csmv T') + return + end if + deallocate(x_field) + end if + end do + + ! scatter the column-field output sub-vector back into y + do i_entry = 1, n_local_col_field + y(a%field_map(j_block_col)%global_local_pos(i_entry)) = y_field(i_entry) + end do + deallocate(y_field) + end do + end subroutine psb_d_nest_base_csmv_t + + ! csmm: multi-RHS product, the 2D analogue of csmv (same gather/scatter + ! per field, the block product is the block's own csmm) + subroutine psb_d_nest_base_csmm(alpha, a, x, beta, y, info, trans) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta, x(:,:) + real(psb_dpk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + real(psb_dpk_), allocatable :: x_field(:,:), y_field(:,:) + integer(psb_ipk_) :: i_block_row, j_block_col, i_entry + integer(psb_ipk_) :: n_local_col_field, n_owned_row_field, n_rhs + character :: trans_op + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_base_csmm' + trans_op = 'N' + if (present(trans)) trans_op = trans + if (trans_op /= 'N' .and. trans_op /= 'n') then + info = psb_err_transpose_not_n_unsupported_ + call psb_errpush(info, name); return + end if + if (.not. associated(a%block_storage)) then + info = psb_err_invalid_input_ + call psb_errpush(info, name, a_err='nested operator not set up') + return + end if + n_rhs = min(size(x,2), size(y,2)) + + if (beta == dzero) then + y(:,:) = dzero + else if (beta /= done) then + y(:,:) = beta * y(:,:) + end if + + do j_block_col = 1, a%n_fields + n_local_col_field = size(a%field_map(j_block_col)%global_local_pos) + allocate(x_field(n_local_col_field, n_rhs), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + do i_entry = 1, n_local_col_field + x_field(i_entry, 1:n_rhs) = x(a%field_map(j_block_col)%global_local_pos(i_entry), 1:n_rhs) + end do + + do i_block_row = 1, a%n_fields + if (a%block_storage%has_block(i_block_row, j_block_col)) then + n_owned_row_field = a%field_map(i_block_row)%n_owned + allocate(y_field(n_owned_row_field, n_rhs), stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, name); return + end if + do i_entry = 1, n_owned_row_field + y_field(i_entry, 1:n_rhs) = y(a%field_map(i_block_row)%global_local_pos(i_entry), 1:n_rhs) + end do + call a%block_storage%mats(i_block_row, j_block_col)%a%csmm( & + & alpha, x_field, done, y_field, info, trans_op) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block csmm') + return + end if + do i_entry = 1, n_owned_row_field + y(a%field_map(i_block_row)%global_local_pos(i_entry), 1:n_rhs) = y_field(i_entry, 1:n_rhs) + end do + deallocate(y_field) + end if + end do + deallocate(x_field) + end do + end subroutine psb_d_nest_base_csmm + + ! cp_to_coo: assemble all the blocks into a single local COO in the + ! global-local layout (rows = concatenated owned rows, columns = the + ! operator's column space). This is the core conversion hook: the generic + ! base-class machinery builds cscnv, csclip, tril/triu, ... on top of it. + subroutine psb_d_nest_base_cp_to_coo(a, b, info) + use psb_d_base_mat_mod, only : psb_d_coo_sparse_mat + class(psb_d_nest_base_mat), intent(in) :: a + class(psb_d_coo_sparse_mat), intent(inout) :: b + integer(psb_ipk_), intent(out) :: info + + type(psb_d_coo_sparse_mat) :: block_coo + integer(psb_ipk_) :: i_field, j_field, k_entry, n_entries, out_pos, owned_offset + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_cp_to_coo' + + if (.not. (associated(a%block_storage) .and. allocated(a%field_map))) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info, name, a_err='nested operator not set up') + return + end if + + call b%allocate(a%get_nrows(), a%get_ncols(), a%get_nzeros()) + out_pos = 0 + owned_offset = 0 + do i_field = 1, a%n_fields + do j_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + call a%block_storage%mats(i_field,j_field)%a%cp_to_coo(block_coo, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block cp_to_coo') + return + end if + n_entries = block_coo%get_nzeros() + do k_entry = 1, n_entries + b%ia(out_pos+k_entry) = owned_offset + block_coo%ia(k_entry) + b%ja(out_pos+k_entry) = a%field_map(j_field)%global_local_pos(block_coo%ja(k_entry)) + b%val(out_pos+k_entry) = block_coo%val(k_entry) + end do + out_pos = out_pos + n_entries + call block_coo%free() + end do + owned_offset = owned_offset + a%field_map(i_field)%n_owned + end do + call b%set_nzeros(out_pos) + call b%set_dupl(psb_dupl_add_) + call b%fix(info) + if (info /= psb_success_) & + & call psb_errpush(psb_err_from_subroutine_, name, a_err='coo fix') + end subroutine psb_d_nest_base_cp_to_coo + + ! mv_to_coo: the adapter does not own the blocks, so "move" degenerates to + ! copy + detach of the adapter + subroutine psb_d_nest_base_mv_to_coo(a, b, info) + use psb_d_base_mat_mod, only : psb_d_coo_sparse_mat + class(psb_d_nest_base_mat), intent(inout) :: a + class(psb_d_coo_sparse_mat), intent(inout) :: b + integer(psb_ipk_), intent(out) :: info + + call a%cp_to_coo(b, info) + if (info == psb_success_) call a%free() + end subroutine psb_d_nest_base_mv_to_coo + + ! rowsum/arwsum: (absolute) row sums, accumulated across the blocks of each + ! row field; d is in the global-local row layout + subroutine psb_d_nest_base_rowsum(d, a) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(out) :: d(:) + call psb_d_nest_base_sum_rows(d, a, absolute=.false.) + end subroutine psb_d_nest_base_rowsum + + subroutine psb_d_nest_base_arwsum(d, a) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(out) :: d(:) + call psb_d_nest_base_sum_rows(d, a, absolute=.true.) + end subroutine psb_d_nest_base_arwsum + + subroutine psb_d_nest_base_sum_rows(d, a, absolute) + real(psb_dpk_), intent(out) :: d(:) + class(psb_d_nest_base_mat), intent(in) :: a + logical, intent(in) :: absolute + + real(psb_dpk_), allocatable :: block_sums(:) + integer(psb_ipk_) :: i_field, j_field, k_entry, n_owned, owned_offset + + d(:) = dzero + if (.not. associated(a%block_storage)) return + owned_offset = 0 + do i_field = 1, a%n_fields + n_owned = a%field_map(i_field)%n_owned + allocate(block_sums(n_owned)) + do j_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + if (absolute) then + call a%block_storage%mats(i_field,j_field)%a%arwsum(block_sums) + else + call a%block_storage%mats(i_field,j_field)%a%rowsum(block_sums) + end if + do k_entry = 1, n_owned + d(owned_offset+k_entry) = d(owned_offset+k_entry) + block_sums(k_entry) + end do + end do + deallocate(block_sums) + owned_offset = owned_offset + n_owned + end do + end subroutine psb_d_nest_base_sum_rows + + ! colsum/aclsum: (absolute) column sums in the operator's column space, + ! accumulated across the blocks of each column field + subroutine psb_d_nest_base_colsum(d, a) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(out) :: d(:) + call psb_d_nest_base_sum_cols(d, a, absolute=.false.) + end subroutine psb_d_nest_base_colsum + + subroutine psb_d_nest_base_aclsum(d, a) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_), intent(out) :: d(:) + call psb_d_nest_base_sum_cols(d, a, absolute=.true.) + end subroutine psb_d_nest_base_aclsum + + subroutine psb_d_nest_base_sum_cols(d, a, absolute) + real(psb_dpk_), intent(out) :: d(:) + class(psb_d_nest_base_mat), intent(in) :: a + logical, intent(in) :: absolute + + real(psb_dpk_), allocatable :: field_sums(:), block_sums(:) + integer(psb_ipk_) :: i_field, j_field, k_entry, n_local + + d(:) = dzero + if (.not. associated(a%block_storage)) return + do j_field = 1, a%n_fields + n_local = size(a%field_map(j_field)%global_local_pos) + allocate(field_sums(n_local), block_sums(n_local)) + field_sums(:) = dzero + do i_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + if (absolute) then + call a%block_storage%mats(i_field,j_field)%a%aclsum(block_sums) + else + call a%block_storage%mats(i_field,j_field)%a%colsum(block_sums) + end if + field_sums(1:n_local) = field_sums(1:n_local) + block_sums(1:n_local) + end do + do k_entry = 1, n_local + d(a%field_map(j_field)%global_local_pos(k_entry)) = field_sums(k_entry) + end do + deallocate(field_sums, block_sums) + end do + end subroutine psb_d_nest_base_sum_cols + + ! maxval / infinity norm / 1-norm, by delegation/accumulation over blocks + function psb_d_nest_base_maxval(a) result(res) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_) :: res + integer(psb_ipk_) :: i_field, j_field + res = dzero + if (.not. associated(a%block_storage)) return + do j_field = 1, a%n_fields + do i_field = 1, a%n_fields + if (a%block_storage%has_block(i_field, j_field)) & + & res = max(res, a%block_storage%mats(i_field,j_field)%a%maxval()) + end do + end do + end function psb_d_nest_base_maxval + + function psb_d_nest_base_csnmi(a) result(res) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_) :: res + real(psb_dpk_), allocatable :: row_sums(:) + res = dzero + if (a%get_nrows() <= 0) return + allocate(row_sums(a%get_nrows())) + call psb_d_nest_base_sum_rows(row_sums, a, absolute=.true.) + res = maxval(row_sums) + end function psb_d_nest_base_csnmi + + function psb_d_nest_base_csnm1(a) result(res) + class(psb_d_nest_base_mat), intent(in) :: a + real(psb_dpk_) :: res + real(psb_dpk_), allocatable :: col_sums(:) + res = dzero + if (a%get_ncols() <= 0) return + allocate(col_sums(a%get_ncols())) + call psb_d_nest_base_sum_cols(col_sums, a, absolute=.true.) + res = maxval(col_sums) + end function psb_d_nest_base_csnm1 + + ! scals/scal: scaling acts on the underlying blocks (the operator is a view) + subroutine psb_d_nest_base_scals(d, a, info) + class(psb_d_nest_base_mat), intent(inout) :: a + real(psb_dpk_), intent(in) :: d + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i_field, j_field + character(len=24) :: name + info = psb_success_ + name = 'psb_d_nest_scals' + if (.not. associated(a%block_storage)) then + info = psb_err_invalid_mat_state_; call psb_errpush(info, name); return + end if + do j_field = 1, a%n_fields + do i_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + call a%block_storage%mats(i_field,j_field)%a%scal(d, info) + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block scal'); return + end if + end do + end do + end subroutine psb_d_nest_base_scals + + subroutine psb_d_nest_base_scal(d, a, info, side) + class(psb_d_nest_base_mat), intent(inout) :: a + real(psb_dpk_), intent(in) :: d(:) + integer(psb_ipk_), intent(out) :: info + character, intent(in), optional :: side + + real(psb_dpk_), allocatable :: d_field(:) + integer(psb_ipk_) :: i_field, j_field, k_entry, n_owned, n_local, owned_offset + character :: side_ + character(len=24) :: name + + info = psb_success_ + name = 'psb_d_nest_scal' + side_ = 'L' + if (present(side)) side_ = side + if (.not. associated(a%block_storage)) then + info = psb_err_invalid_mat_state_; call psb_errpush(info, name); return + end if + + if (side_ == 'L' .or. side_ == 'l') then + ! row scaling: each row field uses its owned slice of d + owned_offset = 0 + do i_field = 1, a%n_fields + n_owned = a%field_map(i_field)%n_owned + do j_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + call a%block_storage%mats(i_field,j_field)%a%scal( & + & d(owned_offset+1:owned_offset+n_owned), info, side='L') + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block scal L'); return + end if + end do + owned_offset = owned_offset + n_owned + end do + else + ! column scaling: each column field gathers its slice of d + do j_field = 1, a%n_fields + n_local = size(a%field_map(j_field)%global_local_pos) + allocate(d_field(n_local)) + do k_entry = 1, n_local + d_field(k_entry) = d(a%field_map(j_field)%global_local_pos(k_entry)) + end do + do i_field = 1, a%n_fields + if (.not. a%block_storage%has_block(i_field, j_field)) cycle + call a%block_storage%mats(i_field,j_field)%a%scal(d_field, info, side='R') + if (info /= psb_success_) then + call psb_errpush(psb_err_from_subroutine_, name, a_err='block scal R'); return + end if + end do + deallocate(d_field) + end do + end if + end subroutine psb_d_nest_base_scal + + ! clone: the adapter is a view, so the clone shares the blocks and the grid + ! descriptor (pointers) while re-owning its private gather index vectors + subroutine psb_d_nest_base_clone(a, b, info) + class(psb_d_nest_base_mat), intent(inout) :: a + class(psb_d_base_sparse_mat), allocatable, intent(inout) :: b + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: i_field + + info = psb_success_ + if (allocated(b)) deallocate(b) + allocate(b, source=a, stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, 'psb_d_nest_clone'); return + end if + select type (b_nest => b) + type is (psb_d_nest_base_mat) + if (allocated(b_nest%field_map)) then + do i_field = 1, size(b_nest%field_map) + ! the sourced copy shares a's gather_pos targets: re-own fresh copies + b_nest%field_map(i_field)%gather_pos => null() + allocate(b_nest%field_map(i_field)%gather_pos, stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, 'psb_d_nest_clone'); return + end if + call b_nest%field_map(i_field)%gather_pos%bld( & + & b_nest%field_map(i_field)%global_local_pos) + end do + end if + end select + end subroutine psb_d_nest_base_clone + + subroutine psb_d_nest_base_mold(a, b, info) + class(psb_d_nest_base_mat), intent(in) :: a + class(psb_d_base_sparse_mat), allocatable, intent(inout) :: b + integer(psb_ipk_), intent(out) :: info + info = psb_success_ + if (allocated(b)) deallocate(b) + allocate(b, mold=a, stat=info) + if (info /= 0) then + info = psb_err_alloc_dealloc_; call psb_errpush(info, 'psb_d_nest_mold') + end if + end subroutine psb_d_nest_base_mold + + ! sizeof: blocks + gather maps (the adapter does not own the descriptors) + function psb_d_nest_base_sizeof(a) result(res) + class(psb_d_nest_base_mat), intent(in) :: a + integer(psb_epk_) :: res + integer(psb_ipk_) :: i_field + res = 8 + if (associated(a%block_storage)) res = res + a%block_storage%sizeof() + if (allocated(a%field_map)) then + do i_field = 1, size(a%field_map) + if (allocated(a%field_map(i_field)%global_local_pos)) & + & res = res + psb_sizeof_ip * size(a%field_map(i_field)%global_local_pos) + end do + end if + end function psb_d_nest_base_sizeof + ! Selective (regime 2) application of a SINGLE block: ! y_field = alpha * A(i_block_row, j_block_col) * x_field + beta * y_field ! x_field is the column-field local vector (owned + ghosts) ALREADY halo-exchanged diff --git a/base/modules/tools/psb_d_nest_builder_mod.F90 b/base/modules/tools/psb_d_nest_builder_mod.F90 index 0fb2851ea..b54782944 100644 --- a/base/modules/tools/psb_d_nest_builder_mod.F90 +++ b/base/modules/tools/psb_d_nest_builder_mod.F90 @@ -67,6 +67,7 @@ module psb_d_nest_builder_mod use psb_penv_mod, only : psb_ctxt_type, psb_info use psb_desc_mod, only : psb_desc_type use psb_d_mat_mod, only : psb_dspmat_type + use psb_d_base_mat_mod, only : psb_d_base_sparse_mat use psb_cd_tools_mod, only : psb_cdall, psb_cdins, psb_cdasb use psb_desc_nest_mod, only : psb_desc_nest_type use psb_d_nest_mat_mod, only : psb_d_nest_sparse_mat @@ -184,10 +185,15 @@ contains end subroutine psb_d_nest_op_ins ! asb: assemble the descriptors, build the blocks, compose the global - ! descriptor, set up the operator and wrap it into a_glob - subroutine psb_d_nest_op_asb(op, info) + ! descriptor, set up the operator and wrap it into a_glob. + ! The optional type ('CSR'/'CSC'/'COO', default 'CSR') or mold (any + ! class extending psb_d_base_sparse_mat, e.g. the psb_ext ELL/HLL or + ! the psb_cuda device formats) selects the storage format of the blocks. + subroutine psb_d_nest_op_asb(op, info, type, mold) class(psb_d_nest_matrix), intent(inout), target :: op integer(psb_ipk_), intent(out) :: info + character(len=*), intent(in), optional :: type + class(psb_d_base_sparse_mat), intent(in), optional :: mold type(psb_d_nest_base_mat) :: nest_operator integer(psb_ipk_) :: n_fields, i_field, j_field @@ -220,7 +226,8 @@ contains & op%block_buffer(i_field,j_field)%entry_rows, & & op%block_buffer(i_field,j_field)%entry_cols, & & op%block_buffer(i_field,j_field)%entry_vals, & - & op%field_desc(i_field), op%field_desc(j_field), info) + & op%field_desc(i_field), op%field_desc(j_field), info, & + & type=type, mold=mold) if (info /= psb_success_) then call psb_errpush(psb_err_from_subroutine_, name, a_err='rect_block'); return end if diff --git a/base/modules/tools/psb_d_nest_tools_mod.F90 b/base/modules/tools/psb_d_nest_tools_mod.F90 index f1e8f2836..da1cc03b8 100644 --- a/base/modules/tools/psb_d_nest_tools_mod.F90 +++ b/base/modules/tools/psb_d_nest_tools_mod.F90 @@ -45,7 +45,7 @@ module psb_d_nest_tools_mod use psb_desc_nest_mod, only : psb_desc_nest_type use psb_d_nest_mat_mod, only : psb_d_nest_sparse_mat use psb_d_mat_mod, only : psb_dspmat_type - use psb_d_base_mat_mod, only : psb_d_coo_sparse_mat + use psb_d_base_mat_mod, only : psb_d_coo_sparse_mat, psb_d_base_sparse_mat use psb_desc_mod, only : psb_desc_type implicit none @@ -304,13 +304,15 @@ contains ! desc_row field-i descriptor (rows) ! desc_col field-j descriptor (columns, with union halo) ! - subroutine psb_d_nest_rect_block(blk, nz, ia_glob, ja_glob, val, desc_row, desc_col, info) + subroutine psb_d_nest_rect_block(blk, nz, ia_glob, ja_glob, val, desc_row, desc_col, info, type, mold) type(psb_dspmat_type), intent(out) :: blk integer(psb_ipk_), intent(in) :: nz integer(psb_lpk_), intent(in) :: ia_glob(:), ja_glob(:) real(psb_dpk_), intent(in) :: val(:) type(psb_desc_type), intent(in) :: desc_row, desc_col integer(psb_ipk_), intent(out) :: info + character(len=*), intent(in), optional :: type ! base storage format (default 'CSR') + class(psb_d_base_sparse_mat), intent(in), optional :: mold ! any format, e.g. psb_ext ELL/HLL type(psb_d_coo_sparse_mat) :: coo_block integer(psb_ipk_) :: k_entry, n_loc_rows, n_loc_cols, loc_row, loc_col @@ -347,7 +349,13 @@ contains call psb_errpush(psb_err_from_subroutine_, name, a_err='coo fix'); return end if call blk%mv_from(coo_block) - call blk%cscnv(info, type='CSR') + if (present(mold)) then + call blk%cscnv(info, mold=mold) + else if (present(type)) then + call blk%cscnv(info, type=type) + else + call blk%cscnv(info, type='CSR') + end if if (info /= 0) then call psb_errpush(psb_err_from_subroutine_, name, a_err='cscnv'); return end if diff --git a/test/nested/CMakeLists.txt b/test/nested/CMakeLists.txt index 95c1bed75..f742a8cf1 100644 --- a/test/nested/CMakeLists.txt +++ b/test/nested/CMakeLists.txt @@ -29,16 +29,16 @@ set(SOURCES_D_NEST_CG_TEST psb_d_nest_cg_test.F90) set(SOURCES_D_NEST_BUILDER_TEST psb_d_nest_builder_test.F90) add_executable(psb_d_nest_glob_test ${SOURCES_D_NEST_GLOB_TEST}) -target_link_libraries(psb_d_nest_glob_test psblas::util psblas::linsolve psblas::prec psblas::base) +target_link_libraries(psb_d_nest_glob_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) add_executable(psb_d_nest_rect_test ${SOURCES_D_NEST_RECT_TEST}) -target_link_libraries(psb_d_nest_rect_test psblas::util psblas::linsolve psblas::prec psblas::base) +target_link_libraries(psb_d_nest_rect_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) add_executable(psb_d_nest_cg_test ${SOURCES_D_NEST_CG_TEST}) -target_link_libraries(psb_d_nest_cg_test psblas::util psblas::linsolve psblas::prec psblas::base) +target_link_libraries(psb_d_nest_cg_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) add_executable(psb_d_nest_builder_test ${SOURCES_D_NEST_BUILDER_TEST}) -target_link_libraries(psb_d_nest_builder_test psblas::util psblas::linsolve psblas::prec psblas::base) +target_link_libraries(psb_d_nest_builder_test psblas::util psblas::linsolve psblas::prec psblas::ext psblas::base) # Set output directory for executables foreach(target psb_d_nest_glob_test psb_d_nest_rect_test psb_d_nest_cg_test psb_d_nest_builder_test) diff --git a/test/nested/Makefile b/test/nested/Makefile index b59501ecd..9790e8f06 100644 --- a/test/nested/Makefile +++ b/test/nested/Makefile @@ -5,7 +5,7 @@ include $(INCDIR)/Make.inc.psblas # # Libraries used LIBDIR=$(INSTALLDIR)/lib -PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_base +PSBLAS_LIB= -L$(LIBDIR) -lpsb_util -lpsb_linsolve -lpsb_prec -lpsb_ext -lpsb_base LDLIBS= $(PSBLDLIBS) # # Compilers and such diff --git a/test/nested/README.md b/test/nested/README.md index a52d2cc5c..b8d17bb06 100644 --- a/test/nested/README.md +++ b/test/nested/README.md @@ -26,14 +26,14 @@ sub-blocks. ## 1. Concepts * **Field** — a contiguous index space (e.g. velocity `V` and pressure `Q` in a saddle-point problem). Each field has its own `psb_desc_type` distribution. -* **Block (i,j)** — the sub-matrix coupling field `i` (rows) with field `j` (columns). It may be rectangular (`|field i| /= |field j|`) and may be absent. +* **Block (i,j)** — the sub-matrix coupling field `i` (rows) with field `j` (columns). It may be rectangular (different field sizes) and may be absent. * **Global operator** — the blocks are concatenated into a single **square** operator `M` of size `sum(field_sizes)`, distributed over one **composed global descriptor** with a **union halo** (one halo exchange per matrix-vector product, covering all blocks of a given column field at once). * **Rectangular blocks** — PSBLAS does not support rectangular *distributed* matrices, but it does support rectangular *local* CSR/COO matrices. The rectangular product therefore happens only in the **local** block `csmv`; the only object carrying a descriptor (and hence communication) is the global operator, which is always square. The global operator (`a_glob`) and global descriptor (`desc_glob`) can be passed unchanged to `psb_spmm`, `psb_krylov`, and the standard preconditioners. -## 2. Recommended API: `psb_d_nest_matrix` +## 2. Quick start: `psb_d_nest_matrix` The easy way to build a nested matrix is the `psb_d_nest_matrix` type (module `psb_d_nest_builder_mod`, re-exported by the umbrella `psb_d_nest_mod`), which follows the usual PSBLAS `init` / `ins` / `asb` pattern and hides all the descriptor / halo / compose / setup boilerplate: @@ -47,8 +47,6 @@ integer(psb_lpk_) :: n1, n2 call nested_matrix%init(ctxt, [n1, n2], info) ! 2) insert the block values, owned rows only (PSBLAS convention). -! ins(block_row, block_col, n_entries, entry_rows, entry_cols, entry_vals, info) -! rows are GLOBAL indices in field block_row, columns in field block_col. call nested_matrix%ins(1, 1, nz_A, iaA, jaA, valA, info) ! A = block (1,1) call nested_matrix%ins(1, 2, nz_Bt, iaBt, jaBt, valBt, info) ! B^T = block (1,2) call nested_matrix%ins(2, 1, nz_B, iaB, jaB, valB, info) ! B = block (2,1) @@ -60,6 +58,8 @@ call nested_matrix%asb(info) ! 4) from here on it is an ordinary distributed matrix/descriptor call psb_geall(x, nested_matrix%desc_glob, info) ... +call prec%init(ctxt, 'BJAC', info) +call prec%build(nested_matrix%a_glob, nested_matrix%desc_glob, info) call psb_krylov('CG', nested_matrix%a_glob, prec, b, x, eps, & & nested_matrix%desc_glob, info, itmax=..., iter=..., err=...) @@ -67,25 +67,124 @@ call psb_krylov('CG', nested_matrix%a_glob, prec, b, x, eps, & call nested_matrix%free(info) ``` -Notes: -* To know which rows it owns in a field, a process can query the per-field descriptor exposed as `nested_matrix%field_desc(i)` (e.g. `nested_matrix%field_desc(1)%get_local_rows()` and `%l2g(...)`), exactly as it would with a plain `psb_cdall` descriptor. -* Off-diagonal blocks may be rectangular: the cross-field column indices are registered into the union halo automatically by `ins`. -* The CG solver requires an SPD operator; a genuine saddle-point operator is indefinite and needs MINRES/GMRES (plus, eventually, a block preconditioner). -* **Do not copy/move** a `psb_d_nest_matrix` after `asb`: the wrapped operator holds internal pointers into the object. +## 3. User API reference +All of the public API is available through the umbrella module: -## 3. Low-level path (advanced) +```fortran +use psb_d_nest_mod +``` + +### 3.1 `type(psb_d_nest_matrix)` — the nested matrix (recommended) + +| Member | Meaning | +|--------|---------| +| `a_glob` | `type(psb_dspmat_type)` — the assembled global operator; pass it to `psb_spmm`, `psb_krylov`, `prec%build` | +| `desc_glob` | `type(psb_desc_type)` — the composed global descriptor; pass it wherever a descriptor is expected | +| `field_desc(i)` | `type(psb_desc_type)` — the descriptor of field `i` (query `%get_local_rows()`, `%l2g(...)` to find the rows owned by this process) | +| `n_fields` | number of fields | + +Methods (collective over the communicator unless noted): + +#### `call nested_matrix%init(ctxt, field_sizes, info)` + +Create the field structure. One descriptor per field is created with a block +row distribution; the total size is independent of the number of processes. + +| Argument | Type | Intent | Meaning | +|----------|------|--------|---------| +| `ctxt` | `type(psb_ctxt_type)` | in | parallel context from `psb_init` | +| `field_sizes(:)` | `integer(psb_lpk_)` | in | global size of each field, e.g. `[n1, n2]` | +| `info` | `integer(psb_ipk_)` | out | return code, `psb_success_` on success | + +#### `call nested_matrix%ins(block_row, block_col, n_entries, entry_rows, entry_cols, entry_vals, info)` + +Insert a batch of coefficients into block `(block_row, block_col)`. May be +called any number of times per block, in any order, before `asb`. Each process +inserts only the rows it owns (PSBLAS convention); cross-field columns are +registered into the union halo automatically. + +| Argument | Type | Intent | Meaning | +|----------|------|--------|---------| +| `block_row` | `integer(psb_ipk_)` | in | row-field index of the block (1..n_fields) | +| `block_col` | `integer(psb_ipk_)` | in | column-field index of the block (1..n_fields) | +| `n_entries` | `integer(psb_ipk_)` | in | number of triplets in this batch | +| `entry_rows(:)` | `integer(psb_lpk_)` | in | GLOBAL row indices in field `block_row` (1..field size) | +| `entry_cols(:)` | `integer(psb_lpk_)` | in | GLOBAL column indices in field `block_col` (1..field size) | +| `entry_vals(:)` | `real(psb_dpk_)` | in | coefficient values | +| `info` | `integer(psb_ipk_)` | out | return code | + +#### `call nested_matrix%asb(info [, type] [, mold])` + +Assemble: builds the per-field halos, the (possibly rectangular) local blocks, +the composed global descriptor `desc_glob` and the global operator `a_glob`. +After `asb` no further `ins` is allowed, and the object must not be +copied/moved (the operator holds internal pointers into it). + +The optional arguments select the **storage format of the blocks**: + +| Argument | Type | Meaning | +|----------|------|---------| +| `type` | `character(len=*)` | a base format name: `'CSR'` (default), `'CSC'`, `'COO'` | +| `mold` | `class(psb_d_base_sparse_mat)` | any format class, e.g. `psb_d_ell_sparse_mat` / `psb_d_hll_sparse_mat` from `psb_ext` | + +The nested operator is format-agnostic: every operation delegates to the +blocks' own methods, so each block runs its native kernels. + +#### `call nested_matrix%free(info)` + +Release every internal object (blocks, descriptors, global operator). + +### 3.2 Solvers and preconditioners + +`a_glob` / `desc_glob` work with the standard PSBLAS infrastructure: + +* **Krylov methods** — `psb_krylov('CG' | 'BICGSTAB' | 'GMRES' | ..., nested_matrix%a_glob, prec, b, x, eps, nested_matrix%desc_glob, info, ...)`. Remember that CG requires an SPD operator; a genuine saddle-point operator is indefinite and needs MINRES/GMRES. +* **Preconditioners** — all the stock PSBLAS one-level preconditioners can be built directly on the nested operator: + * `'NONE'` — identity; + * `'DIAG'` / `'JACOBI'` — diagonal scaling (served by the nested `get_diag`, which concatenates the diagonals of the diagonal blocks; absent blocks contribute zeros); + * `'BJAC'` — block Jacobi with ILU factorization of the local rows (served by the nested `csgetrow`, which extracts the local rows of the global operator across all blocks). + +```fortran +call prec%init(ctxt, 'BJAC', info) +call prec%build(nested_matrix%a_glob, nested_matrix%desc_glob, info) +``` + +### 3.3 Implemented base-class contract + +The nested operator (`psb_d_nest_base_mat`) implements the standard +`psb_d_base_sparse_mat` contract by delegation to the blocks, so it can be used +wherever an assembled PSBLAS matrix is expected: + +* **Products** — `csmv` (also transposed, `trans='T'`), `csmm` (multi-RHS), + `vect_mv` (encapsulated vectors: gathers/scatters through the vectors' own + `gth`/`sct` and runs each block through its `vect_mv`, so device block + formats execute their device kernels). +* **Access/conversions** — `get_diag`, `csgetrow` (and `csget`/`csgetblk` + through the base generics), `cp_to_coo`/`mv_to_coo` (and `cscnv`, `csclip`, + `tril`/`triu`, ... through the base generics built on the COO route). +* **Reductions** — `rowsum`/`arwsum`, `colsum`/`aclsum`, `maxval`, + `spnmi` (infinity norm), `spnm1` (1-norm). +* **Mutation/bookkeeping** — `scal` (left/right) and `scals` (the operator is a + view: scaling acts on the blocks), `clone` (shares the blocks, re-owns the + private index maps), `mold`, `sizeof`, `free`, `get_nzeros`, `get_fmt`. + +Intentionally **not** implemented (they fail with the standard "missing +override" error): `cp_from_coo`/`mv_from_coo` (a nested operator cannot be +built from a flat matrix without the field structure), `csput` (insertions go +to the blocks before assembly), `cssv`/`cssm` (a triangular solve is undefined +for a block operator). + +### 3.4 Low-level API (advanced) -`psb_d_nest_matrix` is built on three lower-level pieces, available directly for advanced use (see `psb_d_nest_cg_test.F90` for an end-to-end example): +`psb_d_nest_matrix` is built on lower-level pieces, available directly (see `psb_d_nest_cg_test.F90` for an end-to-end example): * `psb_cd_nest_compose(grid_desc, desc_glob, info)` — compose the per-field descriptors into the single global descriptor with the union halo. -* `psb_d_nest_base_setup(nest_op, block_storage, grid_desc, desc_glob, info)` — set up the `psb_d_nest_base_mat` operator (implements the local `csmv`). +* `psb_d_nest_base_setup(nest_op, block_storage, grid_desc, desc_glob, info)` — set up the `psb_d_nest_base_mat` operator (implements the local `csmv`, `get_diag`, `csgetrow`). * `psb_d_nest_rect_block(blk, nz, ia, ja, val, desc_row, desc_col, info)` — build a single (possibly rectangular) local block from global triplets, with rows localized against `desc_row` and columns against `desc_col`. -A field-split interface (`psb_d_nest_get_block`, `psb_d_nest_get_field_desc`, -`psb_d_nest_restrict_field`, `psb_d_nest_prolong_field`, -`psb_d_nest_apply_block`) is exposed on `psb_d_nest_base_mat` as the hook for a future block (field-split / Schur) preconditioner. +A field-split interface (`psb_d_nest_get_block`, `psb_d_nest_get_field_desc`, `psb_d_nest_restrict_field`, `psb_d_nest_prolong_field`, `psb_d_nest_apply_block`) is exposed on `psb_d_nest_base_mat` as the hook for a future block (field-split / Schur) preconditioner. ## 4. Tests @@ -93,8 +192,8 @@ A field-split interface (`psb_d_nest_get_block`, `psb_d_nest_get_field_desc`, | Test | What it checks | |------------------------------|----------------| | `psb_d_nest_glob_test` | Square 2×2 operator built with `psb_d_nest_matrix`; the nested `psb_spmm` is compared bit-for-bit against the same matrix assembled monolithically in CSR. | -| `psb_d_nest_rect_test` | Same, with fields of different size (`|V| = 2|Q|`) and genuinely **rectangular** off-diagonal blocks. | -| `psb_d_nest_cg_test` | Standard PSBLAS **CG** on an SPD, ill-conditioned operator (1D Laplacian reordered red-black), built on the **low-level path**; the solution is recovered to machine precision over hundreds of matvecs. | +| `psb_d_nest_rect_test` | Same, with fields of different size (`nV = 2 nQ`) and genuinely **rectangular** off-diagonal blocks. | +| `psb_d_nest_cg_test` | Standard PSBLAS **CG** on an SPD, ill-conditioned operator (1D Laplacian reordered red-black), built on the **low-level path**, solved under every stock preconditioner (`NONE`, `DIAG`, `BJAC`/ILU(0)); requires convergence to machine precision for all of them, and that `DIAG` reproduces the `NONE` iteration count exactly (a bit-precise check of the nested `get_diag`, since the diagonal is the constant `2I`). | | `psb_d_nest_builder_test` | Same CG solve as above but built through the `psb_d_nest_matrix` utility (high-level path). | All tests run both serially and in parallel, and the result is invariant with respect to the number of MPI processes. @@ -126,7 +225,7 @@ Library (under `base/modules/`): * `desc/psb_desc_nest_mod.f90` — `psb_desc_nest_type` (grid of per-field descriptors) * `serial/psb_d_nest_mat_mod.f90` — `psb_d_nest_sparse_mat` (block storage) -* `serial/psb_d_nest_base_mat_mod.F90`— `psb_d_nest_base_mat` (the MATNEST operator + `csmv`) +* `serial/psb_d_nest_base_mat_mod.F90`— `psb_d_nest_base_mat` (the MATNEST operator: `csmv`, `get_diag`, `csgetrow`) * `tools/psb_cd_nest_tools_mod.F90` — descriptor tools (`psb_cd_nest_compose`, ...) * `tools/psb_d_nest_tools_mod.F90` — block tools (`psb_d_nest_rect_block`, ...) * `tools/psb_d_nest_builder_mod.F90` — `psb_d_nest_matrix` frontend (init/ins/asb) diff --git a/test/nested/psb_d_nest_cg_test.F90 b/test/nested/psb_d_nest_cg_test.F90 index d49098fae..1613aeb49 100644 --- a/test/nested/psb_d_nest_cg_test.F90 +++ b/test/nested/psb_d_nest_cg_test.F90 @@ -54,6 +54,13 @@ ! Laplacian up to a permutation: SPD but with lambda_min ~ (pi/m)^2 => cond ~ ! N^2 => CG performs O(N) iterations that GROW with N. ! +! The system is solved under every stock PSBLAS preconditioner: NONE (operator +! only), DIAG (exercises the nested get_diag) and BJAC/ILU(0) (exercises the +! nested csgetrow through the ILU factorization). The test passes if every +! solve converges to the exact solution and DIAG reproduces the NONE iteration +! count exactly (with the constant diagonal 2I, Jacobi is a pure rescaling, so +! any mismatch would expose a wrong nested get_diag). +! ! Run: ./psb_d_nest_cg_test ; mpirun -np 4 ./psb_d_nest_cg_test ! program psb_d_nest_cg_test @@ -93,6 +100,12 @@ program psb_d_nest_cg_test integer(psb_ipk_) :: max_iter, trace_level, n_iter, stop_criterion real(psb_dpk_), parameter :: solution_tol = 1.0e-6_psb_dpk_ + ! stock preconditioners to exercise on the nested operator + integer(psb_ipk_), parameter :: n_precs = 3 + character(len=6), parameter :: prec_names(n_precs) = ['NONE ', 'DIAG ', 'BJAC '] + integer(psb_ipk_) :: i_prec, iter_none, iter_diag + logical :: all_passed + call psb_init(context) call psb_info(context, my_rank, num_procs) @@ -257,41 +270,61 @@ program psb_d_nest_cg_test norm_x_exact = psb_genrm2(x_exact, desc_global, info) !--------------------------------------------------------------- - ! 9) identity preconditioner (NONE): CG exercises only the operator + ! 9) solve with the standard PSBLAS CG under every stock preconditioner: + ! NONE (operator only), DIAG (exercises the nested get_diag), + ! BJAC/ILU(0) (exercises the nested csgetrow through the ILU build) !--------------------------------------------------------------- - call preconditioner%init(context, 'NONE', info) - call preconditioner%build(global_operator, desc_global, info) - if (info /= psb_success_) then - if (my_rank == 0) write(*,*) 'FAIL: preconditioner%build info=', info - goto 9999 - end if + if (my_rank == 0) write(*,'(a,i0,a,i0)') ' np=', num_procs, ' N(global)=', 2*field_size + all_passed = .true. + iter_none = 0 + iter_diag = -1 + do i_prec = 1, n_precs + call preconditioner%init(context, trim(prec_names(i_prec)), info) + call preconditioner%build(global_operator, desc_global, info) + if (info /= psb_success_) then + if (my_rank == 0) write(*,*) 'FAIL: prec%build (', trim(prec_names(i_prec)), ') info=', info + all_passed = .false.; exit + end if - !--------------------------------------------------------------- - ! 10) solve with the standard PSBLAS CG - !--------------------------------------------------------------- - call psb_geall(x_solution, desc_global, info); call psb_geasb(x_solution, desc_global, info) - call psb_krylov('CG', global_operator, preconditioner, rhs, x_solution, stop_tol, desc_global, info, & - & itmax=max_iter, iter=n_iter, err=final_residual, itrace=trace_level, istop=stop_criterion) - if (info /= psb_success_) then - if (my_rank == 0) write(*,*) 'FAIL: psb_krylov(CG) info=', info - goto 9999 - end if + call psb_geall(x_solution, desc_global, info); call psb_geasb(x_solution, desc_global, info) + call psb_krylov('CG', global_operator, preconditioner, rhs, x_solution, stop_tol, desc_global, info, & + & itmax=max_iter, iter=n_iter, err=final_residual, itrace=trace_level, istop=stop_criterion) + if (info /= psb_success_) then + if (my_rank == 0) write(*,*) 'FAIL: psb_krylov(CG,', trim(prec_names(i_prec)), ') info=', info + all_passed = .false.; exit + end if + + ! solution error: || x_solution - x_exact || / || x_exact || + call psb_geaxpby(-done, x_exact, done, x_solution, desc_global, info) + solution_error = psb_genrm2(x_solution, desc_global, info) / norm_x_exact + + if (my_rank == 0) then + write(*,'(a,a6,a,i6,a,es12.4,a,es12.4)') ' prec=', prec_names(i_prec), & + & ' CG iterations=', n_iter, ' residual=', final_residual, & + & ' ||x-x_ex||/||x_ex||=', solution_error + end if + if ((n_iter >= max_iter) .or. (solution_error > solution_tol)) all_passed = .false. + if (trim(prec_names(i_prec)) == 'NONE') iter_none = n_iter + if (trim(prec_names(i_prec)) == 'DIAG') iter_diag = n_iter + + call psb_gefree(x_solution, desc_global, info) + call preconditioner%free(info) + end do !--------------------------------------------------------------- - ! 11) solution error: || x_solution - x_exact || / || x_exact || + ! 10) verdict: every preconditioner converges to the right solution. + ! With the constant diagonal 2I, Jacobi is a pure rescaling, so DIAG + ! must reproduce the unpreconditioned iteration count EXACTLY: this is + ! a bit-precise check that the nested get_diag returns exact values. + ! (BJAC/ILU(0) on a red-black ordering drops all fill, so it cannot + ! reduce the iteration count of this exact-convergence regime; its + ! much smaller final residual shows the ILU factors are consistent.) !--------------------------------------------------------------- - call psb_geaxpby(-done, x_exact, done, x_solution, desc_global, info) ! x_solution <- x_solution - x_exact - solution_error = psb_genrm2(x_solution, desc_global, info) / norm_x_exact - if (my_rank == 0) then - write(*,'(a,i0,a,i0)') ' np=', num_procs, ' N(global)=', 2*field_size - write(*,'(a,i0)') ' CG iterations = ', n_iter - write(*,'(a,es12.4)') ' CG relative residual = ', final_residual - write(*,'(a,es12.4)') ' ||x - x_exact||/||x_ex|| = ', solution_error - if ((n_iter < max_iter) .and. (solution_error <= solution_tol)) then - write(*,*) '[PASS] CG converges on the global nested operator' + if (all_passed .and. (iter_diag == iter_none)) then + write(*,*) '[PASS] CG converges on the global nested operator with NONE/DIAG/BJAC' else - write(*,*) '[FAIL] CG does not converge / wrong solution (tol ', solution_tol, ')' + write(*,*) '[FAIL] preconditioned CG on the nested operator (tol ', solution_tol, ')' end if end if diff --git a/test/nested/psb_d_nest_glob_test.F90 b/test/nested/psb_d_nest_glob_test.F90 index 80054d09e..2b657e180 100644 --- a/test/nested/psb_d_nest_glob_test.F90 +++ b/test/nested/psb_d_nest_glob_test.F90 @@ -53,9 +53,12 @@ program psb_d_nest_glob_test use psb_base_mod use psb_util_mod - use psb_d_nest_mod + use psb_d_nest_mod + use psb_d_hll_mat_mod, only : psb_d_hll_sparse_mat ! psb_ext format for the blocks implicit none + type(psb_d_hll_sparse_mat) :: hll_mold + type(psb_ctxt_type) :: context integer(psb_ipk_) :: my_rank, num_procs, info, i_local_row integer(psb_ipk_) :: entry_idx, field1_local_rows, field2_local_rows @@ -141,7 +144,9 @@ program psb_d_nest_glob_test call nested_matrix%ins(2, 1, entry_idx, entry_rows, entry_cols, entry_vals, info) deallocate(entry_rows, entry_cols, entry_vals) - call nested_matrix%asb(info) + ! assemble with the blocks stored in HLL (psb_ext format): exercises the + ! configurable block storage and the format-agnostic nested matvec + call nested_matrix%asb(info, mold=hll_mold) if (info /= psb_success_) then if (my_rank==0) write(*,*) 'FAIL: nested_matrix%asb info=', info; goto 9999 end if diff --git a/test/nested/psb_d_nest_rect_test.F90 b/test/nested/psb_d_nest_rect_test.F90 index af88d2566..58f0fc29f 100644 --- a/test/nested/psb_d_nest_rect_test.F90 +++ b/test/nested/psb_d_nest_rect_test.F90 @@ -143,7 +143,9 @@ program psb_d_nest_rect_test call nested_matrix%ins(2, 1, entry_idx, entry_rows, entry_cols, entry_vals, info) deallocate(entry_rows, entry_cols, entry_vals) - call nested_matrix%asb(info) + ! assemble with the blocks stored in CSC instead of the CSR default: + ! exercises the configurable block storage on a base format + call nested_matrix%asb(info, type='CSC') if (info /= psb_success_) then if (my_rank==0) write(*,*) 'FAIL: nested_matrix%asb info=', info; goto 9999 end if