From 1af76c067c8cd65f3a77d33459d0e1fc11c59c26 Mon Sep 17 00:00:00 2001 From: wlthr Date: Sat, 26 Aug 2023 10:53:00 +0200 Subject: [PATCH 01/11] added parallel double precision spspmm implementations --- base/modules/Makefile | 6 +- base/modules/serial/psb_d_csr_mat_mod.f90 | 8 + base/modules/serial/psb_d_mat_mod.F90 | 5 +- base/modules/serial/psb_d_rb_idx_tree_mod.f90 | 91 ++++ base/modules/serial/psb_rb_idx_tree_mod.f90 | 5 + base/serial/impl/Makefile | 2 +- base/serial/impl/psb_d_csr_impl.F90 | 424 +++++++++++++++++- base/serial/impl/psb_d_rb_idx_tree_impl.F90 | 284 ++++++++++++ util/Makefile | 2 +- 9 files changed, 815 insertions(+), 12 deletions(-) create mode 100644 base/modules/serial/psb_d_rb_idx_tree_mod.f90 create mode 100644 base/modules/serial/psb_rb_idx_tree_mod.f90 create mode 100644 base/serial/impl/psb_d_rb_idx_tree_impl.F90 diff --git a/base/modules/Makefile b/base/modules/Makefile index 399add1f..d14a24ee 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -64,7 +64,8 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ serial/psb_c_base_mat_mod.o serial/psb_c_csr_mat_mod.o serial/psb_c_csc_mat_mod.o serial/psb_c_mat_mod.o \ - serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o + serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o \ + serial/psb_d_rb_idx_tree_mod.o serial/psb_rb_idx_tree_mod.o #\ # serial/psb_ls_csr_mat_mod.o serial/psb_ld_csr_mat_mod.o serial/psb_lc_csr_mat_mod.o serial/psb_lz_csr_mat_mod.o #\ @@ -270,6 +271,9 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o +serial/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o +serial/psb_rb_idx_tree_mod.o: serial/psb_d_rb_idx_tree_mod.o + error.o psb_realloc_mod.o: psb_error_mod.o psb_error_impl.o: psb_penv_mod.o psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o diff --git a/base/modules/serial/psb_d_csr_mat_mod.f90 b/base/modules/serial/psb_d_csr_mat_mod.f90 index d0aa622b..74cf8fff 100644 --- a/base/modules/serial/psb_d_csr_mat_mod.f90 +++ b/base/modules/serial/psb_d_csr_mat_mod.f90 @@ -46,6 +46,14 @@ module psb_d_csr_mat_mod use psb_d_base_mat_mod + integer(psb_ipk_), parameter :: spspmm_serial = 0 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2 + integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3 + integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4 + integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5 + integer(psb_ipk_) :: spspmm_impl = spspmm_serial + !> \namespace psb_base_mod \class psb_d_csr_sparse_mat !! \extends psb_d_base_mat_mod::psb_d_base_sparse_mat !! diff --git a/base/modules/serial/psb_d_mat_mod.F90 b/base/modules/serial/psb_d_mat_mod.F90 index 8f967ce1..9e613b29 100644 --- a/base/modules/serial/psb_d_mat_mod.F90 +++ b/base/modules/serial/psb_d_mat_mod.F90 @@ -79,7 +79,10 @@ module psb_d_mat_mod use psb_d_base_mat_mod - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat, & + spspmm_impl, spspmm_serial, spspmm_omp_gustavson, & + spspmm_omp_gustavson_1d, spspmm_serial_rb_tree, & + spspmm_omp_rb_tree, spspmm_omp_two_pass use psb_d_csc_mat_mod, only : psb_d_csc_sparse_mat, psb_ld_csc_sparse_mat type :: psb_dspmat_type diff --git a/base/modules/serial/psb_d_rb_idx_tree_mod.f90 b/base/modules/serial/psb_d_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..afdd831d --- /dev/null +++ b/base/modules/serial/psb_d_rb_idx_tree_mod.f90 @@ -0,0 +1,91 @@ +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +module psb_d_rb_idx_tree_mod + use psb_const_mod + implicit none + type :: psb_d_rb_idx_node + integer(psb_ipk_) :: idx + real(psb_dpk_) :: val + type(psb_d_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_d_rb_idx_node + + type :: psb_d_rb_idx_tree + type(psb_d_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_d_rb_idx_tree_insert + end type psb_d_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_d_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + end subroutine psb_d_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_d_rb_idx_tree_merge(trees, mat) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_d_rb_idx_tree + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_d_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + import :: psb_d_rb_idx_tree, psb_d_rb_idx_node + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_d_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_d_rb_idx_tree_rotate_right(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_d_rb_idx_tree_rotate_left(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_d_rb_idx_tree_mod \ No newline at end of file diff --git a/base/modules/serial/psb_rb_idx_tree_mod.f90 b/base/modules/serial/psb_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..e93554f1 --- /dev/null +++ b/base/modules/serial/psb_rb_idx_tree_mod.f90 @@ -0,0 +1,5 @@ +module psb_rb_idx_tree_mod + use psb_const_mod + + use psb_d_rb_idx_tree_mod +end module psb_rb_idx_tree_mod \ No newline at end of file diff --git a/base/serial/impl/Makefile b/base/serial/impl/Makefile index 00088741..ddcdaf96 100644 --- a/base/serial/impl/Makefile +++ b/base/serial/impl/Makefile @@ -10,7 +10,7 @@ BOBJS=psb_base_mat_impl.o \ SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o #\ psb_s_lcoo_impl.o psb_s_lcsr_impl.o -DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o +DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o psb_d_rb_idx_tree_impl.o #\ psb_d_lcoo_impl.o psb_d_lcsr_impl.o COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 10518a2d..1e39b892 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3349,13 +3349,30 @@ subroutine psb_dcsrspspmm(a,b,c,info) goto 9999 endif - ! Estimate number of nonzeros on output. - nza = a%get_nzeros() - nzb = b%get_nzeros() - nzc = 2*(nza+nzb) - call c%allocate(ma,nb,nzc) - - call csr_spspmm(a,b,c,info) + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select call c%set_asb() call c%set_host() @@ -3434,9 +3451,400 @@ contains end do c%irp(ma+1) = nzc + end subroutine csr_spspmm + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib - end subroutine csr_spspmm + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_d_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 +end subroutine spmm_serial_rb_tree + +subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_d_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 +end subroutine spmm_omp_rb_tree + +subroutine compute_indices(a, b, c, info) + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 +end subroutine compute_indices + +subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_d_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + +end subroutine direct_scalar_sparse_row_mul + +subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do +end subroutine spmm_omp_two_pass end subroutine psb_dcsrspspmm diff --git a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..8495a813 --- /dev/null +++ b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 @@ -0,0 +1,284 @@ +subroutine psb_d_rb_idx_tree_insert(this, idx, val) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + + character(len=22) :: name + type(psb_d_rb_idx_node), pointer :: new_node + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_d_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_d_rb_idx_tree_insert + +subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(parent) + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_d_rb_idx_tree_rotate_right(parent) + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_d_rb_idx_tree_fix_insertion + +subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_d_rb_idx_tree_swap_colors + +subroutine psb_d_rb_idx_tree_rotate_right(node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_d_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_d_rb_idx_tree_rotate_right + +subroutine psb_d_rb_idx_tree_rotate_left(node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_d_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_d_rb_idx_tree_rotate_left + +subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_d_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_d_rb_idx_tree_merge \ No newline at end of file diff --git a/util/Makefile b/util/Makefile index 3f087cbd..9b70855f 100644 --- a/util/Makefile +++ b/util/Makefile @@ -11,7 +11,7 @@ BASEOBJS= psb_blockpart_mod.o psb_metispart_mod.o psb_partidx_mod.o \ psb_hbio_mod.o psb_mmio_mod.o psb_mat_dist_mod.o \ psb_s_mat_dist_mod.o psb_d_mat_dist_mod.o psb_c_mat_dist_mod.o psb_z_mat_dist_mod.o \ psb_renum_mod.o psb_gps_mod.o \ - psb_s_renum_mod.o psb_d_renum_mod.o psb_c_renum_mod.o psb_z_renum_mod.o + psb_s_renum_mod.o psb_d_renum_mod.o psb_c_renum_mod.o psb_z_renum_mod.o IMPLOBJS= psb_s_hbio_impl.o psb_d_hbio_impl.o \ psb_c_hbio_impl.o psb_z_hbio_impl.o \ psb_s_mmio_impl.o psb_d_mmio_impl.o \ From 0fe95c3c76881ecd03b1e2170fd1799492115e0d Mon Sep 17 00:00:00 2001 From: wlthr Date: Mon, 28 Aug 2023 10:37:41 +0200 Subject: [PATCH 02/11] added use statement --- base/modules/Makefile | 2 +- base/serial/impl/psb_d_rb_idx_tree_impl.F90 | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/base/modules/Makefile b/base/modules/Makefile index d14a24ee..0d90b9f1 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -271,7 +271,7 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o -serial/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o +serial/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o psb_realloc_mod.o serial/psb_rb_idx_tree_mod.o: serial/psb_d_rb_idx_tree_mod.o error.o psb_realloc_mod.o: psb_error_mod.o diff --git a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 index 8495a813..a6bdc86f 100644 --- a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 +++ b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 @@ -226,6 +226,7 @@ subroutine psb_d_rb_idx_tree_merge(trees, mat) #if defined(OPENMP) use omp_lib #endif + use psb_realloc_mod use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat implicit none From 0185b79b2a0d1decba51d6b75c38c397b7c516cd Mon Sep 17 00:00:00 2001 From: wlthr Date: Mon, 28 Aug 2023 11:04:00 +0200 Subject: [PATCH 03/11] added setter for d_csr_spspmm implementation --- base/modules/serial/psb_d_csr_mat_mod.f90 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/base/modules/serial/psb_d_csr_mat_mod.f90 b/base/modules/serial/psb_d_csr_mat_mod.f90 index 74cf8fff..cabedb36 100644 --- a/base/modules/serial/psb_d_csr_mat_mod.f90 +++ b/base/modules/serial/psb_d_csr_mat_mod.f90 @@ -1301,4 +1301,15 @@ contains end subroutine ld_csr_free + subroutine set_d_csr_spspmm_impl(impl_id) + integer(psb_ipk_), intent(in) :: impl_id + + if (impl_id < 0 .or. impl_id > 5) then + write (*,*) "Invalid implementation id, impl id set to serial" + spspmm_impl = spspmm_serial + else + spspmm_impl = impl_id + end if + end subroutine + end module psb_d_csr_mat_mod From 2322a9ce6188915d3ffa9377b5c08a8151e42b06 Mon Sep 17 00:00:00 2001 From: wlthr Date: Mon, 28 Aug 2023 11:06:30 +0200 Subject: [PATCH 04/11] using end_idx to copy data from threads in gustavson and gustavson_1d --- base/serial/impl/psb_d_csr_impl.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 880bebf8..16f5e26e 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3892,8 +3892,8 @@ contains call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz) + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) !$omp end parallel end subroutine spmm_omp_gustavson @@ -4011,8 +4011,8 @@ contains call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz) + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) !$omp end parallel end subroutine spmm_omp_gustavson_1d From 7b45994b70a67f31f4569cb0e751fc0c4b65288d Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 29 Aug 2023 05:07:55 -0400 Subject: [PATCH 05/11] Setter/getter for SPSPMM algorithm in base_mat_mod --- base/modules/serial/psb_base_mat_mod.F90 | 28 +++++++++++++++++++++++ base/modules/serial/psb_d_csr_mat_mod.f90 | 24 ++----------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/base/modules/serial/psb_base_mat_mod.F90 b/base/modules/serial/psb_base_mat_mod.F90 index af4e4c6f..07bfbd50 100644 --- a/base/modules/serial/psb_base_mat_mod.F90 +++ b/base/modules/serial/psb_base_mat_mod.F90 @@ -73,6 +73,16 @@ module psb_base_mat_mod use psb_const_mod use psi_serial_mod + ! Algs for SPSPMM + integer(psb_ipk_), parameter :: spspmm_serial = 0 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2 + integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3 + integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4 + integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5 + integer(psb_ipk_), save :: spspmm_impl = spspmm_serial + + ! !> \namespace psb_base_mod \class psb_base_sparse_mat !! The basic data about your matrix. @@ -1904,4 +1914,22 @@ contains end subroutine psb_base_from_lbase + function psb_get_spspmm_impl() result(impl_id) + integer(psb_ipk_) :: impl_id + impl_id = spspmm_impl + end function psb_get_spspmm_impl + + subroutine psb_set_spspmm_impl(impl_id) + integer(psb_ipk_), intent(in) :: impl_id + + select case(impl_id) + case (spspmm_serial,spspmm_omp_gustavson,spspmm_omp_gustavson_1d,& + & spspmm_serial_rb_tree,spspmm_omp_rb_tree, spspmm_omp_two_pass) + spspmm_impl = impl_id + case default + + write (*,*) "Invalid implementation id",impl_id + end select + end subroutine psb_set_spspmm_impl + end module psb_base_mat_mod diff --git a/base/modules/serial/psb_d_csr_mat_mod.f90 b/base/modules/serial/psb_d_csr_mat_mod.f90 index cabedb36..21036d5f 100644 --- a/base/modules/serial/psb_d_csr_mat_mod.f90 +++ b/base/modules/serial/psb_d_csr_mat_mod.f90 @@ -45,15 +45,6 @@ module psb_d_csr_mat_mod use psb_d_base_mat_mod - - integer(psb_ipk_), parameter :: spspmm_serial = 0 - integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1 - integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2 - integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3 - integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4 - integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5 - integer(psb_ipk_) :: spspmm_impl = spspmm_serial - !> \namespace psb_base_mod \class psb_d_csr_sparse_mat !! \extends psb_d_base_mat_mod::psb_d_base_sparse_mat !! @@ -586,8 +577,8 @@ module psb_d_csr_mat_mod integer(psb_ipk_), intent(out) :: info end subroutine psb_d_csr_scals end interface - - !> \namespace psb_base_mod \class psb_ld_csr_sparse_mat + + !> \namespace psb_base_mod \class psb_ld_csr_sparse_mat !! \extends psb_ld_base_mat_mod::psb_ld_base_sparse_mat !! !! psb_ld_csr_sparse_mat type and the related methods. @@ -1301,15 +1292,4 @@ contains end subroutine ld_csr_free - subroutine set_d_csr_spspmm_impl(impl_id) - integer(psb_ipk_), intent(in) :: impl_id - - if (impl_id < 0 .or. impl_id > 5) then - write (*,*) "Invalid implementation id, impl id set to serial" - spspmm_impl = spspmm_serial - else - spspmm_impl = impl_id - end if - end subroutine - end module psb_d_csr_mat_mod From d0cacda995a98e4b7d3e57ca404421d03d2cc192 Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 29 Aug 2023 16:19:56 +0200 Subject: [PATCH 06/11] Moved various modules related to RB around, into auxil, update Makefile. --- README.md | 1 + base/modules/Makefile | 18 +- base/modules/auxil/psb_d_rb_idx_tree_mod.f90 | 128 +++++ base/modules/auxil/psb_rb_idx_tree_mod.f90 | 49 ++ base/modules/auxil/psb_sort_mod.f90 | 3 +- .../sort => modules/auxil}/psi_acx_mod.f90 | 0 .../sort => modules/auxil}/psi_alcx_mod.f90 | 0 .../sort => modules/auxil}/psi_lcx_mod.f90 | 0 base/modules/serial/psb_d_rb_idx_tree_mod.f90 | 91 ---- base/modules/serial/psb_rb_idx_tree_mod.f90 | 5 - base/serial/impl/psb_d_rb_idx_tree_impl.F90 | 504 ++++++++++-------- base/serial/sort/Makefile | 4 +- 12 files changed, 467 insertions(+), 336 deletions(-) create mode 100644 base/modules/auxil/psb_d_rb_idx_tree_mod.f90 create mode 100644 base/modules/auxil/psb_rb_idx_tree_mod.f90 rename base/{serial/sort => modules/auxil}/psi_acx_mod.f90 (100%) rename base/{serial/sort => modules/auxil}/psi_alcx_mod.f90 (100%) rename base/{serial/sort => modules/auxil}/psi_lcx_mod.f90 (100%) delete mode 100644 base/modules/serial/psb_d_rb_idx_tree_mod.f90 delete mode 100644 base/modules/serial/psb_rb_idx_tree_mod.f90 diff --git a/README.md b/README.md index 86bb2805..a9813f5e 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ Salvatore Filippone Contributors (roughly reverse cronological order): +Dimitri Walther Andea Di Iorio Stefano Petrilli Soren Rasmussen diff --git a/base/modules/Makefile b/base/modules/Makefile index 0d90b9f1..8fa63f92 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -60,12 +60,12 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \ auxil/psb_d_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \ + auxil/psb_d_rb_idx_tree_mod.o auxil/psb_rb_idx_tree_mod.o \ serial/psb_base_mat_mod.o serial/psb_mat_mod.o\ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ serial/psb_c_base_mat_mod.o serial/psb_c_csr_mat_mod.o serial/psb_c_csc_mat_mod.o serial/psb_c_mat_mod.o \ - serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o \ - serial/psb_d_rb_idx_tree_mod.o serial/psb_rb_idx_tree_mod.o + serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o #\ # serial/psb_ls_csr_mat_mod.o serial/psb_ld_csr_mat_mod.o serial/psb_lc_csr_mat_mod.o serial/psb_lz_csr_mat_mod.o #\ @@ -166,6 +166,7 @@ penv/psi_d_collective_mod.o penv/psi_c_collective_mod.o penv/psi_z_collective_m penv/psi_d_p2p_mod.o penv/psi_c_p2p_mod.o penv/psi_z_p2p_mod.o +auxil/psi_acx_mod.o auxil/psi_alcx_mod.o auxil/psi_lcx_mod.o \ auxil/psb_string_mod.o auxil/psb_m_realloc_mod.o auxil/psb_e_realloc_mod.o auxil/psb_s_realloc_mod.o \ auxil/psb_d_realloc_mod.o auxil/psb_c_realloc_mod.o auxil/psb_z_realloc_mod.o \ desc/psb_desc_const_mod.o psi_penv_mod.o: psb_const_mod.o @@ -175,6 +176,7 @@ desc/psb_indx_map_mod.o desc/psb_hash_mod.o: psb_realloc_mod.o psb_const_mod.o auxil/psb_i_sort_mod.o auxil/psb_s_sort_mod.o auxil/psb_d_sort_mod.o auxil/psb_c_sort_mod.o auxil/psb_z_sort_mod.o \ auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o auxil/psb_sort_mod.o: $(BASIC_MODS) + auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \ auxil/psb_e_hsort_mod.o auxil/psb_e_isort_mod.o \ @@ -193,7 +195,8 @@ auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_d_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \ - auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o + auxil/psb_ip_reord_mod.o \ + auxil/psi_serial_mod.o auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \ @@ -226,7 +229,10 @@ auxil/psb_c_hsort_x_mod.o: auxil/psb_c_hsort_mod.o auxil/psb_z_hsort_x_mod.o: auxil/psb_z_hsort_mod.o auxil/psi_serial_mod.o: auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o \ - auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o + auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o\ + auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o \ + auxil/psi_acx_mod.o auxil/psi_alcx_mod.o auxil/psi_lcx_mod.o + auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o: psb_const_mod.o auxil/psb_ip_reord_mod.o: auxil/psb_m_ip_reord_mod.o auxil/psb_e_ip_reord_mod.o \ @@ -271,8 +277,8 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o -serial/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o psb_realloc_mod.o -serial/psb_rb_idx_tree_mod.o: serial/psb_d_rb_idx_tree_mod.o +auxil/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_rb_idx_tree_mod.o: auxil/psb_d_rb_idx_tree_mod.o error.o psb_realloc_mod.o: psb_error_mod.o psb_error_impl.o: psb_penv_mod.o diff --git a/base/modules/auxil/psb_d_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_d_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..099d1a30 --- /dev/null +++ b/base/modules/auxil/psb_d_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_d_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_d_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_d_rb_idx_node + integer(psb_ipk_) :: idx + real(psb_dpk_) :: val + type(psb_d_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_d_rb_idx_node + + type :: psb_d_rb_idx_tree + type(psb_d_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_d_rb_idx_tree_insert + end type psb_d_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_d_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + end subroutine psb_d_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_d_rb_idx_tree_merge(trees, mat) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_d_rb_idx_tree + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_d_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + import :: psb_d_rb_idx_tree, psb_d_rb_idx_node + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_d_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_d_rb_idx_tree_rotate_right(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_d_rb_idx_tree_rotate_left(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_d_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..9d151389 --- /dev/null +++ b/base/modules/auxil/psb_rb_idx_tree_mod.f90 @@ -0,0 +1,49 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_rb_idx_tree_mod + use psb_const_mod + + use psb_d_rb_idx_tree_mod +end module psb_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_sort_mod.f90 b/base/modules/auxil/psb_sort_mod.f90 index 0bd7bdfc..e980dc2b 100644 --- a/base/modules/auxil/psb_sort_mod.f90 +++ b/base/modules/auxil/psb_sort_mod.f90 @@ -45,7 +45,8 @@ module psb_sort_mod use psb_const_mod use psb_ip_reord_mod - + use psi_serial_mod + use psb_m_hsort_mod use psb_m_isort_mod use psb_m_msort_mod diff --git a/base/serial/sort/psi_acx_mod.f90 b/base/modules/auxil/psi_acx_mod.f90 similarity index 100% rename from base/serial/sort/psi_acx_mod.f90 rename to base/modules/auxil/psi_acx_mod.f90 diff --git a/base/serial/sort/psi_alcx_mod.f90 b/base/modules/auxil/psi_alcx_mod.f90 similarity index 100% rename from base/serial/sort/psi_alcx_mod.f90 rename to base/modules/auxil/psi_alcx_mod.f90 diff --git a/base/serial/sort/psi_lcx_mod.f90 b/base/modules/auxil/psi_lcx_mod.f90 similarity index 100% rename from base/serial/sort/psi_lcx_mod.f90 rename to base/modules/auxil/psi_lcx_mod.f90 diff --git a/base/modules/serial/psb_d_rb_idx_tree_mod.f90 b/base/modules/serial/psb_d_rb_idx_tree_mod.f90 deleted file mode 100644 index afdd831d..00000000 --- a/base/modules/serial/psb_d_rb_idx_tree_mod.f90 +++ /dev/null @@ -1,91 +0,0 @@ -! Red black tree implementation ordered by index -! -! Each node contains and index and a double precision value -! -! The tree should always be well balanced -! -! inserting a node with an existing index will -! add up the new value to the old one -module psb_d_rb_idx_tree_mod - use psb_const_mod - implicit none - type :: psb_d_rb_idx_node - integer(psb_ipk_) :: idx - real(psb_dpk_) :: val - type(psb_d_rb_idx_node), pointer :: left, right, parent - logical :: is_red - end type psb_d_rb_idx_node - - type :: psb_d_rb_idx_tree - type(psb_d_rb_idx_node), pointer :: root - integer(psb_ipk_) :: nnz - - contains - - procedure :: insert => psb_d_rb_idx_tree_insert - end type psb_d_rb_idx_tree - - interface psb_rb_idx_tree_insert - subroutine psb_d_rb_idx_tree_insert(this, idx, val) - import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree - implicit none - class(psb_d_rb_idx_tree), intent(inout) :: this - integer(psb_ipk_), intent(in) :: idx - real(psb_dpk_), intent(in) :: val - end subroutine psb_d_rb_idx_tree_insert - end interface psb_rb_idx_tree_insert - - interface psb_rb_idx_tree_scalar_sparse_row_mul - subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat - import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree - implicit none - type(psb_d_rb_idx_tree), intent(inout) :: tree - real(psb_dpk_), intent(in) :: scalar - type(psb_d_csr_sparse_mat), intent(in) :: mat - integer(psb_ipk_), intent(in) :: row_num - end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul - end interface psb_rb_idx_tree_scalar_sparse_row_mul - - interface psb_rb_idx_tree_merge - subroutine psb_d_rb_idx_tree_merge(trees, mat) - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat - import :: psb_d_rb_idx_tree - type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) - type(psb_d_csr_sparse_mat), intent(inout) :: mat - end subroutine psb_d_rb_idx_tree_merge - end interface psb_rb_idx_tree_merge - - interface psb_rb_idx_tree_fix_insertion - subroutine psb_d_rb_idx_tree_fix_insertion(this, node) - import :: psb_d_rb_idx_tree, psb_d_rb_idx_node - implicit none - class(psb_d_rb_idx_tree), intent(inout) :: this - type(psb_d_rb_idx_node), pointer, intent(inout) :: node - end subroutine psb_d_rb_idx_tree_fix_insertion - end interface psb_rb_idx_tree_fix_insertion - - interface psb_rb_idx_tree_swap_colors - subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) - import :: psb_d_rb_idx_node - implicit none - type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 - end subroutine psb_d_rb_idx_tree_swap_colors - end interface psb_rb_idx_tree_swap_colors - - interface psb_rb_idx_tree_rotate_right - subroutine psb_d_rb_idx_tree_rotate_right(node) - import :: psb_d_rb_idx_node - implicit none - type(psb_d_rb_idx_node), pointer, intent(inout) :: node - end subroutine psb_d_rb_idx_tree_rotate_right - end interface psb_rb_idx_tree_rotate_right - - interface psb_rb_idx_tree_rotate_left - subroutine psb_d_rb_idx_tree_rotate_left(node) - import :: psb_d_rb_idx_node - implicit none - type(psb_d_rb_idx_node), pointer, intent(inout) :: node - end subroutine psb_d_rb_idx_tree_rotate_left - end interface psb_rb_idx_tree_rotate_left -end module psb_d_rb_idx_tree_mod \ No newline at end of file diff --git a/base/modules/serial/psb_rb_idx_tree_mod.f90 b/base/modules/serial/psb_rb_idx_tree_mod.f90 deleted file mode 100644 index e93554f1..00000000 --- a/base/modules/serial/psb_rb_idx_tree_mod.f90 +++ /dev/null @@ -1,5 +0,0 @@ -module psb_rb_idx_tree_mod - use psb_const_mod - - use psb_d_rb_idx_tree_mod -end module psb_rb_idx_tree_mod \ No newline at end of file diff --git a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 index a6bdc86f..9b63d51c 100644 --- a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 +++ b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 @@ -1,285 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_d_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! subroutine psb_d_rb_idx_tree_insert(this, idx, val) - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert - implicit none - class(psb_d_rb_idx_tree), intent(inout) :: this - integer(psb_ipk_), intent(in) :: idx - real(psb_dpk_), intent(in) :: val - - character(len=22) :: name - type(psb_d_rb_idx_node), pointer :: new_node - type(psb_d_rb_idx_node), pointer :: current, previous - name='psb_rb_idx_tree_insert' - - allocate(new_node) - new_node%idx = idx - new_node%val = val - nullify(new_node%left) - nullify(new_node%right) - nullify(new_node%parent) - new_node%is_red = .true. - - - if (.not. associated(this%root)) then - this%root => new_node - this%nnz = 1 - new_node%is_red = .false. - return - end if + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + + character(len=22) :: name + type(psb_d_rb_idx_node), pointer :: new_node + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else - current => this%root + current => current%right + end if + end do - do while (associated(current)) - previous => current + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if - if (idx == current%idx) then - current%val = current%val + val - deallocate(new_node) - return - else if (idx < current%idx) then - current => current%left - else + call psb_d_rb_idx_tree_fix_insertion(this, new_node) - current => current%right - end if - end do + this%nnz = this%nnz + 1 +end subroutine psb_d_rb_idx_tree_insert - if (idx < previous%idx) then - new_node%parent => previous - previous%left => new_node +subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right else - new_node%parent => previous - previous%right => new_node + uncle => grand_parent%left end if - call psb_d_rb_idx_tree_fix_insertion(this, new_node) - - this%nnz = this%nnz + 1 -end subroutine psb_d_rb_idx_tree_insert + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(parent) + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_d_rb_idx_tree_rotate_right(parent) + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) -subroutine psb_d_rb_idx_tree_fix_insertion(this, node) - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion - implicit none - class(psb_d_rb_idx_tree), intent(inout) :: this - type(psb_d_rb_idx_node), pointer, intent(inout) :: node - - character(len=29) :: name - type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle - name = 'psb_rb_idx_tree_fix_insertion' - - current => node - parent => current%parent - do while(associated(parent) .and. parent%is_red) - ! grand parent exist because root can't be red - grand_parent => parent%parent - if (parent%idx < grand_parent%idx) then - uncle => grand_parent%right - else - uncle => grand_parent%left - end if + if (this%root%idx == grand_parent%idx) this%root => current - if (associated(uncle) .and. uncle%is_red) then - parent%is_red = .false. - uncle%is_red = .false. - grand_parent%is_red = .true. - current => grand_parent - parent => current%parent - - ! Left-Left case - else if (current%idx < parent%idx .and. & - parent%idx < grand_parent%idx) then - call psb_d_rb_idx_tree_rotate_right(grand_parent) - call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) - - if (this%root%idx == grand_parent%idx) this%root => parent - - return - ! Left-Right case - else if (current%idx > parent%idx .and. & - parent%idx < grand_parent%idx) then - call psb_d_rb_idx_tree_rotate_left(parent) - call psb_d_rb_idx_tree_rotate_right(grand_parent) - call psb_d_rb_idx_tree_swap_colors(current, grand_parent) - - if (this%root%idx == grand_parent%idx) this%root => current - - return - ! Right-Right case - else if (current%idx > parent%idx .and. & - parent%idx > grand_parent%idx) then - call psb_d_rb_idx_tree_rotate_left(grand_parent) - call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) - - if (this%root%idx == grand_parent%idx) this%root => parent - - return - ! Right-Left case - else - call psb_d_rb_idx_tree_rotate_right(parent) - call psb_d_rb_idx_tree_rotate_left(grand_parent) - call psb_d_rb_idx_tree_swap_colors(current, grand_parent) - - if (this%root%idx == grand_parent%idx) this%root => current - - return - end if - end do + return + end if + end do - this%root%is_red = .false. + this%root%is_red = .false. end subroutine psb_d_rb_idx_tree_fix_insertion subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors - implicit none - type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 - character(len=27) :: name - logical :: tmp - name='psb_rb_idx_tree_swap_colors' + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' - tmp = n1%is_red - n1%is_red = n2%is_red - n2%is_red = tmp + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp end subroutine psb_d_rb_idx_tree_swap_colors subroutine psb_d_rb_idx_tree_rotate_right(node) - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right - implicit none - type(psb_d_rb_idx_node), pointer, intent(inout) :: node + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node - character(len=28) :: name - type(psb_d_rb_idx_node), pointer :: l, lr - name='psb_rb_idx_tree_rotate_right' + character(len=28) :: name + type(psb_d_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' - if (.not. associated(node%left)) return + if (.not. associated(node%left)) return - l => node%left - lr => l%right - node%left => lr + l => node%left + lr => l%right + node%left => lr - if (associated(lr)) lr%parent => node + if (associated(lr)) lr%parent => node - if (associated(node%parent)) then - if (node%idx < node%parent%idx) then - node%parent%left => l - else - node%parent%right => l - end if + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l end if + end if - l%parent => node%parent - node%parent => l + l%parent => node%parent + node%parent => l - l%right => node + l%right => node end subroutine psb_d_rb_idx_tree_rotate_right subroutine psb_d_rb_idx_tree_rotate_left(node) - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left - implicit none - type(psb_d_rb_idx_node), pointer, intent(inout) :: node + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node - character(len=27) :: name - type(psb_d_rb_idx_node), pointer :: r, rl - name='psb_rb_idx_tree_rotate_left' + character(len=27) :: name + type(psb_d_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' - if (.not. associated(node%right)) return + if (.not. associated(node%right)) return - r => node%right - rl => r%left - node%right => rl + r => node%right + rl => r%left + node%right => rl - if (associated(rl)) rl%parent => node + if (associated(rl)) rl%parent => node - if (associated(node%parent)) then - if (node%idx < node%parent%idx) then - node%parent%left => r - else - node%parent%right => r - end if + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r end if + end if - r%parent => node%parent - node%parent => r + r%parent => node%parent + node%parent => r - r%left => node + r%left => node end subroutine psb_d_rb_idx_tree_rotate_left subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat - implicit none - type(psb_d_rb_idx_tree), intent(inout) :: tree - real(psb_dpk_), intent(in) :: scalar - type(psb_d_csr_sparse_mat), intent(in) :: mat - integer(psb_ipk_), intent(in) :: row_num - - character(len=37) :: name - integer(psb_ipk_) :: i - name='psb_rb_idx_tree_scalar_sparse_row_mul' - - do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 - call tree%insert(mat%ja(i),scalar * mat%val(i)) - end do + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul subroutine psb_d_rb_idx_tree_merge(trees, mat) #if defined(OPENMP) - use omp_lib + use omp_lib #endif - use psb_realloc_mod - use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat - implicit none - type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) - type(psb_d_csr_sparse_mat), intent(inout) :: mat - - character(len=21) :: name - integer(psb_ipk_) :: i, j, rows, info, nnz - type(psb_d_rb_idx_node), pointer :: current, previous - name='psb_rb_idx_tree_merge' - - rows = size(trees) - - mat%irp(1) = 1 - - do i=1, rows - mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz - end do + use psb_realloc_mod + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) - nnz = mat%irp(rows + 1) - call psb_realloc(nnz, mat%val, info) - call psb_realloc(nnz, mat%ja, info) + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) #if defined(OPENMP) - !$omp parallel do schedule(static), private(current, previous, j) + !$omp parallel do schedule(static), private(current, previous, j) #endif - do i = 1, size(trees) - j = 0 - current => trees(i)%root - do while(associated(current)) - ! go to the left-most node - do while(associated(current%left)) - current => current%left - end do - mat%val(j + mat%irp(i)) = current%val - mat%ja(j + mat%irp(i)) = current%idx - j = j + 1 - - previous => current - if (associated(current%right)) then - if (associated(current%parent)) then - current%parent%left => current%right - end if - current%right%parent => current%parent - current => current%right - else - current => current%parent - if (associated(current)) nullify(current%left) - end if - deallocate(previous) - end do + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) end do + end do #if defined(OPENMP) - !$omp end parallel do + !$omp end parallel do #endif -end subroutine psb_d_rb_idx_tree_merge \ No newline at end of file +end subroutine psb_d_rb_idx_tree_merge diff --git a/base/serial/sort/Makefile b/base/serial/sort/Makefile index 2ef6420a..ff8fd620 100644 --- a/base/serial/sort/Makefile +++ b/base/serial/sort/Makefile @@ -3,7 +3,6 @@ include ../../../Make.inc # # The object files # -BOBJS=psi_lcx_mod.o psi_alcx_mod.o psi_acx_mod.o IOBJS=psb_m_hsort_impl.o psb_m_isort_impl.o psb_m_msort_impl.o psb_m_qsort_impl.o LOBJS=psb_e_hsort_impl.o psb_e_isort_impl.o psb_e_msort_impl.o psb_e_qsort_impl.o SOBJS=psb_s_hsort_impl.o psb_s_isort_impl.o psb_s_msort_impl.o psb_s_qsort_impl.o @@ -11,7 +10,7 @@ DOBJS=psb_d_hsort_impl.o psb_d_isort_impl.o psb_d_msort_impl.o psb_d_qsort_impl. COBJS=psb_c_hsort_impl.o psb_c_isort_impl.o psb_c_msort_impl.o psb_c_qsort_impl.o ZOBJS=psb_z_hsort_impl.o psb_z_isort_impl.o psb_z_msort_impl.o psb_z_qsort_impl.o -OBJS=$(BOBJS) $(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS) $(IOBJS) $(LOBJS) +OBJS=$(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS) $(IOBJS) $(LOBJS) # # Where the library should go, and how it is called. @@ -35,7 +34,6 @@ lib: objs # A bit excessive, but safe $(OBJS): $(MODDIR)/psb_base_mod.o -$(IOBJS) $(LOBJS) $(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS): $(BOBJS) clean: cleanobjs veryclean: cleanobjs From 0d8a5d3dc2964d7f8e7490fde8021f15d8ae0a8c Mon Sep 17 00:00:00 2001 From: Salvatore Filippone Date: Tue, 29 Aug 2023 19:27:29 +0200 Subject: [PATCH 07/11] New SPSPMM implementation --- base/modules/Makefile | 11 +- base/modules/auxil/psb_c_rb_idx_tree_mod.f90 | 128 +++++ base/modules/auxil/psb_rb_idx_tree_mod.f90 | 9 +- base/modules/auxil/psb_s_rb_idx_tree_mod.f90 | 128 +++++ base/modules/auxil/psb_z_rb_idx_tree_mod.f90 | 128 +++++ base/modules/serial/psb_d_csr_mat_mod.f90 | 5 +- base/modules/serial/psb_d_mat_mod.F90 | 5 +- base/serial/impl/Makefile | 12 +- base/serial/impl/psb_c_csr_impl.F90 | 542 ++++++++++++++++++- base/serial/impl/psb_c_rb_idx_tree_impl.F90 | 329 +++++++++++ base/serial/impl/psb_d_csr_impl.F90 | 400 +++++++++----- base/serial/impl/psb_s_csr_impl.F90 | 542 ++++++++++++++++++- base/serial/impl/psb_s_rb_idx_tree_impl.F90 | 329 +++++++++++ base/serial/impl/psb_z_csr_impl.F90 | 542 ++++++++++++++++++- base/serial/impl/psb_z_rb_idx_tree_impl.F90 | 329 +++++++++++ 15 files changed, 3271 insertions(+), 168 deletions(-) create mode 100644 base/modules/auxil/psb_c_rb_idx_tree_mod.f90 create mode 100644 base/modules/auxil/psb_s_rb_idx_tree_mod.f90 create mode 100644 base/modules/auxil/psb_z_rb_idx_tree_mod.f90 create mode 100644 base/serial/impl/psb_c_rb_idx_tree_impl.F90 create mode 100644 base/serial/impl/psb_s_rb_idx_tree_impl.F90 create mode 100644 base/serial/impl/psb_z_rb_idx_tree_impl.F90 diff --git a/base/modules/Makefile b/base/modules/Makefile index 8fa63f92..f1cb5e53 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -60,7 +60,11 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \ auxil/psb_d_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \ - auxil/psb_d_rb_idx_tree_mod.o auxil/psb_rb_idx_tree_mod.o \ + auxil/psb_s_rb_idx_tree_mod.o \ + auxil/psb_d_rb_idx_tree_mod.o \ + auxil/psb_c_rb_idx_tree_mod.o \ + auxil/psb_z_rb_idx_tree_mod.o \ + auxil/psb_rb_idx_tree_mod.o \ serial/psb_base_mat_mod.o serial/psb_mat_mod.o\ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ @@ -277,8 +281,11 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o +auxil/psb_s_rb_idx_tree_mod.o: serial/psb_s_csr_mat_mod.o psb_realloc_mod.o auxil/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o psb_realloc_mod.o -auxil/psb_rb_idx_tree_mod.o: auxil/psb_d_rb_idx_tree_mod.o +auxil/psb_c_rb_idx_tree_mod.o: serial/psb_c_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_z_rb_idx_tree_mod.o: serial/psb_z_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_rb_idx_tree_mod.o: auxil/psb_s_rb_idx_tree_mod.o auxil/psb_d_rb_idx_tree_mod.o auxil/psb_c_rb_idx_tree_mod.o auxil/psb_z_rb_idx_tree_mod.o error.o psb_realloc_mod.o: psb_error_mod.o psb_error_impl.o: psb_penv_mod.o diff --git a/base/modules/auxil/psb_c_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_c_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..9b3ada6e --- /dev/null +++ b/base/modules/auxil/psb_c_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_c_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_c_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_c_rb_idx_node + integer(psb_ipk_) :: idx + complex(psb_spk_) :: val + type(psb_c_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_c_rb_idx_node + + type :: psb_c_rb_idx_tree + type(psb_c_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_c_rb_idx_tree_insert + end type psb_c_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_c_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_spk_, psb_c_rb_idx_tree + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_spk_), intent(in) :: val + end subroutine psb_c_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + import :: psb_ipk_, psb_spk_, psb_c_rb_idx_tree + implicit none + type(psb_c_rb_idx_tree), intent(inout) :: tree + complex(psb_spk_), intent(in) :: scalar + type(psb_c_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_c_rb_idx_tree_merge(trees, mat) + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + import :: psb_c_rb_idx_tree + type(psb_c_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_c_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_c_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_c_rb_idx_tree_fix_insertion(this, node) + import :: psb_c_rb_idx_tree, psb_c_rb_idx_node + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_c_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_c_rb_idx_tree_swap_colors(n1, n2) + import :: psb_c_rb_idx_node + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_c_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_c_rb_idx_tree_rotate_right(node) + import :: psb_c_rb_idx_node + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_c_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_c_rb_idx_tree_rotate_left(node) + import :: psb_c_rb_idx_node + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_c_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_c_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_rb_idx_tree_mod.f90 index 9d151389..074255ea 100644 --- a/base/modules/auxil/psb_rb_idx_tree_mod.f90 +++ b/base/modules/auxil/psb_rb_idx_tree_mod.f90 @@ -43,7 +43,10 @@ ! Contributed by Dimitri Walther ! module psb_rb_idx_tree_mod - use psb_const_mod - - use psb_d_rb_idx_tree_mod + use psb_const_mod + + use psb_s_rb_idx_tree_mod + use psb_d_rb_idx_tree_mod + use psb_c_rb_idx_tree_mod + use psb_z_rb_idx_tree_mod end module psb_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_s_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_s_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..3ed2d60a --- /dev/null +++ b/base/modules/auxil/psb_s_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_s_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_s_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_s_rb_idx_node + integer(psb_ipk_) :: idx + real(psb_spk_) :: val + type(psb_s_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_s_rb_idx_node + + type :: psb_s_rb_idx_tree + type(psb_s_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_s_rb_idx_tree_insert + end type psb_s_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_s_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_spk_, psb_s_rb_idx_tree + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_spk_), intent(in) :: val + end subroutine psb_s_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + import :: psb_ipk_, psb_spk_, psb_s_rb_idx_tree + implicit none + type(psb_s_rb_idx_tree), intent(inout) :: tree + real(psb_spk_), intent(in) :: scalar + type(psb_s_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_s_rb_idx_tree_merge(trees, mat) + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + import :: psb_s_rb_idx_tree + type(psb_s_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_s_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_s_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_s_rb_idx_tree_fix_insertion(this, node) + import :: psb_s_rb_idx_tree, psb_s_rb_idx_node + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_s_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_s_rb_idx_tree_swap_colors(n1, n2) + import :: psb_s_rb_idx_node + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_s_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_s_rb_idx_tree_rotate_right(node) + import :: psb_s_rb_idx_node + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_s_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_s_rb_idx_tree_rotate_left(node) + import :: psb_s_rb_idx_node + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_s_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_s_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_z_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_z_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..7b15f080 --- /dev/null +++ b/base/modules/auxil/psb_z_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_z_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_z_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_z_rb_idx_node + integer(psb_ipk_) :: idx + complex(psb_dpk_) :: val + type(psb_z_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_z_rb_idx_node + + type :: psb_z_rb_idx_tree + type(psb_z_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_z_rb_idx_tree_insert + end type psb_z_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_z_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_dpk_, psb_z_rb_idx_tree + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_dpk_), intent(in) :: val + end subroutine psb_z_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + import :: psb_ipk_, psb_dpk_, psb_z_rb_idx_tree + implicit none + type(psb_z_rb_idx_tree), intent(inout) :: tree + complex(psb_dpk_), intent(in) :: scalar + type(psb_z_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_z_rb_idx_tree_merge(trees, mat) + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + import :: psb_z_rb_idx_tree + type(psb_z_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_z_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_z_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_z_rb_idx_tree_fix_insertion(this, node) + import :: psb_z_rb_idx_tree, psb_z_rb_idx_node + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_z_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_z_rb_idx_tree_swap_colors(n1, n2) + import :: psb_z_rb_idx_node + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_z_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_z_rb_idx_tree_rotate_right(node) + import :: psb_z_rb_idx_node + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_z_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_z_rb_idx_tree_rotate_left(node) + import :: psb_z_rb_idx_node + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_z_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_z_rb_idx_tree_mod diff --git a/base/modules/serial/psb_d_csr_mat_mod.f90 b/base/modules/serial/psb_d_csr_mat_mod.f90 index 21036d5f..d0aa622b 100644 --- a/base/modules/serial/psb_d_csr_mat_mod.f90 +++ b/base/modules/serial/psb_d_csr_mat_mod.f90 @@ -45,6 +45,7 @@ module psb_d_csr_mat_mod use psb_d_base_mat_mod + !> \namespace psb_base_mod \class psb_d_csr_sparse_mat !! \extends psb_d_base_mat_mod::psb_d_base_sparse_mat !! @@ -577,8 +578,8 @@ module psb_d_csr_mat_mod integer(psb_ipk_), intent(out) :: info end subroutine psb_d_csr_scals end interface - - !> \namespace psb_base_mod \class psb_ld_csr_sparse_mat + + !> \namespace psb_base_mod \class psb_ld_csr_sparse_mat !! \extends psb_ld_base_mat_mod::psb_ld_base_sparse_mat !! !! psb_ld_csr_sparse_mat type and the related methods. diff --git a/base/modules/serial/psb_d_mat_mod.F90 b/base/modules/serial/psb_d_mat_mod.F90 index 9e613b29..8f967ce1 100644 --- a/base/modules/serial/psb_d_mat_mod.F90 +++ b/base/modules/serial/psb_d_mat_mod.F90 @@ -79,10 +79,7 @@ module psb_d_mat_mod use psb_d_base_mat_mod - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat, & - spspmm_impl, spspmm_serial, spspmm_omp_gustavson, & - spspmm_omp_gustavson_1d, spspmm_serial_rb_tree, & - spspmm_omp_rb_tree, spspmm_omp_two_pass + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat use psb_d_csc_mat_mod, only : psb_d_csc_sparse_mat, psb_ld_csc_sparse_mat type :: psb_dspmat_type diff --git a/base/serial/impl/Makefile b/base/serial/impl/Makefile index ddcdaf96..37f400f0 100644 --- a/base/serial/impl/Makefile +++ b/base/serial/impl/Makefile @@ -7,16 +7,20 @@ BOBJS=psb_base_mat_impl.o \ psb_s_base_mat_impl.o psb_d_base_mat_impl.o psb_c_base_mat_impl.o psb_z_base_mat_impl.o #\ psb_s_lbase_mat_impl.o psb_d_lbase_mat_impl.o psb_c_lbase_mat_impl.o psb_z_lbase_mat_impl.o -SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o +SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o\ + psb_s_rb_idx_tree_impl.o #\ psb_s_lcoo_impl.o psb_s_lcsr_impl.o -DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o psb_d_rb_idx_tree_impl.o +DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o\ + psb_d_rb_idx_tree_impl.o #\ psb_d_lcoo_impl.o psb_d_lcsr_impl.o -COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o +COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o\ + psb_c_rb_idx_tree_impl.o #\ psb_c_lcoo_impl.o psb_c_lcsr_impl.o -ZOBJS=psb_z_csr_impl.o psb_z_coo_impl.o psb_z_csc_impl.o psb_z_mat_impl.o +ZOBJS=psb_z_csr_impl.o psb_z_coo_impl.o psb_z_csc_impl.o psb_z_mat_impl.o\ + psb_z_rb_idx_tree_impl.o #\ psb_z_lcoo_impl.o psb_z_lcsr_impl.o diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 6c3ecec6..3709ad21 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_c_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_c_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_c_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,7 @@ subroutine psb_c_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_c_csr_clean_zeros +#if 0 subroutine psb_ccsrspspmm(a,b,c,info) use psb_c_mat_mod use psb_serial_mod, psb_protect_name => psb_ccsrspspmm @@ -3776,7 +3777,538 @@ contains end subroutine csr_spspmm end subroutine psb_ccsrspspmm +#else +subroutine psb_ccsrspspmm(a,b,c,info) + use psb_c_mat_mod + use psb_serial_mod, psb_protect_name => psb_ccsrspspmm + + implicit none + + class(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + complex(psb_spk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + complex(psb_spk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + complex(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + complex(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_c_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_c_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_c_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + complex(psb_spk_), intent(in) :: scalar + type(psb_c_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_ccsrspspmm +#endif ! ! diff --git a/base/serial/impl/psb_c_rb_idx_tree_impl.F90 b/base/serial/impl/psb_c_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..42704c6e --- /dev/null +++ b/base/serial/impl/psb_c_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_c_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_c_rb_idx_tree_insert(this, idx, val) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_insert + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_spk_), intent(in) :: val + + character(len=22) :: name + type(psb_c_rb_idx_node), pointer :: new_node + type(psb_c_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_c_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_c_rb_idx_tree_insert + +subroutine psb_c_rb_idx_tree_fix_insertion(this, node) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_fix_insertion + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_c_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_c_rb_idx_tree_rotate_right(grand_parent) + call psb_c_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_c_rb_idx_tree_rotate_left(parent) + call psb_c_rb_idx_tree_rotate_right(grand_parent) + call psb_c_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_c_rb_idx_tree_rotate_left(grand_parent) + call psb_c_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_c_rb_idx_tree_rotate_right(parent) + call psb_c_rb_idx_tree_rotate_left(grand_parent) + call psb_c_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_c_rb_idx_tree_fix_insertion + +subroutine psb_c_rb_idx_tree_swap_colors(n1, n2) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_swap_colors + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_c_rb_idx_tree_swap_colors + +subroutine psb_c_rb_idx_tree_rotate_right(node) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_rotate_right + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_c_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_c_rb_idx_tree_rotate_right + +subroutine psb_c_rb_idx_tree_rotate_left(node) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_rotate_left + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_c_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_c_rb_idx_tree_rotate_left + +subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_scalar_sparse_row_mul + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + implicit none + type(psb_c_rb_idx_tree), intent(inout) :: tree + complex(psb_spk_), intent(in) :: scalar + type(psb_c_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_c_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_merge + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + implicit none + type(psb_c_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_c_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_c_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_c_rb_idx_tree_merge diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 16f5e26e..b8f692fa 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_d_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_d_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_d_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,130 @@ subroutine psb_d_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_d_csr_clean_zeros +#if 0 +subroutine psb_dcsrspspmm(a,b,c,info) + use psb_d_mat_mod + use psb_serial_mod, psb_protect_name => psb_dcsrspspmm + + implicit none + + class(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + real(psb_dpk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + real(psb_dpk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + + + end subroutine csr_spspmm + +end subroutine psb_dcsrspspmm +#else subroutine psb_dcsrspspmm(a,b,c,info) use psb_d_mat_mod use psb_serial_mod, psb_protect_name => psb_dcsrspspmm @@ -3791,7 +3915,7 @@ contains end subroutine csr_spspmm ! gustavson's algorithm using perfect hashing - ! and OpenMP parallelisation + ! and OpenMP parallelisation subroutine spmm_omp_gustavson(a,b,c,info) use omp_lib @@ -3799,12 +3923,12 @@ contains type(psb_d_csr_sparse_mat), intent(in) :: a,b type(psb_d_csr_sparse_mat), intent(out):: c integer(psb_ipk_), intent(out) :: info - + real(psb_dpk_), allocatable :: vals(:), acc(:) integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx - + ma = a%get_nrows() nb = b%get_ncols() @@ -3814,7 +3938,7 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) - + allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & !$omp shared(a,b,c,offsets) @@ -3822,66 +3946,66 @@ contains start_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma - if (start_idx == 0) then - start_idx = irw - end if - end_idx = irw - do jj = a%irp(irw), a%irp(irw + 1) - 1 - j = a%ja(jj) - thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) - end do + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do end do !$omp end do - + call psb_realloc(thread_upperbound, vals, info) call psb_realloc(thread_upperbound, col_inds, info) - + ! possible bottleneck acc = 0 - + nnz = 0 !$omp do schedule(static) private(irw, jj, j, k) do irw = 1, ma - rwnz = 0 - do jj = a%irp(irw), a%irp(irw + 1) - 1 - j = a%ja(jj) - do k = b%irp(j), b%irp(j + 1) - 1 - if (acc(b%ja(k)) == 0) then - nnz = nnz + 1 - rwnz = rwnz + 1 - col_inds(nnz) = b%ja(k) - end if - acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) - end do - end do - call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) - - do k = nnz - rwnz + 1, nnz - vals(k) = acc(col_inds(k)) - acc(col_inds(k)) = 0 + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) end do - c%irp(irw + 1) = rwnz + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz end do !$omp end do - + offsets(omp_get_thread_num() + 1) = nnz !$omp barrier - + ! possible bottleneck !$omp single do k = 1, omp_get_num_threads() - 1 - offsets(k + 1) = offsets(k + 1) + offsets(k) + offsets(k + 1) = offsets(k + 1) + offsets(k) end do !$omp end single !$omp barrier if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 end if do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) end do !$omp barrier @@ -3891,7 +4015,7 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) !$omp end parallel @@ -3904,13 +4028,13 @@ contains type(psb_d_csr_sparse_mat), intent(in) :: a,b type(psb_d_csr_sparse_mat), intent(out):: c integer(psb_ipk_), intent(out) :: info - + real(psb_dpk_), allocatable :: vals(:), acc(:) integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & - start_idx, end_idx , blk, blk_size, rwstart,& - rwblk, rwblkrem, nblks + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks ma = a%get_nrows() nb = b%get_ncols() @@ -3931,76 +4055,76 @@ contains start_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma - do jj = a%irp(irw), a%irp(irw + 1) - 1 - j = a%ja(jj) - thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) - end do + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do end do !$omp end do - + call psb_realloc(thread_upperbound, vals, info) call psb_realloc(thread_upperbound, col_inds, info) - + ! possible bottleneck acc = 0 - + nnz = 0 !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) do blk = 0, nblks - 1 - if (blk < rwblkrem) then - blk_size = rwblk + 1 - rwstart = blk * rwblk + blk + 1 - else - blk_size = rwblk - rwstart = blk * rwblk & - + rwblkrem + 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw end if - do irw = rwstart, rwstart + blk_size - 1 - if (start_idx == 0) then - start_idx = irw + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) end if - end_idx = irw - rwnz = 0 - do jj = a%irp(irw), a%irp(irw + 1) - 1 - j = a%ja(jj) - do k = b%irp(j), b%irp(j + 1) - 1 - if (acc(b%ja(k)) == 0) then - nnz = nnz + 1 - rwnz = rwnz + 1 - col_inds(nnz) = b%ja(k) - end if - acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) - end do - end do - call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) - - do k = nnz - rwnz + 1, nnz - vals(k) = acc(col_inds(k)) - acc(col_inds(k)) = 0 - end do - c%irp(irw + 1) = rwnz + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 end do + c%irp(irw + 1) = rwnz + end do end do !$omp end do - + offsets(omp_get_thread_num() + 1) = nnz !$omp barrier - + ! possible bottleneck !$omp single do k = 1, omp_get_num_threads() - 1 - offsets(k + 1) = offsets(k + 1) + offsets(k) + offsets(k + 1) = offsets(k + 1) + offsets(k) end do !$omp end single !$omp barrier if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 end if do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) end do !$omp barrier @@ -4010,7 +4134,7 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) !$omp end parallel @@ -4034,20 +4158,20 @@ contains call c%allocate(a_m, b_n) do row = 1, a_m - row_accs(row)%nnz = 0 - nullify(row_accs(row)%root) - do col = a%irp(row), a%irp(row + 1) - 1 - call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) - end do + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do end do call psb_rb_idx_tree_merge(row_accs, c) deallocate(row_accs) info = 0 -end subroutine spmm_serial_rb_tree + end subroutine spmm_serial_rb_tree -subroutine spmm_omp_rb_tree(a,b,c,info) + subroutine spmm_omp_rb_tree(a,b,c,info) use omp_lib use psb_rb_idx_tree_mod implicit none @@ -4070,11 +4194,11 @@ subroutine spmm_omp_rb_tree(a,b,c,info) !$omp parallel do schedule(static) do row = 1, a_m - row_accs(row)%nnz = 0 - nullify(row_accs(row)%root) - do col = a%irp(row), a%irp(row + 1) - 1 - call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) - end do + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do end do !$omp end parallel do @@ -4082,9 +4206,9 @@ subroutine spmm_omp_rb_tree(a,b,c,info) deallocate(row_accs) info = 0 -end subroutine spmm_omp_rb_tree + end subroutine spmm_omp_rb_tree -subroutine compute_indices(a, b, c, info) + subroutine compute_indices(a, b, c, info) implicit none type(psb_d_csr_sparse_mat), intent(in) :: a,b type(psb_d_csr_sparse_mat), intent(out):: c @@ -4096,10 +4220,10 @@ subroutine compute_indices(a, b, c, info) full_mat_bound = 0 !omp parallel do schedule(static) reduction(+:full_mat_bound) do row = 1, a%get_nrows() - do col = a%irp(row), a%irp(row + 1) - 1 - j = a%ja(col) - full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) - end do + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do end do !omp end parallel do @@ -4109,22 +4233,22 @@ subroutine compute_indices(a, b, c, info) c%irp(1) = 1 nnz = 0 - + do row = 1, a%get_nrows() - do col = a%irp(row), a%irp(row + 1) - 1 - do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 - k = 0 - do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) - k = k + 1 - end do - if (c%ja(c%irp(row) + k) == 0) then - c%ja(c%irp(row)+k) = b%ja(i) - nnz = nnz + 1 - end if - end do + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if end do - c%irp(row + 1) = nnz + 1 - call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) end do @@ -4132,9 +4256,9 @@ subroutine compute_indices(a, b, c, info) call psb_realloc(nnz, c%val, info) c%val = 0 -end subroutine compute_indices + end subroutine compute_indices -subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) type(psb_d_csr_sparse_mat), intent(inout) :: out_mat integer(psb_ipk_), intent(in) :: out_row_num real(psb_dpk_), intent(in) :: scalar @@ -4147,17 +4271,17 @@ subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_ row_end = out_mat%irp(out_row_num + 1) - 1 do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 - do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 - if (out_mat%ja(k) == mat%ja(i)) then - out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) - exit - end if - end do + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do end do -end subroutine direct_scalar_sparse_row_mul + end subroutine direct_scalar_sparse_row_mul -subroutine spmm_omp_two_pass(a,b,c,info) + subroutine spmm_omp_two_pass(a,b,c,info) use omp_lib implicit none @@ -4171,20 +4295,20 @@ subroutine spmm_omp_two_pass(a,b,c,info) b_n = b%get_ncols() call c%allocate(a_m, b_n) - + call compute_indices(a, b, c, info) !$omp parallel do schedule(static) do row = 1, a_m - do col = a%irp(row), a%irp(row + 1) - 1 - call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) - end do + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do end do !$omp end parallel do -end subroutine spmm_omp_two_pass + end subroutine spmm_omp_two_pass end subroutine psb_dcsrspspmm - +#endif ! ! diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index dbe7a4be..59d73892 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_s_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_s_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_s_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,7 @@ subroutine psb_s_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_s_csr_clean_zeros +#if 0 subroutine psb_scsrspspmm(a,b,c,info) use psb_s_mat_mod use psb_serial_mod, psb_protect_name => psb_scsrspspmm @@ -3776,7 +3777,538 @@ contains end subroutine csr_spspmm end subroutine psb_scsrspspmm +#else +subroutine psb_scsrspspmm(a,b,c,info) + use psb_s_mat_mod + use psb_serial_mod, psb_protect_name => psb_scsrspspmm + + implicit none + + class(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + real(psb_spk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + real(psb_spk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + real(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_s_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_s_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_s_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + real(psb_spk_), intent(in) :: scalar + type(psb_s_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_scsrspspmm +#endif ! ! diff --git a/base/serial/impl/psb_s_rb_idx_tree_impl.F90 b/base/serial/impl/psb_s_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..ae624f72 --- /dev/null +++ b/base/serial/impl/psb_s_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_s_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_s_rb_idx_tree_insert(this, idx, val) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_insert + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_spk_), intent(in) :: val + + character(len=22) :: name + type(psb_s_rb_idx_node), pointer :: new_node + type(psb_s_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_s_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_s_rb_idx_tree_insert + +subroutine psb_s_rb_idx_tree_fix_insertion(this, node) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_fix_insertion + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_s_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_s_rb_idx_tree_rotate_right(grand_parent) + call psb_s_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_s_rb_idx_tree_rotate_left(parent) + call psb_s_rb_idx_tree_rotate_right(grand_parent) + call psb_s_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_s_rb_idx_tree_rotate_left(grand_parent) + call psb_s_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_s_rb_idx_tree_rotate_right(parent) + call psb_s_rb_idx_tree_rotate_left(grand_parent) + call psb_s_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_s_rb_idx_tree_fix_insertion + +subroutine psb_s_rb_idx_tree_swap_colors(n1, n2) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_swap_colors + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_s_rb_idx_tree_swap_colors + +subroutine psb_s_rb_idx_tree_rotate_right(node) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_rotate_right + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_s_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_s_rb_idx_tree_rotate_right + +subroutine psb_s_rb_idx_tree_rotate_left(node) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_rotate_left + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_s_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_s_rb_idx_tree_rotate_left + +subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_scalar_sparse_row_mul + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + implicit none + type(psb_s_rb_idx_tree), intent(inout) :: tree + real(psb_spk_), intent(in) :: scalar + type(psb_s_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_s_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_merge + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + implicit none + type(psb_s_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_s_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_s_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_s_rb_idx_tree_merge diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 9322105e..34cd6fbc 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_z_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_z_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_z_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,7 @@ subroutine psb_z_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_z_csr_clean_zeros +#if 0 subroutine psb_zcsrspspmm(a,b,c,info) use psb_z_mat_mod use psb_serial_mod, psb_protect_name => psb_zcsrspspmm @@ -3776,7 +3777,538 @@ contains end subroutine csr_spspmm end subroutine psb_zcsrspspmm +#else +subroutine psb_zcsrspspmm(a,b,c,info) + use psb_z_mat_mod + use psb_serial_mod, psb_protect_name => psb_zcsrspspmm + + implicit none + + class(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + complex(psb_dpk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + complex(psb_dpk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + complex(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + complex(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_z_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_z_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_z_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + complex(psb_dpk_), intent(in) :: scalar + type(psb_z_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_zcsrspspmm +#endif ! ! diff --git a/base/serial/impl/psb_z_rb_idx_tree_impl.F90 b/base/serial/impl/psb_z_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..88ab214b --- /dev/null +++ b/base/serial/impl/psb_z_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_z_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_z_rb_idx_tree_insert(this, idx, val) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_insert + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_dpk_), intent(in) :: val + + character(len=22) :: name + type(psb_z_rb_idx_node), pointer :: new_node + type(psb_z_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_z_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_z_rb_idx_tree_insert + +subroutine psb_z_rb_idx_tree_fix_insertion(this, node) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_fix_insertion + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_z_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_z_rb_idx_tree_rotate_right(grand_parent) + call psb_z_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_z_rb_idx_tree_rotate_left(parent) + call psb_z_rb_idx_tree_rotate_right(grand_parent) + call psb_z_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_z_rb_idx_tree_rotate_left(grand_parent) + call psb_z_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_z_rb_idx_tree_rotate_right(parent) + call psb_z_rb_idx_tree_rotate_left(grand_parent) + call psb_z_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_z_rb_idx_tree_fix_insertion + +subroutine psb_z_rb_idx_tree_swap_colors(n1, n2) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_swap_colors + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_z_rb_idx_tree_swap_colors + +subroutine psb_z_rb_idx_tree_rotate_right(node) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_rotate_right + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_z_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_z_rb_idx_tree_rotate_right + +subroutine psb_z_rb_idx_tree_rotate_left(node) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_rotate_left + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_z_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_z_rb_idx_tree_rotate_left + +subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_scalar_sparse_row_mul + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + implicit none + type(psb_z_rb_idx_tree), intent(inout) :: tree + complex(psb_dpk_), intent(in) :: scalar + type(psb_z_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_z_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_merge + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + implicit none + type(psb_z_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_z_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_z_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_z_rb_idx_tree_merge From 41be1357c3af03b6dadb98b1b9647b9fa1f67caf Mon Sep 17 00:00:00 2001 From: sfilippone Date: Wed, 30 Aug 2023 13:45:51 +0200 Subject: [PATCH 08/11] Set defaults for SPSPMM depending on OpenMP compilation. --- base/modules/serial/psb_base_mat_mod.F90 | 5 +- base/serial/impl/psb_c_csr_impl.F90 | 250 ++++++++++++----------- base/serial/impl/psb_d_csr_impl.F90 | 250 ++++++++++++----------- base/serial/impl/psb_s_csr_impl.F90 | 250 ++++++++++++----------- base/serial/impl/psb_z_csr_impl.F90 | 250 ++++++++++++----------- 5 files changed, 508 insertions(+), 497 deletions(-) diff --git a/base/modules/serial/psb_base_mat_mod.F90 b/base/modules/serial/psb_base_mat_mod.F90 index 07bfbd50..2380fcb2 100644 --- a/base/modules/serial/psb_base_mat_mod.F90 +++ b/base/modules/serial/psb_base_mat_mod.F90 @@ -80,8 +80,11 @@ module psb_base_mat_mod integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3 integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4 integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5 +#if defined(OPENMP) + integer(psb_ipk_), save :: spspmm_impl = spspmm_omp_gustavson +#else integer(psb_ipk_), save :: spspmm_impl = spspmm_serial - +#endif ! !> \namespace psb_base_mod \class psb_base_sparse_mat diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 3709ad21..2028300d 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -3654,130 +3654,7 @@ subroutine psb_c_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_c_csr_clean_zeros -#if 0 -subroutine psb_ccsrspspmm(a,b,c,info) - use psb_c_mat_mod - use psb_serial_mod, psb_protect_name => psb_ccsrspspmm - - implicit none - - class(psb_c_csr_sparse_mat), intent(in) :: a,b - type(psb_c_csr_sparse_mat), intent(out) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb - character(len=20) :: name - integer(psb_ipk_) :: err_act - name='psb_csrspspmm' - call psb_erractionsave(err_act) - info = psb_success_ - - if (a%is_dev()) call a%sync() - if (b%is_dev()) call b%sync() - - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - - if ( mb /= na ) then - write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb - info = psb_err_invalid_matrix_sizes_ - call psb_errpush(info,name) - goto 9999 - endif - - ! Estimate number of nonzeros on output. - nza = a%get_nzeros() - nzb = b%get_nzeros() - nzc = 2*(nza+nzb) - call c%allocate(ma,nb,nzc) - - call csr_spspmm(a,b,c,info) - - call c%set_asb() - call c%set_host() - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -contains - - subroutine csr_spspmm(a,b,c,info) - implicit none - type(psb_c_csr_sparse_mat), intent(in) :: a,b - type(psb_c_csr_sparse_mat), intent(inout) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb - integer(psb_ipk_), allocatable :: irow(:), idxs(:) - complex(psb_spk_), allocatable :: row(:) - integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & - & nzc,nnzre, isz, ipb, irwsz, nrc, nze - complex(psb_spk_) :: cfb - - - info = psb_success_ - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - nze = min(size(c%val),size(c%ja)) - isz = max(ma,na,mb,nb) - call psb_realloc(isz,row,info) - if (info == 0) call psb_realloc(isz,idxs,info) - if (info == 0) call psb_realloc(isz,irow,info) - if (info /= 0) return - row = dzero - irow = 0 - nzc = 1 - do j = 1,ma - c%irp(j) = nzc - nrc = 0 - do k = a%irp(j), a%irp(j+1)-1 - irw = a%ja(k) - cfb = a%val(k) - irwsz = b%irp(irw+1)-b%irp(irw) - do i = b%irp(irw),b%irp(irw+1)-1 - icl = b%ja(i) - if (irow(icl) 0 ) then - if ((nzc+nrc)>nze) then - nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) - call psb_realloc(nze,c%val,info) - if (info == 0) call psb_realloc(nze,c%ja,info) - if (info /= 0) return - end if - - call psb_qsort(idxs(1:nrc)) - do i=1, nrc - irw = idxs(i) - c%ja(nzc) = irw - c%val(nzc) = row(irw) - row(irw) = dzero - nzc = nzc + 1 - end do - end if - end do - - c%irp(ma+1) = nzc - - - end subroutine csr_spspmm - -end subroutine psb_ccsrspspmm -#else +#if defined(OPENMP) subroutine psb_ccsrspspmm(a,b,c,info) use psb_c_mat_mod use psb_serial_mod, psb_protect_name => psb_ccsrspspmm @@ -4307,6 +4184,131 @@ contains !$omp end parallel do end subroutine spmm_omp_two_pass +end subroutine psb_ccsrspspmm + +#else + +subroutine psb_ccsrspspmm(a,b,c,info) + use psb_c_mat_mod + use psb_serial_mod, psb_protect_name => psb_ccsrspspmm + + implicit none + + class(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + complex(psb_spk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + complex(psb_spk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + + + end subroutine csr_spspmm + end subroutine psb_ccsrspspmm #endif diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index b8f692fa..10f99bc9 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3654,130 +3654,7 @@ subroutine psb_d_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_d_csr_clean_zeros -#if 0 -subroutine psb_dcsrspspmm(a,b,c,info) - use psb_d_mat_mod - use psb_serial_mod, psb_protect_name => psb_dcsrspspmm - - implicit none - - class(psb_d_csr_sparse_mat), intent(in) :: a,b - type(psb_d_csr_sparse_mat), intent(out) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb - character(len=20) :: name - integer(psb_ipk_) :: err_act - name='psb_csrspspmm' - call psb_erractionsave(err_act) - info = psb_success_ - - if (a%is_dev()) call a%sync() - if (b%is_dev()) call b%sync() - - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - - if ( mb /= na ) then - write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb - info = psb_err_invalid_matrix_sizes_ - call psb_errpush(info,name) - goto 9999 - endif - - ! Estimate number of nonzeros on output. - nza = a%get_nzeros() - nzb = b%get_nzeros() - nzc = 2*(nza+nzb) - call c%allocate(ma,nb,nzc) - - call csr_spspmm(a,b,c,info) - - call c%set_asb() - call c%set_host() - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -contains - - subroutine csr_spspmm(a,b,c,info) - implicit none - type(psb_d_csr_sparse_mat), intent(in) :: a,b - type(psb_d_csr_sparse_mat), intent(inout) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb - integer(psb_ipk_), allocatable :: irow(:), idxs(:) - real(psb_dpk_), allocatable :: row(:) - integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & - & nzc,nnzre, isz, ipb, irwsz, nrc, nze - real(psb_dpk_) :: cfb - - - info = psb_success_ - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - nze = min(size(c%val),size(c%ja)) - isz = max(ma,na,mb,nb) - call psb_realloc(isz,row,info) - if (info == 0) call psb_realloc(isz,idxs,info) - if (info == 0) call psb_realloc(isz,irow,info) - if (info /= 0) return - row = dzero - irow = 0 - nzc = 1 - do j = 1,ma - c%irp(j) = nzc - nrc = 0 - do k = a%irp(j), a%irp(j+1)-1 - irw = a%ja(k) - cfb = a%val(k) - irwsz = b%irp(irw+1)-b%irp(irw) - do i = b%irp(irw),b%irp(irw+1)-1 - icl = b%ja(i) - if (irow(icl) 0 ) then - if ((nzc+nrc)>nze) then - nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) - call psb_realloc(nze,c%val,info) - if (info == 0) call psb_realloc(nze,c%ja,info) - if (info /= 0) return - end if - - call psb_qsort(idxs(1:nrc)) - do i=1, nrc - irw = idxs(i) - c%ja(nzc) = irw - c%val(nzc) = row(irw) - row(irw) = dzero - nzc = nzc + 1 - end do - end if - end do - - c%irp(ma+1) = nzc - - - end subroutine csr_spspmm - -end subroutine psb_dcsrspspmm -#else +#if defined(OPENMP) subroutine psb_dcsrspspmm(a,b,c,info) use psb_d_mat_mod use psb_serial_mod, psb_protect_name => psb_dcsrspspmm @@ -4307,6 +4184,131 @@ contains !$omp end parallel do end subroutine spmm_omp_two_pass +end subroutine psb_dcsrspspmm + +#else + +subroutine psb_dcsrspspmm(a,b,c,info) + use psb_d_mat_mod + use psb_serial_mod, psb_protect_name => psb_dcsrspspmm + + implicit none + + class(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + real(psb_dpk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + real(psb_dpk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + + + end subroutine csr_spspmm + end subroutine psb_dcsrspspmm #endif diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 59d73892..f3d5c669 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -3654,130 +3654,7 @@ subroutine psb_s_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_s_csr_clean_zeros -#if 0 -subroutine psb_scsrspspmm(a,b,c,info) - use psb_s_mat_mod - use psb_serial_mod, psb_protect_name => psb_scsrspspmm - - implicit none - - class(psb_s_csr_sparse_mat), intent(in) :: a,b - type(psb_s_csr_sparse_mat), intent(out) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb - character(len=20) :: name - integer(psb_ipk_) :: err_act - name='psb_csrspspmm' - call psb_erractionsave(err_act) - info = psb_success_ - - if (a%is_dev()) call a%sync() - if (b%is_dev()) call b%sync() - - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - - if ( mb /= na ) then - write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb - info = psb_err_invalid_matrix_sizes_ - call psb_errpush(info,name) - goto 9999 - endif - - ! Estimate number of nonzeros on output. - nza = a%get_nzeros() - nzb = b%get_nzeros() - nzc = 2*(nza+nzb) - call c%allocate(ma,nb,nzc) - - call csr_spspmm(a,b,c,info) - - call c%set_asb() - call c%set_host() - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -contains - - subroutine csr_spspmm(a,b,c,info) - implicit none - type(psb_s_csr_sparse_mat), intent(in) :: a,b - type(psb_s_csr_sparse_mat), intent(inout) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb - integer(psb_ipk_), allocatable :: irow(:), idxs(:) - real(psb_spk_), allocatable :: row(:) - integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & - & nzc,nnzre, isz, ipb, irwsz, nrc, nze - real(psb_spk_) :: cfb - - - info = psb_success_ - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - nze = min(size(c%val),size(c%ja)) - isz = max(ma,na,mb,nb) - call psb_realloc(isz,row,info) - if (info == 0) call psb_realloc(isz,idxs,info) - if (info == 0) call psb_realloc(isz,irow,info) - if (info /= 0) return - row = dzero - irow = 0 - nzc = 1 - do j = 1,ma - c%irp(j) = nzc - nrc = 0 - do k = a%irp(j), a%irp(j+1)-1 - irw = a%ja(k) - cfb = a%val(k) - irwsz = b%irp(irw+1)-b%irp(irw) - do i = b%irp(irw),b%irp(irw+1)-1 - icl = b%ja(i) - if (irow(icl) 0 ) then - if ((nzc+nrc)>nze) then - nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) - call psb_realloc(nze,c%val,info) - if (info == 0) call psb_realloc(nze,c%ja,info) - if (info /= 0) return - end if - - call psb_qsort(idxs(1:nrc)) - do i=1, nrc - irw = idxs(i) - c%ja(nzc) = irw - c%val(nzc) = row(irw) - row(irw) = dzero - nzc = nzc + 1 - end do - end if - end do - - c%irp(ma+1) = nzc - - - end subroutine csr_spspmm - -end subroutine psb_scsrspspmm -#else +#if defined(OPENMP) subroutine psb_scsrspspmm(a,b,c,info) use psb_s_mat_mod use psb_serial_mod, psb_protect_name => psb_scsrspspmm @@ -4307,6 +4184,131 @@ contains !$omp end parallel do end subroutine spmm_omp_two_pass +end subroutine psb_scsrspspmm + +#else + +subroutine psb_scsrspspmm(a,b,c,info) + use psb_s_mat_mod + use psb_serial_mod, psb_protect_name => psb_scsrspspmm + + implicit none + + class(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + real(psb_spk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + real(psb_spk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + + + end subroutine csr_spspmm + end subroutine psb_scsrspspmm #endif diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 34cd6fbc..5cf1c72d 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -3654,130 +3654,7 @@ subroutine psb_z_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_z_csr_clean_zeros -#if 0 -subroutine psb_zcsrspspmm(a,b,c,info) - use psb_z_mat_mod - use psb_serial_mod, psb_protect_name => psb_zcsrspspmm - - implicit none - - class(psb_z_csr_sparse_mat), intent(in) :: a,b - type(psb_z_csr_sparse_mat), intent(out) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb - character(len=20) :: name - integer(psb_ipk_) :: err_act - name='psb_csrspspmm' - call psb_erractionsave(err_act) - info = psb_success_ - - if (a%is_dev()) call a%sync() - if (b%is_dev()) call b%sync() - - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - - if ( mb /= na ) then - write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb - info = psb_err_invalid_matrix_sizes_ - call psb_errpush(info,name) - goto 9999 - endif - - ! Estimate number of nonzeros on output. - nza = a%get_nzeros() - nzb = b%get_nzeros() - nzc = 2*(nza+nzb) - call c%allocate(ma,nb,nzc) - - call csr_spspmm(a,b,c,info) - - call c%set_asb() - call c%set_host() - - call psb_erractionrestore(err_act) - return - -9999 call psb_error_handler(err_act) - - return - -contains - - subroutine csr_spspmm(a,b,c,info) - implicit none - type(psb_z_csr_sparse_mat), intent(in) :: a,b - type(psb_z_csr_sparse_mat), intent(inout) :: c - integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: ma,na,mb,nb - integer(psb_ipk_), allocatable :: irow(:), idxs(:) - complex(psb_dpk_), allocatable :: row(:) - integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & - & nzc,nnzre, isz, ipb, irwsz, nrc, nze - complex(psb_dpk_) :: cfb - - - info = psb_success_ - ma = a%get_nrows() - na = a%get_ncols() - mb = b%get_nrows() - nb = b%get_ncols() - - nze = min(size(c%val),size(c%ja)) - isz = max(ma,na,mb,nb) - call psb_realloc(isz,row,info) - if (info == 0) call psb_realloc(isz,idxs,info) - if (info == 0) call psb_realloc(isz,irow,info) - if (info /= 0) return - row = dzero - irow = 0 - nzc = 1 - do j = 1,ma - c%irp(j) = nzc - nrc = 0 - do k = a%irp(j), a%irp(j+1)-1 - irw = a%ja(k) - cfb = a%val(k) - irwsz = b%irp(irw+1)-b%irp(irw) - do i = b%irp(irw),b%irp(irw+1)-1 - icl = b%ja(i) - if (irow(icl) 0 ) then - if ((nzc+nrc)>nze) then - nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) - call psb_realloc(nze,c%val,info) - if (info == 0) call psb_realloc(nze,c%ja,info) - if (info /= 0) return - end if - - call psb_qsort(idxs(1:nrc)) - do i=1, nrc - irw = idxs(i) - c%ja(nzc) = irw - c%val(nzc) = row(irw) - row(irw) = dzero - nzc = nzc + 1 - end do - end if - end do - - c%irp(ma+1) = nzc - - - end subroutine csr_spspmm - -end subroutine psb_zcsrspspmm -#else +#if defined(OPENMP) subroutine psb_zcsrspspmm(a,b,c,info) use psb_z_mat_mod use psb_serial_mod, psb_protect_name => psb_zcsrspspmm @@ -4307,6 +4184,131 @@ contains !$omp end parallel do end subroutine spmm_omp_two_pass +end subroutine psb_zcsrspspmm + +#else + +subroutine psb_zcsrspspmm(a,b,c,info) + use psb_z_mat_mod + use psb_serial_mod, psb_protect_name => psb_zcsrspspmm + + implicit none + + class(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + complex(psb_dpk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + complex(psb_dpk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + + + end subroutine csr_spspmm + end subroutine psb_zcsrspspmm #endif From efa29bc3ad9e2ee74d3cbfe78dffef63377c6ac4 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Tue, 5 Sep 2023 17:00:49 +0200 Subject: [PATCH 09/11] Fix configure for GCC when using --with-fcopt --- configure | 60 +++++++++++++++++++++++++++------------------------- configure.ac | 12 ++++++----- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/configure b/configure index 97b3eb79..752fe192 100755 --- a/configure +++ b/configure @@ -7279,6 +7279,37 @@ if test "X$FCOPT" == "X" ; then # note that no space should be placed around the equality symbol in assignations # Note : 'native' is valid _only_ on GCC/x86 (32/64 bits) FCOPT="-g -O3 -frecursive $FCOPT" + elif test "X$psblas_cv_fc" == X"xlf" ; then + # XL compiler : consider using -qarch=auto + FCOPT="-O3 -qarch=auto -qlanglvl=extended -qxlf2003=polymorphic:autorealloc $FCOPT" + FCFLAGS="-qhalt=e -qlanglvl=extended -qxlf2003=polymorphic:autorealloc $FCFLAGS" + elif test "X$psblas_cv_fc" == X"ifc" ; then + # other compilers .. + FCOPT="-O3 -recursive $FCOPT" + elif test "X$psblas_cv_fc" == X"pg" ; then + # other compilers .. + FCOPT="-fast $FCOPT" + # NOTE : PG & Sun use -fast instead -O3 + elif test "X$psblas_cv_fc" == X"sun" ; then + # other compilers .. + FCOPT="-fast $FCOPT" + elif test "X$psblas_cv_fc" == X"cray" ; then + FCOPT="-O3 -em -J. $FCOPT" + elif test "X$psblas_cv_fc" == X"nag" ; then + # NAG compiler .. + FCOPT="-O2 " + # NOTE : PG & Sun use -fast instead -O3 + else + FCOPT="-g -O2 $FCOPT" + fi +fi +if test "X$psblas_cv_fc" == X"nag" ; then + # Add needed options + FCOPT="$FCOPT -dcfuns -f2003 -wmismatch=mpi_scatterv,mpi_alltoallv,mpi_gatherv,mpi_allgatherv" + EXTRA_OPT="-mismatch_all" +fi +if test "X$psblas_cv_fc" == "Xgcc" ; then + FCOPT="-frecursive $FCOPT" { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for version 10 or later of GNU Fortran" >&5 printf %s "checking for version 10 or later of GNU Fortran... " >&6; } ac_ext=${ac_fc_srcext-f} @@ -7317,35 +7348,6 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $ ac_compiler_gnu=$ac_cv_c_compiler_gnu - - elif test "X$psblas_cv_fc" == X"xlf" ; then - # XL compiler : consider using -qarch=auto - FCOPT="-O3 -qarch=auto -qlanglvl=extended -qxlf2003=polymorphic:autorealloc $FCOPT" - FCFLAGS="-qhalt=e -qlanglvl=extended -qxlf2003=polymorphic:autorealloc $FCFLAGS" - elif test "X$psblas_cv_fc" == X"ifc" ; then - # other compilers .. - FCOPT="-O3 -recursive $FCOPT" - elif test "X$psblas_cv_fc" == X"pg" ; then - # other compilers .. - FCOPT="-fast $FCOPT" - # NOTE : PG & Sun use -fast instead -O3 - elif test "X$psblas_cv_fc" == X"sun" ; then - # other compilers .. - FCOPT="-fast $FCOPT" - elif test "X$psblas_cv_fc" == X"cray" ; then - FCOPT="-O3 -em -J. $FCOPT" - elif test "X$psblas_cv_fc" == X"nag" ; then - # NAG compiler .. - FCOPT="-O2 " - # NOTE : PG & Sun use -fast instead -O3 - else - FCOPT="-g -O2 $FCOPT" - fi -fi -if test "X$psblas_cv_fc" == X"nag" ; then - # Add needed options - FCOPT="$FCOPT -dcfuns -f2003 -wmismatch=mpi_scatterv,mpi_alltoallv,mpi_gatherv,mpi_allgatherv" - EXTRA_OPT="-mismatch_all" fi diff --git a/configure.ac b/configure.ac index d9923a6e..d8a02a50 100755 --- a/configure.ac +++ b/configure.ac @@ -416,11 +416,7 @@ if test "X$FCOPT" == "X" ; then if test "X$psblas_cv_fc" == "Xgcc" ; then # note that no space should be placed around the equality symbol in assignations # Note : 'native' is valid _only_ on GCC/x86 (32/64 bits) - FCOPT="-g -O3 -frecursive $FCOPT" - PAC_HAVE_GFORTRAN_10( - [FCOPT="-fallow-argument-mismatch $FCOPT"], - []) - + FCOPT="-g -O3 -frecursive $FCOPT" elif test "X$psblas_cv_fc" == X"xlf" ; then # XL compiler : consider using -qarch=auto FCOPT="-O3 -qarch=auto -qlanglvl=extended -qxlf2003=polymorphic:autorealloc $FCOPT" @@ -450,6 +446,12 @@ if test "X$psblas_cv_fc" == X"nag" ; then FCOPT="$FCOPT -dcfuns -f2003 -wmismatch=mpi_scatterv,mpi_alltoallv,mpi_gatherv,mpi_allgatherv" EXTRA_OPT="-mismatch_all" fi +if test "X$psblas_cv_fc" == "Xgcc" ; then + FCOPT="-frecursive $FCOPT" + PAC_HAVE_GFORTRAN_10( + [FCOPT="-fallow-argument-mismatch $FCOPT"], + []) +fi # COPT,FCOPT are aliases for CFLAGS,FCFLAGS . From def0635c5304b3bbbdf205f3f788531b12ce8947 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 7 Sep 2023 09:25:37 +0200 Subject: [PATCH 10/11] More OMP directives in cd_inloc --- base/tools/psb_cd_inloc.f90 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/base/tools/psb_cd_inloc.f90 b/base/tools/psb_cd_inloc.f90 index e8b9578e..30c3e9b3 100644 --- a/base/tools/psb_cd_inloc.f90 +++ b/base/tools/psb_cd_inloc.f90 @@ -176,7 +176,8 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) end if tmpgidx = 0 flag_ = 1 - do i=1,loc_row + !$omp parallel do private(i) + do i=1,loc_row if ((v(i)<1).or.(v(i)>m)) then info = psb_err_entry_out_of_bounds_ l_err(1) = i @@ -215,6 +216,7 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) novrl = 0 norphan = 0 npr_ov = 0 + !$omp parallel do private(i) do i=1,loc_row if ((v(i)<1).or.(v(i)>m)) then info = psb_err_entry_out_of_bounds_ @@ -222,7 +224,6 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) l_err(2) = v(i) l_err(3) = loc_row l_err(4) = m - exit endif vl(i) = v(i) end do From e31dd52c41e04c82fcbdb8d8a9cb900f62fff2f3 Mon Sep 17 00:00:00 2001 From: sfilippone Date: Thu, 7 Sep 2023 09:48:18 +0200 Subject: [PATCH 11/11] Fixed CRITICAL in hash_mod --- base/modules/desc/psb_hash_mod.F90 | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index 9e45e1f0..eb5556a2 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -458,7 +458,7 @@ contains integer(psb_ipk_), intent(out) :: val, info integer(psb_ipk_) :: hsize,hmask, hk, hd - + logical :: redo info = HashOK hsize = hash%hsize hmask = hash%hmask @@ -483,6 +483,7 @@ contains info = HashDuplicate return end if + redo = .false. !$OMP CRITICAL if (hash%table(hk,1) == HashFreeEntry) then if (hash%nk == hash%hsize -1) then @@ -497,8 +498,9 @@ contains info = HashOutOfMemory !return else - call psb_hash_searchinskey(key,val,nextval,hash,info) - !return + redo = .true. +!!$ call psb_hash_searchinskey(key,val,nextval,hash,info) +!!$ return end if else hash%nk = hash%nk + 1 @@ -509,6 +511,7 @@ contains end if end if !$OMP END CRITICAL + if (redo) call psb_hash_searchinskey(key,val,nextval,hash,info) if (info /= HashOk) return if (val > 0) return hk = hk - hd