diff --git a/README.md b/README.md index 86bb2805..a9813f5e 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ Salvatore Filippone Contributors (roughly reverse cronological order): +Dimitri Walther Andea Di Iorio Stefano Petrilli Soren Rasmussen diff --git a/base/modules/Makefile b/base/modules/Makefile index 399add1f..f1cb5e53 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -60,11 +60,16 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \ auxil/psb_d_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \ + auxil/psb_s_rb_idx_tree_mod.o \ + auxil/psb_d_rb_idx_tree_mod.o \ + auxil/psb_c_rb_idx_tree_mod.o \ + auxil/psb_z_rb_idx_tree_mod.o \ + auxil/psb_rb_idx_tree_mod.o \ serial/psb_base_mat_mod.o serial/psb_mat_mod.o\ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ serial/psb_c_base_mat_mod.o serial/psb_c_csr_mat_mod.o serial/psb_c_csc_mat_mod.o serial/psb_c_mat_mod.o \ - serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o + serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o #\ # serial/psb_ls_csr_mat_mod.o serial/psb_ld_csr_mat_mod.o serial/psb_lc_csr_mat_mod.o serial/psb_lz_csr_mat_mod.o #\ @@ -165,6 +170,7 @@ penv/psi_d_collective_mod.o penv/psi_c_collective_mod.o penv/psi_z_collective_m penv/psi_d_p2p_mod.o penv/psi_c_p2p_mod.o penv/psi_z_p2p_mod.o +auxil/psi_acx_mod.o auxil/psi_alcx_mod.o auxil/psi_lcx_mod.o \ auxil/psb_string_mod.o auxil/psb_m_realloc_mod.o auxil/psb_e_realloc_mod.o auxil/psb_s_realloc_mod.o \ auxil/psb_d_realloc_mod.o auxil/psb_c_realloc_mod.o auxil/psb_z_realloc_mod.o \ desc/psb_desc_const_mod.o psi_penv_mod.o: psb_const_mod.o @@ -174,6 +180,7 @@ desc/psb_indx_map_mod.o desc/psb_hash_mod.o: psb_realloc_mod.o psb_const_mod.o auxil/psb_i_sort_mod.o auxil/psb_s_sort_mod.o auxil/psb_d_sort_mod.o auxil/psb_c_sort_mod.o auxil/psb_z_sort_mod.o \ auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o auxil/psb_sort_mod.o: $(BASIC_MODS) + auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \ auxil/psb_e_hsort_mod.o auxil/psb_e_isort_mod.o \ @@ -192,7 +199,8 @@ auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_d_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \ - auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o + auxil/psb_ip_reord_mod.o \ + auxil/psi_serial_mod.o auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \ @@ -225,7 +233,10 @@ auxil/psb_c_hsort_x_mod.o: auxil/psb_c_hsort_mod.o auxil/psb_z_hsort_x_mod.o: auxil/psb_z_hsort_mod.o auxil/psi_serial_mod.o: auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o \ - auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o + auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o\ + auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o \ + auxil/psi_acx_mod.o auxil/psi_alcx_mod.o auxil/psi_lcx_mod.o + auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o: psb_const_mod.o auxil/psb_ip_reord_mod.o: auxil/psb_m_ip_reord_mod.o auxil/psb_e_ip_reord_mod.o \ @@ -270,6 +281,12 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o +auxil/psb_s_rb_idx_tree_mod.o: serial/psb_s_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_c_rb_idx_tree_mod.o: serial/psb_c_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_z_rb_idx_tree_mod.o: serial/psb_z_csr_mat_mod.o psb_realloc_mod.o +auxil/psb_rb_idx_tree_mod.o: auxil/psb_s_rb_idx_tree_mod.o auxil/psb_d_rb_idx_tree_mod.o auxil/psb_c_rb_idx_tree_mod.o auxil/psb_z_rb_idx_tree_mod.o + error.o psb_realloc_mod.o: psb_error_mod.o psb_error_impl.o: psb_penv_mod.o psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o diff --git a/base/modules/auxil/psb_c_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_c_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..9b3ada6e --- /dev/null +++ b/base/modules/auxil/psb_c_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_c_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_c_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_c_rb_idx_node + integer(psb_ipk_) :: idx + complex(psb_spk_) :: val + type(psb_c_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_c_rb_idx_node + + type :: psb_c_rb_idx_tree + type(psb_c_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_c_rb_idx_tree_insert + end type psb_c_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_c_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_spk_, psb_c_rb_idx_tree + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_spk_), intent(in) :: val + end subroutine psb_c_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + import :: psb_ipk_, psb_spk_, psb_c_rb_idx_tree + implicit none + type(psb_c_rb_idx_tree), intent(inout) :: tree + complex(psb_spk_), intent(in) :: scalar + type(psb_c_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_c_rb_idx_tree_merge(trees, mat) + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + import :: psb_c_rb_idx_tree + type(psb_c_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_c_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_c_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_c_rb_idx_tree_fix_insertion(this, node) + import :: psb_c_rb_idx_tree, psb_c_rb_idx_node + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_c_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_c_rb_idx_tree_swap_colors(n1, n2) + import :: psb_c_rb_idx_node + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_c_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_c_rb_idx_tree_rotate_right(node) + import :: psb_c_rb_idx_node + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_c_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_c_rb_idx_tree_rotate_left(node) + import :: psb_c_rb_idx_node + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_c_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_c_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_d_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_d_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..099d1a30 --- /dev/null +++ b/base/modules/auxil/psb_d_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_d_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_d_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_d_rb_idx_node + integer(psb_ipk_) :: idx + real(psb_dpk_) :: val + type(psb_d_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_d_rb_idx_node + + type :: psb_d_rb_idx_tree + type(psb_d_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_d_rb_idx_tree_insert + end type psb_d_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_d_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + end subroutine psb_d_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_d_rb_idx_tree_merge(trees, mat) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_d_rb_idx_tree + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_d_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + import :: psb_d_rb_idx_tree, psb_d_rb_idx_node + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_d_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_d_rb_idx_tree_rotate_right(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_d_rb_idx_tree_rotate_left(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_d_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..074255ea --- /dev/null +++ b/base/modules/auxil/psb_rb_idx_tree_mod.f90 @@ -0,0 +1,52 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_rb_idx_tree_mod + use psb_const_mod + + use psb_s_rb_idx_tree_mod + use psb_d_rb_idx_tree_mod + use psb_c_rb_idx_tree_mod + use psb_z_rb_idx_tree_mod +end module psb_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_s_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_s_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..3ed2d60a --- /dev/null +++ b/base/modules/auxil/psb_s_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_s_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_s_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_s_rb_idx_node + integer(psb_ipk_) :: idx + real(psb_spk_) :: val + type(psb_s_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_s_rb_idx_node + + type :: psb_s_rb_idx_tree + type(psb_s_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_s_rb_idx_tree_insert + end type psb_s_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_s_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_spk_, psb_s_rb_idx_tree + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_spk_), intent(in) :: val + end subroutine psb_s_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + import :: psb_ipk_, psb_spk_, psb_s_rb_idx_tree + implicit none + type(psb_s_rb_idx_tree), intent(inout) :: tree + real(psb_spk_), intent(in) :: scalar + type(psb_s_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_s_rb_idx_tree_merge(trees, mat) + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + import :: psb_s_rb_idx_tree + type(psb_s_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_s_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_s_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_s_rb_idx_tree_fix_insertion(this, node) + import :: psb_s_rb_idx_tree, psb_s_rb_idx_node + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_s_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_s_rb_idx_tree_swap_colors(n1, n2) + import :: psb_s_rb_idx_node + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_s_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_s_rb_idx_tree_rotate_right(node) + import :: psb_s_rb_idx_node + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_s_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_s_rb_idx_tree_rotate_left(node) + import :: psb_s_rb_idx_node + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_s_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_s_rb_idx_tree_mod diff --git a/base/modules/auxil/psb_sort_mod.f90 b/base/modules/auxil/psb_sort_mod.f90 index 0bd7bdfc..e980dc2b 100644 --- a/base/modules/auxil/psb_sort_mod.f90 +++ b/base/modules/auxil/psb_sort_mod.f90 @@ -45,7 +45,8 @@ module psb_sort_mod use psb_const_mod use psb_ip_reord_mod - + use psi_serial_mod + use psb_m_hsort_mod use psb_m_isort_mod use psb_m_msort_mod diff --git a/base/modules/auxil/psb_z_rb_idx_tree_mod.f90 b/base/modules/auxil/psb_z_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..7b15f080 --- /dev/null +++ b/base/modules/auxil/psb_z_rb_idx_tree_mod.f90 @@ -0,0 +1,128 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_z_rb_idx_tree_mod +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +module psb_z_rb_idx_tree_mod + use psb_const_mod + implicit none + + type :: psb_z_rb_idx_node + integer(psb_ipk_) :: idx + complex(psb_dpk_) :: val + type(psb_z_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_z_rb_idx_node + + type :: psb_z_rb_idx_tree + type(psb_z_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_z_rb_idx_tree_insert + end type psb_z_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_z_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_dpk_, psb_z_rb_idx_tree + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_dpk_), intent(in) :: val + end subroutine psb_z_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + import :: psb_ipk_, psb_dpk_, psb_z_rb_idx_tree + implicit none + type(psb_z_rb_idx_tree), intent(inout) :: tree + complex(psb_dpk_), intent(in) :: scalar + type(psb_z_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_z_rb_idx_tree_merge(trees, mat) + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + import :: psb_z_rb_idx_tree + type(psb_z_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_z_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_z_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_z_rb_idx_tree_fix_insertion(this, node) + import :: psb_z_rb_idx_tree, psb_z_rb_idx_node + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_z_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_z_rb_idx_tree_swap_colors(n1, n2) + import :: psb_z_rb_idx_node + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_z_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_z_rb_idx_tree_rotate_right(node) + import :: psb_z_rb_idx_node + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_z_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_z_rb_idx_tree_rotate_left(node) + import :: psb_z_rb_idx_node + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_z_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_z_rb_idx_tree_mod diff --git a/base/serial/sort/psi_acx_mod.f90 b/base/modules/auxil/psi_acx_mod.f90 similarity index 100% rename from base/serial/sort/psi_acx_mod.f90 rename to base/modules/auxil/psi_acx_mod.f90 diff --git a/base/serial/sort/psi_alcx_mod.f90 b/base/modules/auxil/psi_alcx_mod.f90 similarity index 100% rename from base/serial/sort/psi_alcx_mod.f90 rename to base/modules/auxil/psi_alcx_mod.f90 diff --git a/base/serial/sort/psi_lcx_mod.f90 b/base/modules/auxil/psi_lcx_mod.f90 similarity index 100% rename from base/serial/sort/psi_lcx_mod.f90 rename to base/modules/auxil/psi_lcx_mod.f90 diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index 9e45e1f0..eb5556a2 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -458,7 +458,7 @@ contains integer(psb_ipk_), intent(out) :: val, info integer(psb_ipk_) :: hsize,hmask, hk, hd - + logical :: redo info = HashOK hsize = hash%hsize hmask = hash%hmask @@ -483,6 +483,7 @@ contains info = HashDuplicate return end if + redo = .false. !$OMP CRITICAL if (hash%table(hk,1) == HashFreeEntry) then if (hash%nk == hash%hsize -1) then @@ -497,8 +498,9 @@ contains info = HashOutOfMemory !return else - call psb_hash_searchinskey(key,val,nextval,hash,info) - !return + redo = .true. +!!$ call psb_hash_searchinskey(key,val,nextval,hash,info) +!!$ return end if else hash%nk = hash%nk + 1 @@ -509,6 +511,7 @@ contains end if end if !$OMP END CRITICAL + if (redo) call psb_hash_searchinskey(key,val,nextval,hash,info) if (info /= HashOk) return if (val > 0) return hk = hk - hd diff --git a/base/modules/serial/psb_base_mat_mod.F90 b/base/modules/serial/psb_base_mat_mod.F90 index af4e4c6f..2380fcb2 100644 --- a/base/modules/serial/psb_base_mat_mod.F90 +++ b/base/modules/serial/psb_base_mat_mod.F90 @@ -73,6 +73,19 @@ module psb_base_mat_mod use psb_const_mod use psi_serial_mod + ! Algs for SPSPMM + integer(psb_ipk_), parameter :: spspmm_serial = 0 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2 + integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3 + integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4 + integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5 +#if defined(OPENMP) + integer(psb_ipk_), save :: spspmm_impl = spspmm_omp_gustavson +#else + integer(psb_ipk_), save :: spspmm_impl = spspmm_serial +#endif + ! !> \namespace psb_base_mod \class psb_base_sparse_mat !! The basic data about your matrix. @@ -1904,4 +1917,22 @@ contains end subroutine psb_base_from_lbase + function psb_get_spspmm_impl() result(impl_id) + integer(psb_ipk_) :: impl_id + impl_id = spspmm_impl + end function psb_get_spspmm_impl + + subroutine psb_set_spspmm_impl(impl_id) + integer(psb_ipk_), intent(in) :: impl_id + + select case(impl_id) + case (spspmm_serial,spspmm_omp_gustavson,spspmm_omp_gustavson_1d,& + & spspmm_serial_rb_tree,spspmm_omp_rb_tree, spspmm_omp_two_pass) + spspmm_impl = impl_id + case default + + write (*,*) "Invalid implementation id",impl_id + end select + end subroutine psb_set_spspmm_impl + end module psb_base_mat_mod diff --git a/base/serial/impl/Makefile b/base/serial/impl/Makefile index 00088741..37f400f0 100644 --- a/base/serial/impl/Makefile +++ b/base/serial/impl/Makefile @@ -7,16 +7,20 @@ BOBJS=psb_base_mat_impl.o \ psb_s_base_mat_impl.o psb_d_base_mat_impl.o psb_c_base_mat_impl.o psb_z_base_mat_impl.o #\ psb_s_lbase_mat_impl.o psb_d_lbase_mat_impl.o psb_c_lbase_mat_impl.o psb_z_lbase_mat_impl.o -SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o +SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o\ + psb_s_rb_idx_tree_impl.o #\ psb_s_lcoo_impl.o psb_s_lcsr_impl.o -DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o +DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o\ + psb_d_rb_idx_tree_impl.o #\ psb_d_lcoo_impl.o psb_d_lcsr_impl.o -COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o +COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o\ + psb_c_rb_idx_tree_impl.o #\ psb_c_lcoo_impl.o psb_c_lcsr_impl.o -ZOBJS=psb_z_csr_impl.o psb_z_coo_impl.o psb_z_csc_impl.o psb_z_mat_impl.o +ZOBJS=psb_z_csr_impl.o psb_z_coo_impl.o psb_z_csc_impl.o psb_z_mat_impl.o\ + psb_z_rb_idx_tree_impl.o #\ psb_z_lcoo_impl.o psb_z_lcsr_impl.o diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 6c3ecec6..2028300d 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_c_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_c_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_c_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_c_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,540 @@ subroutine psb_c_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_c_csr_clean_zeros +#if defined(OPENMP) +subroutine psb_ccsrspspmm(a,b,c,info) + use psb_c_mat_mod + use psb_serial_mod, psb_protect_name => psb_ccsrspspmm + + implicit none + + class(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + complex(psb_spk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + complex(psb_spk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + complex(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + complex(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_c_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_c_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_c_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + complex(psb_spk_), intent(in) :: scalar + type(psb_c_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_c_csr_sparse_mat), intent(in) :: a,b + type(psb_c_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_ccsrspspmm + +#else + subroutine psb_ccsrspspmm(a,b,c,info) use psb_c_mat_mod use psb_serial_mod, psb_protect_name => psb_ccsrspspmm @@ -3776,7 +4310,7 @@ contains end subroutine csr_spspmm end subroutine psb_ccsrspspmm - +#endif ! ! diff --git a/base/serial/impl/psb_c_rb_idx_tree_impl.F90 b/base/serial/impl/psb_c_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..42704c6e --- /dev/null +++ b/base/serial/impl/psb_c_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_c_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_c_rb_idx_tree_insert(this, idx, val) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_insert + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_spk_), intent(in) :: val + + character(len=22) :: name + type(psb_c_rb_idx_node), pointer :: new_node + type(psb_c_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_c_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_c_rb_idx_tree_insert + +subroutine psb_c_rb_idx_tree_fix_insertion(this, node) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_fix_insertion + implicit none + class(psb_c_rb_idx_tree), intent(inout) :: this + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_c_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_c_rb_idx_tree_rotate_right(grand_parent) + call psb_c_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_c_rb_idx_tree_rotate_left(parent) + call psb_c_rb_idx_tree_rotate_right(grand_parent) + call psb_c_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_c_rb_idx_tree_rotate_left(grand_parent) + call psb_c_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_c_rb_idx_tree_rotate_right(parent) + call psb_c_rb_idx_tree_rotate_left(grand_parent) + call psb_c_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_c_rb_idx_tree_fix_insertion + +subroutine psb_c_rb_idx_tree_swap_colors(n1, n2) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_swap_colors + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_c_rb_idx_tree_swap_colors + +subroutine psb_c_rb_idx_tree_rotate_right(node) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_rotate_right + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_c_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_c_rb_idx_tree_rotate_right + +subroutine psb_c_rb_idx_tree_rotate_left(node) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_rotate_left + implicit none + type(psb_c_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_c_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_c_rb_idx_tree_rotate_left + +subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_scalar_sparse_row_mul + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + implicit none + type(psb_c_rb_idx_tree), intent(inout) :: tree + complex(psb_spk_), intent(in) :: scalar + type(psb_c_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_c_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_merge + use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat + implicit none + type(psb_c_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_c_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_c_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_c_rb_idx_tree_merge diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 5b667342..10f99bc9 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_d_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_d_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_d_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_d_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,540 @@ subroutine psb_d_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_d_csr_clean_zeros +#if defined(OPENMP) +subroutine psb_dcsrspspmm(a,b,c,info) + use psb_d_mat_mod + use psb_serial_mod, psb_protect_name => psb_dcsrspspmm + + implicit none + + class(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + real(psb_dpk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + real(psb_dpk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_d_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_d_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_d_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_dcsrspspmm + +#else + subroutine psb_dcsrspspmm(a,b,c,info) use psb_d_mat_mod use psb_serial_mod, psb_protect_name => psb_dcsrspspmm @@ -3776,7 +4310,7 @@ contains end subroutine csr_spspmm end subroutine psb_dcsrspspmm - +#endif ! ! diff --git a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..9b63d51c --- /dev/null +++ b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_d_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_d_rb_idx_tree_insert(this, idx, val) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + + character(len=22) :: name + type(psb_d_rb_idx_node), pointer :: new_node + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_d_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_d_rb_idx_tree_insert + +subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(parent) + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_d_rb_idx_tree_rotate_right(parent) + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_d_rb_idx_tree_fix_insertion + +subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_d_rb_idx_tree_swap_colors + +subroutine psb_d_rb_idx_tree_rotate_right(node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_d_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_d_rb_idx_tree_rotate_right + +subroutine psb_d_rb_idx_tree_rotate_left(node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_d_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_d_rb_idx_tree_rotate_left + +subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_d_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_d_rb_idx_tree_merge diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index dbe7a4be..f3d5c669 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_s_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_s_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_s_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_s_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,540 @@ subroutine psb_s_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_s_csr_clean_zeros +#if defined(OPENMP) +subroutine psb_scsrspspmm(a,b,c,info) + use psb_s_mat_mod + use psb_serial_mod, psb_protect_name => psb_scsrspspmm + + implicit none + + class(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + real(psb_spk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + real(psb_spk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_spk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_s_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_s_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_s_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + real(psb_spk_), intent(in) :: scalar + type(psb_s_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_s_csr_sparse_mat), intent(in) :: a,b + type(psb_s_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_scsrspspmm + +#else + subroutine psb_scsrspspmm(a,b,c,info) use psb_s_mat_mod use psb_serial_mod, psb_protect_name => psb_scsrspspmm @@ -3776,7 +4310,7 @@ contains end subroutine csr_spspmm end subroutine psb_scsrspspmm - +#endif ! ! diff --git a/base/serial/impl/psb_s_rb_idx_tree_impl.F90 b/base/serial/impl/psb_s_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..ae624f72 --- /dev/null +++ b/base/serial/impl/psb_s_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_s_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_s_rb_idx_tree_insert(this, idx, val) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_insert + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_spk_), intent(in) :: val + + character(len=22) :: name + type(psb_s_rb_idx_node), pointer :: new_node + type(psb_s_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_s_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_s_rb_idx_tree_insert + +subroutine psb_s_rb_idx_tree_fix_insertion(this, node) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_fix_insertion + implicit none + class(psb_s_rb_idx_tree), intent(inout) :: this + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_s_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_s_rb_idx_tree_rotate_right(grand_parent) + call psb_s_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_s_rb_idx_tree_rotate_left(parent) + call psb_s_rb_idx_tree_rotate_right(grand_parent) + call psb_s_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_s_rb_idx_tree_rotate_left(grand_parent) + call psb_s_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_s_rb_idx_tree_rotate_right(parent) + call psb_s_rb_idx_tree_rotate_left(grand_parent) + call psb_s_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_s_rb_idx_tree_fix_insertion + +subroutine psb_s_rb_idx_tree_swap_colors(n1, n2) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_swap_colors + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_s_rb_idx_tree_swap_colors + +subroutine psb_s_rb_idx_tree_rotate_right(node) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_rotate_right + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_s_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_s_rb_idx_tree_rotate_right + +subroutine psb_s_rb_idx_tree_rotate_left(node) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_rotate_left + implicit none + type(psb_s_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_s_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_s_rb_idx_tree_rotate_left + +subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_scalar_sparse_row_mul + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + implicit none + type(psb_s_rb_idx_tree), intent(inout) :: tree + real(psb_spk_), intent(in) :: scalar + type(psb_s_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_s_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_merge + use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat + implicit none + type(psb_s_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_s_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_s_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_s_rb_idx_tree_merge diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 9322105e..5cf1c72d 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -3213,10 +3213,10 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) if (info == psb_success_) call psb_safe_ab_cpy(b%ja,a%ja,info) if (info == psb_success_) call psb_safe_ab_cpy(b%val,a%val,info) if (info == psb_success_) call psb_realloc(max(nr+1,nc+1),a%irp,info) - + endif - + #if defined(OPENMP) !$OMP PARALLEL default(shared) reduction(max:info) @@ -3246,7 +3246,7 @@ subroutine psb_z_cp_csr_from_coo(a,b,info) #endif call a%set_host() - + end subroutine psb_z_cp_csr_from_coo @@ -3495,7 +3495,7 @@ subroutine psb_z_cp_csr_to_fmt(a,b,info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,b%irp,info) call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%val,info) @@ -3600,7 +3600,7 @@ subroutine psb_z_cp_csr_from_fmt(a,b,info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) else - ! Despite the implementation in safe_cpy, it seems better this way + ! Despite the implementation in safe_cpy, it seems better this way call psb_realloc(nr+1,a%irp,info) call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%val,info) @@ -3654,6 +3654,540 @@ subroutine psb_z_csr_clean_zeros(a, info) call a%set_host() end subroutine psb_z_csr_clean_zeros +#if defined(OPENMP) +subroutine psb_zcsrspspmm(a,b,c,info) + use psb_z_mat_mod + use psb_serial_mod, psb_protect_name => psb_zcsrspspmm + + implicit none + + class(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb + character(len=20) :: name + integer(psb_ipk_) :: err_act + name='psb_csrspspmm' + call psb_erractionsave(err_act) + info = psb_success_ + + if (a%is_dev()) call a%sync() + if (b%is_dev()) call b%sync() + + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + + if ( mb /= na ) then + write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb + info = psb_err_invalid_matrix_sizes_ + call psb_errpush(info,name) + goto 9999 + endif + + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select + + call c%set_asb() + call c%set_host() + + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +contains + + subroutine csr_spspmm(a,b,c,info) + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(inout) :: c + integer(psb_ipk_), intent(out) :: info + integer(psb_ipk_) :: ma,na,mb,nb + integer(psb_ipk_), allocatable :: irow(:), idxs(:) + complex(psb_dpk_), allocatable :: row(:) + integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, & + & nzc,nnzre, isz, ipb, irwsz, nrc, nze + complex(psb_dpk_) :: cfb + + + info = psb_success_ + ma = a%get_nrows() + na = a%get_ncols() + mb = b%get_nrows() + nb = b%get_ncols() + + nze = min(size(c%val),size(c%ja)) + isz = max(ma,na,mb,nb) + call psb_realloc(isz,row,info) + if (info == 0) call psb_realloc(isz,idxs,info) + if (info == 0) call psb_realloc(isz,irow,info) + if (info /= 0) return + row = dzero + irow = 0 + nzc = 1 + do j = 1,ma + c%irp(j) = nzc + nrc = 0 + do k = a%irp(j), a%irp(j+1)-1 + irw = a%ja(k) + cfb = a%val(k) + irwsz = b%irp(irw+1)-b%irp(irw) + do i = b%irp(irw),b%irp(irw+1)-1 + icl = b%ja(i) + if (irow(icl) 0 ) then + if ((nzc+nrc)>nze) then + nze = max(ma*((nzc+j-1)/j),nzc+2*nrc) + call psb_realloc(nze,c%val,info) + if (info == 0) call psb_realloc(nze,c%ja,info) + if (info /= 0) return + end if + + call psb_qsort(idxs(1:nrc)) + do i=1, nrc + irw = idxs(i) + c%ja(nzc) = irw + c%val(nzc) = row(irw) + row(irw) = dzero + nzc = nzc + 1 + end do + end if + end do + + c%irp(ma+1) = nzc + end subroutine csr_spspmm + + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib + + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + complex(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + complex(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_z_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 + end subroutine spmm_serial_rb_tree + + subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_z_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 + end subroutine spmm_omp_rb_tree + + subroutine compute_indices(a, b, c, info) + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 + end subroutine compute_indices + + subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_z_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + complex(psb_dpk_), intent(in) :: scalar + type(psb_z_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + + end subroutine direct_scalar_sparse_row_mul + + subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_z_csr_sparse_mat), intent(in) :: a,b + type(psb_z_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + end subroutine spmm_omp_two_pass + +end subroutine psb_zcsrspspmm + +#else + subroutine psb_zcsrspspmm(a,b,c,info) use psb_z_mat_mod use psb_serial_mod, psb_protect_name => psb_zcsrspspmm @@ -3776,7 +4310,7 @@ contains end subroutine csr_spspmm end subroutine psb_zcsrspspmm - +#endif ! ! diff --git a/base/serial/impl/psb_z_rb_idx_tree_impl.F90 b/base/serial/impl/psb_z_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..88ab214b --- /dev/null +++ b/base/serial/impl/psb_z_rb_idx_tree_impl.F90 @@ -0,0 +1,329 @@ +! +! Parallel Sparse BLAS version 3.5 +! (C) Copyright 2006-2018 +! Salvatore Filippone +! Alfredo Buttari +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! +! +! +! package: psb_z_rb_idx_tree_impl +! +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +! Contributed by Dimitri Walther +! +subroutine psb_z_rb_idx_tree_insert(this, idx, val) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_insert + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + complex(psb_dpk_), intent(in) :: val + + character(len=22) :: name + type(psb_z_rb_idx_node), pointer :: new_node + type(psb_z_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_z_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_z_rb_idx_tree_insert + +subroutine psb_z_rb_idx_tree_fix_insertion(this, node) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_fix_insertion + implicit none + class(psb_z_rb_idx_tree), intent(inout) :: this + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_z_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_z_rb_idx_tree_rotate_right(grand_parent) + call psb_z_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_z_rb_idx_tree_rotate_left(parent) + call psb_z_rb_idx_tree_rotate_right(grand_parent) + call psb_z_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_z_rb_idx_tree_rotate_left(grand_parent) + call psb_z_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_z_rb_idx_tree_rotate_right(parent) + call psb_z_rb_idx_tree_rotate_left(grand_parent) + call psb_z_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_z_rb_idx_tree_fix_insertion + +subroutine psb_z_rb_idx_tree_swap_colors(n1, n2) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_swap_colors + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_z_rb_idx_tree_swap_colors + +subroutine psb_z_rb_idx_tree_rotate_right(node) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_rotate_right + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_z_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_z_rb_idx_tree_rotate_right + +subroutine psb_z_rb_idx_tree_rotate_left(node) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_rotate_left + implicit none + type(psb_z_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_z_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_z_rb_idx_tree_rotate_left + +subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_scalar_sparse_row_mul + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + implicit none + type(psb_z_rb_idx_tree), intent(inout) :: tree + complex(psb_dpk_), intent(in) :: scalar + type(psb_z_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_z_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_realloc_mod + use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_merge + use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat + implicit none + type(psb_z_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_z_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_z_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_z_rb_idx_tree_merge diff --git a/base/serial/sort/Makefile b/base/serial/sort/Makefile index 2ef6420a..ff8fd620 100644 --- a/base/serial/sort/Makefile +++ b/base/serial/sort/Makefile @@ -3,7 +3,6 @@ include ../../../Make.inc # # The object files # -BOBJS=psi_lcx_mod.o psi_alcx_mod.o psi_acx_mod.o IOBJS=psb_m_hsort_impl.o psb_m_isort_impl.o psb_m_msort_impl.o psb_m_qsort_impl.o LOBJS=psb_e_hsort_impl.o psb_e_isort_impl.o psb_e_msort_impl.o psb_e_qsort_impl.o SOBJS=psb_s_hsort_impl.o psb_s_isort_impl.o psb_s_msort_impl.o psb_s_qsort_impl.o @@ -11,7 +10,7 @@ DOBJS=psb_d_hsort_impl.o psb_d_isort_impl.o psb_d_msort_impl.o psb_d_qsort_impl. COBJS=psb_c_hsort_impl.o psb_c_isort_impl.o psb_c_msort_impl.o psb_c_qsort_impl.o ZOBJS=psb_z_hsort_impl.o psb_z_isort_impl.o psb_z_msort_impl.o psb_z_qsort_impl.o -OBJS=$(BOBJS) $(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS) $(IOBJS) $(LOBJS) +OBJS=$(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS) $(IOBJS) $(LOBJS) # # Where the library should go, and how it is called. @@ -35,7 +34,6 @@ lib: objs # A bit excessive, but safe $(OBJS): $(MODDIR)/psb_base_mod.o -$(IOBJS) $(LOBJS) $(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS): $(BOBJS) clean: cleanobjs veryclean: cleanobjs diff --git a/base/tools/psb_cd_inloc.f90 b/base/tools/psb_cd_inloc.f90 index e8b9578e..30c3e9b3 100644 --- a/base/tools/psb_cd_inloc.f90 +++ b/base/tools/psb_cd_inloc.f90 @@ -176,7 +176,8 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) end if tmpgidx = 0 flag_ = 1 - do i=1,loc_row + !$omp parallel do private(i) + do i=1,loc_row if ((v(i)<1).or.(v(i)>m)) then info = psb_err_entry_out_of_bounds_ l_err(1) = i @@ -215,6 +216,7 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) novrl = 0 norphan = 0 npr_ov = 0 + !$omp parallel do private(i) do i=1,loc_row if ((v(i)<1).or.(v(i)>m)) then info = psb_err_entry_out_of_bounds_ @@ -222,7 +224,6 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash) l_err(2) = v(i) l_err(3) = loc_row l_err(4) = m - exit endif vl(i) = v(i) end do diff --git a/util/Makefile b/util/Makefile index 3f087cbd..9b70855f 100644 --- a/util/Makefile +++ b/util/Makefile @@ -11,7 +11,7 @@ BASEOBJS= psb_blockpart_mod.o psb_metispart_mod.o psb_partidx_mod.o \ psb_hbio_mod.o psb_mmio_mod.o psb_mat_dist_mod.o \ psb_s_mat_dist_mod.o psb_d_mat_dist_mod.o psb_c_mat_dist_mod.o psb_z_mat_dist_mod.o \ psb_renum_mod.o psb_gps_mod.o \ - psb_s_renum_mod.o psb_d_renum_mod.o psb_c_renum_mod.o psb_z_renum_mod.o + psb_s_renum_mod.o psb_d_renum_mod.o psb_c_renum_mod.o psb_z_renum_mod.o IMPLOBJS= psb_s_hbio_impl.o psb_d_hbio_impl.o \ psb_c_hbio_impl.o psb_z_hbio_impl.o \ psb_s_mmio_impl.o psb_d_mmio_impl.o \