Merge branch 'dev-openmp' into development

master v3.8.1
sfilippone 1 year ago
commit d1bf46b0b1

@ -121,6 +121,7 @@ Salvatore Filippone
Contributors (roughly reverse cronological order): Contributors (roughly reverse cronological order):
Dimitri Walther
Andea Di Iorio Andea Di Iorio
Stefano Petrilli Stefano Petrilli
Soren Rasmussen Soren Rasmussen

@ -60,6 +60,11 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \
auxil/psb_d_hsort_x_mod.o \ auxil/psb_d_hsort_x_mod.o \
auxil/psb_c_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \
auxil/psb_z_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \
auxil/psb_s_rb_idx_tree_mod.o \
auxil/psb_d_rb_idx_tree_mod.o \
auxil/psb_c_rb_idx_tree_mod.o \
auxil/psb_z_rb_idx_tree_mod.o \
auxil/psb_rb_idx_tree_mod.o \
serial/psb_base_mat_mod.o serial/psb_mat_mod.o\ serial/psb_base_mat_mod.o serial/psb_mat_mod.o\
serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \
serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \
@ -165,6 +170,7 @@ penv/psi_d_collective_mod.o penv/psi_c_collective_mod.o penv/psi_z_collective_m
penv/psi_d_p2p_mod.o penv/psi_c_p2p_mod.o penv/psi_z_p2p_mod.o penv/psi_d_p2p_mod.o penv/psi_c_p2p_mod.o penv/psi_z_p2p_mod.o
auxil/psi_acx_mod.o auxil/psi_alcx_mod.o auxil/psi_lcx_mod.o \
auxil/psb_string_mod.o auxil/psb_m_realloc_mod.o auxil/psb_e_realloc_mod.o auxil/psb_s_realloc_mod.o \ auxil/psb_string_mod.o auxil/psb_m_realloc_mod.o auxil/psb_e_realloc_mod.o auxil/psb_s_realloc_mod.o \
auxil/psb_d_realloc_mod.o auxil/psb_c_realloc_mod.o auxil/psb_z_realloc_mod.o \ auxil/psb_d_realloc_mod.o auxil/psb_c_realloc_mod.o auxil/psb_z_realloc_mod.o \
desc/psb_desc_const_mod.o psi_penv_mod.o: psb_const_mod.o desc/psb_desc_const_mod.o psi_penv_mod.o: psb_const_mod.o
@ -174,6 +180,7 @@ desc/psb_indx_map_mod.o desc/psb_hash_mod.o: psb_realloc_mod.o psb_const_mod.o
auxil/psb_i_sort_mod.o auxil/psb_s_sort_mod.o auxil/psb_d_sort_mod.o auxil/psb_c_sort_mod.o auxil/psb_z_sort_mod.o \ auxil/psb_i_sort_mod.o auxil/psb_s_sort_mod.o auxil/psb_d_sort_mod.o auxil/psb_c_sort_mod.o auxil/psb_z_sort_mod.o \
auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o auxil/psb_sort_mod.o: $(BASIC_MODS) auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o auxil/psb_sort_mod.o: $(BASIC_MODS)
auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \
auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \ auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \
auxil/psb_e_hsort_mod.o auxil/psb_e_isort_mod.o \ auxil/psb_e_hsort_mod.o auxil/psb_e_isort_mod.o \
@ -192,7 +199,8 @@ auxil/psb_sort_mod.o: auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \
auxil/psb_d_hsort_x_mod.o \ auxil/psb_d_hsort_x_mod.o \
auxil/psb_c_hsort_x_mod.o \ auxil/psb_c_hsort_x_mod.o \
auxil/psb_z_hsort_x_mod.o \ auxil/psb_z_hsort_x_mod.o \
auxil/psb_ip_reord_mod.o auxil/psi_serial_mod.o auxil/psb_ip_reord_mod.o \
auxil/psi_serial_mod.o
auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \ auxil/psb_m_hsort_mod.o auxil/psb_m_isort_mod.o \
auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \ auxil/psb_m_msort_mod.o auxil/psb_m_qsort_mod.o \
@ -225,7 +233,10 @@ auxil/psb_c_hsort_x_mod.o: auxil/psb_c_hsort_mod.o
auxil/psb_z_hsort_x_mod.o: auxil/psb_z_hsort_mod.o auxil/psb_z_hsort_x_mod.o: auxil/psb_z_hsort_mod.o
auxil/psi_serial_mod.o: auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o \ auxil/psi_serial_mod.o: auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o \
auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o\
auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o \
auxil/psi_acx_mod.o auxil/psi_alcx_mod.o auxil/psi_lcx_mod.o
auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o: psb_const_mod.o auxil/psi_m_serial_mod.o auxil/psi_e_serial_mod.o auxil/psi_s_serial_mod.o auxil/psi_d_serial_mod.o auxil/psi_c_serial_mod.o auxil/psi_z_serial_mod.o: psb_const_mod.o
auxil/psb_ip_reord_mod.o: auxil/psb_m_ip_reord_mod.o auxil/psb_e_ip_reord_mod.o \ auxil/psb_ip_reord_mod.o: auxil/psb_m_ip_reord_mod.o auxil/psb_e_ip_reord_mod.o \
@ -270,6 +281,12 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o
serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o
serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o
auxil/psb_s_rb_idx_tree_mod.o: serial/psb_s_csr_mat_mod.o psb_realloc_mod.o
auxil/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o psb_realloc_mod.o
auxil/psb_c_rb_idx_tree_mod.o: serial/psb_c_csr_mat_mod.o psb_realloc_mod.o
auxil/psb_z_rb_idx_tree_mod.o: serial/psb_z_csr_mat_mod.o psb_realloc_mod.o
auxil/psb_rb_idx_tree_mod.o: auxil/psb_s_rb_idx_tree_mod.o auxil/psb_d_rb_idx_tree_mod.o auxil/psb_c_rb_idx_tree_mod.o auxil/psb_z_rb_idx_tree_mod.o
error.o psb_realloc_mod.o: psb_error_mod.o error.o psb_realloc_mod.o: psb_error_mod.o
psb_error_impl.o: psb_penv_mod.o psb_error_impl.o: psb_penv_mod.o
psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o

@ -0,0 +1,128 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_c_rb_idx_tree_mod
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
module psb_c_rb_idx_tree_mod
use psb_const_mod
implicit none
type :: psb_c_rb_idx_node
integer(psb_ipk_) :: idx
complex(psb_spk_) :: val
type(psb_c_rb_idx_node), pointer :: left, right, parent
logical :: is_red
end type psb_c_rb_idx_node
type :: psb_c_rb_idx_tree
type(psb_c_rb_idx_node), pointer :: root
integer(psb_ipk_) :: nnz
contains
procedure :: insert => psb_c_rb_idx_tree_insert
end type psb_c_rb_idx_tree
interface psb_rb_idx_tree_insert
subroutine psb_c_rb_idx_tree_insert(this, idx, val)
import :: psb_ipk_, psb_spk_, psb_c_rb_idx_tree
implicit none
class(psb_c_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
complex(psb_spk_), intent(in) :: val
end subroutine psb_c_rb_idx_tree_insert
end interface psb_rb_idx_tree_insert
interface psb_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat
import :: psb_ipk_, psb_spk_, psb_c_rb_idx_tree
implicit none
type(psb_c_rb_idx_tree), intent(inout) :: tree
complex(psb_spk_), intent(in) :: scalar
type(psb_c_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
end subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul
end interface psb_rb_idx_tree_scalar_sparse_row_mul
interface psb_rb_idx_tree_merge
subroutine psb_c_rb_idx_tree_merge(trees, mat)
use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat
import :: psb_c_rb_idx_tree
type(psb_c_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_c_csr_sparse_mat), intent(inout) :: mat
end subroutine psb_c_rb_idx_tree_merge
end interface psb_rb_idx_tree_merge
interface psb_rb_idx_tree_fix_insertion
subroutine psb_c_rb_idx_tree_fix_insertion(this, node)
import :: psb_c_rb_idx_tree, psb_c_rb_idx_node
implicit none
class(psb_c_rb_idx_tree), intent(inout) :: this
type(psb_c_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_c_rb_idx_tree_fix_insertion
end interface psb_rb_idx_tree_fix_insertion
interface psb_rb_idx_tree_swap_colors
subroutine psb_c_rb_idx_tree_swap_colors(n1, n2)
import :: psb_c_rb_idx_node
implicit none
type(psb_c_rb_idx_node), pointer, intent(inout) :: n1, n2
end subroutine psb_c_rb_idx_tree_swap_colors
end interface psb_rb_idx_tree_swap_colors
interface psb_rb_idx_tree_rotate_right
subroutine psb_c_rb_idx_tree_rotate_right(node)
import :: psb_c_rb_idx_node
implicit none
type(psb_c_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_c_rb_idx_tree_rotate_right
end interface psb_rb_idx_tree_rotate_right
interface psb_rb_idx_tree_rotate_left
subroutine psb_c_rb_idx_tree_rotate_left(node)
import :: psb_c_rb_idx_node
implicit none
type(psb_c_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_c_rb_idx_tree_rotate_left
end interface psb_rb_idx_tree_rotate_left
end module psb_c_rb_idx_tree_mod

@ -0,0 +1,128 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_d_rb_idx_tree_mod
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
module psb_d_rb_idx_tree_mod
use psb_const_mod
implicit none
type :: psb_d_rb_idx_node
integer(psb_ipk_) :: idx
real(psb_dpk_) :: val
type(psb_d_rb_idx_node), pointer :: left, right, parent
logical :: is_red
end type psb_d_rb_idx_node
type :: psb_d_rb_idx_tree
type(psb_d_rb_idx_node), pointer :: root
integer(psb_ipk_) :: nnz
contains
procedure :: insert => psb_d_rb_idx_tree_insert
end type psb_d_rb_idx_tree
interface psb_rb_idx_tree_insert
subroutine psb_d_rb_idx_tree_insert(this, idx, val)
import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
real(psb_dpk_), intent(in) :: val
end subroutine psb_d_rb_idx_tree_insert
end interface psb_rb_idx_tree_insert
interface psb_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree
implicit none
type(psb_d_rb_idx_tree), intent(inout) :: tree
real(psb_dpk_), intent(in) :: scalar
type(psb_d_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul
end interface psb_rb_idx_tree_scalar_sparse_row_mul
interface psb_rb_idx_tree_merge
subroutine psb_d_rb_idx_tree_merge(trees, mat)
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
import :: psb_d_rb_idx_tree
type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_d_csr_sparse_mat), intent(inout) :: mat
end subroutine psb_d_rb_idx_tree_merge
end interface psb_rb_idx_tree_merge
interface psb_rb_idx_tree_fix_insertion
subroutine psb_d_rb_idx_tree_fix_insertion(this, node)
import :: psb_d_rb_idx_tree, psb_d_rb_idx_node
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_d_rb_idx_tree_fix_insertion
end interface psb_rb_idx_tree_fix_insertion
interface psb_rb_idx_tree_swap_colors
subroutine psb_d_rb_idx_tree_swap_colors(n1, n2)
import :: psb_d_rb_idx_node
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2
end subroutine psb_d_rb_idx_tree_swap_colors
end interface psb_rb_idx_tree_swap_colors
interface psb_rb_idx_tree_rotate_right
subroutine psb_d_rb_idx_tree_rotate_right(node)
import :: psb_d_rb_idx_node
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_d_rb_idx_tree_rotate_right
end interface psb_rb_idx_tree_rotate_right
interface psb_rb_idx_tree_rotate_left
subroutine psb_d_rb_idx_tree_rotate_left(node)
import :: psb_d_rb_idx_node
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_d_rb_idx_tree_rotate_left
end interface psb_rb_idx_tree_rotate_left
end module psb_d_rb_idx_tree_mod

@ -0,0 +1,52 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_rb_idx_tree_mod
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
module psb_rb_idx_tree_mod
use psb_const_mod
use psb_s_rb_idx_tree_mod
use psb_d_rb_idx_tree_mod
use psb_c_rb_idx_tree_mod
use psb_z_rb_idx_tree_mod
end module psb_rb_idx_tree_mod

@ -0,0 +1,128 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_s_rb_idx_tree_mod
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
module psb_s_rb_idx_tree_mod
use psb_const_mod
implicit none
type :: psb_s_rb_idx_node
integer(psb_ipk_) :: idx
real(psb_spk_) :: val
type(psb_s_rb_idx_node), pointer :: left, right, parent
logical :: is_red
end type psb_s_rb_idx_node
type :: psb_s_rb_idx_tree
type(psb_s_rb_idx_node), pointer :: root
integer(psb_ipk_) :: nnz
contains
procedure :: insert => psb_s_rb_idx_tree_insert
end type psb_s_rb_idx_tree
interface psb_rb_idx_tree_insert
subroutine psb_s_rb_idx_tree_insert(this, idx, val)
import :: psb_ipk_, psb_spk_, psb_s_rb_idx_tree
implicit none
class(psb_s_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
real(psb_spk_), intent(in) :: val
end subroutine psb_s_rb_idx_tree_insert
end interface psb_rb_idx_tree_insert
interface psb_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat
import :: psb_ipk_, psb_spk_, psb_s_rb_idx_tree
implicit none
type(psb_s_rb_idx_tree), intent(inout) :: tree
real(psb_spk_), intent(in) :: scalar
type(psb_s_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
end subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul
end interface psb_rb_idx_tree_scalar_sparse_row_mul
interface psb_rb_idx_tree_merge
subroutine psb_s_rb_idx_tree_merge(trees, mat)
use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat
import :: psb_s_rb_idx_tree
type(psb_s_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_s_csr_sparse_mat), intent(inout) :: mat
end subroutine psb_s_rb_idx_tree_merge
end interface psb_rb_idx_tree_merge
interface psb_rb_idx_tree_fix_insertion
subroutine psb_s_rb_idx_tree_fix_insertion(this, node)
import :: psb_s_rb_idx_tree, psb_s_rb_idx_node
implicit none
class(psb_s_rb_idx_tree), intent(inout) :: this
type(psb_s_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_s_rb_idx_tree_fix_insertion
end interface psb_rb_idx_tree_fix_insertion
interface psb_rb_idx_tree_swap_colors
subroutine psb_s_rb_idx_tree_swap_colors(n1, n2)
import :: psb_s_rb_idx_node
implicit none
type(psb_s_rb_idx_node), pointer, intent(inout) :: n1, n2
end subroutine psb_s_rb_idx_tree_swap_colors
end interface psb_rb_idx_tree_swap_colors
interface psb_rb_idx_tree_rotate_right
subroutine psb_s_rb_idx_tree_rotate_right(node)
import :: psb_s_rb_idx_node
implicit none
type(psb_s_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_s_rb_idx_tree_rotate_right
end interface psb_rb_idx_tree_rotate_right
interface psb_rb_idx_tree_rotate_left
subroutine psb_s_rb_idx_tree_rotate_left(node)
import :: psb_s_rb_idx_node
implicit none
type(psb_s_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_s_rb_idx_tree_rotate_left
end interface psb_rb_idx_tree_rotate_left
end module psb_s_rb_idx_tree_mod

@ -45,6 +45,7 @@
module psb_sort_mod module psb_sort_mod
use psb_const_mod use psb_const_mod
use psb_ip_reord_mod use psb_ip_reord_mod
use psi_serial_mod
use psb_m_hsort_mod use psb_m_hsort_mod
use psb_m_isort_mod use psb_m_isort_mod

@ -0,0 +1,128 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_z_rb_idx_tree_mod
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
module psb_z_rb_idx_tree_mod
use psb_const_mod
implicit none
type :: psb_z_rb_idx_node
integer(psb_ipk_) :: idx
complex(psb_dpk_) :: val
type(psb_z_rb_idx_node), pointer :: left, right, parent
logical :: is_red
end type psb_z_rb_idx_node
type :: psb_z_rb_idx_tree
type(psb_z_rb_idx_node), pointer :: root
integer(psb_ipk_) :: nnz
contains
procedure :: insert => psb_z_rb_idx_tree_insert
end type psb_z_rb_idx_tree
interface psb_rb_idx_tree_insert
subroutine psb_z_rb_idx_tree_insert(this, idx, val)
import :: psb_ipk_, psb_dpk_, psb_z_rb_idx_tree
implicit none
class(psb_z_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
complex(psb_dpk_), intent(in) :: val
end subroutine psb_z_rb_idx_tree_insert
end interface psb_rb_idx_tree_insert
interface psb_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat
import :: psb_ipk_, psb_dpk_, psb_z_rb_idx_tree
implicit none
type(psb_z_rb_idx_tree), intent(inout) :: tree
complex(psb_dpk_), intent(in) :: scalar
type(psb_z_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
end subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul
end interface psb_rb_idx_tree_scalar_sparse_row_mul
interface psb_rb_idx_tree_merge
subroutine psb_z_rb_idx_tree_merge(trees, mat)
use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat
import :: psb_z_rb_idx_tree
type(psb_z_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_z_csr_sparse_mat), intent(inout) :: mat
end subroutine psb_z_rb_idx_tree_merge
end interface psb_rb_idx_tree_merge
interface psb_rb_idx_tree_fix_insertion
subroutine psb_z_rb_idx_tree_fix_insertion(this, node)
import :: psb_z_rb_idx_tree, psb_z_rb_idx_node
implicit none
class(psb_z_rb_idx_tree), intent(inout) :: this
type(psb_z_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_z_rb_idx_tree_fix_insertion
end interface psb_rb_idx_tree_fix_insertion
interface psb_rb_idx_tree_swap_colors
subroutine psb_z_rb_idx_tree_swap_colors(n1, n2)
import :: psb_z_rb_idx_node
implicit none
type(psb_z_rb_idx_node), pointer, intent(inout) :: n1, n2
end subroutine psb_z_rb_idx_tree_swap_colors
end interface psb_rb_idx_tree_swap_colors
interface psb_rb_idx_tree_rotate_right
subroutine psb_z_rb_idx_tree_rotate_right(node)
import :: psb_z_rb_idx_node
implicit none
type(psb_z_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_z_rb_idx_tree_rotate_right
end interface psb_rb_idx_tree_rotate_right
interface psb_rb_idx_tree_rotate_left
subroutine psb_z_rb_idx_tree_rotate_left(node)
import :: psb_z_rb_idx_node
implicit none
type(psb_z_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_z_rb_idx_tree_rotate_left
end interface psb_rb_idx_tree_rotate_left
end module psb_z_rb_idx_tree_mod

@ -458,7 +458,7 @@ contains
integer(psb_ipk_), intent(out) :: val, info integer(psb_ipk_), intent(out) :: val, info
integer(psb_ipk_) :: hsize,hmask, hk, hd integer(psb_ipk_) :: hsize,hmask, hk, hd
logical :: redo
info = HashOK info = HashOK
hsize = hash%hsize hsize = hash%hsize
hmask = hash%hmask hmask = hash%hmask
@ -483,6 +483,7 @@ contains
info = HashDuplicate info = HashDuplicate
return return
end if end if
redo = .false.
!$OMP CRITICAL !$OMP CRITICAL
if (hash%table(hk,1) == HashFreeEntry) then if (hash%table(hk,1) == HashFreeEntry) then
if (hash%nk == hash%hsize -1) then if (hash%nk == hash%hsize -1) then
@ -497,8 +498,9 @@ contains
info = HashOutOfMemory info = HashOutOfMemory
!return !return
else else
call psb_hash_searchinskey(key,val,nextval,hash,info) redo = .true.
!return !!$ call psb_hash_searchinskey(key,val,nextval,hash,info)
!!$ return
end if end if
else else
hash%nk = hash%nk + 1 hash%nk = hash%nk + 1
@ -509,6 +511,7 @@ contains
end if end if
end if end if
!$OMP END CRITICAL !$OMP END CRITICAL
if (redo) call psb_hash_searchinskey(key,val,nextval,hash,info)
if (info /= HashOk) return if (info /= HashOk) return
if (val > 0) return if (val > 0) return
hk = hk - hd hk = hk - hd

@ -73,6 +73,19 @@ module psb_base_mat_mod
use psb_const_mod use psb_const_mod
use psi_serial_mod use psi_serial_mod
! Algs for SPSPMM
integer(psb_ipk_), parameter :: spspmm_serial = 0
integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1
integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2
integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3
integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4
integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5
#if defined(OPENMP)
integer(psb_ipk_), save :: spspmm_impl = spspmm_omp_gustavson
#else
integer(psb_ipk_), save :: spspmm_impl = spspmm_serial
#endif
! !
!> \namespace psb_base_mod \class psb_base_sparse_mat !> \namespace psb_base_mod \class psb_base_sparse_mat
!! The basic data about your matrix. !! The basic data about your matrix.
@ -1904,4 +1917,22 @@ contains
end subroutine psb_base_from_lbase end subroutine psb_base_from_lbase
function psb_get_spspmm_impl() result(impl_id)
integer(psb_ipk_) :: impl_id
impl_id = spspmm_impl
end function psb_get_spspmm_impl
subroutine psb_set_spspmm_impl(impl_id)
integer(psb_ipk_), intent(in) :: impl_id
select case(impl_id)
case (spspmm_serial,spspmm_omp_gustavson,spspmm_omp_gustavson_1d,&
& spspmm_serial_rb_tree,spspmm_omp_rb_tree, spspmm_omp_two_pass)
spspmm_impl = impl_id
case default
write (*,*) "Invalid implementation id",impl_id
end select
end subroutine psb_set_spspmm_impl
end module psb_base_mat_mod end module psb_base_mat_mod

@ -7,16 +7,20 @@ BOBJS=psb_base_mat_impl.o \
psb_s_base_mat_impl.o psb_d_base_mat_impl.o psb_c_base_mat_impl.o psb_z_base_mat_impl.o psb_s_base_mat_impl.o psb_d_base_mat_impl.o psb_c_base_mat_impl.o psb_z_base_mat_impl.o
#\ #\
psb_s_lbase_mat_impl.o psb_d_lbase_mat_impl.o psb_c_lbase_mat_impl.o psb_z_lbase_mat_impl.o psb_s_lbase_mat_impl.o psb_d_lbase_mat_impl.o psb_c_lbase_mat_impl.o psb_z_lbase_mat_impl.o
SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o\
psb_s_rb_idx_tree_impl.o
#\ #\
psb_s_lcoo_impl.o psb_s_lcsr_impl.o psb_s_lcoo_impl.o psb_s_lcsr_impl.o
DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o\
psb_d_rb_idx_tree_impl.o
#\ #\
psb_d_lcoo_impl.o psb_d_lcsr_impl.o psb_d_lcoo_impl.o psb_d_lcsr_impl.o
COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o\
psb_c_rb_idx_tree_impl.o
#\ #\
psb_c_lcoo_impl.o psb_c_lcsr_impl.o psb_c_lcoo_impl.o psb_c_lcsr_impl.o
ZOBJS=psb_z_csr_impl.o psb_z_coo_impl.o psb_z_csc_impl.o psb_z_mat_impl.o ZOBJS=psb_z_csr_impl.o psb_z_coo_impl.o psb_z_csc_impl.o psb_z_mat_impl.o\
psb_z_rb_idx_tree_impl.o
#\ #\
psb_z_lcoo_impl.o psb_z_lcsr_impl.o psb_z_lcoo_impl.o psb_z_lcsr_impl.o

@ -3495,7 +3495,7 @@ subroutine psb_c_cp_csr_to_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info)
if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,b%irp,info) call psb_realloc(nr+1,b%irp,info)
call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%ja,info)
call psb_realloc(nz,b%val,info) call psb_realloc(nz,b%val,info)
@ -3600,7 +3600,7 @@ subroutine psb_c_cp_csr_from_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info)
if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,a%irp,info) call psb_realloc(nr+1,a%irp,info)
call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%ja,info)
call psb_realloc(nz,a%val,info) call psb_realloc(nz,a%val,info)
@ -3654,6 +3654,540 @@ subroutine psb_c_csr_clean_zeros(a, info)
call a%set_host() call a%set_host()
end subroutine psb_c_csr_clean_zeros end subroutine psb_c_csr_clean_zeros
#if defined(OPENMP)
subroutine psb_ccsrspspmm(a,b,c,info)
use psb_c_mat_mod
use psb_serial_mod, psb_protect_name => psb_ccsrspspmm
implicit none
class(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb
character(len=20) :: name
integer(psb_ipk_) :: err_act
name='psb_csrspspmm'
call psb_erractionsave(err_act)
info = psb_success_
if (a%is_dev()) call a%sync()
if (b%is_dev()) call b%sync()
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
if ( mb /= na ) then
write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb
info = psb_err_invalid_matrix_sizes_
call psb_errpush(info,name)
goto 9999
endif
select case(spspmm_impl)
case (spspmm_serial)
! Estimate number of nonzeros on output.
nza = a%get_nzeros()
nzb = b%get_nzeros()
nzc = 2*(nza+nzb)
call c%allocate(ma,nb,nzc)
call csr_spspmm(a,b,c,info)
case (spspmm_omp_gustavson)
call spmm_omp_gustavson(a,b,c,info)
case (spspmm_omp_gustavson_1d)
call spmm_omp_gustavson_1d(a,b,c,info)
case (spspmm_serial_rb_tree)
call spmm_serial_rb_tree(a,b,c,info)
case (spspmm_omp_rb_tree)
call spmm_omp_rb_tree(a,b,c,info)
case (spspmm_omp_two_pass)
call spmm_omp_two_pass(a,b,c,info)
case default
write(psb_err_unit,*) 'Unknown spspmm implementation'
! push error
goto 9999
end select
call c%set_asb()
call c%set_host()
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
contains
subroutine csr_spspmm(a,b,c,info)
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(inout) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb
integer(psb_ipk_), allocatable :: irow(:), idxs(:)
complex(psb_spk_), allocatable :: row(:)
integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, &
& nzc,nnzre, isz, ipb, irwsz, nrc, nze
complex(psb_spk_) :: cfb
info = psb_success_
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
nze = min(size(c%val),size(c%ja))
isz = max(ma,na,mb,nb)
call psb_realloc(isz,row,info)
if (info == 0) call psb_realloc(isz,idxs,info)
if (info == 0) call psb_realloc(isz,irow,info)
if (info /= 0) return
row = dzero
irow = 0
nzc = 1
do j = 1,ma
c%irp(j) = nzc
nrc = 0
do k = a%irp(j), a%irp(j+1)-1
irw = a%ja(k)
cfb = a%val(k)
irwsz = b%irp(irw+1)-b%irp(irw)
do i = b%irp(irw),b%irp(irw+1)-1
icl = b%ja(i)
if (irow(icl)<j) then
nrc = nrc + 1
idxs(nrc) = icl
irow(icl) = j
end if
row(icl) = row(icl) + cfb*b%val(i)
end do
end do
if (nrc > 0 ) then
if ((nzc+nrc)>nze) then
nze = max(ma*((nzc+j-1)/j),nzc+2*nrc)
call psb_realloc(nze,c%val,info)
if (info == 0) call psb_realloc(nze,c%ja,info)
if (info /= 0) return
end if
call psb_qsort(idxs(1:nrc))
do i=1, nrc
irw = idxs(i)
c%ja(nzc) = irw
c%val(nzc) = row(irw)
row(irw) = dzero
nzc = nzc + 1
end do
end if
end do
c%irp(ma+1) = nzc
end subroutine csr_spspmm
! gustavson's algorithm using perfect hashing
! and OpenMP parallelisation
subroutine spmm_omp_gustavson(a,b,c,info)
use omp_lib
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
complex(psb_spk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw, jj, j, k)
do irw = 1, ma
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson
subroutine spmm_omp_gustavson_1d(a,b,c,info)
use omp_lib
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
complex(psb_spk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, &
start_idx, end_idx , blk, blk_size, rwstart,&
rwblk, rwblkrem, nblks
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
nblks = 4 * omp_get_max_threads()
rwblk = (ma / nblks)
rwblkrem = modulo(ma, nblks)
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart)
do blk = 0, nblks - 1
if (blk < rwblkrem) then
blk_size = rwblk + 1
rwstart = blk * rwblk + blk + 1
else
blk_size = rwblk
rwstart = blk * rwblk &
+ rwblkrem + 1
end if
do irw = rwstart, rwstart + blk_size - 1
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson_1d
subroutine spmm_serial_rb_tree(a,b,c,info)
use psb_rb_idx_tree_mod
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_c_rb_idx_tree), allocatable :: row_accs(:)
a_m = a%get_nrows()
b_n = b%get_ncols()
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_serial_rb_tree
subroutine spmm_omp_rb_tree(a,b,c,info)
use omp_lib
use psb_rb_idx_tree_mod
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_c_rb_idx_tree), allocatable :: row_accs(:)
real(8) :: tic, toc
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
!$omp parallel do schedule(static)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_omp_rb_tree
subroutine compute_indices(a, b, c, info)
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer :: full_mat_bound
integer :: row, col, i, j, k, nnz
full_mat_bound = 0
!omp parallel do schedule(static) reduction(+:full_mat_bound)
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
j = a%ja(col)
full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j)
end do
end do
!omp end parallel do
call psb_realloc(a%get_nrows() + 1, c%irp, info)
call psb_realloc(full_mat_bound, c%ja, info)
c%ja = 0
c%irp(1) = 1
nnz = 0
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1
k = 0
do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i))
k = k + 1
end do
if (c%ja(c%irp(row) + k) == 0) then
c%ja(c%irp(row)+k) = b%ja(i)
nnz = nnz + 1
end if
end do
end do
c%irp(row + 1) = nnz + 1
call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1))
end do
call psb_realloc(nnz, c%ja, info)
call psb_realloc(nnz, c%val, info)
c%val = 0
end subroutine compute_indices
subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num)
type(psb_c_csr_sparse_mat), intent(inout) :: out_mat
integer(psb_ipk_), intent(in) :: out_row_num
complex(psb_spk_), intent(in) :: scalar
type(psb_c_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: trgt_row_num
integer(psb_ipk_) :: i, k, row_start, row_end
row_start = out_mat%irp(out_row_num)
row_end = out_mat%irp(out_row_num + 1) - 1
do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1
do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1
if (out_mat%ja(k) == mat%ja(i)) then
out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i)
exit
end if
end do
end do
end subroutine direct_scalar_sparse_row_mul
subroutine spmm_omp_two_pass(a,b,c,info)
use omp_lib
implicit none
type(psb_c_csr_sparse_mat), intent(in) :: a,b
type(psb_c_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n, row, col
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
call compute_indices(a, b, c, info)
!$omp parallel do schedule(static)
do row = 1, a_m
do col = a%irp(row), a%irp(row + 1) - 1
call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
end subroutine spmm_omp_two_pass
end subroutine psb_ccsrspspmm
#else
subroutine psb_ccsrspspmm(a,b,c,info) subroutine psb_ccsrspspmm(a,b,c,info)
use psb_c_mat_mod use psb_c_mat_mod
use psb_serial_mod, psb_protect_name => psb_ccsrspspmm use psb_serial_mod, psb_protect_name => psb_ccsrspspmm
@ -3776,7 +4310,7 @@ contains
end subroutine csr_spspmm end subroutine csr_spspmm
end subroutine psb_ccsrspspmm end subroutine psb_ccsrspspmm
#endif
! !
! !

@ -0,0 +1,329 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_c_rb_idx_tree_impl
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
subroutine psb_c_rb_idx_tree_insert(this, idx, val)
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_insert
implicit none
class(psb_c_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
complex(psb_spk_), intent(in) :: val
character(len=22) :: name
type(psb_c_rb_idx_node), pointer :: new_node
type(psb_c_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_insert'
allocate(new_node)
new_node%idx = idx
new_node%val = val
nullify(new_node%left)
nullify(new_node%right)
nullify(new_node%parent)
new_node%is_red = .true.
if (.not. associated(this%root)) then
this%root => new_node
this%nnz = 1
new_node%is_red = .false.
return
end if
current => this%root
do while (associated(current))
previous => current
if (idx == current%idx) then
current%val = current%val + val
deallocate(new_node)
return
else if (idx < current%idx) then
current => current%left
else
current => current%right
end if
end do
if (idx < previous%idx) then
new_node%parent => previous
previous%left => new_node
else
new_node%parent => previous
previous%right => new_node
end if
call psb_c_rb_idx_tree_fix_insertion(this, new_node)
this%nnz = this%nnz + 1
end subroutine psb_c_rb_idx_tree_insert
subroutine psb_c_rb_idx_tree_fix_insertion(this, node)
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_fix_insertion
implicit none
class(psb_c_rb_idx_tree), intent(inout) :: this
type(psb_c_rb_idx_node), pointer, intent(inout) :: node
character(len=29) :: name
type(psb_c_rb_idx_node), pointer :: current, parent, grand_parent, uncle
name = 'psb_rb_idx_tree_fix_insertion'
current => node
parent => current%parent
do while(associated(parent) .and. parent%is_red)
! grand parent exist because root can't be red
grand_parent => parent%parent
if (parent%idx < grand_parent%idx) then
uncle => grand_parent%right
else
uncle => grand_parent%left
end if
if (associated(uncle) .and. uncle%is_red) then
parent%is_red = .false.
uncle%is_red = .false.
grand_parent%is_red = .true.
current => grand_parent
parent => current%parent
! Left-Left case
else if (current%idx < parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_c_rb_idx_tree_rotate_right(grand_parent)
call psb_c_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Left-Right case
else if (current%idx > parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_c_rb_idx_tree_rotate_left(parent)
call psb_c_rb_idx_tree_rotate_right(grand_parent)
call psb_c_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
! Right-Right case
else if (current%idx > parent%idx .and. &
parent%idx > grand_parent%idx) then
call psb_c_rb_idx_tree_rotate_left(grand_parent)
call psb_c_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Right-Left case
else
call psb_c_rb_idx_tree_rotate_right(parent)
call psb_c_rb_idx_tree_rotate_left(grand_parent)
call psb_c_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
end if
end do
this%root%is_red = .false.
end subroutine psb_c_rb_idx_tree_fix_insertion
subroutine psb_c_rb_idx_tree_swap_colors(n1, n2)
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_swap_colors
implicit none
type(psb_c_rb_idx_node), pointer, intent(inout) :: n1, n2
character(len=27) :: name
logical :: tmp
name='psb_rb_idx_tree_swap_colors'
tmp = n1%is_red
n1%is_red = n2%is_red
n2%is_red = tmp
end subroutine psb_c_rb_idx_tree_swap_colors
subroutine psb_c_rb_idx_tree_rotate_right(node)
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_rotate_right
implicit none
type(psb_c_rb_idx_node), pointer, intent(inout) :: node
character(len=28) :: name
type(psb_c_rb_idx_node), pointer :: l, lr
name='psb_rb_idx_tree_rotate_right'
if (.not. associated(node%left)) return
l => node%left
lr => l%right
node%left => lr
if (associated(lr)) lr%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => l
else
node%parent%right => l
end if
end if
l%parent => node%parent
node%parent => l
l%right => node
end subroutine psb_c_rb_idx_tree_rotate_right
subroutine psb_c_rb_idx_tree_rotate_left(node)
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_rotate_left
implicit none
type(psb_c_rb_idx_node), pointer, intent(inout) :: node
character(len=27) :: name
type(psb_c_rb_idx_node), pointer :: r, rl
name='psb_rb_idx_tree_rotate_left'
if (.not. associated(node%right)) return
r => node%right
rl => r%left
node%right => rl
if (associated(rl)) rl%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => r
else
node%parent%right => r
end if
end if
r%parent => node%parent
node%parent => r
r%left => node
end subroutine psb_c_rb_idx_tree_rotate_left
subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_scalar_sparse_row_mul
use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat
implicit none
type(psb_c_rb_idx_tree), intent(inout) :: tree
complex(psb_spk_), intent(in) :: scalar
type(psb_c_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
character(len=37) :: name
integer(psb_ipk_) :: i
name='psb_rb_idx_tree_scalar_sparse_row_mul'
do i = mat%irp(row_num), mat%irp(row_num + 1) - 1
call tree%insert(mat%ja(i),scalar * mat%val(i))
end do
end subroutine psb_c_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_c_rb_idx_tree_merge(trees, mat)
#if defined(OPENMP)
use omp_lib
#endif
use psb_realloc_mod
use psb_c_rb_idx_tree_mod, psb_protect_name => psb_c_rb_idx_tree_merge
use psb_c_csr_mat_mod, only : psb_c_csr_sparse_mat
implicit none
type(psb_c_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_c_csr_sparse_mat), intent(inout) :: mat
character(len=21) :: name
integer(psb_ipk_) :: i, j, rows, info, nnz
type(psb_c_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_merge'
rows = size(trees)
mat%irp(1) = 1
do i=1, rows
mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz
end do
nnz = mat%irp(rows + 1)
call psb_realloc(nnz, mat%val, info)
call psb_realloc(nnz, mat%ja, info)
#if defined(OPENMP)
!$omp parallel do schedule(static), private(current, previous, j)
#endif
do i = 1, size(trees)
j = 0
current => trees(i)%root
do while(associated(current))
! go to the left-most node
do while(associated(current%left))
current => current%left
end do
mat%val(j + mat%irp(i)) = current%val
mat%ja(j + mat%irp(i)) = current%idx
j = j + 1
previous => current
if (associated(current%right)) then
if (associated(current%parent)) then
current%parent%left => current%right
end if
current%right%parent => current%parent
current => current%right
else
current => current%parent
if (associated(current)) nullify(current%left)
end if
deallocate(previous)
end do
end do
#if defined(OPENMP)
!$omp end parallel do
#endif
end subroutine psb_c_rb_idx_tree_merge

@ -3495,7 +3495,7 @@ subroutine psb_d_cp_csr_to_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info)
if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,b%irp,info) call psb_realloc(nr+1,b%irp,info)
call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%ja,info)
call psb_realloc(nz,b%val,info) call psb_realloc(nz,b%val,info)
@ -3600,7 +3600,7 @@ subroutine psb_d_cp_csr_from_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info)
if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,a%irp,info) call psb_realloc(nr+1,a%irp,info)
call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%ja,info)
call psb_realloc(nz,a%val,info) call psb_realloc(nz,a%val,info)
@ -3654,6 +3654,540 @@ subroutine psb_d_csr_clean_zeros(a, info)
call a%set_host() call a%set_host()
end subroutine psb_d_csr_clean_zeros end subroutine psb_d_csr_clean_zeros
#if defined(OPENMP)
subroutine psb_dcsrspspmm(a,b,c,info)
use psb_d_mat_mod
use psb_serial_mod, psb_protect_name => psb_dcsrspspmm
implicit none
class(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb
character(len=20) :: name
integer(psb_ipk_) :: err_act
name='psb_csrspspmm'
call psb_erractionsave(err_act)
info = psb_success_
if (a%is_dev()) call a%sync()
if (b%is_dev()) call b%sync()
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
if ( mb /= na ) then
write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb
info = psb_err_invalid_matrix_sizes_
call psb_errpush(info,name)
goto 9999
endif
select case(spspmm_impl)
case (spspmm_serial)
! Estimate number of nonzeros on output.
nza = a%get_nzeros()
nzb = b%get_nzeros()
nzc = 2*(nza+nzb)
call c%allocate(ma,nb,nzc)
call csr_spspmm(a,b,c,info)
case (spspmm_omp_gustavson)
call spmm_omp_gustavson(a,b,c,info)
case (spspmm_omp_gustavson_1d)
call spmm_omp_gustavson_1d(a,b,c,info)
case (spspmm_serial_rb_tree)
call spmm_serial_rb_tree(a,b,c,info)
case (spspmm_omp_rb_tree)
call spmm_omp_rb_tree(a,b,c,info)
case (spspmm_omp_two_pass)
call spmm_omp_two_pass(a,b,c,info)
case default
write(psb_err_unit,*) 'Unknown spspmm implementation'
! push error
goto 9999
end select
call c%set_asb()
call c%set_host()
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
contains
subroutine csr_spspmm(a,b,c,info)
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(inout) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb
integer(psb_ipk_), allocatable :: irow(:), idxs(:)
real(psb_dpk_), allocatable :: row(:)
integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, &
& nzc,nnzre, isz, ipb, irwsz, nrc, nze
real(psb_dpk_) :: cfb
info = psb_success_
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
nze = min(size(c%val),size(c%ja))
isz = max(ma,na,mb,nb)
call psb_realloc(isz,row,info)
if (info == 0) call psb_realloc(isz,idxs,info)
if (info == 0) call psb_realloc(isz,irow,info)
if (info /= 0) return
row = dzero
irow = 0
nzc = 1
do j = 1,ma
c%irp(j) = nzc
nrc = 0
do k = a%irp(j), a%irp(j+1)-1
irw = a%ja(k)
cfb = a%val(k)
irwsz = b%irp(irw+1)-b%irp(irw)
do i = b%irp(irw),b%irp(irw+1)-1
icl = b%ja(i)
if (irow(icl)<j) then
nrc = nrc + 1
idxs(nrc) = icl
irow(icl) = j
end if
row(icl) = row(icl) + cfb*b%val(i)
end do
end do
if (nrc > 0 ) then
if ((nzc+nrc)>nze) then
nze = max(ma*((nzc+j-1)/j),nzc+2*nrc)
call psb_realloc(nze,c%val,info)
if (info == 0) call psb_realloc(nze,c%ja,info)
if (info /= 0) return
end if
call psb_qsort(idxs(1:nrc))
do i=1, nrc
irw = idxs(i)
c%ja(nzc) = irw
c%val(nzc) = row(irw)
row(irw) = dzero
nzc = nzc + 1
end do
end if
end do
c%irp(ma+1) = nzc
end subroutine csr_spspmm
! gustavson's algorithm using perfect hashing
! and OpenMP parallelisation
subroutine spmm_omp_gustavson(a,b,c,info)
use omp_lib
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
real(psb_dpk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw, jj, j, k)
do irw = 1, ma
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson
subroutine spmm_omp_gustavson_1d(a,b,c,info)
use omp_lib
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
real(psb_dpk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, &
start_idx, end_idx , blk, blk_size, rwstart,&
rwblk, rwblkrem, nblks
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
nblks = 4 * omp_get_max_threads()
rwblk = (ma / nblks)
rwblkrem = modulo(ma, nblks)
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart)
do blk = 0, nblks - 1
if (blk < rwblkrem) then
blk_size = rwblk + 1
rwstart = blk * rwblk + blk + 1
else
blk_size = rwblk
rwstart = blk * rwblk &
+ rwblkrem + 1
end if
do irw = rwstart, rwstart + blk_size - 1
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson_1d
subroutine spmm_serial_rb_tree(a,b,c,info)
use psb_rb_idx_tree_mod
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_d_rb_idx_tree), allocatable :: row_accs(:)
a_m = a%get_nrows()
b_n = b%get_ncols()
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_serial_rb_tree
subroutine spmm_omp_rb_tree(a,b,c,info)
use omp_lib
use psb_rb_idx_tree_mod
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_d_rb_idx_tree), allocatable :: row_accs(:)
real(8) :: tic, toc
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
!$omp parallel do schedule(static)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_omp_rb_tree
subroutine compute_indices(a, b, c, info)
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer :: full_mat_bound
integer :: row, col, i, j, k, nnz
full_mat_bound = 0
!omp parallel do schedule(static) reduction(+:full_mat_bound)
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
j = a%ja(col)
full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j)
end do
end do
!omp end parallel do
call psb_realloc(a%get_nrows() + 1, c%irp, info)
call psb_realloc(full_mat_bound, c%ja, info)
c%ja = 0
c%irp(1) = 1
nnz = 0
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1
k = 0
do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i))
k = k + 1
end do
if (c%ja(c%irp(row) + k) == 0) then
c%ja(c%irp(row)+k) = b%ja(i)
nnz = nnz + 1
end if
end do
end do
c%irp(row + 1) = nnz + 1
call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1))
end do
call psb_realloc(nnz, c%ja, info)
call psb_realloc(nnz, c%val, info)
c%val = 0
end subroutine compute_indices
subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num)
type(psb_d_csr_sparse_mat), intent(inout) :: out_mat
integer(psb_ipk_), intent(in) :: out_row_num
real(psb_dpk_), intent(in) :: scalar
type(psb_d_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: trgt_row_num
integer(psb_ipk_) :: i, k, row_start, row_end
row_start = out_mat%irp(out_row_num)
row_end = out_mat%irp(out_row_num + 1) - 1
do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1
do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1
if (out_mat%ja(k) == mat%ja(i)) then
out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i)
exit
end if
end do
end do
end subroutine direct_scalar_sparse_row_mul
subroutine spmm_omp_two_pass(a,b,c,info)
use omp_lib
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n, row, col
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
call compute_indices(a, b, c, info)
!$omp parallel do schedule(static)
do row = 1, a_m
do col = a%irp(row), a%irp(row + 1) - 1
call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
end subroutine spmm_omp_two_pass
end subroutine psb_dcsrspspmm
#else
subroutine psb_dcsrspspmm(a,b,c,info) subroutine psb_dcsrspspmm(a,b,c,info)
use psb_d_mat_mod use psb_d_mat_mod
use psb_serial_mod, psb_protect_name => psb_dcsrspspmm use psb_serial_mod, psb_protect_name => psb_dcsrspspmm
@ -3776,7 +4310,7 @@ contains
end subroutine csr_spspmm end subroutine csr_spspmm
end subroutine psb_dcsrspspmm end subroutine psb_dcsrspspmm
#endif
! !
! !

@ -0,0 +1,329 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_d_rb_idx_tree_impl
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
subroutine psb_d_rb_idx_tree_insert(this, idx, val)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
real(psb_dpk_), intent(in) :: val
character(len=22) :: name
type(psb_d_rb_idx_node), pointer :: new_node
type(psb_d_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_insert'
allocate(new_node)
new_node%idx = idx
new_node%val = val
nullify(new_node%left)
nullify(new_node%right)
nullify(new_node%parent)
new_node%is_red = .true.
if (.not. associated(this%root)) then
this%root => new_node
this%nnz = 1
new_node%is_red = .false.
return
end if
current => this%root
do while (associated(current))
previous => current
if (idx == current%idx) then
current%val = current%val + val
deallocate(new_node)
return
else if (idx < current%idx) then
current => current%left
else
current => current%right
end if
end do
if (idx < previous%idx) then
new_node%parent => previous
previous%left => new_node
else
new_node%parent => previous
previous%right => new_node
end if
call psb_d_rb_idx_tree_fix_insertion(this, new_node)
this%nnz = this%nnz + 1
end subroutine psb_d_rb_idx_tree_insert
subroutine psb_d_rb_idx_tree_fix_insertion(this, node)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
character(len=29) :: name
type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle
name = 'psb_rb_idx_tree_fix_insertion'
current => node
parent => current%parent
do while(associated(parent) .and. parent%is_red)
! grand parent exist because root can't be red
grand_parent => parent%parent
if (parent%idx < grand_parent%idx) then
uncle => grand_parent%right
else
uncle => grand_parent%left
end if
if (associated(uncle) .and. uncle%is_red) then
parent%is_red = .false.
uncle%is_red = .false.
grand_parent%is_red = .true.
current => grand_parent
parent => current%parent
! Left-Left case
else if (current%idx < parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_d_rb_idx_tree_rotate_right(grand_parent)
call psb_d_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Left-Right case
else if (current%idx > parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_d_rb_idx_tree_rotate_left(parent)
call psb_d_rb_idx_tree_rotate_right(grand_parent)
call psb_d_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
! Right-Right case
else if (current%idx > parent%idx .and. &
parent%idx > grand_parent%idx) then
call psb_d_rb_idx_tree_rotate_left(grand_parent)
call psb_d_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Right-Left case
else
call psb_d_rb_idx_tree_rotate_right(parent)
call psb_d_rb_idx_tree_rotate_left(grand_parent)
call psb_d_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
end if
end do
this%root%is_red = .false.
end subroutine psb_d_rb_idx_tree_fix_insertion
subroutine psb_d_rb_idx_tree_swap_colors(n1, n2)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2
character(len=27) :: name
logical :: tmp
name='psb_rb_idx_tree_swap_colors'
tmp = n1%is_red
n1%is_red = n2%is_red
n2%is_red = tmp
end subroutine psb_d_rb_idx_tree_swap_colors
subroutine psb_d_rb_idx_tree_rotate_right(node)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
character(len=28) :: name
type(psb_d_rb_idx_node), pointer :: l, lr
name='psb_rb_idx_tree_rotate_right'
if (.not. associated(node%left)) return
l => node%left
lr => l%right
node%left => lr
if (associated(lr)) lr%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => l
else
node%parent%right => l
end if
end if
l%parent => node%parent
node%parent => l
l%right => node
end subroutine psb_d_rb_idx_tree_rotate_right
subroutine psb_d_rb_idx_tree_rotate_left(node)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
character(len=27) :: name
type(psb_d_rb_idx_node), pointer :: r, rl
name='psb_rb_idx_tree_rotate_left'
if (.not. associated(node%right)) return
r => node%right
rl => r%left
node%right => rl
if (associated(rl)) rl%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => r
else
node%parent%right => r
end if
end if
r%parent => node%parent
node%parent => r
r%left => node
end subroutine psb_d_rb_idx_tree_rotate_left
subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
implicit none
type(psb_d_rb_idx_tree), intent(inout) :: tree
real(psb_dpk_), intent(in) :: scalar
type(psb_d_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
character(len=37) :: name
integer(psb_ipk_) :: i
name='psb_rb_idx_tree_scalar_sparse_row_mul'
do i = mat%irp(row_num), mat%irp(row_num + 1) - 1
call tree%insert(mat%ja(i),scalar * mat%val(i))
end do
end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_d_rb_idx_tree_merge(trees, mat)
#if defined(OPENMP)
use omp_lib
#endif
use psb_realloc_mod
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
implicit none
type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_d_csr_sparse_mat), intent(inout) :: mat
character(len=21) :: name
integer(psb_ipk_) :: i, j, rows, info, nnz
type(psb_d_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_merge'
rows = size(trees)
mat%irp(1) = 1
do i=1, rows
mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz
end do
nnz = mat%irp(rows + 1)
call psb_realloc(nnz, mat%val, info)
call psb_realloc(nnz, mat%ja, info)
#if defined(OPENMP)
!$omp parallel do schedule(static), private(current, previous, j)
#endif
do i = 1, size(trees)
j = 0
current => trees(i)%root
do while(associated(current))
! go to the left-most node
do while(associated(current%left))
current => current%left
end do
mat%val(j + mat%irp(i)) = current%val
mat%ja(j + mat%irp(i)) = current%idx
j = j + 1
previous => current
if (associated(current%right)) then
if (associated(current%parent)) then
current%parent%left => current%right
end if
current%right%parent => current%parent
current => current%right
else
current => current%parent
if (associated(current)) nullify(current%left)
end if
deallocate(previous)
end do
end do
#if defined(OPENMP)
!$omp end parallel do
#endif
end subroutine psb_d_rb_idx_tree_merge

@ -3495,7 +3495,7 @@ subroutine psb_s_cp_csr_to_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info)
if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,b%irp,info) call psb_realloc(nr+1,b%irp,info)
call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%ja,info)
call psb_realloc(nz,b%val,info) call psb_realloc(nz,b%val,info)
@ -3600,7 +3600,7 @@ subroutine psb_s_cp_csr_from_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info)
if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,a%irp,info) call psb_realloc(nr+1,a%irp,info)
call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%ja,info)
call psb_realloc(nz,a%val,info) call psb_realloc(nz,a%val,info)
@ -3654,6 +3654,540 @@ subroutine psb_s_csr_clean_zeros(a, info)
call a%set_host() call a%set_host()
end subroutine psb_s_csr_clean_zeros end subroutine psb_s_csr_clean_zeros
#if defined(OPENMP)
subroutine psb_scsrspspmm(a,b,c,info)
use psb_s_mat_mod
use psb_serial_mod, psb_protect_name => psb_scsrspspmm
implicit none
class(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb
character(len=20) :: name
integer(psb_ipk_) :: err_act
name='psb_csrspspmm'
call psb_erractionsave(err_act)
info = psb_success_
if (a%is_dev()) call a%sync()
if (b%is_dev()) call b%sync()
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
if ( mb /= na ) then
write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb
info = psb_err_invalid_matrix_sizes_
call psb_errpush(info,name)
goto 9999
endif
select case(spspmm_impl)
case (spspmm_serial)
! Estimate number of nonzeros on output.
nza = a%get_nzeros()
nzb = b%get_nzeros()
nzc = 2*(nza+nzb)
call c%allocate(ma,nb,nzc)
call csr_spspmm(a,b,c,info)
case (spspmm_omp_gustavson)
call spmm_omp_gustavson(a,b,c,info)
case (spspmm_omp_gustavson_1d)
call spmm_omp_gustavson_1d(a,b,c,info)
case (spspmm_serial_rb_tree)
call spmm_serial_rb_tree(a,b,c,info)
case (spspmm_omp_rb_tree)
call spmm_omp_rb_tree(a,b,c,info)
case (spspmm_omp_two_pass)
call spmm_omp_two_pass(a,b,c,info)
case default
write(psb_err_unit,*) 'Unknown spspmm implementation'
! push error
goto 9999
end select
call c%set_asb()
call c%set_host()
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
contains
subroutine csr_spspmm(a,b,c,info)
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(inout) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb
integer(psb_ipk_), allocatable :: irow(:), idxs(:)
real(psb_spk_), allocatable :: row(:)
integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, &
& nzc,nnzre, isz, ipb, irwsz, nrc, nze
real(psb_spk_) :: cfb
info = psb_success_
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
nze = min(size(c%val),size(c%ja))
isz = max(ma,na,mb,nb)
call psb_realloc(isz,row,info)
if (info == 0) call psb_realloc(isz,idxs,info)
if (info == 0) call psb_realloc(isz,irow,info)
if (info /= 0) return
row = dzero
irow = 0
nzc = 1
do j = 1,ma
c%irp(j) = nzc
nrc = 0
do k = a%irp(j), a%irp(j+1)-1
irw = a%ja(k)
cfb = a%val(k)
irwsz = b%irp(irw+1)-b%irp(irw)
do i = b%irp(irw),b%irp(irw+1)-1
icl = b%ja(i)
if (irow(icl)<j) then
nrc = nrc + 1
idxs(nrc) = icl
irow(icl) = j
end if
row(icl) = row(icl) + cfb*b%val(i)
end do
end do
if (nrc > 0 ) then
if ((nzc+nrc)>nze) then
nze = max(ma*((nzc+j-1)/j),nzc+2*nrc)
call psb_realloc(nze,c%val,info)
if (info == 0) call psb_realloc(nze,c%ja,info)
if (info /= 0) return
end if
call psb_qsort(idxs(1:nrc))
do i=1, nrc
irw = idxs(i)
c%ja(nzc) = irw
c%val(nzc) = row(irw)
row(irw) = dzero
nzc = nzc + 1
end do
end if
end do
c%irp(ma+1) = nzc
end subroutine csr_spspmm
! gustavson's algorithm using perfect hashing
! and OpenMP parallelisation
subroutine spmm_omp_gustavson(a,b,c,info)
use omp_lib
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
real(psb_spk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw, jj, j, k)
do irw = 1, ma
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson
subroutine spmm_omp_gustavson_1d(a,b,c,info)
use omp_lib
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
real(psb_spk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, &
start_idx, end_idx , blk, blk_size, rwstart,&
rwblk, rwblkrem, nblks
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
nblks = 4 * omp_get_max_threads()
rwblk = (ma / nblks)
rwblkrem = modulo(ma, nblks)
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart)
do blk = 0, nblks - 1
if (blk < rwblkrem) then
blk_size = rwblk + 1
rwstart = blk * rwblk + blk + 1
else
blk_size = rwblk
rwstart = blk * rwblk &
+ rwblkrem + 1
end if
do irw = rwstart, rwstart + blk_size - 1
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson_1d
subroutine spmm_serial_rb_tree(a,b,c,info)
use psb_rb_idx_tree_mod
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_s_rb_idx_tree), allocatable :: row_accs(:)
a_m = a%get_nrows()
b_n = b%get_ncols()
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_serial_rb_tree
subroutine spmm_omp_rb_tree(a,b,c,info)
use omp_lib
use psb_rb_idx_tree_mod
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_s_rb_idx_tree), allocatable :: row_accs(:)
real(8) :: tic, toc
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
!$omp parallel do schedule(static)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_omp_rb_tree
subroutine compute_indices(a, b, c, info)
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer :: full_mat_bound
integer :: row, col, i, j, k, nnz
full_mat_bound = 0
!omp parallel do schedule(static) reduction(+:full_mat_bound)
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
j = a%ja(col)
full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j)
end do
end do
!omp end parallel do
call psb_realloc(a%get_nrows() + 1, c%irp, info)
call psb_realloc(full_mat_bound, c%ja, info)
c%ja = 0
c%irp(1) = 1
nnz = 0
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1
k = 0
do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i))
k = k + 1
end do
if (c%ja(c%irp(row) + k) == 0) then
c%ja(c%irp(row)+k) = b%ja(i)
nnz = nnz + 1
end if
end do
end do
c%irp(row + 1) = nnz + 1
call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1))
end do
call psb_realloc(nnz, c%ja, info)
call psb_realloc(nnz, c%val, info)
c%val = 0
end subroutine compute_indices
subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num)
type(psb_s_csr_sparse_mat), intent(inout) :: out_mat
integer(psb_ipk_), intent(in) :: out_row_num
real(psb_spk_), intent(in) :: scalar
type(psb_s_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: trgt_row_num
integer(psb_ipk_) :: i, k, row_start, row_end
row_start = out_mat%irp(out_row_num)
row_end = out_mat%irp(out_row_num + 1) - 1
do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1
do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1
if (out_mat%ja(k) == mat%ja(i)) then
out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i)
exit
end if
end do
end do
end subroutine direct_scalar_sparse_row_mul
subroutine spmm_omp_two_pass(a,b,c,info)
use omp_lib
implicit none
type(psb_s_csr_sparse_mat), intent(in) :: a,b
type(psb_s_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n, row, col
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
call compute_indices(a, b, c, info)
!$omp parallel do schedule(static)
do row = 1, a_m
do col = a%irp(row), a%irp(row + 1) - 1
call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
end subroutine spmm_omp_two_pass
end subroutine psb_scsrspspmm
#else
subroutine psb_scsrspspmm(a,b,c,info) subroutine psb_scsrspspmm(a,b,c,info)
use psb_s_mat_mod use psb_s_mat_mod
use psb_serial_mod, psb_protect_name => psb_scsrspspmm use psb_serial_mod, psb_protect_name => psb_scsrspspmm
@ -3776,7 +4310,7 @@ contains
end subroutine csr_spspmm end subroutine csr_spspmm
end subroutine psb_scsrspspmm end subroutine psb_scsrspspmm
#endif
! !
! !

@ -0,0 +1,329 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_s_rb_idx_tree_impl
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
subroutine psb_s_rb_idx_tree_insert(this, idx, val)
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_insert
implicit none
class(psb_s_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
real(psb_spk_), intent(in) :: val
character(len=22) :: name
type(psb_s_rb_idx_node), pointer :: new_node
type(psb_s_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_insert'
allocate(new_node)
new_node%idx = idx
new_node%val = val
nullify(new_node%left)
nullify(new_node%right)
nullify(new_node%parent)
new_node%is_red = .true.
if (.not. associated(this%root)) then
this%root => new_node
this%nnz = 1
new_node%is_red = .false.
return
end if
current => this%root
do while (associated(current))
previous => current
if (idx == current%idx) then
current%val = current%val + val
deallocate(new_node)
return
else if (idx < current%idx) then
current => current%left
else
current => current%right
end if
end do
if (idx < previous%idx) then
new_node%parent => previous
previous%left => new_node
else
new_node%parent => previous
previous%right => new_node
end if
call psb_s_rb_idx_tree_fix_insertion(this, new_node)
this%nnz = this%nnz + 1
end subroutine psb_s_rb_idx_tree_insert
subroutine psb_s_rb_idx_tree_fix_insertion(this, node)
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_fix_insertion
implicit none
class(psb_s_rb_idx_tree), intent(inout) :: this
type(psb_s_rb_idx_node), pointer, intent(inout) :: node
character(len=29) :: name
type(psb_s_rb_idx_node), pointer :: current, parent, grand_parent, uncle
name = 'psb_rb_idx_tree_fix_insertion'
current => node
parent => current%parent
do while(associated(parent) .and. parent%is_red)
! grand parent exist because root can't be red
grand_parent => parent%parent
if (parent%idx < grand_parent%idx) then
uncle => grand_parent%right
else
uncle => grand_parent%left
end if
if (associated(uncle) .and. uncle%is_red) then
parent%is_red = .false.
uncle%is_red = .false.
grand_parent%is_red = .true.
current => grand_parent
parent => current%parent
! Left-Left case
else if (current%idx < parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_s_rb_idx_tree_rotate_right(grand_parent)
call psb_s_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Left-Right case
else if (current%idx > parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_s_rb_idx_tree_rotate_left(parent)
call psb_s_rb_idx_tree_rotate_right(grand_parent)
call psb_s_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
! Right-Right case
else if (current%idx > parent%idx .and. &
parent%idx > grand_parent%idx) then
call psb_s_rb_idx_tree_rotate_left(grand_parent)
call psb_s_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Right-Left case
else
call psb_s_rb_idx_tree_rotate_right(parent)
call psb_s_rb_idx_tree_rotate_left(grand_parent)
call psb_s_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
end if
end do
this%root%is_red = .false.
end subroutine psb_s_rb_idx_tree_fix_insertion
subroutine psb_s_rb_idx_tree_swap_colors(n1, n2)
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_swap_colors
implicit none
type(psb_s_rb_idx_node), pointer, intent(inout) :: n1, n2
character(len=27) :: name
logical :: tmp
name='psb_rb_idx_tree_swap_colors'
tmp = n1%is_red
n1%is_red = n2%is_red
n2%is_red = tmp
end subroutine psb_s_rb_idx_tree_swap_colors
subroutine psb_s_rb_idx_tree_rotate_right(node)
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_rotate_right
implicit none
type(psb_s_rb_idx_node), pointer, intent(inout) :: node
character(len=28) :: name
type(psb_s_rb_idx_node), pointer :: l, lr
name='psb_rb_idx_tree_rotate_right'
if (.not. associated(node%left)) return
l => node%left
lr => l%right
node%left => lr
if (associated(lr)) lr%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => l
else
node%parent%right => l
end if
end if
l%parent => node%parent
node%parent => l
l%right => node
end subroutine psb_s_rb_idx_tree_rotate_right
subroutine psb_s_rb_idx_tree_rotate_left(node)
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_rotate_left
implicit none
type(psb_s_rb_idx_node), pointer, intent(inout) :: node
character(len=27) :: name
type(psb_s_rb_idx_node), pointer :: r, rl
name='psb_rb_idx_tree_rotate_left'
if (.not. associated(node%right)) return
r => node%right
rl => r%left
node%right => rl
if (associated(rl)) rl%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => r
else
node%parent%right => r
end if
end if
r%parent => node%parent
node%parent => r
r%left => node
end subroutine psb_s_rb_idx_tree_rotate_left
subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_scalar_sparse_row_mul
use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat
implicit none
type(psb_s_rb_idx_tree), intent(inout) :: tree
real(psb_spk_), intent(in) :: scalar
type(psb_s_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
character(len=37) :: name
integer(psb_ipk_) :: i
name='psb_rb_idx_tree_scalar_sparse_row_mul'
do i = mat%irp(row_num), mat%irp(row_num + 1) - 1
call tree%insert(mat%ja(i),scalar * mat%val(i))
end do
end subroutine psb_s_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_s_rb_idx_tree_merge(trees, mat)
#if defined(OPENMP)
use omp_lib
#endif
use psb_realloc_mod
use psb_s_rb_idx_tree_mod, psb_protect_name => psb_s_rb_idx_tree_merge
use psb_s_csr_mat_mod, only : psb_s_csr_sparse_mat
implicit none
type(psb_s_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_s_csr_sparse_mat), intent(inout) :: mat
character(len=21) :: name
integer(psb_ipk_) :: i, j, rows, info, nnz
type(psb_s_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_merge'
rows = size(trees)
mat%irp(1) = 1
do i=1, rows
mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz
end do
nnz = mat%irp(rows + 1)
call psb_realloc(nnz, mat%val, info)
call psb_realloc(nnz, mat%ja, info)
#if defined(OPENMP)
!$omp parallel do schedule(static), private(current, previous, j)
#endif
do i = 1, size(trees)
j = 0
current => trees(i)%root
do while(associated(current))
! go to the left-most node
do while(associated(current%left))
current => current%left
end do
mat%val(j + mat%irp(i)) = current%val
mat%ja(j + mat%irp(i)) = current%idx
j = j + 1
previous => current
if (associated(current%right)) then
if (associated(current%parent)) then
current%parent%left => current%right
end if
current%right%parent => current%parent
current => current%right
else
current => current%parent
if (associated(current)) nullify(current%left)
end if
deallocate(previous)
end do
end do
#if defined(OPENMP)
!$omp end parallel do
#endif
end subroutine psb_s_rb_idx_tree_merge

@ -3495,7 +3495,7 @@ subroutine psb_z_cp_csr_to_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info) if (info == 0) call psb_safe_cpy( a%ja(1:nz), b%ja , info)
if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info) if (info == 0) call psb_safe_cpy( a%val(1:nz), b%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,b%irp,info) call psb_realloc(nr+1,b%irp,info)
call psb_realloc(nz,b%ja,info) call psb_realloc(nz,b%ja,info)
call psb_realloc(nz,b%val,info) call psb_realloc(nz,b%val,info)
@ -3600,7 +3600,7 @@ subroutine psb_z_cp_csr_from_fmt(a,b,info)
if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info) if (info == 0) call psb_safe_cpy( b%ja(1:nz) , a%ja , info)
if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info) if (info == 0) call psb_safe_cpy( b%val(1:nz) , a%val , info)
else else
! Despite the implementation in safe_cpy, it seems better this way ! Despite the implementation in safe_cpy, it seems better this way
call psb_realloc(nr+1,a%irp,info) call psb_realloc(nr+1,a%irp,info)
call psb_realloc(nz,a%ja,info) call psb_realloc(nz,a%ja,info)
call psb_realloc(nz,a%val,info) call psb_realloc(nz,a%val,info)
@ -3654,6 +3654,540 @@ subroutine psb_z_csr_clean_zeros(a, info)
call a%set_host() call a%set_host()
end subroutine psb_z_csr_clean_zeros end subroutine psb_z_csr_clean_zeros
#if defined(OPENMP)
subroutine psb_zcsrspspmm(a,b,c,info)
use psb_z_mat_mod
use psb_serial_mod, psb_protect_name => psb_zcsrspspmm
implicit none
class(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb, nzc, nza, nzb
character(len=20) :: name
integer(psb_ipk_) :: err_act
name='psb_csrspspmm'
call psb_erractionsave(err_act)
info = psb_success_
if (a%is_dev()) call a%sync()
if (b%is_dev()) call b%sync()
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
if ( mb /= na ) then
write(psb_err_unit,*) 'Mismatch in SPSPMM: ',ma,na,mb,nb
info = psb_err_invalid_matrix_sizes_
call psb_errpush(info,name)
goto 9999
endif
select case(spspmm_impl)
case (spspmm_serial)
! Estimate number of nonzeros on output.
nza = a%get_nzeros()
nzb = b%get_nzeros()
nzc = 2*(nza+nzb)
call c%allocate(ma,nb,nzc)
call csr_spspmm(a,b,c,info)
case (spspmm_omp_gustavson)
call spmm_omp_gustavson(a,b,c,info)
case (spspmm_omp_gustavson_1d)
call spmm_omp_gustavson_1d(a,b,c,info)
case (spspmm_serial_rb_tree)
call spmm_serial_rb_tree(a,b,c,info)
case (spspmm_omp_rb_tree)
call spmm_omp_rb_tree(a,b,c,info)
case (spspmm_omp_two_pass)
call spmm_omp_two_pass(a,b,c,info)
case default
write(psb_err_unit,*) 'Unknown spspmm implementation'
! push error
goto 9999
end select
call c%set_asb()
call c%set_host()
call psb_erractionrestore(err_act)
return
9999 call psb_error_handler(err_act)
return
contains
subroutine csr_spspmm(a,b,c,info)
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(inout) :: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: ma,na,mb,nb
integer(psb_ipk_), allocatable :: irow(:), idxs(:)
complex(psb_dpk_), allocatable :: row(:)
integer(psb_ipk_) :: i,j,k,irw,icl,icf, iret, &
& nzc,nnzre, isz, ipb, irwsz, nrc, nze
complex(psb_dpk_) :: cfb
info = psb_success_
ma = a%get_nrows()
na = a%get_ncols()
mb = b%get_nrows()
nb = b%get_ncols()
nze = min(size(c%val),size(c%ja))
isz = max(ma,na,mb,nb)
call psb_realloc(isz,row,info)
if (info == 0) call psb_realloc(isz,idxs,info)
if (info == 0) call psb_realloc(isz,irow,info)
if (info /= 0) return
row = dzero
irow = 0
nzc = 1
do j = 1,ma
c%irp(j) = nzc
nrc = 0
do k = a%irp(j), a%irp(j+1)-1
irw = a%ja(k)
cfb = a%val(k)
irwsz = b%irp(irw+1)-b%irp(irw)
do i = b%irp(irw),b%irp(irw+1)-1
icl = b%ja(i)
if (irow(icl)<j) then
nrc = nrc + 1
idxs(nrc) = icl
irow(icl) = j
end if
row(icl) = row(icl) + cfb*b%val(i)
end do
end do
if (nrc > 0 ) then
if ((nzc+nrc)>nze) then
nze = max(ma*((nzc+j-1)/j),nzc+2*nrc)
call psb_realloc(nze,c%val,info)
if (info == 0) call psb_realloc(nze,c%ja,info)
if (info /= 0) return
end if
call psb_qsort(idxs(1:nrc))
do i=1, nrc
irw = idxs(i)
c%ja(nzc) = irw
c%val(nzc) = row(irw)
row(irw) = dzero
nzc = nzc + 1
end do
end if
end do
c%irp(ma+1) = nzc
end subroutine csr_spspmm
! gustavson's algorithm using perfect hashing
! and OpenMP parallelisation
subroutine spmm_omp_gustavson(a,b,c,info)
use omp_lib
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
complex(psb_dpk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw, jj, j, k)
do irw = 1, ma
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson
subroutine spmm_omp_gustavson_1d(a,b,c,info)
use omp_lib
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
complex(psb_dpk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, &
start_idx, end_idx , blk, blk_size, rwstart,&
rwblk, rwblkrem, nblks
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
nblks = 4 * omp_get_max_threads()
rwblk = (ma / nblks)
rwblkrem = modulo(ma, nblks)
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart)
do blk = 0, nblks - 1
if (blk < rwblkrem) then
blk_size = rwblk + 1
rwstart = blk * rwblk + blk + 1
else
blk_size = rwblk
rwstart = blk * rwblk &
+ rwblkrem + 1
end if
do irw = rwstart, rwstart + blk_size - 1
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson_1d
subroutine spmm_serial_rb_tree(a,b,c,info)
use psb_rb_idx_tree_mod
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_z_rb_idx_tree), allocatable :: row_accs(:)
a_m = a%get_nrows()
b_n = b%get_ncols()
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_serial_rb_tree
subroutine spmm_omp_rb_tree(a,b,c,info)
use omp_lib
use psb_rb_idx_tree_mod
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_z_rb_idx_tree), allocatable :: row_accs(:)
real(8) :: tic, toc
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
!$omp parallel do schedule(static)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_omp_rb_tree
subroutine compute_indices(a, b, c, info)
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer :: full_mat_bound
integer :: row, col, i, j, k, nnz
full_mat_bound = 0
!omp parallel do schedule(static) reduction(+:full_mat_bound)
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
j = a%ja(col)
full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j)
end do
end do
!omp end parallel do
call psb_realloc(a%get_nrows() + 1, c%irp, info)
call psb_realloc(full_mat_bound, c%ja, info)
c%ja = 0
c%irp(1) = 1
nnz = 0
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1
k = 0
do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i))
k = k + 1
end do
if (c%ja(c%irp(row) + k) == 0) then
c%ja(c%irp(row)+k) = b%ja(i)
nnz = nnz + 1
end if
end do
end do
c%irp(row + 1) = nnz + 1
call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1))
end do
call psb_realloc(nnz, c%ja, info)
call psb_realloc(nnz, c%val, info)
c%val = 0
end subroutine compute_indices
subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num)
type(psb_z_csr_sparse_mat), intent(inout) :: out_mat
integer(psb_ipk_), intent(in) :: out_row_num
complex(psb_dpk_), intent(in) :: scalar
type(psb_z_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: trgt_row_num
integer(psb_ipk_) :: i, k, row_start, row_end
row_start = out_mat%irp(out_row_num)
row_end = out_mat%irp(out_row_num + 1) - 1
do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1
do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1
if (out_mat%ja(k) == mat%ja(i)) then
out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i)
exit
end if
end do
end do
end subroutine direct_scalar_sparse_row_mul
subroutine spmm_omp_two_pass(a,b,c,info)
use omp_lib
implicit none
type(psb_z_csr_sparse_mat), intent(in) :: a,b
type(psb_z_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n, row, col
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
call compute_indices(a, b, c, info)
!$omp parallel do schedule(static)
do row = 1, a_m
do col = a%irp(row), a%irp(row + 1) - 1
call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
end subroutine spmm_omp_two_pass
end subroutine psb_zcsrspspmm
#else
subroutine psb_zcsrspspmm(a,b,c,info) subroutine psb_zcsrspspmm(a,b,c,info)
use psb_z_mat_mod use psb_z_mat_mod
use psb_serial_mod, psb_protect_name => psb_zcsrspspmm use psb_serial_mod, psb_protect_name => psb_zcsrspspmm
@ -3776,7 +4310,7 @@ contains
end subroutine csr_spspmm end subroutine csr_spspmm
end subroutine psb_zcsrspspmm end subroutine psb_zcsrspspmm
#endif
! !
! !

@ -0,0 +1,329 @@
!
! Parallel Sparse BLAS version 3.5
! (C) Copyright 2006-2018
! Salvatore Filippone
! Alfredo Buttari
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions
! are met:
! 1. Redistributions of source code must retain the above copyright
! notice, this list of conditions and the following disclaimer.
! 2. Redistributions in binary form must reproduce the above copyright
! notice, this list of conditions, and the following disclaimer in the
! documentation and/or other materials provided with the distribution.
! 3. The name of the PSBLAS group or the names of its contributors may
! not be used to endorse or promote products derived from this
! software without specific written permission.
!
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! POSSIBILITY OF SUCH DAMAGE.
!
!
!
! package: psb_z_rb_idx_tree_impl
!
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
! Contributed by Dimitri Walther
!
subroutine psb_z_rb_idx_tree_insert(this, idx, val)
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_insert
implicit none
class(psb_z_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
complex(psb_dpk_), intent(in) :: val
character(len=22) :: name
type(psb_z_rb_idx_node), pointer :: new_node
type(psb_z_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_insert'
allocate(new_node)
new_node%idx = idx
new_node%val = val
nullify(new_node%left)
nullify(new_node%right)
nullify(new_node%parent)
new_node%is_red = .true.
if (.not. associated(this%root)) then
this%root => new_node
this%nnz = 1
new_node%is_red = .false.
return
end if
current => this%root
do while (associated(current))
previous => current
if (idx == current%idx) then
current%val = current%val + val
deallocate(new_node)
return
else if (idx < current%idx) then
current => current%left
else
current => current%right
end if
end do
if (idx < previous%idx) then
new_node%parent => previous
previous%left => new_node
else
new_node%parent => previous
previous%right => new_node
end if
call psb_z_rb_idx_tree_fix_insertion(this, new_node)
this%nnz = this%nnz + 1
end subroutine psb_z_rb_idx_tree_insert
subroutine psb_z_rb_idx_tree_fix_insertion(this, node)
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_fix_insertion
implicit none
class(psb_z_rb_idx_tree), intent(inout) :: this
type(psb_z_rb_idx_node), pointer, intent(inout) :: node
character(len=29) :: name
type(psb_z_rb_idx_node), pointer :: current, parent, grand_parent, uncle
name = 'psb_rb_idx_tree_fix_insertion'
current => node
parent => current%parent
do while(associated(parent) .and. parent%is_red)
! grand parent exist because root can't be red
grand_parent => parent%parent
if (parent%idx < grand_parent%idx) then
uncle => grand_parent%right
else
uncle => grand_parent%left
end if
if (associated(uncle) .and. uncle%is_red) then
parent%is_red = .false.
uncle%is_red = .false.
grand_parent%is_red = .true.
current => grand_parent
parent => current%parent
! Left-Left case
else if (current%idx < parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_z_rb_idx_tree_rotate_right(grand_parent)
call psb_z_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Left-Right case
else if (current%idx > parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_z_rb_idx_tree_rotate_left(parent)
call psb_z_rb_idx_tree_rotate_right(grand_parent)
call psb_z_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
! Right-Right case
else if (current%idx > parent%idx .and. &
parent%idx > grand_parent%idx) then
call psb_z_rb_idx_tree_rotate_left(grand_parent)
call psb_z_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Right-Left case
else
call psb_z_rb_idx_tree_rotate_right(parent)
call psb_z_rb_idx_tree_rotate_left(grand_parent)
call psb_z_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
end if
end do
this%root%is_red = .false.
end subroutine psb_z_rb_idx_tree_fix_insertion
subroutine psb_z_rb_idx_tree_swap_colors(n1, n2)
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_swap_colors
implicit none
type(psb_z_rb_idx_node), pointer, intent(inout) :: n1, n2
character(len=27) :: name
logical :: tmp
name='psb_rb_idx_tree_swap_colors'
tmp = n1%is_red
n1%is_red = n2%is_red
n2%is_red = tmp
end subroutine psb_z_rb_idx_tree_swap_colors
subroutine psb_z_rb_idx_tree_rotate_right(node)
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_rotate_right
implicit none
type(psb_z_rb_idx_node), pointer, intent(inout) :: node
character(len=28) :: name
type(psb_z_rb_idx_node), pointer :: l, lr
name='psb_rb_idx_tree_rotate_right'
if (.not. associated(node%left)) return
l => node%left
lr => l%right
node%left => lr
if (associated(lr)) lr%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => l
else
node%parent%right => l
end if
end if
l%parent => node%parent
node%parent => l
l%right => node
end subroutine psb_z_rb_idx_tree_rotate_right
subroutine psb_z_rb_idx_tree_rotate_left(node)
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_rotate_left
implicit none
type(psb_z_rb_idx_node), pointer, intent(inout) :: node
character(len=27) :: name
type(psb_z_rb_idx_node), pointer :: r, rl
name='psb_rb_idx_tree_rotate_left'
if (.not. associated(node%right)) return
r => node%right
rl => r%left
node%right => rl
if (associated(rl)) rl%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => r
else
node%parent%right => r
end if
end if
r%parent => node%parent
node%parent => r
r%left => node
end subroutine psb_z_rb_idx_tree_rotate_left
subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_scalar_sparse_row_mul
use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat
implicit none
type(psb_z_rb_idx_tree), intent(inout) :: tree
complex(psb_dpk_), intent(in) :: scalar
type(psb_z_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
character(len=37) :: name
integer(psb_ipk_) :: i
name='psb_rb_idx_tree_scalar_sparse_row_mul'
do i = mat%irp(row_num), mat%irp(row_num + 1) - 1
call tree%insert(mat%ja(i),scalar * mat%val(i))
end do
end subroutine psb_z_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_z_rb_idx_tree_merge(trees, mat)
#if defined(OPENMP)
use omp_lib
#endif
use psb_realloc_mod
use psb_z_rb_idx_tree_mod, psb_protect_name => psb_z_rb_idx_tree_merge
use psb_z_csr_mat_mod, only : psb_z_csr_sparse_mat
implicit none
type(psb_z_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_z_csr_sparse_mat), intent(inout) :: mat
character(len=21) :: name
integer(psb_ipk_) :: i, j, rows, info, nnz
type(psb_z_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_merge'
rows = size(trees)
mat%irp(1) = 1
do i=1, rows
mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz
end do
nnz = mat%irp(rows + 1)
call psb_realloc(nnz, mat%val, info)
call psb_realloc(nnz, mat%ja, info)
#if defined(OPENMP)
!$omp parallel do schedule(static), private(current, previous, j)
#endif
do i = 1, size(trees)
j = 0
current => trees(i)%root
do while(associated(current))
! go to the left-most node
do while(associated(current%left))
current => current%left
end do
mat%val(j + mat%irp(i)) = current%val
mat%ja(j + mat%irp(i)) = current%idx
j = j + 1
previous => current
if (associated(current%right)) then
if (associated(current%parent)) then
current%parent%left => current%right
end if
current%right%parent => current%parent
current => current%right
else
current => current%parent
if (associated(current)) nullify(current%left)
end if
deallocate(previous)
end do
end do
#if defined(OPENMP)
!$omp end parallel do
#endif
end subroutine psb_z_rb_idx_tree_merge

@ -3,7 +3,6 @@ include ../../../Make.inc
# #
# The object files # The object files
# #
BOBJS=psi_lcx_mod.o psi_alcx_mod.o psi_acx_mod.o
IOBJS=psb_m_hsort_impl.o psb_m_isort_impl.o psb_m_msort_impl.o psb_m_qsort_impl.o IOBJS=psb_m_hsort_impl.o psb_m_isort_impl.o psb_m_msort_impl.o psb_m_qsort_impl.o
LOBJS=psb_e_hsort_impl.o psb_e_isort_impl.o psb_e_msort_impl.o psb_e_qsort_impl.o LOBJS=psb_e_hsort_impl.o psb_e_isort_impl.o psb_e_msort_impl.o psb_e_qsort_impl.o
SOBJS=psb_s_hsort_impl.o psb_s_isort_impl.o psb_s_msort_impl.o psb_s_qsort_impl.o SOBJS=psb_s_hsort_impl.o psb_s_isort_impl.o psb_s_msort_impl.o psb_s_qsort_impl.o
@ -11,7 +10,7 @@ DOBJS=psb_d_hsort_impl.o psb_d_isort_impl.o psb_d_msort_impl.o psb_d_qsort_impl.
COBJS=psb_c_hsort_impl.o psb_c_isort_impl.o psb_c_msort_impl.o psb_c_qsort_impl.o COBJS=psb_c_hsort_impl.o psb_c_isort_impl.o psb_c_msort_impl.o psb_c_qsort_impl.o
ZOBJS=psb_z_hsort_impl.o psb_z_isort_impl.o psb_z_msort_impl.o psb_z_qsort_impl.o ZOBJS=psb_z_hsort_impl.o psb_z_isort_impl.o psb_z_msort_impl.o psb_z_qsort_impl.o
OBJS=$(BOBJS) $(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS) $(IOBJS) $(LOBJS) OBJS=$(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS) $(IOBJS) $(LOBJS)
# #
# Where the library should go, and how it is called. # Where the library should go, and how it is called.
@ -35,7 +34,6 @@ lib: objs
# A bit excessive, but safe # A bit excessive, but safe
$(OBJS): $(MODDIR)/psb_base_mod.o $(OBJS): $(MODDIR)/psb_base_mod.o
$(IOBJS) $(LOBJS) $(SOBJS) $(DOBJS) $(COBJS) $(ZOBJS): $(BOBJS)
clean: cleanobjs clean: cleanobjs
veryclean: cleanobjs veryclean: cleanobjs

@ -176,6 +176,7 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash)
end if end if
tmpgidx = 0 tmpgidx = 0
flag_ = 1 flag_ = 1
!$omp parallel do private(i)
do i=1,loc_row do i=1,loc_row
if ((v(i)<1).or.(v(i)>m)) then if ((v(i)<1).or.(v(i)>m)) then
info = psb_err_entry_out_of_bounds_ info = psb_err_entry_out_of_bounds_
@ -215,6 +216,7 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash)
novrl = 0 novrl = 0
norphan = 0 norphan = 0
npr_ov = 0 npr_ov = 0
!$omp parallel do private(i)
do i=1,loc_row do i=1,loc_row
if ((v(i)<1).or.(v(i)>m)) then if ((v(i)<1).or.(v(i)>m)) then
info = psb_err_entry_out_of_bounds_ info = psb_err_entry_out_of_bounds_
@ -222,7 +224,6 @@ subroutine psb_cd_inloc(v, ctxt, desc, info, globalcheck,idx,usehash)
l_err(2) = v(i) l_err(2) = v(i)
l_err(3) = loc_row l_err(3) = loc_row
l_err(4) = m l_err(4) = m
exit
endif endif
vl(i) = v(i) vl(i) = v(i)
end do end do

Loading…
Cancel
Save