added parallel double precision spspmm implementations

omp-walther
wlthr 1 year ago
parent 347352fe1e
commit 1af76c067c

@ -64,7 +64,8 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \
serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \
serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \
serial/psb_c_base_mat_mod.o serial/psb_c_csr_mat_mod.o serial/psb_c_csc_mat_mod.o serial/psb_c_mat_mod.o \ serial/psb_c_base_mat_mod.o serial/psb_c_csr_mat_mod.o serial/psb_c_csc_mat_mod.o serial/psb_c_mat_mod.o \
serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o \
serial/psb_d_rb_idx_tree_mod.o serial/psb_rb_idx_tree_mod.o
#\ #\
# serial/psb_ls_csr_mat_mod.o serial/psb_ld_csr_mat_mod.o serial/psb_lc_csr_mat_mod.o serial/psb_lz_csr_mat_mod.o # serial/psb_ls_csr_mat_mod.o serial/psb_ld_csr_mat_mod.o serial/psb_lc_csr_mat_mod.o serial/psb_lz_csr_mat_mod.o
#\ #\
@ -270,6 +271,9 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o
serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o
serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o
serial/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o
serial/psb_rb_idx_tree_mod.o: serial/psb_d_rb_idx_tree_mod.o
error.o psb_realloc_mod.o: psb_error_mod.o error.o psb_realloc_mod.o: psb_error_mod.o
psb_error_impl.o: psb_penv_mod.o psb_error_impl.o: psb_penv_mod.o
psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o

@ -46,6 +46,14 @@ module psb_d_csr_mat_mod
use psb_d_base_mat_mod use psb_d_base_mat_mod
integer(psb_ipk_), parameter :: spspmm_serial = 0
integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1
integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2
integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3
integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4
integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5
integer(psb_ipk_) :: spspmm_impl = spspmm_serial
!> \namespace psb_base_mod \class psb_d_csr_sparse_mat !> \namespace psb_base_mod \class psb_d_csr_sparse_mat
!! \extends psb_d_base_mat_mod::psb_d_base_sparse_mat !! \extends psb_d_base_mat_mod::psb_d_base_sparse_mat
!! !!

@ -79,7 +79,10 @@
module psb_d_mat_mod module psb_d_mat_mod
use psb_d_base_mat_mod use psb_d_base_mat_mod
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat, &
spspmm_impl, spspmm_serial, spspmm_omp_gustavson, &
spspmm_omp_gustavson_1d, spspmm_serial_rb_tree, &
spspmm_omp_rb_tree, spspmm_omp_two_pass
use psb_d_csc_mat_mod, only : psb_d_csc_sparse_mat, psb_ld_csc_sparse_mat use psb_d_csc_mat_mod, only : psb_d_csc_sparse_mat, psb_ld_csc_sparse_mat
type :: psb_dspmat_type type :: psb_dspmat_type

@ -0,0 +1,91 @@
! Red black tree implementation ordered by index
!
! Each node contains and index and a double precision value
!
! The tree should always be well balanced
!
! inserting a node with an existing index will
! add up the new value to the old one
module psb_d_rb_idx_tree_mod
use psb_const_mod
implicit none
type :: psb_d_rb_idx_node
integer(psb_ipk_) :: idx
real(psb_dpk_) :: val
type(psb_d_rb_idx_node), pointer :: left, right, parent
logical :: is_red
end type psb_d_rb_idx_node
type :: psb_d_rb_idx_tree
type(psb_d_rb_idx_node), pointer :: root
integer(psb_ipk_) :: nnz
contains
procedure :: insert => psb_d_rb_idx_tree_insert
end type psb_d_rb_idx_tree
interface psb_rb_idx_tree_insert
subroutine psb_d_rb_idx_tree_insert(this, idx, val)
import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
real(psb_dpk_), intent(in) :: val
end subroutine psb_d_rb_idx_tree_insert
end interface psb_rb_idx_tree_insert
interface psb_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree
implicit none
type(psb_d_rb_idx_tree), intent(inout) :: tree
real(psb_dpk_), intent(in) :: scalar
type(psb_d_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul
end interface psb_rb_idx_tree_scalar_sparse_row_mul
interface psb_rb_idx_tree_merge
subroutine psb_d_rb_idx_tree_merge(trees, mat)
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
import :: psb_d_rb_idx_tree
type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_d_csr_sparse_mat), intent(inout) :: mat
end subroutine psb_d_rb_idx_tree_merge
end interface psb_rb_idx_tree_merge
interface psb_rb_idx_tree_fix_insertion
subroutine psb_d_rb_idx_tree_fix_insertion(this, node)
import :: psb_d_rb_idx_tree, psb_d_rb_idx_node
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_d_rb_idx_tree_fix_insertion
end interface psb_rb_idx_tree_fix_insertion
interface psb_rb_idx_tree_swap_colors
subroutine psb_d_rb_idx_tree_swap_colors(n1, n2)
import :: psb_d_rb_idx_node
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2
end subroutine psb_d_rb_idx_tree_swap_colors
end interface psb_rb_idx_tree_swap_colors
interface psb_rb_idx_tree_rotate_right
subroutine psb_d_rb_idx_tree_rotate_right(node)
import :: psb_d_rb_idx_node
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_d_rb_idx_tree_rotate_right
end interface psb_rb_idx_tree_rotate_right
interface psb_rb_idx_tree_rotate_left
subroutine psb_d_rb_idx_tree_rotate_left(node)
import :: psb_d_rb_idx_node
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
end subroutine psb_d_rb_idx_tree_rotate_left
end interface psb_rb_idx_tree_rotate_left
end module psb_d_rb_idx_tree_mod

@ -0,0 +1,5 @@
module psb_rb_idx_tree_mod
use psb_const_mod
use psb_d_rb_idx_tree_mod
end module psb_rb_idx_tree_mod

@ -10,7 +10,7 @@ BOBJS=psb_base_mat_impl.o \
SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o
#\ #\
psb_s_lcoo_impl.o psb_s_lcsr_impl.o psb_s_lcoo_impl.o psb_s_lcsr_impl.o
DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o psb_d_rb_idx_tree_impl.o
#\ #\
psb_d_lcoo_impl.o psb_d_lcsr_impl.o psb_d_lcoo_impl.o psb_d_lcsr_impl.o
COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o

@ -3349,6 +3349,8 @@ subroutine psb_dcsrspspmm(a,b,c,info)
goto 9999 goto 9999
endif endif
select case(spspmm_impl)
case (spspmm_serial)
! Estimate number of nonzeros on output. ! Estimate number of nonzeros on output.
nza = a%get_nzeros() nza = a%get_nzeros()
nzb = b%get_nzeros() nzb = b%get_nzeros()
@ -3356,6 +3358,21 @@ subroutine psb_dcsrspspmm(a,b,c,info)
call c%allocate(ma,nb,nzc) call c%allocate(ma,nb,nzc)
call csr_spspmm(a,b,c,info) call csr_spspmm(a,b,c,info)
case (spspmm_omp_gustavson)
call spmm_omp_gustavson(a,b,c,info)
case (spspmm_omp_gustavson_1d)
call spmm_omp_gustavson_1d(a,b,c,info)
case (spspmm_serial_rb_tree)
call spmm_serial_rb_tree(a,b,c,info)
case (spspmm_omp_rb_tree)
call spmm_omp_rb_tree(a,b,c,info)
case (spspmm_omp_two_pass)
call spmm_omp_two_pass(a,b,c,info)
case default
write(psb_err_unit,*) 'Unknown spspmm implementation'
! push error
goto 9999
end select
call c%set_asb() call c%set_asb()
call c%set_host() call c%set_host()
@ -3434,9 +3451,400 @@ contains
end do end do
c%irp(ma+1) = nzc c%irp(ma+1) = nzc
end subroutine csr_spspmm
! gustavson's algorithm using perfect hashing
! and OpenMP parallelisation
subroutine spmm_omp_gustavson(a,b,c,info)
use omp_lib
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
end subroutine csr_spspmm real(psb_dpk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw, jj, j, k)
do irw = 1, ma
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson
subroutine spmm_omp_gustavson_1d(a,b,c,info)
use omp_lib
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
real(psb_dpk_), allocatable :: vals(:), acc(:)
integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, &
start_idx, end_idx , blk, blk_size, rwstart,&
rwblk, rwblkrem, nblks
ma = a%get_nrows()
nb = b%get_ncols()
call c%allocate(ma, nb)
c%irp(1) = 1
! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info)
allocate(offsets(omp_get_max_threads()))
nblks = 4 * omp_get_max_threads()
rwblk = (ma / nblks)
rwblkrem = modulo(ma, nblks)
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0
start_idx = 0
!$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j)
end do
end do
!$omp end do
call psb_realloc(thread_upperbound, vals, info)
call psb_realloc(thread_upperbound, col_inds, info)
! possible bottleneck
acc = 0
nnz = 0
!$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart)
do blk = 0, nblks - 1
if (blk < rwblkrem) then
blk_size = rwblk + 1
rwstart = blk * rwblk + blk + 1
else
blk_size = rwblk
rwstart = blk * rwblk &
+ rwblkrem + 1
end if
do irw = rwstart, rwstart + blk_size - 1
if (start_idx == 0) then
start_idx = irw
end if
end_idx = irw
rwnz = 0
do jj = a%irp(irw), a%irp(irw + 1) - 1
j = a%ja(jj)
do k = b%irp(j), b%irp(j + 1) - 1
if (acc(b%ja(k)) == 0) then
nnz = nnz + 1
rwnz = rwnz + 1
col_inds(nnz) = b%ja(k)
end if
acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k)
end do
end do
call psb_qsort(col_inds(nnz - rwnz + 1:nnz))
do k = nnz - rwnz + 1, nnz
vals(k) = acc(col_inds(k))
acc(col_inds(k)) = 0
end do
c%irp(irw + 1) = rwnz
end do
end do
!$omp end do
offsets(omp_get_thread_num() + 1) = nnz
!$omp barrier
! possible bottleneck
!$omp single
do k = 1, omp_get_num_threads() - 1
offsets(k + 1) = offsets(k + 1) + offsets(k)
end do
!$omp end single
!$omp barrier
if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if
do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do
!$omp barrier
!$omp single
c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma)
call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single
c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz)
!$omp end parallel
end subroutine spmm_omp_gustavson_1d
subroutine spmm_serial_rb_tree(a,b,c,info)
use psb_rb_idx_tree_mod
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_d_rb_idx_tree), allocatable :: row_accs(:)
a_m = a%get_nrows()
b_n = b%get_ncols()
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_serial_rb_tree
subroutine spmm_omp_rb_tree(a,b,c,info)
use omp_lib
use psb_rb_idx_tree_mod
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n
integer(psb_ipk_) :: row, col
type(psb_d_rb_idx_tree), allocatable :: row_accs(:)
real(8) :: tic, toc
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
allocate(row_accs(a_m))
call c%allocate(a_m, b_n)
!$omp parallel do schedule(static)
do row = 1, a_m
row_accs(row)%nnz = 0
nullify(row_accs(row)%root)
do col = a%irp(row), a%irp(row + 1) - 1
call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
call psb_rb_idx_tree_merge(row_accs, c)
deallocate(row_accs)
info = 0
end subroutine spmm_omp_rb_tree
subroutine compute_indices(a, b, c, info)
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer :: full_mat_bound
integer :: row, col, i, j, k, nnz
full_mat_bound = 0
!omp parallel do schedule(static) reduction(+:full_mat_bound)
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
j = a%ja(col)
full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j)
end do
end do
!omp end parallel do
call psb_realloc(a%get_nrows() + 1, c%irp, info)
call psb_realloc(full_mat_bound, c%ja, info)
c%ja = 0
c%irp(1) = 1
nnz = 0
do row = 1, a%get_nrows()
do col = a%irp(row), a%irp(row + 1) - 1
do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1
k = 0
do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i))
k = k + 1
end do
if (c%ja(c%irp(row) + k) == 0) then
c%ja(c%irp(row)+k) = b%ja(i)
nnz = nnz + 1
end if
end do
end do
c%irp(row + 1) = nnz + 1
call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1))
end do
call psb_realloc(nnz, c%ja, info)
call psb_realloc(nnz, c%val, info)
c%val = 0
end subroutine compute_indices
subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num)
type(psb_d_csr_sparse_mat), intent(inout) :: out_mat
integer(psb_ipk_), intent(in) :: out_row_num
real(psb_dpk_), intent(in) :: scalar
type(psb_d_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: trgt_row_num
integer(psb_ipk_) :: i, k, row_start, row_end
row_start = out_mat%irp(out_row_num)
row_end = out_mat%irp(out_row_num + 1) - 1
do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1
do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1
if (out_mat%ja(k) == mat%ja(i)) then
out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i)
exit
end if
end do
end do
end subroutine direct_scalar_sparse_row_mul
subroutine spmm_omp_two_pass(a,b,c,info)
use omp_lib
implicit none
type(psb_d_csr_sparse_mat), intent(in) :: a,b
type(psb_d_csr_sparse_mat), intent(out):: c
integer(psb_ipk_), intent(out) :: info
integer(psb_ipk_) :: a_m, b_n, row, col
a_m = a%get_nrows()
b_n = b%get_ncols()
call c%allocate(a_m, b_n)
call compute_indices(a, b, c, info)
!$omp parallel do schedule(static)
do row = 1, a_m
do col = a%irp(row), a%irp(row + 1) - 1
call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col))
end do
end do
!$omp end parallel do
end subroutine spmm_omp_two_pass
end subroutine psb_dcsrspspmm end subroutine psb_dcsrspspmm

@ -0,0 +1,284 @@
subroutine psb_d_rb_idx_tree_insert(this, idx, val)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
integer(psb_ipk_), intent(in) :: idx
real(psb_dpk_), intent(in) :: val
character(len=22) :: name
type(psb_d_rb_idx_node), pointer :: new_node
type(psb_d_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_insert'
allocate(new_node)
new_node%idx = idx
new_node%val = val
nullify(new_node%left)
nullify(new_node%right)
nullify(new_node%parent)
new_node%is_red = .true.
if (.not. associated(this%root)) then
this%root => new_node
this%nnz = 1
new_node%is_red = .false.
return
end if
current => this%root
do while (associated(current))
previous => current
if (idx == current%idx) then
current%val = current%val + val
deallocate(new_node)
return
else if (idx < current%idx) then
current => current%left
else
current => current%right
end if
end do
if (idx < previous%idx) then
new_node%parent => previous
previous%left => new_node
else
new_node%parent => previous
previous%right => new_node
end if
call psb_d_rb_idx_tree_fix_insertion(this, new_node)
this%nnz = this%nnz + 1
end subroutine psb_d_rb_idx_tree_insert
subroutine psb_d_rb_idx_tree_fix_insertion(this, node)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion
implicit none
class(psb_d_rb_idx_tree), intent(inout) :: this
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
character(len=29) :: name
type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle
name = 'psb_rb_idx_tree_fix_insertion'
current => node
parent => current%parent
do while(associated(parent) .and. parent%is_red)
! grand parent exist because root can't be red
grand_parent => parent%parent
if (parent%idx < grand_parent%idx) then
uncle => grand_parent%right
else
uncle => grand_parent%left
end if
if (associated(uncle) .and. uncle%is_red) then
parent%is_red = .false.
uncle%is_red = .false.
grand_parent%is_red = .true.
current => grand_parent
parent => current%parent
! Left-Left case
else if (current%idx < parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_d_rb_idx_tree_rotate_right(grand_parent)
call psb_d_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Left-Right case
else if (current%idx > parent%idx .and. &
parent%idx < grand_parent%idx) then
call psb_d_rb_idx_tree_rotate_left(parent)
call psb_d_rb_idx_tree_rotate_right(grand_parent)
call psb_d_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
! Right-Right case
else if (current%idx > parent%idx .and. &
parent%idx > grand_parent%idx) then
call psb_d_rb_idx_tree_rotate_left(grand_parent)
call psb_d_rb_idx_tree_swap_colors(parent, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => parent
return
! Right-Left case
else
call psb_d_rb_idx_tree_rotate_right(parent)
call psb_d_rb_idx_tree_rotate_left(grand_parent)
call psb_d_rb_idx_tree_swap_colors(current, grand_parent)
if (this%root%idx == grand_parent%idx) this%root => current
return
end if
end do
this%root%is_red = .false.
end subroutine psb_d_rb_idx_tree_fix_insertion
subroutine psb_d_rb_idx_tree_swap_colors(n1, n2)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2
character(len=27) :: name
logical :: tmp
name='psb_rb_idx_tree_swap_colors'
tmp = n1%is_red
n1%is_red = n2%is_red
n2%is_red = tmp
end subroutine psb_d_rb_idx_tree_swap_colors
subroutine psb_d_rb_idx_tree_rotate_right(node)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
character(len=28) :: name
type(psb_d_rb_idx_node), pointer :: l, lr
name='psb_rb_idx_tree_rotate_right'
if (.not. associated(node%left)) return
l => node%left
lr => l%right
node%left => lr
if (associated(lr)) lr%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => l
else
node%parent%right => l
end if
end if
l%parent => node%parent
node%parent => l
l%right => node
end subroutine psb_d_rb_idx_tree_rotate_right
subroutine psb_d_rb_idx_tree_rotate_left(node)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left
implicit none
type(psb_d_rb_idx_node), pointer, intent(inout) :: node
character(len=27) :: name
type(psb_d_rb_idx_node), pointer :: r, rl
name='psb_rb_idx_tree_rotate_left'
if (.not. associated(node%right)) return
r => node%right
rl => r%left
node%right => rl
if (associated(rl)) rl%parent => node
if (associated(node%parent)) then
if (node%idx < node%parent%idx) then
node%parent%left => r
else
node%parent%right => r
end if
end if
r%parent => node%parent
node%parent => r
r%left => node
end subroutine psb_d_rb_idx_tree_rotate_left
subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num)
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
implicit none
type(psb_d_rb_idx_tree), intent(inout) :: tree
real(psb_dpk_), intent(in) :: scalar
type(psb_d_csr_sparse_mat), intent(in) :: mat
integer(psb_ipk_), intent(in) :: row_num
character(len=37) :: name
integer(psb_ipk_) :: i
name='psb_rb_idx_tree_scalar_sparse_row_mul'
do i = mat%irp(row_num), mat%irp(row_num + 1) - 1
call tree%insert(mat%ja(i),scalar * mat%val(i))
end do
end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul
subroutine psb_d_rb_idx_tree_merge(trees, mat)
#if defined(OPENMP)
use omp_lib
#endif
use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge
use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat
implicit none
type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:)
type(psb_d_csr_sparse_mat), intent(inout) :: mat
character(len=21) :: name
integer(psb_ipk_) :: i, j, rows, info, nnz
type(psb_d_rb_idx_node), pointer :: current, previous
name='psb_rb_idx_tree_merge'
rows = size(trees)
mat%irp(1) = 1
do i=1, rows
mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz
end do
nnz = mat%irp(rows + 1)
call psb_realloc(nnz, mat%val, info)
call psb_realloc(nnz, mat%ja, info)
#if defined(OPENMP)
!$omp parallel do schedule(static), private(current, previous, j)
#endif
do i = 1, size(trees)
j = 0
current => trees(i)%root
do while(associated(current))
! go to the left-most node
do while(associated(current%left))
current => current%left
end do
mat%val(j + mat%irp(i)) = current%val
mat%ja(j + mat%irp(i)) = current%idx
j = j + 1
previous => current
if (associated(current%right)) then
if (associated(current%parent)) then
current%parent%left => current%right
end if
current%right%parent => current%parent
current => current%right
else
current => current%parent
if (associated(current)) nullify(current%left)
end if
deallocate(previous)
end do
end do
#if defined(OPENMP)
!$omp end parallel do
#endif
end subroutine psb_d_rb_idx_tree_merge
Loading…
Cancel
Save