diff --git a/base/modules/Makefile b/base/modules/Makefile index 399add1f..d14a24ee 100644 --- a/base/modules/Makefile +++ b/base/modules/Makefile @@ -64,7 +64,8 @@ SERIAL_MODS=serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o \ serial/psb_s_base_mat_mod.o serial/psb_s_csr_mat_mod.o serial/psb_s_csc_mat_mod.o serial/psb_s_mat_mod.o \ serial/psb_d_base_mat_mod.o serial/psb_d_csr_mat_mod.o serial/psb_d_csc_mat_mod.o serial/psb_d_mat_mod.o \ serial/psb_c_base_mat_mod.o serial/psb_c_csr_mat_mod.o serial/psb_c_csc_mat_mod.o serial/psb_c_mat_mod.o \ - serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o + serial/psb_z_base_mat_mod.o serial/psb_z_csr_mat_mod.o serial/psb_z_csc_mat_mod.o serial/psb_z_mat_mod.o \ + serial/psb_d_rb_idx_tree_mod.o serial/psb_rb_idx_tree_mod.o #\ # serial/psb_ls_csr_mat_mod.o serial/psb_ld_csr_mat_mod.o serial/psb_lc_csr_mat_mod.o serial/psb_lz_csr_mat_mod.o #\ @@ -270,6 +271,9 @@ serial/psb_z_vect_mod.o: serial/psb_z_base_vect_mod.o serial/psb_i_vect_mod.o serial/psb_s_serial_mod.o serial/psb_d_serial_mod.o serial/psb_c_serial_mod.o serial/psb_z_serial_mod.o: serial/psb_mat_mod.o auxil/psb_string_mod.o auxil/psb_sort_mod.o auxil/psi_serial_mod.o serial/psb_vect_mod.o: serial/psb_i_vect_mod.o serial/psb_l_vect_mod.o serial/psb_d_vect_mod.o serial/psb_s_vect_mod.o serial/psb_c_vect_mod.o serial/psb_z_vect_mod.o +serial/psb_d_rb_idx_tree_mod.o: serial/psb_d_csr_mat_mod.o +serial/psb_rb_idx_tree_mod.o: serial/psb_d_rb_idx_tree_mod.o + error.o psb_realloc_mod.o: psb_error_mod.o psb_error_impl.o: psb_penv_mod.o psb_timers_mod.o: psb_penv_mod.o psb_const_mod.o psb_realloc_mod.o psb_error_mod.o diff --git a/base/modules/serial/psb_d_csr_mat_mod.f90 b/base/modules/serial/psb_d_csr_mat_mod.f90 index d0aa622b..74cf8fff 100644 --- a/base/modules/serial/psb_d_csr_mat_mod.f90 +++ b/base/modules/serial/psb_d_csr_mat_mod.f90 @@ -46,6 +46,14 @@ module psb_d_csr_mat_mod use psb_d_base_mat_mod + integer(psb_ipk_), parameter :: spspmm_serial = 0 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson = 1 + integer(psb_ipk_), parameter :: spspmm_omp_gustavson_1d = 2 + integer(psb_ipk_), parameter :: spspmm_serial_rb_tree = 3 + integer(psb_ipk_), parameter :: spspmm_omp_rb_tree = 4 + integer(psb_ipk_), parameter :: spspmm_omp_two_pass = 5 + integer(psb_ipk_) :: spspmm_impl = spspmm_serial + !> \namespace psb_base_mod \class psb_d_csr_sparse_mat !! \extends psb_d_base_mat_mod::psb_d_base_sparse_mat !! diff --git a/base/modules/serial/psb_d_mat_mod.F90 b/base/modules/serial/psb_d_mat_mod.F90 index 8f967ce1..9e613b29 100644 --- a/base/modules/serial/psb_d_mat_mod.F90 +++ b/base/modules/serial/psb_d_mat_mod.F90 @@ -79,7 +79,10 @@ module psb_d_mat_mod use psb_d_base_mat_mod - use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat, psb_ld_csr_sparse_mat, & + spspmm_impl, spspmm_serial, spspmm_omp_gustavson, & + spspmm_omp_gustavson_1d, spspmm_serial_rb_tree, & + spspmm_omp_rb_tree, spspmm_omp_two_pass use psb_d_csc_mat_mod, only : psb_d_csc_sparse_mat, psb_ld_csc_sparse_mat type :: psb_dspmat_type diff --git a/base/modules/serial/psb_d_rb_idx_tree_mod.f90 b/base/modules/serial/psb_d_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..afdd831d --- /dev/null +++ b/base/modules/serial/psb_d_rb_idx_tree_mod.f90 @@ -0,0 +1,91 @@ +! Red black tree implementation ordered by index +! +! Each node contains and index and a double precision value +! +! The tree should always be well balanced +! +! inserting a node with an existing index will +! add up the new value to the old one +module psb_d_rb_idx_tree_mod + use psb_const_mod + implicit none + type :: psb_d_rb_idx_node + integer(psb_ipk_) :: idx + real(psb_dpk_) :: val + type(psb_d_rb_idx_node), pointer :: left, right, parent + logical :: is_red + end type psb_d_rb_idx_node + + type :: psb_d_rb_idx_tree + type(psb_d_rb_idx_node), pointer :: root + integer(psb_ipk_) :: nnz + + contains + + procedure :: insert => psb_d_rb_idx_tree_insert + end type psb_d_rb_idx_tree + + interface psb_rb_idx_tree_insert + subroutine psb_d_rb_idx_tree_insert(this, idx, val) + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + end subroutine psb_d_rb_idx_tree_insert + end interface psb_rb_idx_tree_insert + + interface psb_rb_idx_tree_scalar_sparse_row_mul + subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_ipk_, psb_dpk_, psb_d_rb_idx_tree + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + end interface psb_rb_idx_tree_scalar_sparse_row_mul + + interface psb_rb_idx_tree_merge + subroutine psb_d_rb_idx_tree_merge(trees, mat) + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + import :: psb_d_rb_idx_tree + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + end subroutine psb_d_rb_idx_tree_merge + end interface psb_rb_idx_tree_merge + + interface psb_rb_idx_tree_fix_insertion + subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + import :: psb_d_rb_idx_tree, psb_d_rb_idx_node + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_fix_insertion + end interface psb_rb_idx_tree_fix_insertion + + interface psb_rb_idx_tree_swap_colors + subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + end subroutine psb_d_rb_idx_tree_swap_colors + end interface psb_rb_idx_tree_swap_colors + + interface psb_rb_idx_tree_rotate_right + subroutine psb_d_rb_idx_tree_rotate_right(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_right + end interface psb_rb_idx_tree_rotate_right + + interface psb_rb_idx_tree_rotate_left + subroutine psb_d_rb_idx_tree_rotate_left(node) + import :: psb_d_rb_idx_node + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + end subroutine psb_d_rb_idx_tree_rotate_left + end interface psb_rb_idx_tree_rotate_left +end module psb_d_rb_idx_tree_mod \ No newline at end of file diff --git a/base/modules/serial/psb_rb_idx_tree_mod.f90 b/base/modules/serial/psb_rb_idx_tree_mod.f90 new file mode 100644 index 00000000..e93554f1 --- /dev/null +++ b/base/modules/serial/psb_rb_idx_tree_mod.f90 @@ -0,0 +1,5 @@ +module psb_rb_idx_tree_mod + use psb_const_mod + + use psb_d_rb_idx_tree_mod +end module psb_rb_idx_tree_mod \ No newline at end of file diff --git a/base/serial/impl/Makefile b/base/serial/impl/Makefile index 00088741..ddcdaf96 100644 --- a/base/serial/impl/Makefile +++ b/base/serial/impl/Makefile @@ -10,7 +10,7 @@ BOBJS=psb_base_mat_impl.o \ SOBJS=psb_s_csr_impl.o psb_s_coo_impl.o psb_s_csc_impl.o psb_s_mat_impl.o #\ psb_s_lcoo_impl.o psb_s_lcsr_impl.o -DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o +DOBJS=psb_d_csr_impl.o psb_d_coo_impl.o psb_d_csc_impl.o psb_d_mat_impl.o psb_d_rb_idx_tree_impl.o #\ psb_d_lcoo_impl.o psb_d_lcsr_impl.o COBJS=psb_c_csr_impl.o psb_c_coo_impl.o psb_c_csc_impl.o psb_c_mat_impl.o diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index 10518a2d..1e39b892 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3349,13 +3349,30 @@ subroutine psb_dcsrspspmm(a,b,c,info) goto 9999 endif - ! Estimate number of nonzeros on output. - nza = a%get_nzeros() - nzb = b%get_nzeros() - nzc = 2*(nza+nzb) - call c%allocate(ma,nb,nzc) - - call csr_spspmm(a,b,c,info) + select case(spspmm_impl) + case (spspmm_serial) + ! Estimate number of nonzeros on output. + nza = a%get_nzeros() + nzb = b%get_nzeros() + nzc = 2*(nza+nzb) + call c%allocate(ma,nb,nzc) + + call csr_spspmm(a,b,c,info) + case (spspmm_omp_gustavson) + call spmm_omp_gustavson(a,b,c,info) + case (spspmm_omp_gustavson_1d) + call spmm_omp_gustavson_1d(a,b,c,info) + case (spspmm_serial_rb_tree) + call spmm_serial_rb_tree(a,b,c,info) + case (spspmm_omp_rb_tree) + call spmm_omp_rb_tree(a,b,c,info) + case (spspmm_omp_two_pass) + call spmm_omp_two_pass(a,b,c,info) + case default + write(psb_err_unit,*) 'Unknown spspmm implementation' + ! push error + goto 9999 + end select call c%set_asb() call c%set_host() @@ -3434,9 +3451,400 @@ contains end do c%irp(ma+1) = nzc + end subroutine csr_spspmm + ! gustavson's algorithm using perfect hashing + ! and OpenMP parallelisation + subroutine spmm_omp_gustavson(a,b,c,info) + use omp_lib - end subroutine csr_spspmm + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + + allocate(offsets(omp_get_max_threads())) + !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & + !$omp shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw, jj, j, k) + do irw = 1, ma + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson + + subroutine spmm_omp_gustavson_1d(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + real(psb_dpk_), allocatable :: vals(:), acc(:) + integer(psb_ipk_) :: ma, nb + integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) + integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, & + start_idx, end_idx , blk, blk_size, rwstart,& + rwblk, rwblkrem, nblks + + ma = a%get_nrows() + nb = b%get_ncols() + + call c%allocate(ma, nb) + c%irp(1) = 1 + + ! dense accumulator + ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf + call psb_realloc(nb, acc, info) + allocate(offsets(omp_get_max_threads())) + + nblks = 4 * omp_get_max_threads() + rwblk = (ma / nblks) + rwblkrem = modulo(ma, nblks) + !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) + thread_upperbound = 0 + start_idx = 0 + !$omp do schedule(static) private(irw, jj, j) + do irw = 1, ma + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + thread_upperbound = thread_upperbound + b%irp(j+1) - b%irp(j) + end do + end do + !$omp end do + + call psb_realloc(thread_upperbound, vals, info) + call psb_realloc(thread_upperbound, col_inds, info) + + ! possible bottleneck + acc = 0 + + nnz = 0 + !$omp do schedule(static) private(irw,jj,j,k,rwnz,blk,blk_size,rwstart) + do blk = 0, nblks - 1 + if (blk < rwblkrem) then + blk_size = rwblk + 1 + rwstart = blk * rwblk + blk + 1 + else + blk_size = rwblk + rwstart = blk * rwblk & + + rwblkrem + 1 + end if + do irw = rwstart, rwstart + blk_size - 1 + if (start_idx == 0) then + start_idx = irw + end if + end_idx = irw + rwnz = 0 + do jj = a%irp(irw), a%irp(irw + 1) - 1 + j = a%ja(jj) + do k = b%irp(j), b%irp(j + 1) - 1 + if (acc(b%ja(k)) == 0) then + nnz = nnz + 1 + rwnz = rwnz + 1 + col_inds(nnz) = b%ja(k) + end if + acc(b%ja(k)) = acc(b%ja(k)) + a%val(jj) * b%val(k) + end do + end do + call psb_qsort(col_inds(nnz - rwnz + 1:nnz)) + + do k = nnz - rwnz + 1, nnz + vals(k) = acc(col_inds(k)) + acc(col_inds(k)) = 0 + end do + c%irp(irw + 1) = rwnz + end do + end do + !$omp end do + + offsets(omp_get_thread_num() + 1) = nnz + !$omp barrier + + ! possible bottleneck + !$omp single + do k = 1, omp_get_num_threads() - 1 + offsets(k + 1) = offsets(k + 1) + offsets(k) + end do + !$omp end single + + !$omp barrier + + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do + + !$omp barrier + + !$omp single + c%irp(ma + 1) = c%irp(ma + 1) + c%irp(ma) + call psb_realloc(c%irp(ma + 1), c%val, info) + call psb_realloc(c%irp(ma + 1), c%ja, info) + !$omp end single + + c%val(c%irp(start_idx):c%irp(start_idx) + nnz) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(start_idx) + nnz) = col_inds(1:nnz) + !$omp end parallel + end subroutine spmm_omp_gustavson_1d + + subroutine spmm_serial_rb_tree(a,b,c,info) + use psb_rb_idx_tree_mod + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_d_rb_idx_tree), allocatable :: row_accs(:) + + a_m = a%get_nrows() + b_n = b%get_ncols() + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + + info = 0 +end subroutine spmm_serial_rb_tree + +subroutine spmm_omp_rb_tree(a,b,c,info) + use omp_lib + use psb_rb_idx_tree_mod + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n + integer(psb_ipk_) :: row, col + type(psb_d_rb_idx_tree), allocatable :: row_accs(:) + real(8) :: tic, toc + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + allocate(row_accs(a_m)) + call c%allocate(a_m, b_n) + + !$omp parallel do schedule(static) + do row = 1, a_m + row_accs(row)%nnz = 0 + nullify(row_accs(row)%root) + do col = a%irp(row), a%irp(row + 1) - 1 + call psb_rb_idx_tree_scalar_sparse_row_mul(row_accs(row), a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do + + call psb_rb_idx_tree_merge(row_accs, c) + + deallocate(row_accs) + info = 0 +end subroutine spmm_omp_rb_tree + +subroutine compute_indices(a, b, c, info) + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer :: full_mat_bound + integer :: row, col, i, j, k, nnz + + full_mat_bound = 0 + !omp parallel do schedule(static) reduction(+:full_mat_bound) + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + j = a%ja(col) + full_mat_bound = full_mat_bound + b%irp(j+1) - b%irp(j) + end do + end do + !omp end parallel do + + call psb_realloc(a%get_nrows() + 1, c%irp, info) + call psb_realloc(full_mat_bound, c%ja, info) + c%ja = 0 + c%irp(1) = 1 + + nnz = 0 + + do row = 1, a%get_nrows() + do col = a%irp(row), a%irp(row + 1) - 1 + do i = b%irp(a%ja(col)), b%irp(a%ja(col) + 1) - 1 + k = 0 + do while(c%ja(c%irp(row) + k) /= 0 .and. c%ja(c%irp(row) + k) /= b%ja(i)) + k = k + 1 + end do + if (c%ja(c%irp(row) + k) == 0) then + c%ja(c%irp(row)+k) = b%ja(i) + nnz = nnz + 1 + end if + end do + end do + c%irp(row + 1) = nnz + 1 + call psb_qsort(c%ja(c%irp(row):c%irp(row + 1)-1)) + end do + + + call psb_realloc(nnz, c%ja, info) + call psb_realloc(nnz, c%val, info) + + c%val = 0 +end subroutine compute_indices + +subroutine direct_scalar_sparse_row_mul(out_mat, out_row_num, scalar, mat, trgt_row_num) + type(psb_d_csr_sparse_mat), intent(inout) :: out_mat + integer(psb_ipk_), intent(in) :: out_row_num + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: trgt_row_num + + integer(psb_ipk_) :: i, k, row_start, row_end + + row_start = out_mat%irp(out_row_num) + row_end = out_mat%irp(out_row_num + 1) - 1 + + do i = mat%irp(trgt_row_num), mat%irp(trgt_row_num + 1) - 1 + do k = out_mat%irp(out_row_num), out_mat%irp(out_row_num + 1) - 1 + if (out_mat%ja(k) == mat%ja(i)) then + out_mat%val(k) = out_mat%val(k) + scalar * mat%val(i) + exit + end if + end do + end do + +end subroutine direct_scalar_sparse_row_mul + +subroutine spmm_omp_two_pass(a,b,c,info) + use omp_lib + + implicit none + type(psb_d_csr_sparse_mat), intent(in) :: a,b + type(psb_d_csr_sparse_mat), intent(out):: c + integer(psb_ipk_), intent(out) :: info + + integer(psb_ipk_) :: a_m, b_n, row, col + + a_m = a%get_nrows() + b_n = b%get_ncols() + + call c%allocate(a_m, b_n) + + call compute_indices(a, b, c, info) + + !$omp parallel do schedule(static) + do row = 1, a_m + do col = a%irp(row), a%irp(row + 1) - 1 + call direct_scalar_sparse_row_mul(c, row, a%val(col), b, a%ja(col)) + end do + end do + !$omp end parallel do +end subroutine spmm_omp_two_pass end subroutine psb_dcsrspspmm diff --git a/base/serial/impl/psb_d_rb_idx_tree_impl.F90 b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 new file mode 100644 index 00000000..8495a813 --- /dev/null +++ b/base/serial/impl/psb_d_rb_idx_tree_impl.F90 @@ -0,0 +1,284 @@ +subroutine psb_d_rb_idx_tree_insert(this, idx, val) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_insert + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + integer(psb_ipk_), intent(in) :: idx + real(psb_dpk_), intent(in) :: val + + character(len=22) :: name + type(psb_d_rb_idx_node), pointer :: new_node + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_insert' + + allocate(new_node) + new_node%idx = idx + new_node%val = val + nullify(new_node%left) + nullify(new_node%right) + nullify(new_node%parent) + new_node%is_red = .true. + + + if (.not. associated(this%root)) then + this%root => new_node + this%nnz = 1 + new_node%is_red = .false. + return + end if + + current => this%root + + do while (associated(current)) + previous => current + + if (idx == current%idx) then + current%val = current%val + val + deallocate(new_node) + return + else if (idx < current%idx) then + current => current%left + else + + current => current%right + end if + end do + + if (idx < previous%idx) then + new_node%parent => previous + previous%left => new_node + else + new_node%parent => previous + previous%right => new_node + end if + + call psb_d_rb_idx_tree_fix_insertion(this, new_node) + + this%nnz = this%nnz + 1 +end subroutine psb_d_rb_idx_tree_insert + +subroutine psb_d_rb_idx_tree_fix_insertion(this, node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_fix_insertion + implicit none + class(psb_d_rb_idx_tree), intent(inout) :: this + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=29) :: name + type(psb_d_rb_idx_node), pointer :: current, parent, grand_parent, uncle + name = 'psb_rb_idx_tree_fix_insertion' + + current => node + parent => current%parent + do while(associated(parent) .and. parent%is_red) + ! grand parent exist because root can't be red + grand_parent => parent%parent + if (parent%idx < grand_parent%idx) then + uncle => grand_parent%right + else + uncle => grand_parent%left + end if + + if (associated(uncle) .and. uncle%is_red) then + parent%is_red = .false. + uncle%is_red = .false. + grand_parent%is_red = .true. + current => grand_parent + parent => current%parent + + ! Left-Left case + else if (current%idx < parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Left-Right case + else if (current%idx > parent%idx .and. & + parent%idx < grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(parent) + call psb_d_rb_idx_tree_rotate_right(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + ! Right-Right case + else if (current%idx > parent%idx .and. & + parent%idx > grand_parent%idx) then + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(parent, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => parent + + return + ! Right-Left case + else + call psb_d_rb_idx_tree_rotate_right(parent) + call psb_d_rb_idx_tree_rotate_left(grand_parent) + call psb_d_rb_idx_tree_swap_colors(current, grand_parent) + + if (this%root%idx == grand_parent%idx) this%root => current + + return + end if + end do + + this%root%is_red = .false. +end subroutine psb_d_rb_idx_tree_fix_insertion + +subroutine psb_d_rb_idx_tree_swap_colors(n1, n2) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_swap_colors + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: n1, n2 + + character(len=27) :: name + logical :: tmp + name='psb_rb_idx_tree_swap_colors' + + tmp = n1%is_red + n1%is_red = n2%is_red + n2%is_red = tmp +end subroutine psb_d_rb_idx_tree_swap_colors + +subroutine psb_d_rb_idx_tree_rotate_right(node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_right + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=28) :: name + type(psb_d_rb_idx_node), pointer :: l, lr + name='psb_rb_idx_tree_rotate_right' + + if (.not. associated(node%left)) return + + l => node%left + lr => l%right + node%left => lr + + if (associated(lr)) lr%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => l + else + node%parent%right => l + end if + end if + + l%parent => node%parent + node%parent => l + + l%right => node +end subroutine psb_d_rb_idx_tree_rotate_right + +subroutine psb_d_rb_idx_tree_rotate_left(node) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_rotate_left + implicit none + type(psb_d_rb_idx_node), pointer, intent(inout) :: node + + character(len=27) :: name + type(psb_d_rb_idx_node), pointer :: r, rl + name='psb_rb_idx_tree_rotate_left' + + if (.not. associated(node%right)) return + + r => node%right + rl => r%left + node%right => rl + + if (associated(rl)) rl%parent => node + + if (associated(node%parent)) then + if (node%idx < node%parent%idx) then + node%parent%left => r + else + node%parent%right => r + end if + end if + + r%parent => node%parent + node%parent => r + + r%left => node +end subroutine psb_d_rb_idx_tree_rotate_left + +subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul(tree, scalar, mat, row_num) + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_scalar_sparse_row_mul + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), intent(inout) :: tree + real(psb_dpk_), intent(in) :: scalar + type(psb_d_csr_sparse_mat), intent(in) :: mat + integer(psb_ipk_), intent(in) :: row_num + + character(len=37) :: name + integer(psb_ipk_) :: i + name='psb_rb_idx_tree_scalar_sparse_row_mul' + + do i = mat%irp(row_num), mat%irp(row_num + 1) - 1 + call tree%insert(mat%ja(i),scalar * mat%val(i)) + end do + +end subroutine psb_d_rb_idx_tree_scalar_sparse_row_mul + +subroutine psb_d_rb_idx_tree_merge(trees, mat) +#if defined(OPENMP) + use omp_lib +#endif + use psb_d_rb_idx_tree_mod, psb_protect_name => psb_d_rb_idx_tree_merge + use psb_d_csr_mat_mod, only : psb_d_csr_sparse_mat + implicit none + type(psb_d_rb_idx_tree), allocatable, intent(inout) :: trees(:) + type(psb_d_csr_sparse_mat), intent(inout) :: mat + + character(len=21) :: name + integer(psb_ipk_) :: i, j, rows, info, nnz + type(psb_d_rb_idx_node), pointer :: current, previous + name='psb_rb_idx_tree_merge' + + rows = size(trees) + + mat%irp(1) = 1 + + do i=1, rows + mat%irp(i + 1) = mat%irp(i) + trees(i)%nnz + end do + + nnz = mat%irp(rows + 1) + call psb_realloc(nnz, mat%val, info) + call psb_realloc(nnz, mat%ja, info) + +#if defined(OPENMP) + !$omp parallel do schedule(static), private(current, previous, j) +#endif + do i = 1, size(trees) + j = 0 + current => trees(i)%root + do while(associated(current)) + ! go to the left-most node + do while(associated(current%left)) + current => current%left + end do + mat%val(j + mat%irp(i)) = current%val + mat%ja(j + mat%irp(i)) = current%idx + j = j + 1 + + previous => current + if (associated(current%right)) then + if (associated(current%parent)) then + current%parent%left => current%right + end if + current%right%parent => current%parent + current => current%right + else + current => current%parent + if (associated(current)) nullify(current%left) + end if + deallocate(previous) + end do + end do +#if defined(OPENMP) + !$omp end parallel do +#endif +end subroutine psb_d_rb_idx_tree_merge \ No newline at end of file diff --git a/util/Makefile b/util/Makefile index 3f087cbd..9b70855f 100644 --- a/util/Makefile +++ b/util/Makefile @@ -11,7 +11,7 @@ BASEOBJS= psb_blockpart_mod.o psb_metispart_mod.o psb_partidx_mod.o \ psb_hbio_mod.o psb_mmio_mod.o psb_mat_dist_mod.o \ psb_s_mat_dist_mod.o psb_d_mat_dist_mod.o psb_c_mat_dist_mod.o psb_z_mat_dist_mod.o \ psb_renum_mod.o psb_gps_mod.o \ - psb_s_renum_mod.o psb_d_renum_mod.o psb_c_renum_mod.o psb_z_renum_mod.o + psb_s_renum_mod.o psb_d_renum_mod.o psb_c_renum_mod.o psb_z_renum_mod.o IMPLOBJS= psb_s_hbio_impl.o psb_d_hbio_impl.o \ psb_c_hbio_impl.o psb_z_hbio_impl.o \ psb_s_mmio_impl.o psb_d_mmio_impl.o \