From e72c0f0bf96c3fc56c83620230caac5d1463ee7c Mon Sep 17 00:00:00 2001 From: sfilippone Date: Mon, 29 Apr 2024 15:04:23 +0200 Subject: [PATCH] Fix OMP impl of sparse-sparse product --- base/serial/impl/psb_c_csr_impl.F90 | 54 +++++++++++++++++------------ base/serial/impl/psb_d_csr_impl.F90 | 54 +++++++++++++++++------------ base/serial/impl/psb_s_csr_impl.F90 | 54 +++++++++++++++++------------ base/serial/impl/psb_z_csr_impl.F90 | 54 +++++++++++++++++------------ 4 files changed, 128 insertions(+), 88 deletions(-) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 9354098b..276d3d1c 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -3805,6 +3805,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3815,12 +3816,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3876,15 +3884,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3892,9 +3899,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3930,6 +3938,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -3996,14 +4005,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4011,9 +4020,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index bc0efcc9..a0aaeeee 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3805,6 +3805,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3815,12 +3816,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3876,15 +3884,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3892,9 +3899,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3930,6 +3938,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -3996,14 +4005,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4011,9 +4020,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index ba70e021..9d5dc145 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -3805,6 +3805,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3815,12 +3816,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3876,15 +3884,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3892,9 +3899,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3930,6 +3938,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -3996,14 +4005,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4011,9 +4020,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 23a4fb84..7f11c3bd 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -3805,6 +3805,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3815,12 +3816,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3876,15 +3884,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3892,9 +3899,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3930,6 +3938,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -3996,14 +4005,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4011,9 +4020,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d