Fix OMP impl of sparse-sparse product

repack-csga
sfilippone 9 months ago
parent 025350a361
commit e72c0f0bf9

@ -3805,6 +3805,7 @@ contains
integer(psb_ipk_) :: ma, nb integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
integer(psb_ipk_) :: nth, lth,ith
ma = a%get_nrows() ma = a%get_nrows()
nb = b%get_ncols() nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
! dense accumulator ! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info) call psb_realloc(nb, acc, info)
!$omp parallel shared(nth,lth)
!$omp single
nth = omp_get_num_threads()
lth = min(nth, ma)
!$omp end single
!$omp end parallel
allocate(offsets(omp_get_max_threads())) allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets) !$omp num_threads(lth) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
if (start_idx == 0) then if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
!$omp end single !$omp end single
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -3892,9 +3899,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson end subroutine spmm_omp_gustavson
@ -3930,6 +3938,7 @@ contains
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1 do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -4011,9 +4020,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson_1d end subroutine spmm_omp_gustavson_1d

@ -3805,6 +3805,7 @@ contains
integer(psb_ipk_) :: ma, nb integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
integer(psb_ipk_) :: nth, lth,ith
ma = a%get_nrows() ma = a%get_nrows()
nb = b%get_ncols() nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
! dense accumulator ! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info) call psb_realloc(nb, acc, info)
!$omp parallel shared(nth,lth)
!$omp single
nth = omp_get_num_threads()
lth = min(nth, ma)
!$omp end single
!$omp end parallel
allocate(offsets(omp_get_max_threads())) allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets) !$omp num_threads(lth) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
if (start_idx == 0) then if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
!$omp end single !$omp end single
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -3892,9 +3899,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson end subroutine spmm_omp_gustavson
@ -3930,6 +3938,7 @@ contains
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1 do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -4011,9 +4020,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson_1d end subroutine spmm_omp_gustavson_1d

@ -3805,6 +3805,7 @@ contains
integer(psb_ipk_) :: ma, nb integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
integer(psb_ipk_) :: nth, lth,ith
ma = a%get_nrows() ma = a%get_nrows()
nb = b%get_ncols() nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
! dense accumulator ! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info) call psb_realloc(nb, acc, info)
!$omp parallel shared(nth,lth)
!$omp single
nth = omp_get_num_threads()
lth = min(nth, ma)
!$omp end single
!$omp end parallel
allocate(offsets(omp_get_max_threads())) allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets) !$omp num_threads(lth) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
if (start_idx == 0) then if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
!$omp end single !$omp end single
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -3892,9 +3899,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson end subroutine spmm_omp_gustavson
@ -3930,6 +3938,7 @@ contains
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1 do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -4011,9 +4020,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson_1d end subroutine spmm_omp_gustavson_1d

@ -3805,6 +3805,7 @@ contains
integer(psb_ipk_) :: ma, nb integer(psb_ipk_) :: ma, nb
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
integer(psb_ipk_) :: nth, lth,ith
ma = a%get_nrows() ma = a%get_nrows()
nb = b%get_ncols() nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
! dense accumulator ! dense accumulator
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
call psb_realloc(nb, acc, info) call psb_realloc(nb, acc, info)
!$omp parallel shared(nth,lth)
!$omp single
nth = omp_get_num_threads()
lth = min(nth, ma)
!$omp end single
!$omp end parallel
allocate(offsets(omp_get_max_threads())) allocate(offsets(omp_get_max_threads()))
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
!$omp shared(a,b,c,offsets) !$omp num_threads(lth) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
if (start_idx == 0) then if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
!$omp end single !$omp end single
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -3892,9 +3899,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson end subroutine spmm_omp_gustavson
@ -3930,6 +3938,7 @@ contains
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
thread_upperbound = 0 thread_upperbound = 0
start_idx = 0 start_idx = 0
end_idx = 0
!$omp do schedule(static) private(irw, jj, j) !$omp do schedule(static) private(irw, jj, j)
do irw = 1, ma do irw = 1, ma
do jj = a%irp(irw), a%irp(irw + 1) - 1 do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains
!$omp barrier !$omp barrier
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
if (omp_get_thread_num() /= 0) then if (omp_get_thread_num() /= 0) then
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
end if end if
do irw = start_idx, end_idx - 1 do irw = start_idx, end_idx - 1
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
end do end do
end if
!$omp barrier !$omp barrier
!$omp single !$omp single
@ -4011,9 +4020,10 @@ contains
call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%val, info)
call psb_realloc(c%irp(ma + 1), c%ja, info) call psb_realloc(c%irp(ma + 1), c%ja, info)
!$omp end single !$omp end single
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
end if
!$omp end parallel !$omp end parallel
end subroutine spmm_omp_gustavson_1d end subroutine spmm_omp_gustavson_1d

Loading…
Cancel
Save