|
|
|
@ -3805,6 +3805,7 @@ contains
|
|
|
|
|
integer(psb_ipk_) :: ma, nb
|
|
|
|
|
integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
|
|
|
|
|
integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
|
|
|
|
|
integer(psb_ipk_) :: nth, lth,ith
|
|
|
|
|
|
|
|
|
|
ma = a%get_nrows()
|
|
|
|
|
nb = b%get_ncols()
|
|
|
|
@ -3815,12 +3816,19 @@ contains
|
|
|
|
|
! dense accumulator
|
|
|
|
|
! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
|
|
|
|
|
call psb_realloc(nb, acc, info)
|
|
|
|
|
!$omp parallel shared(nth,lth)
|
|
|
|
|
!$omp single
|
|
|
|
|
nth = omp_get_num_threads()
|
|
|
|
|
lth = min(nth, ma)
|
|
|
|
|
!$omp end single
|
|
|
|
|
!$omp end parallel
|
|
|
|
|
|
|
|
|
|
allocate(offsets(omp_get_max_threads()))
|
|
|
|
|
!$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
|
|
|
|
|
!$omp shared(a,b,c,offsets)
|
|
|
|
|
!$omp num_threads(lth) shared(a,b,c,offsets)
|
|
|
|
|
thread_upperbound = 0
|
|
|
|
|
start_idx = 0
|
|
|
|
|
end_idx = 0
|
|
|
|
|
!$omp do schedule(static) private(irw, jj, j)
|
|
|
|
|
do irw = 1, ma
|
|
|
|
|
if (start_idx == 0) then
|
|
|
|
@ -3876,15 +3884,14 @@ contains
|
|
|
|
|
!$omp end single
|
|
|
|
|
|
|
|
|
|
!$omp barrier
|
|
|
|
|
|
|
|
|
|
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
|
|
|
|
|
if (omp_get_thread_num() /= 0) then
|
|
|
|
|
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
|
|
|
|
|
end if
|
|
|
|
|
|
|
|
|
|
do irw = start_idx, end_idx - 1
|
|
|
|
|
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
|
|
|
|
|
end do
|
|
|
|
|
|
|
|
|
|
end if
|
|
|
|
|
!$omp barrier
|
|
|
|
|
|
|
|
|
|
!$omp single
|
|
|
|
@ -3892,9 +3899,10 @@ contains
|
|
|
|
|
call psb_realloc(c%irp(ma + 1), c%val, info)
|
|
|
|
|
call psb_realloc(c%irp(ma + 1), c%ja, info)
|
|
|
|
|
!$omp end single
|
|
|
|
|
|
|
|
|
|
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
|
|
|
|
|
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
|
|
|
|
|
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
|
|
|
|
|
end if
|
|
|
|
|
!$omp end parallel
|
|
|
|
|
end subroutine spmm_omp_gustavson
|
|
|
|
|
|
|
|
|
@ -3930,6 +3938,7 @@ contains
|
|
|
|
|
!$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
|
|
|
|
|
thread_upperbound = 0
|
|
|
|
|
start_idx = 0
|
|
|
|
|
end_idx = 0
|
|
|
|
|
!$omp do schedule(static) private(irw, jj, j)
|
|
|
|
|
do irw = 1, ma
|
|
|
|
|
do jj = a%irp(irw), a%irp(irw + 1) - 1
|
|
|
|
@ -3996,14 +4005,14 @@ contains
|
|
|
|
|
|
|
|
|
|
!$omp barrier
|
|
|
|
|
|
|
|
|
|
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
|
|
|
|
|
if (omp_get_thread_num() /= 0) then
|
|
|
|
|
c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
|
|
|
|
|
end if
|
|
|
|
|
|
|
|
|
|
do irw = start_idx, end_idx - 1
|
|
|
|
|
c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
|
|
|
|
|
end do
|
|
|
|
|
|
|
|
|
|
end if
|
|
|
|
|
!$omp barrier
|
|
|
|
|
|
|
|
|
|
!$omp single
|
|
|
|
@ -4011,9 +4020,10 @@ contains
|
|
|
|
|
call psb_realloc(c%irp(ma + 1), c%val, info)
|
|
|
|
|
call psb_realloc(c%irp(ma + 1), c%ja, info)
|
|
|
|
|
!$omp end single
|
|
|
|
|
|
|
|
|
|
if ((start_idx /= 0).and.(start_idx <= end_idx) ) then
|
|
|
|
|
c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
|
|
|
|
|
c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
|
|
|
|
|
end if
|
|
|
|
|
!$omp end parallel
|
|
|
|
|
end subroutine spmm_omp_gustavson_1d
|
|
|
|
|
|
|
|
|
|