Fix OMP impl of sparse-sparse product

2 years ago · e72c0f0bf9
parent 025350a361
commit e72c0f0bf9
4 changed files with 128 additions and 88 deletions
--- a/base/serial/impl/psb_c_csr_impl.F90
+++ b/base/serial/impl/psb_c_csr_impl.F90
@ -3805,6 +3805,7 @@ contains
    integer(psb_ipk_)   :: ma, nb
    integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
    integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith

    ma = a%get_nrows()
    nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
    ! dense accumulator
    ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
    call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel

    allocate(offsets(omp_get_max_threads()))
    !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
    !$omp end single

    !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -3892,9 +3899,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson

@ -3930,6 +3938,7 @@ contains
    !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0 
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains

    !$omp barrier

-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -4011,9 +4020,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson_1d

--- a/base/serial/impl/psb_d_csr_impl.F90
+++ b/base/serial/impl/psb_d_csr_impl.F90
@ -3805,6 +3805,7 @@ contains
    integer(psb_ipk_)   :: ma, nb
    integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
    integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith

    ma = a%get_nrows()
    nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
    ! dense accumulator
    ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
    call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel

    allocate(offsets(omp_get_max_threads()))
    !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
    !$omp end single

    !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -3892,9 +3899,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson

@ -3930,6 +3938,7 @@ contains
    !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0 
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains

    !$omp barrier

-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -4011,9 +4020,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson_1d

--- a/base/serial/impl/psb_s_csr_impl.F90
+++ b/base/serial/impl/psb_s_csr_impl.F90
@ -3805,6 +3805,7 @@ contains
    integer(psb_ipk_)   :: ma, nb
    integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
    integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith

    ma = a%get_nrows()
    nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
    ! dense accumulator
    ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
    call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel

    allocate(offsets(omp_get_max_threads()))
    !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
    !$omp end single

    !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -3892,9 +3899,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson

@ -3930,6 +3938,7 @@ contains
    !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0 
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains

    !$omp barrier

-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -4011,9 +4020,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson_1d

--- a/base/serial/impl/psb_z_csr_impl.F90
+++ b/base/serial/impl/psb_z_csr_impl.F90
@ -3805,6 +3805,7 @@ contains
    integer(psb_ipk_)   :: ma, nb
    integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
    integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith

    ma = a%get_nrows()
    nb = b%get_ncols()
@ -3815,12 +3816,19 @@ contains
    ! dense accumulator
    ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
    call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel

    allocate(offsets(omp_get_max_threads()))
    !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      if (start_idx == 0) then
@ -3876,15 +3884,14 @@ contains
    !$omp end single

    !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -3892,9 +3899,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson

@ -3930,6 +3938,7 @@ contains
    !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
    thread_upperbound = 0
    start_idx = 0
+    end_idx   = 0 
    !$omp do schedule(static) private(irw, jj, j)
    do irw = 1, ma
      do jj = a%irp(irw), a%irp(irw + 1) - 1
@ -3996,14 +4005,14 @@ contains

    !$omp barrier

-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
    end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
    !$omp barrier

    !$omp single
@ -4011,9 +4020,10 @@ contains
    call psb_realloc(c%irp(ma + 1), c%val, info)        
    call psb_realloc(c%irp(ma + 1), c%ja, info)
    !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
    !$omp end parallel
  end subroutine spmm_omp_gustavson_1d