From e72c0f0bf96c3fc56c83620230caac5d1463ee7c Mon Sep 17 00:00:00 2001
From: sfilippone <filippone.salvatore@gmail.com>
Date: Mon, 29 Apr 2024 15:04:23 +0200
Subject: [PATCH] Fix OMP impl of sparse-sparse product

---
 base/serial/impl/psb_c_csr_impl.F90 | 54 +++++++++++++++++------------
 base/serial/impl/psb_d_csr_impl.F90 | 54 +++++++++++++++++------------
 base/serial/impl/psb_s_csr_impl.F90 | 54 +++++++++++++++++------------
 base/serial/impl/psb_z_csr_impl.F90 | 54 +++++++++++++++++------------
 4 files changed, 128 insertions(+), 88 deletions(-)

diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90
index 9354098b..276d3d1c 100644
--- a/base/serial/impl/psb_c_csr_impl.F90
+++ b/base/serial/impl/psb_c_csr_impl.F90
@@ -3805,6 +3805,7 @@ contains
     integer(psb_ipk_)   :: ma, nb
     integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
     integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith
 
     ma = a%get_nrows()
     nb = b%get_ncols()
@@ -3815,12 +3816,19 @@ contains
     ! dense accumulator
     ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
     call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel
 
     allocate(offsets(omp_get_max_threads()))
     !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       if (start_idx == 0) then
@@ -3876,15 +3884,14 @@ contains
     !$omp end single
 
     !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -3892,9 +3899,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson
 
@@ -3930,6 +3938,7 @@ contains
     !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0 
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       do jj = a%irp(irw), a%irp(irw + 1) - 1
@@ -3996,14 +4005,14 @@ contains
 
     !$omp barrier
 
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -4011,9 +4020,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson_1d
 
diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90
index bc0efcc9..a0aaeeee 100644
--- a/base/serial/impl/psb_d_csr_impl.F90
+++ b/base/serial/impl/psb_d_csr_impl.F90
@@ -3805,6 +3805,7 @@ contains
     integer(psb_ipk_)   :: ma, nb
     integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
     integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith
 
     ma = a%get_nrows()
     nb = b%get_ncols()
@@ -3815,12 +3816,19 @@ contains
     ! dense accumulator
     ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
     call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel
 
     allocate(offsets(omp_get_max_threads()))
     !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       if (start_idx == 0) then
@@ -3876,15 +3884,14 @@ contains
     !$omp end single
 
     !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -3892,9 +3899,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson
 
@@ -3930,6 +3938,7 @@ contains
     !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0 
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       do jj = a%irp(irw), a%irp(irw + 1) - 1
@@ -3996,14 +4005,14 @@ contains
 
     !$omp barrier
 
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -4011,9 +4020,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson_1d
 
diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90
index ba70e021..9d5dc145 100644
--- a/base/serial/impl/psb_s_csr_impl.F90
+++ b/base/serial/impl/psb_s_csr_impl.F90
@@ -3805,6 +3805,7 @@ contains
     integer(psb_ipk_)   :: ma, nb
     integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
     integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith
 
     ma = a%get_nrows()
     nb = b%get_ncols()
@@ -3815,12 +3816,19 @@ contains
     ! dense accumulator
     ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
     call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel
 
     allocate(offsets(omp_get_max_threads()))
     !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       if (start_idx == 0) then
@@ -3876,15 +3884,14 @@ contains
     !$omp end single
 
     !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -3892,9 +3899,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson
 
@@ -3930,6 +3938,7 @@ contains
     !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0 
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       do jj = a%irp(irw), a%irp(irw + 1) - 1
@@ -3996,14 +4005,14 @@ contains
 
     !$omp barrier
 
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -4011,9 +4020,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson_1d
 
diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90
index 23a4fb84..7f11c3bd 100644
--- a/base/serial/impl/psb_z_csr_impl.F90
+++ b/base/serial/impl/psb_z_csr_impl.F90
@@ -3805,6 +3805,7 @@ contains
     integer(psb_ipk_)   :: ma, nb
     integer(psb_ipk_), allocatable :: col_inds(:), offsets(:)
     integer(psb_ipk_)   :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx
+    integer(psb_ipk_) :: nth, lth,ith
 
     ma = a%get_nrows()
     nb = b%get_ncols()
@@ -3815,12 +3816,19 @@ contains
     ! dense accumulator
     ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf
     call psb_realloc(nb, acc, info)
+    !$omp parallel shared(nth,lth)
+    !$omp single
+    nth = omp_get_num_threads()
+    lth = min(nth, ma)
+    !$omp end single
+    !$omp end parallel
 
     allocate(offsets(omp_get_max_threads()))
     !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) &
-    !$omp shared(a,b,c,offsets) 
+    !$omp num_threads(lth) shared(a,b,c,offsets) 
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       if (start_idx == 0) then
@@ -3876,15 +3884,14 @@ contains
     !$omp end single
 
     !$omp barrier
-
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -3892,9 +3899,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson
 
@@ -3930,6 +3938,7 @@ contains
     !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets)
     thread_upperbound = 0
     start_idx = 0
+    end_idx   = 0 
     !$omp do schedule(static) private(irw, jj, j)
     do irw = 1, ma
       do jj = a%irp(irw), a%irp(irw + 1) - 1
@@ -3996,14 +4005,14 @@ contains
 
     !$omp barrier
 
-    if (omp_get_thread_num() /= 0) then
-      c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      if (omp_get_thread_num() /= 0) then
+        c%irp(start_idx) = offsets(omp_get_thread_num()) + 1
+      end if
+      do irw = start_idx, end_idx - 1
+        c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
+      end do
     end if
-
-    do irw = start_idx, end_idx - 1
-      c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw)
-    end do
-
     !$omp barrier
 
     !$omp single
@@ -4011,9 +4020,10 @@ contains
     call psb_realloc(c%irp(ma + 1), c%val, info)        
     call psb_realloc(c%irp(ma + 1), c%ja, info)
     !$omp end single
-
-    c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
-    c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    if ((start_idx /= 0).and.(start_idx <= end_idx) ) then 
+      c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz)
+      c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz)
+    end if
     !$omp end parallel
   end subroutine spmm_omp_gustavson_1d