diff --git a/base/modules/desc/psb_gen_block_map_mod.F90 b/base/modules/desc/psb_gen_block_map_mod.F90 index f0c433e0..650bb430 100644 --- a/base/modules/desc/psb_gen_block_map_mod.F90 +++ b/base/modules/desc/psb_gen_block_map_mod.F90 @@ -215,7 +215,9 @@ contains end if if (present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,idxmap,owned_,info) & + !$omp private(i) do i=1, size(idx) if (mask(i)) then if ((1<=idx(i)).and.(idx(i) <= idxmap%local_rows)) then @@ -229,9 +231,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,idxmap,owned_,info) & + !$omp private(i) do i=1, size(idx) if ((1<=idx(i)).and.(idx(i) <= idxmap%local_rows)) then idx(i) = idxmap%min_glob_row + idx(i) - 1 @@ -243,7 +247,7 @@ contains info = -1 end if end do - + !$omp end parallel do end if end subroutine block_ll2gv1 @@ -277,7 +281,9 @@ contains end if if (present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idxin,idxout,idxmap,owned_,info) & + !$omp private(i) do i=1, im if (mask(i)) then if ((1<=idxin(i)).and.(idxin(i) <= idxmap%local_rows)) then @@ -291,9 +297,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idxin,idxout,idxmap,owned_,info) & + !$omp private(i) do i=1, im if ((1<=idxin(i)).and.(idxin(i) <= idxmap%local_rows)) then idxout(i) = idxmap%min_glob_row + idxin(i) - 1 @@ -305,7 +313,7 @@ contains info = -1 end if end do - + !$omp end parallel do end if if (is > im) then @@ -392,6 +400,9 @@ contains if (present(mask)) then if (idxmap%is_asb()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idx,idxmap,owned_) & + !$omp private(i,nv,tidx) do i=1, is if (mask(i)) then if ((idxmap%min_glob_row <= idx(i)).and. & @@ -408,7 +419,11 @@ contains end if end if end do + !$omp end parallel do else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idx,idxmap,owned_) & + !$omp private(i,ip,lip,tidx,info) do i=1,is if (mask(i)) then if ((idxmap%min_glob_row <= idx(i)).and.& @@ -424,8 +439,8 @@ contains end if end if end do + !$omp end parallel do else -!!$ write(0,*) 'Block status: invalid ',idxmap%get_state() idx(1:is) = -1 info = -1 end if @@ -433,6 +448,9 @@ contains else if (.not.present(mask)) then if (idxmap%is_asb()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(is,idx,idxmap,owned_) & + !$omp private(i,nv,tidx) do i=1, is if ((idxmap%min_glob_row <= idx(i)).and.& & (idx(i) <= idxmap%max_glob_row)) then @@ -447,8 +465,11 @@ contains idx(i) = -1 end if end do - + !$omp end parallel do else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(is,idx,idxmap,owned_) & + !$omp private(i,ip,lip,tidx,info) do i=1,is if ((idxmap%min_glob_row <= idx(i)).and.& & (idx(i) <= idxmap%max_glob_row)) then @@ -462,6 +483,7 @@ contains idx(i) = -1 end if end do + !$omp end parallel do else idx(1:is) = -1 info = -1 @@ -953,7 +975,9 @@ contains end if info = psb_success_ else - info = -5 + write(0,*) 'From has_search_ins:',info,ip,lip,nxt,& + & idxmap%min_glob_row,idxmap%max_glob_row + info = -6 return end if idxout(i) = lip + idxmap%local_rows @@ -1131,7 +1155,7 @@ contains idxmap%global_cols = ntot idxmap%local_rows = nl idxmap%local_cols = nl - idxmap%ctxt = ctxt + idxmap%ctxt = ctxt idxmap%state = psb_desc_bld_ idxmap%mpic = psb_get_mpi_comm(ctxt) idxmap%min_glob_row = vnl(iam)+1 diff --git a/base/modules/desc/psb_hash_map_mod.F90 b/base/modules/desc/psb_hash_map_mod.F90 index 058dbb8d..09cde3d4 100644 --- a/base/modules/desc/psb_hash_map_mod.F90 +++ b/base/modules/desc/psb_hash_map_mod.F90 @@ -63,7 +63,7 @@ module psb_hash_map_mod integer(psb_lpk_) :: hashvsize, hashvmask integer(psb_ipk_), allocatable :: hashv(:) integer(psb_lpk_), allocatable :: glb_lc(:,:), loc_to_glob(:) - type(psb_hash_type) :: hash + type(psb_hash_type) :: hash contains @@ -221,6 +221,9 @@ contains if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,idxmap,owned_) & + !$omp private(i) do i=1, size(idx) if (mask(i)) then if ((1<=idx(i)).and.(idx(i) <= idxmap%local_rows)) then @@ -233,9 +236,12 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,idxmap,owned_) & + !$omp private(i) do i=1, size(idx) if ((1<=idx(i)).and.(idx(i) <= idxmap%local_rows)) then idx(i) = idxmap%loc_to_glob(idx(i)) @@ -246,7 +252,7 @@ contains idx(i) = -1 end if end do - + !$omp end parallel do end if end subroutine hash_l2gv1 @@ -363,6 +369,9 @@ contains else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idx,mglob,idxmap,nrm,ncol,nrow,owned_) & + !$omp private(i,ip,lip,tlip,info) do i = 1, is if (mask(i)) then ip = idx(i) @@ -388,7 +397,7 @@ contains endif end if enddo - + !$omp end parallel do else write(0,*) 'Hash status: invalid ',idxmap%get_state() idx(1:is) = -1 @@ -404,6 +413,9 @@ contains else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(is,idx,mglob,idxmap,nrm,ncol,nrow,owned_) & + !$omp private(i,ip,lip,tlip,info) do i = 1, is ip = idx(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -427,14 +439,12 @@ contains idx(i) = lip endif enddo - + !$omp end parallel do else write(0,*) 'Hash status: invalid ',idxmap%get_state() idx(1:is) = -1 info = -1 - end if - end if end subroutine hash_g2lv1 @@ -493,6 +503,9 @@ contains else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idxin,idxout,mglob,idxmap,nrm,ncol,nrow,owned_) & + !$omp private(i,ip,lip,tlip,info) do i = 1, is if (mask(i)) then ip = idxin(i) @@ -518,6 +531,7 @@ contains endif end if enddo + !$omp end parallel do else write(0,*) 'Hash status: invalid ',idxmap%get_state() idxout(1:is) = -1 @@ -533,6 +547,9 @@ contains else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(is,idxin,idxout,mglob,idxmap,nrm,ncol,nrow,owned_) & + !$omp private(i,ip,lip,tlip,info) do i = 1, is ip = idxin(i) if ((ip < 1 ).or.(ip>mglob)) then @@ -556,14 +573,12 @@ contains idxout(i) = lip endif enddo - + !$omp end parallel do else write(0,*) 'Hash status: invalid ',idxmap%get_state() idxout(1:is) = -1 info = -1 - end if - end if end subroutine hash_g2lv2 @@ -649,7 +664,7 @@ contains & err_act integer(psb_lpk_) :: mglob, ip, nxt, tlip type(psb_ctxt_type) :: ctxt - integer(psb_ipk_) :: me, np + integer(psb_ipk_) :: me, np,ith character(len=20) :: name,ch_err logical, allocatable :: mask_(:) !!$ logical :: use_openmp = .true. @@ -683,363 +698,243 @@ contains mglob = idxmap%get_gr() nrow = idxmap%get_lr() !write(0,*) me,name,' before loop ',psb_errstatus_fatal() -#ifdef OPENMP - !call OMP_init_lock(ins_lck) - - if (idxmap%is_bld()) then - - isLoopValid = .true. - ncol = idxmap%get_lc() - if (present(mask)) then - mask_ = mask - else - allocate(mask_(size(idx))) - mask_ = .true. - end if +#if defined(OPENMP) + isLoopValid = .true. + if (idxmap%is_bld()) then if (present(lidx)) then - if (present(mask)) then - !$omp critical(hash_g2l_ins) - - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz,lidx) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) + if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(lidx,mask,name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + !$omp private(i,ip,lip,tlip,nxt,info) & + !$omp reduction(.AND.:isLoopValid) do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + if (mask(i)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then idx(i) = -1 cycle endif - !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - !call OMP_set_lock(ins_lck) - - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - ncol = idxmap%get_lc() - nxt = lidx(i) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - - if (info /= psb_success_) then - !write(0,*) 'Error spot 1' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - end if - end if + if (lip > 0) then + idx(i) = lip + info = psb_success_ + else + !$omp critical(hash_g2l_ins) + tlip = lip + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + else + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip > 0) then + idx(i) = lip else - idx(i) = -1 + call psb_hash_searchinskey(ip,lip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == lip) then + call psb_ensure_size(nxt,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(max(ncol,nxt)) + endif + idx(i) = lip + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. + end if end if - !call OMP_unset_lock(ins_lck) - end if - else - idx(i) = lip + endif + !$omp end critical(hash_g2l_ins) end if else idx(i) = -1 end if + enddo + !$omp end parallel do - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 - end if - else - !$omp critical(hash_g2l_ins) + else if (.not.present(mask)) then - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz,lidx) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(lidx,name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + !$omp private(i,ip,lip,tlip,nxt,info) & + !$omp reduction(.AND.:isLoopValid) do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob) ) then idx(i) = -1 cycle endif - !call OMP_set_lock(ins_lck) ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - !call OMP_set_lock(ins_lck) - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - ncol = idxmap%get_lc() - nxt = lidx(i) - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - - if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& - & pad=-1_psb_lpk_,addsz=laddsz) - - if (info /= psb_success_) then - !write(0,*) 'Error spot 2' - call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - - isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) - end if - end if + if (lip > 0) then + idx(i) = lip + info = psb_success_ + else + !$omp critical(hash_g2l_ins) + tlip = lip + nxt = lidx(i) + if (nxt <= nrow) then + idx(i) = -1 + else + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) + if (lip > 0) then + idx(i) = lip else - idx(i) = -1 + call psb_hash_searchinskey(ip,lip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == lip) then + call psb_ensure_size(nxt,idxmap%loc_to_glob,info,& + & pad=-1_psb_lpk_,addsz=laddsz) + if (info /= psb_success_) then + info=1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='psb_ensure_size',i_err=(/info/)) + isLoopValid = .false. + end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(max(ncol,nxt)) + endif + idx(i) = lip + info = psb_success_ + else + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. + end if end if - !call OMP_unset_lock(ins_lck) - end if - else - idx(i) = lip + endif + !$omp end critical(hash_g2l_ins) end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 - end if + enddo + !$omp end parallel do end if + else if (.not.present(lidx)) then - if(present(mask)) then - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mask,mglob,idxmap,ncol,nrow,laddsz) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - !$omp critical(hash_g2l_ins) + if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + !$omp private(i,ip,lip,tlip,nxt,info) & + !$omp reduction(.AND.:isLoopValid) do i = 1, is - info = 0 - if (.not. isLoopValid) cycle if (mask(i)) then - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - !call OMP_set_lock(ins_lck) - ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. - !write(0,*) me,name,' b hic 1 ',psb_errstatus_fatal() + ncol = idxmap%get_lc() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - !write(0,*) me,name,' a hic 1 ',psb_errstatus_fatal() - if (lip < 0) then - !call OMP_set_lock(ins_lck) - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing, so this lock is relatively rare). - ncol = idxmap%get_lc() - nxt = ncol + 1 - !write(0,*) me,name,' b hic 2 ',psb_errstatus_fatal() - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - !write(0,*) me,name,' a hic 2 ',psb_errstatus_fatal() + if (lip > 0) then + idx(i) = lip + info = psb_success_ + else + !$omp critical(hash_g2l_ins) + ncol = idxmap%get_lc() + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - !write(0,*) me,name,' b hsik ',psb_errstatus_fatal() - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - if (psb_errstatus_fatal()) write(0,*) me,name,' a hsik ',info,omp_get_thread_num() - !write(0,*) me,name,' a hsik ',psb_errstatus_fatal() - lip = tlip - - if (info >= 0) then - !write(0,*) 'Error before spot 3', info - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + idx(i) = lip + else + call psb_hash_searchinskey(ip,lip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == lip) then + call psb_ensure_size(nxt,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - if (psb_errstatus_fatal()) write(0,*) me,name,' a esz ',info,omp_get_thread_num() if (info /= psb_success_) then - !write(0,*) 'Error spot 3', info + info=1 call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - + & a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) end if - end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(nxt) + endif + idx(i) = lip + info = psb_success_ else - idx(i) = -1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if - !call OMP_unset_lock(ins_lck) end if - else - idx(i) = lip + !$omp end critical(hash_g2l_ins) end if else idx(i) = -1 end if + enddo + !$omp end parallel do - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) + else if (.not.present(mask)) then - if (.not. isLoopValid) then - goto 9999 - end if - else - ! $ OMP PARALLEL DO default(none) schedule(DYNAMIC) & - ! $ OMP shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & - ! $ OMP private(i,ip,lip,tlip,nxt,info) & - ! $ OMP reduction(.AND.:isLoopValid) - !$omp critical(hash_g2l_ins) + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(name,me,is,idx,ins_lck,mglob,idxmap,ncol,nrow,laddsz) & + !$omp private(i,ip,lip,tlip,nxt,info) & + !$omp reduction(.AND.:isLoopValid) do i = 1, is - info = 0 - if (.not. isLoopValid) cycle - ip = idx(i) - if ((ip < 1 ).or.(ip>mglob)) then + ip = idx(i) + if ((ip < 1 ).or.(ip>mglob)) then idx(i) = -1 cycle endif - !call OMP_set_lock(ins_lck) - ncol = idxmap%get_lc() - !call OMP_unset_lock(ins_lck) - - ! At first, we check the index presence in 'idxmap'. Usually - ! the index is found. If it is not found, we repeat the checking, - ! but inside a critical region. + ncol = idxmap%get_lc() call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) - if (lip < 0) then - !call OMP_set_lock(ins_lck) - ! We check again if the index is already in 'idxmap', this - ! time inside a critical region (we assume that the index - ! is often already existing). - ncol = idxmap%get_lc() - nxt = ncol + 1 - call hash_inner_cnv(ip,lip,idxmap%hashvmask,& - & idxmap%hashv,idxmap%glb_lc,ncol) - + if (lip > 0) then + idx(i) = lip + info = psb_success_ + else + !$omp critical(hash_g2l_ins) + ncol = idxmap%get_lc() + nxt = ncol + 1 + call hash_inner_cnv(ip,lip,idxmap%hashvmask,idxmap%hashv,& + & idxmap%glb_lc,ncol) if (lip > 0) then - idx(i) = lip - else if (lip < 0) then - ! Index not found - call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) - lip = tlip - - if (info >= 0) then - ! 'nxt' is not equal to 'tlip' when the key is already inside - ! the hash map. In that case 'tlip' is the value corresponding - ! to the existing mapping. - if (nxt == tlip) then - - ncol = MAX(ncol,nxt) - - call psb_ensure_size(ncol,idxmap%loc_to_glob,info,& + idx(i) = lip + else + call psb_hash_searchinskey(ip,lip,nxt,idxmap%hash,info) + if (info >=0) then + if (nxt == lip) then + call psb_ensure_size(nxt,idxmap%loc_to_glob,info,& & pad=-1_psb_lpk_,addsz=laddsz) - if (info /= psb_success_) then - !write(0,*) 'Error spot 4' + info=1 call psb_errpush(psb_err_from_subroutine_ai_,name,& - &a_err='psb_ensure_size',i_err=(/info/)) - + & a_err='psb_ensure_size',i_err=(/info/)) isLoopValid = .false. - idx(i) = -1 - else - idx(i) = lip - idxmap%loc_to_glob(nxt) = ip - call idxmap%set_lc(ncol) end if - end if + idxmap%loc_to_glob(nxt) = ip + call idxmap%set_lc(nxt) + endif + idx(i) = lip + info = psb_success_ else - idx(i) = -1 + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='SearchInsKeyVal',i_err=(/info/)) + isLoopValid = .false. end if - !call OMP_unset_lock(ins_lck) end if - - else - idx(i) = lip + !$omp end critical(hash_g2l_ins) end if - - end do - ! $ OMP END PARALLEL DO - !$omp end critical(hash_g2l_ins) - - if (.not. isLoopValid) then - goto 9999 - end if - + enddo + !$omp end parallel do end if end if else @@ -1047,7 +942,7 @@ contains idx = -1 info = -1 end if - !call OMP_destroy_lock(ins_lck) + if (.not. isLoopValid) goto 9999 #else !!$ else if (.not.use_openmp) then isLoopValid = .true. @@ -1066,13 +961,13 @@ contains call hash_inner_cnv(ip,lip,idxmap%hashvmask,& & idxmap%hashv,idxmap%glb_lc,ncol) if (lip < 0) then - tlip = lip nxt = lidx(i) if (nxt <= nrow) then idx(i) = -1 cycle endif call psb_hash_searchinskey(ip,tlip,nxt,idxmap%hash,info) + lip = tlip if (info >=0) then if (nxt == tlip) then ncol = max(ncol,nxt) @@ -1747,6 +1642,9 @@ contains ! for a width of psb_hash_bits ! if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(n,hashv,hashmask,x,glb_lc,nrm,mask) & + !$omp private(i,key,idx,ih,nh,tmp,lb,ub,lm) do i=1, n if (mask(i)) then key = x(i) @@ -1784,7 +1682,11 @@ contains end if end if end do + !$omp end parallel do else + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(n,hashv,hashmask,x,glb_lc,nrm) & + !$omp private(i,key,idx,ih,nh,tmp,lb,ub,lm) do i=1, n key = x(i) ih = iand(key,hashmask) @@ -1820,6 +1722,7 @@ contains x(i) = tmp end if end do + !$omp end parallel do end if end subroutine hash_inner_cnv1 @@ -1842,6 +1745,9 @@ contains ! for a width of psb_hash_bits ! if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(n,hashv,hashmask,x,y,glb_lc,nrm,mask,psb_err_unit) & + !$omp private(i,key,idx,ih,nh,tmp,lb,ub,lm) do i=1, n if (mask(i)) then key = x(i) @@ -1882,9 +1788,12 @@ contains end if end if end do - + !$omp end parallel do else + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(n,hashv,hashmask,x,y,glb_lc,nrm,psb_err_unit) & + !$omp private(i,key,idx,ih,nh,tmp,lb,ub,lm) do i=1, n key = x(i) ih = iand(key,hashmask) @@ -1923,6 +1832,7 @@ contains y(i) = tmp end if end do + !$omp end parallel do end if end subroutine hash_inner_cnv2 diff --git a/base/modules/desc/psb_hash_mod.F90 b/base/modules/desc/psb_hash_mod.F90 index eb5556a2..18b1142d 100644 --- a/base/modules/desc/psb_hash_mod.F90 +++ b/base/modules/desc/psb_hash_mod.F90 @@ -383,12 +383,12 @@ contains integer(psb_lpk_), intent(out) :: val integer(psb_ipk_), intent(out) :: info - integer(psb_ipk_) :: hsize,hmask, hk, hd + integer(psb_ipk_) :: hsize,hmask, hk, hd, i + logical :: redo info = HashOK hsize = hash%hsize hmask = hash%hmask - val = -1 hk = iand(psb_hashval(key),hmask) if (hk == 0) then hd = 1 @@ -400,56 +400,57 @@ contains info = HashOutOfMemory return end if - + val = -1 + !$omp atomic hash%nsrch = hash%nsrch + 1 + !$omp end atomic do + !$omp atomic hash%nacc = hash%nacc + 1 + !$omp end atomic if (hash%table(hk,1) == key) then val = hash%table(hk,2) info = HashDuplicate - !write(0,*) 'In searchinskey 1 : ', info, HashDuplicate return end if + redo = .false. !$omp critical(hashsearchins) - if (hash%table(hk,1) == key) then - val = hash%table(hk,2) - info = HashDuplicate - else - if (hash%table(hk,1) == HashFreeEntry) then - if (hash%nk == hash%hsize -1) then - ! - ! Note: because of the way we allocate things at CDALL - ! time this is really unlikely; if we get here, we - ! have at least as many halo indices as internals, which - ! means we're already in trouble. But we try to keep going. - ! - call psb_hash_realloc(hash,info) - if (info /= HashOk) then - info = HashOutOfMemory - !return - else - call psb_hash_searchinskey(key,val,nextval,hash,info) - !return - end if + if (hash%table(hk,1) == HashFreeEntry) then + if (hash%nk == hash%hsize -1) then + ! + ! Note: because of the way we allocate things at CDALL + ! time this is really unlikely; if we get here, we + ! have at least as many halo indices as internals, which + ! means we're already in trouble. But we try to keep going. + ! + call psb_hash_realloc(hash,info) + if (info /= HashOk) then + info = HashOutOfMemory else - hash%nk = hash%nk + 1 - hash%table(hk,1) = key - hash%table(hk,2) = nextval - val = nextval - !return + redo = .true. end if + else + hash%nk = hash%nk + 1 + hash%table(hk,1) = key + hash%table(hk,2) = nextval + val = nextval + info = HashOk end if + else if (hash%table(hk,1) == key) then + val = hash%table(hk,2) + info = HashDuplicate + else + info = HashNotFound end if !$omp end critical(hashsearchins) - if (info /= HashOk) then - write(0,*) 'In searchinskey 2: ', info + if (redo) then + call psb_hash_searchinskey(key,val,nextval,hash,info) return end if - if (val > 0) return + if (val > 0) exit hk = hk - hd if (hk < 0) hk = hk + hsize end do - !write(0,*) 'In searchinskey 3: ', info end subroutine psb_hash_lsearchinskey recursive subroutine psb_hash_isearchinskey(key,val,nextval,hash,info) @@ -459,10 +460,11 @@ contains integer(psb_ipk_) :: hsize,hmask, hk, hd logical :: redo + info = HashOK hsize = hash%hsize hmask = hash%hmask - + hk = iand(psb_hashval(key),hmask) if (hk == 0) then hd = 1 @@ -475,17 +477,22 @@ contains return end if val = -1 + val = -1 + !$omp atomic hash%nsrch = hash%nsrch + 1 + !$omp end atomic do + !$omp atomic hash%nacc = hash%nacc + 1 + !$omp end atomic if (hash%table(hk,1) == key) then val = hash%table(hk,2) info = HashDuplicate return end if redo = .false. - !$OMP CRITICAL - if (hash%table(hk,1) == HashFreeEntry) then + !$omp critical(hashsearchins) + if (hash%table(hk,1) == HashFreeEntry) then if (hash%nk == hash%hsize -1) then ! ! Note: because of the way we allocate things at CDALL @@ -496,24 +503,28 @@ contains call psb_hash_realloc(hash,info) if (info /= HashOk) then info = HashOutOfMemory - !return else redo = .true. -!!$ call psb_hash_searchinskey(key,val,nextval,hash,info) -!!$ return end if else hash%nk = hash%nk + 1 hash%table(hk,1) = key hash%table(hk,2) = nextval val = nextval - !return + info = HashOk end if + else if (hash%table(hk,1) == key) then + val = hash%table(hk,2) + info = HashDuplicate + else + info = HashNotFound end if - !$OMP END CRITICAL - if (redo) call psb_hash_searchinskey(key,val,nextval,hash,info) - if (info /= HashOk) return - if (val > 0) return + !$omp end critical(hashsearchins) + if (redo) then + call psb_hash_searchinskey(key,val,nextval,hash,info) + return + end if + if (val > 0) exit hk = hk - hd if (hk < 0) hk = hk + hsize end do @@ -551,7 +562,7 @@ contains end if if (hash%table(hk,1) == HashFreeEntry) then val = HashFreeEntry -! !$ info = HashNotFound + info = HashNotFound return end if hk = hk - hd @@ -591,7 +602,7 @@ contains end if if (hash%table(hk,1) == HashFreeEntry) then val = HashFreeEntry -! !$ info = HashNotFound + info = HashNotFound return end if hk = hk - hd diff --git a/base/modules/desc/psb_indx_map_mod.f90 b/base/modules/desc/psb_indx_map_mod.f90 index 7753db23..422be5f3 100644 --- a/base/modules/desc/psb_indx_map_mod.f90 +++ b/base/modules/desc/psb_indx_map_mod.f90 @@ -158,6 +158,7 @@ module psb_indx_map_mod procedure, pass(idxmap) :: set_lr => base_set_lr procedure, pass(idxmap) :: set_lc => base_set_lc + procedure, pass(idxmap) :: inc_lc => base_inc_lc procedure, pass(idxmap) :: set_p_adjcncy => base_set_p_adjcncy procedure, pass(idxmap) :: xtnd_p_adjcncy => base_xtnd_p_adjcncy @@ -235,7 +236,7 @@ module psb_indx_map_mod & base_get_gr, base_get_gc, base_get_lr, base_get_lc, base_get_ctxt,& & base_get_mpic, base_sizeof, base_set_null, & & base_set_grl, base_set_gcl, & - & base_set_lr, base_set_lc, base_set_ctxt,& + & base_set_lr, base_set_lc, base_inc_lc, base_set_ctxt,& & base_set_mpic, base_get_fmt, base_asb, base_free,& & base_l2gs1, base_l2gs2, base_l2gv1, base_l2gv2,& & base_g2ls1, base_g2ls2, base_g2lv1, base_g2lv2,& @@ -573,6 +574,14 @@ contains idxmap%local_cols = val end subroutine base_set_lc + subroutine base_inc_lc(idxmap) + implicit none + class(psb_indx_map), intent(inout) :: idxmap + !$omp atomic + idxmap%local_cols = idxmap%local_cols + 1 + !$omp end atomic + end subroutine base_inc_lc + subroutine base_set_p_adjcncy(idxmap,val) use psb_realloc_mod use psb_sort_mod diff --git a/base/modules/desc/psb_list_map_mod.F90 b/base/modules/desc/psb_list_map_mod.F90 index 6b61cf52..913145da 100644 --- a/base/modules/desc/psb_list_map_mod.F90 +++ b/base/modules/desc/psb_list_map_mod.F90 @@ -178,7 +178,10 @@ contains end if if (present(mask)) then - + + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,idxmap,owned_) & + !$omp private(i) do i=1, size(idx) if (mask(i)) then if ((1<=idx(i)).and.(idx(i) <= idxmap%get_lr())) then @@ -191,9 +194,12 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,idxmap,owned_) & + !$omp private(i) do i=1, size(idx) if ((1<=idx(i)).and.(idx(i) <= idxmap%get_lr())) then idx(i) = idxmap%loc_to_glob(idx(i)) @@ -204,7 +210,8 @@ contains idx(i) = -1 end if end do - + !$omp end parallel do + end if end subroutine list_ll2gv1 @@ -298,6 +305,9 @@ contains if (present(mask)) then if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idx,idxmap,owned_) & + !$omp private(i,ix) do i=1,is if (mask(i)) then if ((1 <= idx(i)).and.(idx(i) <= idxmap%global_rows)) then @@ -309,6 +319,7 @@ contains end if end if end do + !$omp end parallel do else idx(1:is) = -1 info = -1 @@ -317,6 +328,9 @@ contains else if (.not.present(mask)) then if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(is,idx,idxmap,owned_) & + !$omp private(i,ix) do i=1, is if ((1 <= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idx(i)) @@ -326,6 +340,7 @@ contains idx(i) = -1 end if end do + !$omp end parallel do else idx(1:is) = -1 info = -1 @@ -365,6 +380,9 @@ contains if (present(mask)) then if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idxin,idxout,idxmap,owned_) & + !$omp private(i,ix) do i=1,is if (mask(i)) then if ((1 <= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then @@ -376,6 +394,7 @@ contains end if end if end do + !$omp end parallel do else idxout(1:is) = -1 info = -1 @@ -384,6 +403,9 @@ contains else if (.not.present(mask)) then if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(is,idxin,idxout,idxmap,owned_) & + !$omp private(i,ix) do i=1, is if ((1 <= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then ix = idxmap%glob_to_loc(idxin(i)) @@ -393,6 +415,7 @@ contains idxout(i) = -1 end if end do + !$omp end parallel do else idxout(1:is) = -1 info = -1 @@ -541,6 +564,10 @@ contains else if (.not.present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,is,idx,idxmap,laddsz,lidx) & + !$omp private(i,ix,info) + ! $ o m p reduction(.AND.:isLoopValid) do i=1, is if (info /= 0) cycle if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then @@ -579,8 +606,8 @@ contains idx(i) = -1 end if end do + !$omp end parallel do end if - else if (.not.present(lidx)) then if (present(mask)) then diff --git a/base/modules/desc/psb_repl_map_mod.F90 b/base/modules/desc/psb_repl_map_mod.F90 index fe51b7b1..738d6de2 100644 --- a/base/modules/desc/psb_repl_map_mod.F90 +++ b/base/modules/desc/psb_repl_map_mod.F90 @@ -169,7 +169,9 @@ contains end if if (present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,idxmap) & + !$omp private(i) do i=1, size(idx) if (mask(i)) then if ((1<=idx(i)).and.(idx(i) <= idxmap%local_rows)) then @@ -179,9 +181,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,idxmap) & + !$omp private(i) do i=1, size(idx) if ((1<=idx(i)).and.(idx(i) <= idxmap%local_rows)) then ! do nothing @@ -189,7 +193,7 @@ contains idx(i) = -1 end if end do - + !$omp end parallel do end if end subroutine repl_l2gv1 @@ -223,7 +227,9 @@ contains end if if (present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idxin,idxout,idxmap) & + !$omp private(i) do i=1, im if (mask(i)) then if ((1<=idxin(i)).and.(idxin(i) <= idxmap%local_rows)) then @@ -233,9 +239,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idxin,idxout,idxmap) & + !$omp private(i) do i=1, im if ((1<=idxin(i)).and.(idxin(i) <= idxmap%local_rows)) then idxout(i) = idxin(i) @@ -243,7 +251,7 @@ contains idxout(i) = -1 end if end do - + !$omp end parallel do end if if (is > im) info = -3 @@ -324,6 +332,9 @@ contains if (present(mask)) then if (idxmap%is_asb()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,idxmap) & + !$omp private(i) do i=1, is if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then @@ -333,7 +344,11 @@ contains end if end if end do + !$omp end parallel do else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,idxmap) & + !$omp private(i) do i=1,is if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then @@ -344,6 +359,7 @@ contains end if end if end do + !$omp end parallel do else idx(1:is) = -1 info = -1 @@ -352,6 +368,9 @@ contains else if (.not.present(mask)) then if (idxmap%is_asb()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,idxmap) & + !$omp private(i) do i=1, is if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ! do nothing @@ -359,7 +378,11 @@ contains idx(i) = -1 end if end do + !$omp end parallel do else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,idxmap) & + !$omp private(i) do i=1,is if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ! do nothing @@ -367,6 +390,7 @@ contains idx(i) = -1 end if end do + !$omp end parallel do else idx(1:is) = -1 info = -1 @@ -409,6 +433,9 @@ contains if (present(mask)) then if (idxmap%is_asb()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idxin,idxout,idxmap) & + !$omp private(i) do i=1, im if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then @@ -418,7 +445,11 @@ contains end if end if end do + !$omp end parallel do else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idxin,idxout,idxmap) & + !$omp private(i) do i=1,im if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then @@ -428,6 +459,7 @@ contains end if end if end do + !$omp end parallel do else idxout(1:im) = -1 info = -1 @@ -436,6 +468,9 @@ contains else if (.not.present(mask)) then if (idxmap%is_asb()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idxin,idxout,idxmap) & + !$omp private(i) do i=1, im if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then idxout(i) = idxin(i) @@ -443,7 +478,11 @@ contains idxout(i) = -1 end if end do + !$omp end parallel do else if (idxmap%is_valid()) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idxin,idxout,idxmap) & + !$omp private(i) do i=1,im if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then idxout(i) = idxin(i) @@ -451,6 +490,7 @@ contains idxout(i) = -1 end if end do + !$omp end parallel do else idxout(1:im) = -1 info = -1 @@ -557,6 +597,9 @@ contains else if (idxmap%is_valid()) then if (present(lidx)) then if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,lidx,is,idxmap) & + !$omp private(i) do i=1, is if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then @@ -566,9 +609,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,lidx,is,idxmap) & + !$omp private(i) do i=1, is if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ! do nothing @@ -576,9 +621,13 @@ contains idx(i) = -1 end if end do + !$omp end parallel do end if else if (.not.present(lidx)) then if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idx,is,idxmap) & + !$omp private(i) do i=1, is if (mask(i)) then if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then @@ -588,8 +637,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idx,is,idxmap) & + !$omp private(i) do i=1, is if ((1<= idx(i)).and.(idx(i) <= idxmap%global_rows)) then ! do nothing @@ -597,6 +649,7 @@ contains idx(i) = -1 end if end do + !$omp end parallel do end if end if else @@ -644,6 +697,9 @@ contains else if (idxmap%is_valid()) then if (present(lidx)) then if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idxin,idxout,im,idxmap) & + !$omp private(i) do i=1, im if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then @@ -653,9 +709,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then - + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idxin,idxout,im,idxmap) & + !$omp private(i) do i=1, im if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then idxout(i) = idxin(i) @@ -663,9 +721,13 @@ contains idxout(i) = -1 end if end do + !$omp end parallel do end if else if (.not.present(lidx)) then if (present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(mask,idxin,idxout,im,idxmap) & + !$omp private(i) do i=1, im if (mask(i)) then if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then @@ -675,8 +737,11 @@ contains end if end if end do - + !$omp end parallel do else if (.not.present(mask)) then + !$omp parallel do default(none) schedule(dynamic) & + !$omp shared(idxin,idxout,im,idxmap) & + !$omp private(i) do i=1, im if ((1<= idxin(i)).and.(idxin(i) <= idxmap%global_rows)) then idxout(i) = idxin(i) @@ -684,6 +749,7 @@ contains idxout(i) = -1 end if end do + !$omp end parallel do end if end if else diff --git a/base/serial/impl/psb_c_base_mat_impl.F90 b/base/serial/impl/psb_c_base_mat_impl.F90 index 1137bbdc..70dabe1e 100644 --- a/base/serial/impl/psb_c_base_mat_impl.F90 +++ b/base/serial/impl/psb_c_base_mat_impl.F90 @@ -2051,8 +2051,8 @@ subroutine psb_c_base_vect_mv(alpha,a,x,beta,y,info,trans) ! For the time being we just throw everything back ! onto the normal routines. - call x%sync() - call y%sync() + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%spmm(alpha,x%v,beta,y%v,info,trans) call y%set_host() end subroutine psb_c_base_vect_mv @@ -2105,8 +2105,6 @@ subroutine psb_c_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) goto 9999 end if - call x%sync() - call y%sync() if (present(d)) then call d%sync() if (present(scale)) then @@ -2206,8 +2204,11 @@ subroutine psb_c_base_inner_vect_sv(alpha,a,x,beta,y,info,trans) info = psb_success_ call psb_erractionsave(err_act) + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%inner_spsm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() if (info /= psb_success_) then info = psb_err_from_subroutine_ diff --git a/base/serial/impl/psb_c_csc_impl.F90 b/base/serial/impl/psb_c_csc_impl.F90 index ad59fcb1..5cb8f87d 100644 --- a/base/serial/impl/psb_c_csc_impl.F90 +++ b/base/serial/impl/psb_c_csc_impl.F90 @@ -2171,7 +2171,7 @@ subroutine psb_c_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat call b%set_nzeros(a%get_nzeros()) @@ -2336,7 +2336,7 @@ subroutine psb_c_cp_csc_to_fmt(a,b,info) if (a%is_dev()) call a%sync() b%psb_c_base_sparse_mat = a%psb_c_base_sparse_mat nc = a%get_ncols() - nz = a%get_nzeros() + nz = max(a%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) @@ -2469,7 +2469,7 @@ subroutine psb_c_cp_csc_from_fmt(a,b,info) if (b%is_dev()) call b%sync() a%psb_c_base_sparse_mat = b%psb_c_base_sparse_mat nc = b%get_ncols() - nz = b%get_nzeros() + nz = max(b%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) @@ -4066,7 +4066,7 @@ subroutine psb_lc_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_lc_base_sparse_mat = a%psb_lc_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_c_csr_impl.F90 b/base/serial/impl/psb_c_csr_impl.F90 index 2f766c93..dccaaf36 100644 --- a/base/serial/impl/psb_c_csr_impl.F90 +++ b/base/serial/impl/psb_c_csr_impl.F90 @@ -3819,6 +3819,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3829,12 +3830,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3890,15 +3898,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3906,9 +3913,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3944,6 +3952,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -4010,14 +4019,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4025,9 +4034,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d @@ -6296,7 +6306,7 @@ subroutine psb_lc_mv_csr_to_coo(a,b,info) if (a%is_dev()) call a%sync() nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_lc_base_sparse_mat = a%psb_lc_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_d_base_mat_impl.F90 b/base/serial/impl/psb_d_base_mat_impl.F90 index 09732d5b..8fae21a9 100644 --- a/base/serial/impl/psb_d_base_mat_impl.F90 +++ b/base/serial/impl/psb_d_base_mat_impl.F90 @@ -2051,8 +2051,8 @@ subroutine psb_d_base_vect_mv(alpha,a,x,beta,y,info,trans) ! For the time being we just throw everything back ! onto the normal routines. - call x%sync() - call y%sync() + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%spmm(alpha,x%v,beta,y%v,info,trans) call y%set_host() end subroutine psb_d_base_vect_mv @@ -2105,8 +2105,6 @@ subroutine psb_d_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) goto 9999 end if - call x%sync() - call y%sync() if (present(d)) then call d%sync() if (present(scale)) then @@ -2206,8 +2204,11 @@ subroutine psb_d_base_inner_vect_sv(alpha,a,x,beta,y,info,trans) info = psb_success_ call psb_erractionsave(err_act) + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%inner_spsm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() if (info /= psb_success_) then info = psb_err_from_subroutine_ diff --git a/base/serial/impl/psb_d_csc_impl.F90 b/base/serial/impl/psb_d_csc_impl.F90 index 32298d8f..f135bf03 100644 --- a/base/serial/impl/psb_d_csc_impl.F90 +++ b/base/serial/impl/psb_d_csc_impl.F90 @@ -2171,7 +2171,7 @@ subroutine psb_d_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat call b%set_nzeros(a%get_nzeros()) @@ -2336,7 +2336,7 @@ subroutine psb_d_cp_csc_to_fmt(a,b,info) if (a%is_dev()) call a%sync() b%psb_d_base_sparse_mat = a%psb_d_base_sparse_mat nc = a%get_ncols() - nz = a%get_nzeros() + nz = max(a%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) @@ -2469,7 +2469,7 @@ subroutine psb_d_cp_csc_from_fmt(a,b,info) if (b%is_dev()) call b%sync() a%psb_d_base_sparse_mat = b%psb_d_base_sparse_mat nc = b%get_ncols() - nz = b%get_nzeros() + nz = max(b%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) @@ -4066,7 +4066,7 @@ subroutine psb_ld_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_ld_base_sparse_mat = a%psb_ld_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_d_csr_impl.F90 b/base/serial/impl/psb_d_csr_impl.F90 index b6c9da91..46d09c5f 100644 --- a/base/serial/impl/psb_d_csr_impl.F90 +++ b/base/serial/impl/psb_d_csr_impl.F90 @@ -3819,6 +3819,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3829,12 +3830,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3890,15 +3898,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3906,9 +3913,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3944,6 +3952,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -4010,14 +4019,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4025,9 +4034,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d @@ -6296,7 +6306,7 @@ subroutine psb_ld_mv_csr_to_coo(a,b,info) if (a%is_dev()) call a%sync() nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_ld_base_sparse_mat = a%psb_ld_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_s_base_mat_impl.F90 b/base/serial/impl/psb_s_base_mat_impl.F90 index bcad1fc6..a1b87123 100644 --- a/base/serial/impl/psb_s_base_mat_impl.F90 +++ b/base/serial/impl/psb_s_base_mat_impl.F90 @@ -2051,8 +2051,8 @@ subroutine psb_s_base_vect_mv(alpha,a,x,beta,y,info,trans) ! For the time being we just throw everything back ! onto the normal routines. - call x%sync() - call y%sync() + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%spmm(alpha,x%v,beta,y%v,info,trans) call y%set_host() end subroutine psb_s_base_vect_mv @@ -2105,8 +2105,6 @@ subroutine psb_s_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) goto 9999 end if - call x%sync() - call y%sync() if (present(d)) then call d%sync() if (present(scale)) then @@ -2206,8 +2204,11 @@ subroutine psb_s_base_inner_vect_sv(alpha,a,x,beta,y,info,trans) info = psb_success_ call psb_erractionsave(err_act) + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%inner_spsm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() if (info /= psb_success_) then info = psb_err_from_subroutine_ diff --git a/base/serial/impl/psb_s_csc_impl.F90 b/base/serial/impl/psb_s_csc_impl.F90 index 41c3e3db..9a278bc0 100644 --- a/base/serial/impl/psb_s_csc_impl.F90 +++ b/base/serial/impl/psb_s_csc_impl.F90 @@ -2171,7 +2171,7 @@ subroutine psb_s_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat call b%set_nzeros(a%get_nzeros()) @@ -2336,7 +2336,7 @@ subroutine psb_s_cp_csc_to_fmt(a,b,info) if (a%is_dev()) call a%sync() b%psb_s_base_sparse_mat = a%psb_s_base_sparse_mat nc = a%get_ncols() - nz = a%get_nzeros() + nz = max(a%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) @@ -2469,7 +2469,7 @@ subroutine psb_s_cp_csc_from_fmt(a,b,info) if (b%is_dev()) call b%sync() a%psb_s_base_sparse_mat = b%psb_s_base_sparse_mat nc = b%get_ncols() - nz = b%get_nzeros() + nz = max(b%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) @@ -4066,7 +4066,7 @@ subroutine psb_ls_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_ls_base_sparse_mat = a%psb_ls_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_s_csr_impl.F90 b/base/serial/impl/psb_s_csr_impl.F90 index 393c188c..6f8e5afd 100644 --- a/base/serial/impl/psb_s_csr_impl.F90 +++ b/base/serial/impl/psb_s_csr_impl.F90 @@ -3819,6 +3819,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3829,12 +3830,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3890,15 +3898,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3906,9 +3913,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3944,6 +3952,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -4010,14 +4019,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4025,9 +4034,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d @@ -6296,7 +6306,7 @@ subroutine psb_ls_mv_csr_to_coo(a,b,info) if (a%is_dev()) call a%sync() nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_ls_base_sparse_mat = a%psb_ls_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_z_base_mat_impl.F90 b/base/serial/impl/psb_z_base_mat_impl.F90 index fc01a605..51ab1bb3 100644 --- a/base/serial/impl/psb_z_base_mat_impl.F90 +++ b/base/serial/impl/psb_z_base_mat_impl.F90 @@ -2051,8 +2051,8 @@ subroutine psb_z_base_vect_mv(alpha,a,x,beta,y,info,trans) ! For the time being we just throw everything back ! onto the normal routines. - call x%sync() - call y%sync() + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%spmm(alpha,x%v,beta,y%v,info,trans) call y%set_host() end subroutine psb_z_base_vect_mv @@ -2105,8 +2105,6 @@ subroutine psb_z_base_vect_cssv(alpha,a,x,beta,y,info,trans,scale,d) goto 9999 end if - call x%sync() - call y%sync() if (present(d)) then call d%sync() if (present(scale)) then @@ -2206,8 +2204,11 @@ subroutine psb_z_base_inner_vect_sv(alpha,a,x,beta,y,info,trans) info = psb_success_ call psb_erractionsave(err_act) + if (x%is_dev()) call x%sync() + if (y%is_dev()) call y%sync() call a%inner_spsm(alpha,x%v,beta,y%v,info,trans) + call y%set_host() if (info /= psb_success_) then info = psb_err_from_subroutine_ diff --git a/base/serial/impl/psb_z_csc_impl.F90 b/base/serial/impl/psb_z_csc_impl.F90 index 56681a33..71739164 100644 --- a/base/serial/impl/psb_z_csc_impl.F90 +++ b/base/serial/impl/psb_z_csc_impl.F90 @@ -2171,7 +2171,7 @@ subroutine psb_z_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat call b%set_nzeros(a%get_nzeros()) @@ -2336,7 +2336,7 @@ subroutine psb_z_cp_csc_to_fmt(a,b,info) if (a%is_dev()) call a%sync() b%psb_z_base_sparse_mat = a%psb_z_base_sparse_mat nc = a%get_ncols() - nz = a%get_nzeros() + nz = max(a%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( a%icp(1:nc+1), b%icp , info) if (info == 0) call psb_safe_cpy( a%ia(1:nz), b%ia , info) @@ -2469,7 +2469,7 @@ subroutine psb_z_cp_csc_from_fmt(a,b,info) if (b%is_dev()) call b%sync() a%psb_z_base_sparse_mat = b%psb_z_base_sparse_mat nc = b%get_ncols() - nz = b%get_nzeros() + nz = max(b%get_nzeros(),ione) if (.false.) then if (info == 0) call psb_safe_cpy( b%icp(1:nc+1), a%icp , info) if (info == 0) call psb_safe_cpy( b%ia(1:nz), a%ia , info) @@ -4066,7 +4066,7 @@ subroutine psb_lz_mv_csc_to_coo(a,b,info) nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_lz_base_sparse_mat = a%psb_lz_base_sparse_mat call b%set_nzeros(a%get_nzeros()) diff --git a/base/serial/impl/psb_z_csr_impl.F90 b/base/serial/impl/psb_z_csr_impl.F90 index 4a13a163..76ca427f 100644 --- a/base/serial/impl/psb_z_csr_impl.F90 +++ b/base/serial/impl/psb_z_csr_impl.F90 @@ -3819,6 +3819,7 @@ contains integer(psb_ipk_) :: ma, nb integer(psb_ipk_), allocatable :: col_inds(:), offsets(:) integer(psb_ipk_) :: irw, jj, j, k, nnz, rwnz, thread_upperbound, start_idx, end_idx + integer(psb_ipk_) :: nth, lth,ith ma = a%get_nrows() nb = b%get_ncols() @@ -3829,12 +3830,19 @@ contains ! dense accumulator ! https://sc18.supercomputing.org/proceedings/workshops/workshop_files/ws_lasalss115s2-file1.pdf call psb_realloc(nb, acc, info) + !$omp parallel shared(nth,lth) + !$omp single + nth = omp_get_num_threads() + lth = min(nth, ma) + !$omp end single + !$omp end parallel allocate(offsets(omp_get_max_threads())) !$omp parallel private(vals,col_inds,nnz,rwnz,thread_upperbound,acc,start_idx,end_idx) & - !$omp shared(a,b,c,offsets) + !$omp num_threads(lth) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma if (start_idx == 0) then @@ -3890,15 +3898,14 @@ contains !$omp end single !$omp barrier - - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -3906,9 +3913,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson @@ -3944,6 +3952,7 @@ contains !$omp parallel private(vals,col_inds,nnz,thread_upperbound,acc,start_idx,end_idx) shared(a,b,c,offsets) thread_upperbound = 0 start_idx = 0 + end_idx = 0 !$omp do schedule(static) private(irw, jj, j) do irw = 1, ma do jj = a%irp(irw), a%irp(irw + 1) - 1 @@ -4010,14 +4019,14 @@ contains !$omp barrier - if (omp_get_thread_num() /= 0) then - c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + if (omp_get_thread_num() /= 0) then + c%irp(start_idx) = offsets(omp_get_thread_num()) + 1 + end if + do irw = start_idx, end_idx - 1 + c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) + end do end if - - do irw = start_idx, end_idx - 1 - c%irp(irw + 1) = c%irp(irw + 1) + c%irp(irw) - end do - !$omp barrier !$omp single @@ -4025,9 +4034,10 @@ contains call psb_realloc(c%irp(ma + 1), c%val, info) call psb_realloc(c%irp(ma + 1), c%ja, info) !$omp end single - - c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) - c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + if ((start_idx /= 0).and.(start_idx <= end_idx) ) then + c%val(c%irp(start_idx):c%irp(end_idx + 1) - 1) = vals(1:nnz) + c%ja(c%irp(start_idx):c%irp(end_idx + 1) - 1) = col_inds(1:nnz) + end if !$omp end parallel end subroutine spmm_omp_gustavson_1d @@ -6296,7 +6306,7 @@ subroutine psb_lz_mv_csr_to_coo(a,b,info) if (a%is_dev()) call a%sync() nr = a%get_nrows() nc = a%get_ncols() - nza = a%get_nzeros() + nza = max(a%get_nzeros(),ione) b%psb_lz_base_sparse_mat = a%psb_lz_base_sparse_mat call b%set_nzeros(a%get_nzeros())