Merge remote-tracking branch 'origin/cuda-multivect' into psblas-bgmres
commit
c08431d71e
@ -0,0 +1,123 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
|
||||
|
||||
use psb_base_mod
|
||||
use hdiagdev_mod
|
||||
use psb_vectordev_mod
|
||||
use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_csmm
|
||||
implicit none
|
||||
class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a
|
||||
complex(psb_spk_), intent(in) :: alpha, beta, x(:,:)
|
||||
complex(psb_spk_), intent(inout) :: y(:,:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
|
||||
character :: trans_
|
||||
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
|
||||
complex(psb_spk_), allocatable :: acc(:)
|
||||
type(c_ptr) :: gpX, gpY
|
||||
logical :: tra
|
||||
Integer(Psb_ipk_) :: err_act
|
||||
character(len=20) :: name='c_cuda_hdiag_csmm'
|
||||
logical, parameter :: debug=.false.
|
||||
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
|
||||
if (present(trans)) then
|
||||
trans_ = trans
|
||||
else
|
||||
trans_ = 'N'
|
||||
end if
|
||||
|
||||
if (.not.a%is_asb()) then
|
||||
info = psb_err_invalid_mat_state_
|
||||
call psb_errpush(info,name)
|
||||
goto 9999
|
||||
endif
|
||||
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
|
||||
|
||||
if (tra) then
|
||||
m = a%get_ncols()
|
||||
n = a%get_nrows()
|
||||
else
|
||||
n = a%get_ncols()
|
||||
m = a%get_nrows()
|
||||
end if
|
||||
|
||||
if (size(x,1)<n) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (size(y,1)<m) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (tra) then
|
||||
if (a%is_dev()) call a%sync()
|
||||
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
|
||||
else
|
||||
!
|
||||
! Just to test, move X/Y to/from the GPU.
|
||||
!
|
||||
nxy = min(size(x,2),size(y,2))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpX,x,size(x,1))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpY,y,size(y,1))
|
||||
|
||||
if (info == 0) &
|
||||
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
|
||||
if (info == 0) &
|
||||
& info = readMultiVecDevice(gpY,y,size(y,1))
|
||||
if (info /= 0) goto 9999
|
||||
call freeMultiVecDevice(gpX)
|
||||
call freeMultiVecDevice(gpY)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 call psb_error_handler(err_act)
|
||||
|
||||
return
|
||||
|
||||
end subroutine psb_c_cuda_hdiag_csmm
|
@ -0,0 +1,123 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
|
||||
|
||||
use psb_base_mod
|
||||
use hdiagdev_mod
|
||||
use psb_vectordev_mod
|
||||
use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_csmm
|
||||
implicit none
|
||||
class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a
|
||||
real(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
|
||||
real(psb_dpk_), intent(inout) :: y(:,:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
|
||||
character :: trans_
|
||||
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
|
||||
real(psb_dpk_), allocatable :: acc(:)
|
||||
type(c_ptr) :: gpX, gpY
|
||||
logical :: tra
|
||||
Integer(Psb_ipk_) :: err_act
|
||||
character(len=20) :: name='d_cuda_hdiag_csmm'
|
||||
logical, parameter :: debug=.false.
|
||||
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
|
||||
if (present(trans)) then
|
||||
trans_ = trans
|
||||
else
|
||||
trans_ = 'N'
|
||||
end if
|
||||
|
||||
if (.not.a%is_asb()) then
|
||||
info = psb_err_invalid_mat_state_
|
||||
call psb_errpush(info,name)
|
||||
goto 9999
|
||||
endif
|
||||
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
|
||||
|
||||
if (tra) then
|
||||
m = a%get_ncols()
|
||||
n = a%get_nrows()
|
||||
else
|
||||
n = a%get_ncols()
|
||||
m = a%get_nrows()
|
||||
end if
|
||||
|
||||
if (size(x,1)<n) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (size(y,1)<m) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (tra) then
|
||||
if (a%is_dev()) call a%sync()
|
||||
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
|
||||
else
|
||||
!
|
||||
! Just to test, move X/Y to/from the GPU.
|
||||
!
|
||||
nxy = min(size(x,2),size(y,2))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpX,x,size(x,1))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpY,y,size(y,1))
|
||||
|
||||
if (info == 0) &
|
||||
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
|
||||
if (info == 0) &
|
||||
& info = readMultiVecDevice(gpY,y,size(y,1))
|
||||
if (info /= 0) goto 9999
|
||||
call freeMultiVecDevice(gpX)
|
||||
call freeMultiVecDevice(gpY)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 call psb_error_handler(err_act)
|
||||
|
||||
return
|
||||
|
||||
end subroutine psb_d_cuda_hdiag_csmm
|
@ -0,0 +1,123 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
|
||||
|
||||
use psb_base_mod
|
||||
use hdiagdev_mod
|
||||
use psb_vectordev_mod
|
||||
use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_csmm
|
||||
implicit none
|
||||
class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a
|
||||
real(psb_spk_), intent(in) :: alpha, beta, x(:,:)
|
||||
real(psb_spk_), intent(inout) :: y(:,:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
|
||||
character :: trans_
|
||||
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
|
||||
real(psb_spk_), allocatable :: acc(:)
|
||||
type(c_ptr) :: gpX, gpY
|
||||
logical :: tra
|
||||
Integer(Psb_ipk_) :: err_act
|
||||
character(len=20) :: name='s_cuda_hdiag_csmm'
|
||||
logical, parameter :: debug=.false.
|
||||
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
|
||||
if (present(trans)) then
|
||||
trans_ = trans
|
||||
else
|
||||
trans_ = 'N'
|
||||
end if
|
||||
|
||||
if (.not.a%is_asb()) then
|
||||
info = psb_err_invalid_mat_state_
|
||||
call psb_errpush(info,name)
|
||||
goto 9999
|
||||
endif
|
||||
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
|
||||
|
||||
if (tra) then
|
||||
m = a%get_ncols()
|
||||
n = a%get_nrows()
|
||||
else
|
||||
n = a%get_ncols()
|
||||
m = a%get_nrows()
|
||||
end if
|
||||
|
||||
if (size(x,1)<n) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (size(y,1)<m) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (tra) then
|
||||
if (a%is_dev()) call a%sync()
|
||||
call a%psb_s_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
|
||||
else
|
||||
!
|
||||
! Just to test, move X/Y to/from the GPU.
|
||||
!
|
||||
nxy = min(size(x,2),size(y,2))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpX,x,size(x,1))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpY,y,size(y,1))
|
||||
|
||||
if (info == 0) &
|
||||
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
|
||||
if (info == 0) &
|
||||
& info = readMultiVecDevice(gpY,y,size(y,1))
|
||||
if (info /= 0) goto 9999
|
||||
call freeMultiVecDevice(gpX)
|
||||
call freeMultiVecDevice(gpY)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 call psb_error_handler(err_act)
|
||||
|
||||
return
|
||||
|
||||
end subroutine psb_s_cuda_hdiag_csmm
|
@ -0,0 +1,123 @@
|
||||
! Parallel Sparse BLAS GPU plugin
|
||||
! (C) Copyright 2013
|
||||
!
|
||||
! Salvatore Filippone
|
||||
! Alessandro Fanfarillo
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
|
||||
subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans)
|
||||
|
||||
use psb_base_mod
|
||||
use hdiagdev_mod
|
||||
use psb_vectordev_mod
|
||||
use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_csmm
|
||||
implicit none
|
||||
class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a
|
||||
complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:)
|
||||
complex(psb_dpk_), intent(inout) :: y(:,:)
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
character, optional, intent(in) :: trans
|
||||
|
||||
character :: trans_
|
||||
integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy
|
||||
complex(psb_dpk_), allocatable :: acc(:)
|
||||
type(c_ptr) :: gpX, gpY
|
||||
logical :: tra
|
||||
Integer(Psb_ipk_) :: err_act
|
||||
character(len=20) :: name='z_cuda_hdiag_csmm'
|
||||
logical, parameter :: debug=.false.
|
||||
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
|
||||
if (present(trans)) then
|
||||
trans_ = trans
|
||||
else
|
||||
trans_ = 'N'
|
||||
end if
|
||||
|
||||
if (.not.a%is_asb()) then
|
||||
info = psb_err_invalid_mat_state_
|
||||
call psb_errpush(info,name)
|
||||
goto 9999
|
||||
endif
|
||||
tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C')
|
||||
|
||||
if (tra) then
|
||||
m = a%get_ncols()
|
||||
n = a%get_nrows()
|
||||
else
|
||||
n = a%get_ncols()
|
||||
m = a%get_nrows()
|
||||
end if
|
||||
|
||||
if (size(x,1)<n) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/3*ione,n,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (size(y,1)<m) then
|
||||
info = 36
|
||||
call psb_errpush(info,name,i_err=(/5*ione,m,izero,izero,izero/))
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
if (tra) then
|
||||
if (a%is_dev()) call a%sync()
|
||||
call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans)
|
||||
else
|
||||
!
|
||||
! Just to test, move X/Y to/from the GPU.
|
||||
!
|
||||
nxy = min(size(x,2),size(y,2))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpX,nxy,size(x,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpX,x,size(x,1))
|
||||
if (info == 0) &
|
||||
& info = FallocMultiVecDevice(gpY,nxy,size(y,1),spgpu_type_double)
|
||||
if (info == 0) &
|
||||
& info = writeMultiVecDevice(gpY,y,size(y,1))
|
||||
|
||||
if (info == 0) &
|
||||
& info = spmmHdiagDevice(a%deviceMat,alpha,gpX,beta,gpY)
|
||||
if (info == 0) &
|
||||
& info = readMultiVecDevice(gpY,y,size(y,1))
|
||||
if (info /= 0) goto 9999
|
||||
call freeMultiVecDevice(gpX)
|
||||
call freeMultiVecDevice(gpY)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 call psb_error_handler(err_act)
|
||||
|
||||
return
|
||||
|
||||
end subroutine psb_z_cuda_hdiag_csmm
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include "cuComplex.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "ell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE cuFloatComplex
|
||||
#define TYPE_SYMBOL C
|
||||
#define TEX_FETCH_TYPE cuFloatComplex
|
||||
#include "ell_spmm_base.cuh"
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include <stdio.h>
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "ell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE double
|
||||
#define TYPE_SYMBOL D
|
||||
#define TEX_FETCH_TYPE int2
|
||||
#include "ell_spmm_base.cuh"
|
@ -0,0 +1,276 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
|
||||
#define PRE_CONCAT(A, B) A ## B
|
||||
#define CONCAT(A, B) PRE_CONCAT(A, B)
|
||||
|
||||
#undef GEN_SPGPU_ELL_NAME
|
||||
#undef X_TEX
|
||||
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
|
||||
|
||||
__device__ __host__ static float zero_float() { return 0.0f; }
|
||||
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
|
||||
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
|
||||
|
||||
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
|
||||
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
|
||||
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
|
||||
|
||||
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
|
||||
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
|
||||
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
|
||||
|
||||
__device__ static float readValue_float(float fetch) { return fetch; }
|
||||
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
|
||||
|
||||
// host or c.c >= 1.3
|
||||
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
|
||||
__device__ __host__ static double zero_double() { return 0.0; }
|
||||
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
|
||||
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
|
||||
|
||||
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
|
||||
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
|
||||
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
|
||||
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
|
||||
|
||||
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
|
||||
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
|
||||
{
|
||||
cuDoubleComplex c;
|
||||
c.x = __hiloint2double (fetch.y, fetch.x);
|
||||
c.y = __hiloint2double (fetch.w, fetch.z);
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// Texture cache management
|
||||
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
|
||||
|
||||
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
|
||||
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
|
||||
|
||||
__device__ static VALUE_TYPE
|
||||
fetchTex (int pointer)
|
||||
{
|
||||
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
|
||||
return CONCAT(readValue_,VALUE_TYPE) (fetch);
|
||||
}
|
||||
#endif
|
||||
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
|
||||
#define GEN_SPGPU_ELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),ellspmm_vanilla)
|
||||
#include "ell_spmm_base_template.cuh"
|
||||
#if 0
|
||||
#undef GEN_SPGPU_ELL_NAME
|
||||
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
|
||||
#define GEN_SPGPU_ELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_prefetch)
|
||||
#undef USE_PREFETCHING
|
||||
#define USE_PREFETCHING
|
||||
#include "ell_spmm_base_template.cuh"
|
||||
#define ENABLE_CACHE
|
||||
#undef GEN_SPGPU_ELL_NAME
|
||||
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
|
||||
#define GEN_SPGPU_ELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache_prefetch)
|
||||
#include "ell_spmm_base_template.cuh"
|
||||
#undef GEN_SPGPU_ELL_NAME
|
||||
#undef USE_PREFETCHING
|
||||
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
|
||||
#define GEN_SPGPU_ELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),ellspmm_texcache)
|
||||
#include "ell_spmm_base_template.cuh"
|
||||
#endif
|
||||
#undef GEN_SPGPU_ELL_NAME
|
||||
#define GEN_SPGPU_ELL_NAME(x) CONCAT(CONCAT(spgpu,x),ellspmm)
|
||||
|
||||
#if 0
|
||||
|
||||
void
|
||||
GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
|
||||
(spgpuHandle_t handle,
|
||||
int count,
|
||||
VALUE_TYPE* z,
|
||||
int zPitch,
|
||||
const VALUE_TYPE *y,
|
||||
int yPitch,
|
||||
VALUE_TYPE alpha,
|
||||
const VALUE_TYPE* cM,
|
||||
const int* rP,
|
||||
int cMPitch,
|
||||
int rPPitch,
|
||||
const __device int* rS,
|
||||
const __device int* rIdx,
|
||||
int avgNnzPerRow,
|
||||
int maxNnzPerRow,
|
||||
int rows,
|
||||
const VALUE_TYPE *x,
|
||||
int xPitch,
|
||||
VALUE_TYPE beta,
|
||||
int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *px,*py, *pz;
|
||||
int cnt, c1;
|
||||
|
||||
dim3 block (THREAD_BLOCK, 1);
|
||||
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
|
||||
// Should we generalize the code to 1/2/4/8 threads per row?
|
||||
// And maybe adjust THREAD_BLOCK size?
|
||||
int shrMemSize,maxShmemSz;
|
||||
int numMp=getGPUMultiProcessors();
|
||||
int maxThMp=getGPUMaxThreadsPerMP();
|
||||
int nmblksMp=maxThMp/THREAD_BLOCK;
|
||||
int nmblk=nmblksMp*numMp;
|
||||
dim3 grid (nmblk);
|
||||
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
|
||||
if (shrMemSize > maxShmemSz) {
|
||||
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
|
||||
return;
|
||||
}
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > 2*MMBSZ) {
|
||||
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch, py, yPitch,
|
||||
alpha, cM, rP, cMPitch, rPPitch,
|
||||
rS, rows, px, xPitch, beta, baseIndex);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt > MMBSZ) {
|
||||
c1 = cnt/2;
|
||||
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch, py, yPitch,
|
||||
alpha, cM, rP, cMPitch, rPPitch,
|
||||
rS, rows, px, xPitch, beta, baseIndex);
|
||||
cnt -= c1;
|
||||
}
|
||||
if (cnt > MMBSZ) {
|
||||
fprintf(stderr,"Invalid residual count %d\n",cnt);
|
||||
} else if (cnt > 0){
|
||||
CONCAT(GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch, py, yPitch,
|
||||
alpha, cM, rP, cMPitch, rPPitch,
|
||||
rS, rows, px, xPitch, beta, baseIndex);
|
||||
}
|
||||
cudaCheckError("CUDA error on ell_spmm");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void
|
||||
GEN_SPGPU_ELL_NAME(TYPE_SYMBOL)
|
||||
(spgpuHandle_t handle,
|
||||
int count,
|
||||
VALUE_TYPE* z,
|
||||
int zPitch,
|
||||
const VALUE_TYPE *y,
|
||||
int yPitch,
|
||||
VALUE_TYPE alpha,
|
||||
const VALUE_TYPE* cM,
|
||||
const int* rP,
|
||||
int cMPitch,
|
||||
int rPPitch,
|
||||
const __device int* rS,
|
||||
const __device int* rIdx,
|
||||
int avgNnzPerRow,
|
||||
int maxNnzPerRow,
|
||||
int rows,
|
||||
const VALUE_TYPE *x,
|
||||
int xPitch,
|
||||
VALUE_TYPE beta,
|
||||
int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *px,*py,*pz;
|
||||
int cnt;
|
||||
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
|
||||
|
||||
int maxShmemSz;
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
|
||||
while (rows > maxNForACall) {//managing large vectors
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > MMBSZ) {
|
||||
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
cMPitch, rPPitch,
|
||||
rS, rIdx, avgNnzPerRow,
|
||||
maxNnzPerRow, maxNForACall,
|
||||
px, xPitch, beta, baseIndex);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt >0) {
|
||||
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
cMPitch, rPPitch,
|
||||
rS, rIdx, avgNnzPerRow,
|
||||
maxNnzPerRow, maxNForACall,
|
||||
px, xPitch, beta, baseIndex);
|
||||
}
|
||||
|
||||
y = y + maxNForACall;
|
||||
z = z + maxNForACall;
|
||||
cM = cM + maxNForACall;
|
||||
rP = rP + maxNForACall;
|
||||
rS = rS + maxNForACall;
|
||||
rows -= maxNForACall;
|
||||
}
|
||||
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > MMBSZ) {
|
||||
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
cMPitch, rPPitch,
|
||||
rS, rIdx, avgNnzPerRow,
|
||||
maxNnzPerRow, rows,
|
||||
px, xPitch, beta, baseIndex);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt >0) {
|
||||
CONCAT(_,GEN_SPGPU_ELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
cMPitch, rPPitch,
|
||||
rS, rIdx, avgNnzPerRow,
|
||||
maxNnzPerRow, rows,
|
||||
px, xPitch, beta, baseIndex);
|
||||
}
|
||||
|
||||
cudaCheckError("CUDA error on ell_spmm");
|
||||
}
|
@ -0,0 +1,169 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
#define IDX2
|
||||
#define THREAD_BLOCK 128
|
||||
#define MMBSZ 8
|
||||
|
||||
#if 0
|
||||
|
||||
__global__ void
|
||||
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
|
||||
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
|
||||
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
|
||||
int cMPitch, int rPPitch, const int* rS, int rows,
|
||||
const VALUE_TYPE *x, int xPitch,
|
||||
VALUE_TYPE beta, int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *pz,*px,*py;
|
||||
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
|
||||
VALUE_TYPE yVal;
|
||||
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
|
||||
|
||||
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
unsigned int gridSize = gridDim.x * blockDim.x;
|
||||
|
||||
while (i < rows) {
|
||||
|
||||
rS += i; rP += i; cM += i;
|
||||
|
||||
int rowSize = rS[i];
|
||||
for (int k=0; k<count; k++) {
|
||||
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
|
||||
}
|
||||
|
||||
for (int j = 0; j < rowSize; j++) {
|
||||
int pointer;
|
||||
VALUE_TYPE value;
|
||||
VALUE_TYPE fetch;
|
||||
|
||||
pointer = rP[0] - baseIndex;
|
||||
rP += rPPitch;
|
||||
|
||||
value = cM[0];
|
||||
cM += cMPitch;
|
||||
|
||||
px = (VALUE_TYPE *) x;
|
||||
for (int k=0; k<count; k++) {
|
||||
fetch = px[pointer];
|
||||
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
|
||||
px = px + xPitch;
|
||||
}
|
||||
}
|
||||
// Since z and y are accessed with the same offset by the same thread,
|
||||
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = z;
|
||||
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
|
||||
for (int k=0; k<count; k++) {
|
||||
yVal = py[i];
|
||||
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
|
||||
py += yPitch;
|
||||
pz += zPitch;
|
||||
}
|
||||
} else {
|
||||
for (int k=0; k<count; k++) {
|
||||
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
|
||||
pz += zPitch;
|
||||
}
|
||||
}
|
||||
|
||||
i += gridSize;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__global__ void
|
||||
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
|
||||
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
|
||||
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
|
||||
int cMPitch, int rPPitch, const int* rS, int rows,
|
||||
const VALUE_TYPE *x, int xPitch,
|
||||
VALUE_TYPE beta, int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *pz,*px,*py;
|
||||
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
|
||||
VALUE_TYPE yVal;
|
||||
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
|
||||
|
||||
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
|
||||
|
||||
if (i < rows) {
|
||||
rS += i; rP += i; cM += i;
|
||||
|
||||
int rowSize = rS[0];
|
||||
for (int k=0; k<count; k++) {
|
||||
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
|
||||
}
|
||||
|
||||
for (int j = 0; j < rowSize; j++) {
|
||||
int pointer;
|
||||
VALUE_TYPE value;
|
||||
VALUE_TYPE fetch;
|
||||
|
||||
pointer = rP[0] - baseIndex;
|
||||
rP += rPPitch;
|
||||
|
||||
value = cM[0];
|
||||
cM += cMPitch;
|
||||
|
||||
px = (VALUE_TYPE *) x;
|
||||
for (int k=0; k<count; k++) {
|
||||
fetch = px[pointer];
|
||||
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
|
||||
px = px + xPitch;
|
||||
}
|
||||
}
|
||||
|
||||
// Since z and y are accessed with the same offset by the same thread,
|
||||
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = z;
|
||||
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
|
||||
for (int k=0; k<count; k++) {
|
||||
yVal = py[i];
|
||||
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]));
|
||||
py += yPitch;
|
||||
pz += zPitch;
|
||||
}
|
||||
} else {
|
||||
for (int k=0; k<count; k++) {
|
||||
pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha, temp[k][threadIdx.x]);
|
||||
pz += zPitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
CONCAT(_,GEN_SPGPU_ELL_NAME(TYPE_SYMBOL))
|
||||
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
|
||||
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int cMPitch, int rPPitch,
|
||||
const int* rS, const __device int* rIdx, int avgNnzPerRow, int maxNnzPerRow, int rows,
|
||||
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
|
||||
{
|
||||
dim3 block (THREAD_BLOCK, 1);
|
||||
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
|
||||
// Should we generalize the code to 1/2/4/8 threads per row?
|
||||
// And maybe adjust THREAD_BLOCK size?
|
||||
int shrMemSize,maxShmemSz;
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
|
||||
CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
|
||||
alpha, cM, rP, cMPitch, rPPitch, rS, rows,
|
||||
x, xPitch, beta, baseIndex);
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "ell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE float
|
||||
#define TYPE_SYMBOL S
|
||||
#define TEX_FETCH_TYPE float
|
||||
#include "ell_spmm_base.cuh"
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include "cuComplex.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "ell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE cuDoubleComplex
|
||||
#define TYPE_SYMBOL Z
|
||||
#define TEX_FETCH_TYPE int4
|
||||
#include "ell_spmm_base.cuh"
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include "cuComplex.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hdia.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE cuFloatComplex
|
||||
#define TYPE_SYMBOL C
|
||||
#define TEX_FETCH_TYPE cuFloatComplex
|
||||
#include "hdia_spmm_base.cuh"
|
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hdia.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
//#define ENABLE_CACHE
|
||||
#define VALUE_TYPE double
|
||||
#define TYPE_SYMBOL D
|
||||
//#define TEX_FETCH_TYPE int2
|
||||
#include "hdia_spmm_base.cuh"
|
||||
|
@ -0,0 +1,188 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
|
||||
#define PRE_CONCAT(A, B) A ## B
|
||||
#define CONCAT(A, B) PRE_CONCAT(A, B)
|
||||
|
||||
#undef GEN_SPGPU_HDIA_NAME
|
||||
#undef X_TEX
|
||||
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
|
||||
|
||||
__device__ __host__ static float zero_float() { return 0.0f; }
|
||||
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
|
||||
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
|
||||
|
||||
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
|
||||
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
|
||||
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
|
||||
|
||||
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
|
||||
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
|
||||
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
|
||||
|
||||
__device__ static float readValue_float(float fetch) { return fetch; }
|
||||
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
|
||||
|
||||
// host or c.c >= 1.3
|
||||
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
|
||||
__device__ __host__ static double zero_double() { return 0.0; }
|
||||
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
|
||||
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
|
||||
|
||||
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
|
||||
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
|
||||
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
|
||||
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
|
||||
|
||||
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
|
||||
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
|
||||
{
|
||||
cuDoubleComplex c;
|
||||
c.x = __hiloint2double (fetch.y, fetch.x);
|
||||
c.y = __hiloint2double (fetch.w, fetch.z);
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
// Texture cache management
|
||||
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
|
||||
|
||||
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
|
||||
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
|
||||
|
||||
__device__ static VALUE_TYPE
|
||||
fetchTex (int pointer)
|
||||
{
|
||||
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
|
||||
return CONCAT(readValue_,VALUE_TYPE) (fetch);
|
||||
}
|
||||
#endif
|
||||
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
|
||||
#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_vanilla)
|
||||
#include "hdia_spmm_base_template.cuh"
|
||||
#if 0
|
||||
#undef GEN_SPGPU_HDIA_NAME
|
||||
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
|
||||
#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_prefetch)
|
||||
#undef USE_PREFETCHING
|
||||
#define USE_PREFETCHING
|
||||
#include "hdia_spmm_base_template.cuh"
|
||||
#define ENABLE_CACHE
|
||||
#undef GEN_SPGPU_HDIA_NAME
|
||||
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
|
||||
#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache_prefetch)
|
||||
#include "hdia_spmm_base_template.cuh"
|
||||
#undef GEN_SPGPU_HDIA_NAME
|
||||
#undef USE_PREFETCHING
|
||||
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
|
||||
#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmm_texcache)
|
||||
#include "hdia_spmm_base_template.cuh"
|
||||
#endif
|
||||
#undef GEN_SPGPU_HDIA_NAME
|
||||
#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmm)
|
||||
|
||||
void
|
||||
GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL)
|
||||
(spgpuHandle_t handle,
|
||||
int count,
|
||||
VALUE_TYPE* z,
|
||||
int zPitch,
|
||||
const VALUE_TYPE *y,
|
||||
int yPitch,
|
||||
VALUE_TYPE alpha,
|
||||
const VALUE_TYPE* cM,
|
||||
const int* hdiaOffsets,
|
||||
int hackSize,
|
||||
const __device int* hackOffsets,
|
||||
int rows,
|
||||
int cols,
|
||||
const VALUE_TYPE *x,
|
||||
int xPitch,
|
||||
VALUE_TYPE beta)
|
||||
{
|
||||
VALUE_TYPE *px,*py, *pz;
|
||||
int cnt;
|
||||
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
|
||||
|
||||
// maxNForACall should be a multiple of hackSize
|
||||
maxNForACall = (maxNForACall/hackSize)*hackSize;
|
||||
int maxShmemSz;
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
|
||||
while (rows > maxNForACall) {//managing large vectors
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > MMBSZ) {
|
||||
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
|
||||
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, hdiaOffsets,
|
||||
hackSize, hackOffsets,
|
||||
maxNForACall, cols,
|
||||
px, xPitch, beta);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt >0) {
|
||||
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, hdiaOffsets,
|
||||
hackSize, hackOffsets,
|
||||
maxNForACall, cols,
|
||||
px, xPitch, beta);
|
||||
}
|
||||
|
||||
y = y + maxNForACall;
|
||||
z = z + maxNForACall;
|
||||
hackOffsets = hackOffsets + maxNForACall/hackSize;
|
||||
rows -= maxNForACall;
|
||||
}
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > MMBSZ) {
|
||||
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
|
||||
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, hdiaOffsets,
|
||||
hackSize, hackOffsets,
|
||||
rows, cols,
|
||||
px, xPitch, beta);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt >0) {
|
||||
CONCAT(_,GEN_SPGPU_HDIA_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, hdiaOffsets,
|
||||
hackSize, hackOffsets,
|
||||
rows, cols,
|
||||
px, xPitch, beta);
|
||||
}
|
||||
|
||||
cudaCheckError("CUDA error on hdiag_spmm");
|
||||
}
|
@ -0,0 +1,141 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#define THREAD_BLOCK 128
|
||||
#define MMBSZ 8
|
||||
|
||||
__global__ void
|
||||
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
|
||||
(int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
|
||||
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
|
||||
int hackSize, const int* hackOffsets, int rows, int cols,
|
||||
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
|
||||
{
|
||||
VALUE_TYPE *pz,*px,*py;
|
||||
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
|
||||
VALUE_TYPE yVal;
|
||||
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
|
||||
|
||||
int hackCount = (rows + hackSize - 1)/hackSize;
|
||||
|
||||
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
|
||||
|
||||
int hackId = i / hackSize;
|
||||
int hackLaneId = i % hackSize;
|
||||
|
||||
// shared between offsetsChunks and warpHackOffsetTemp
|
||||
extern __shared__ int dynShrMem[];
|
||||
|
||||
int hackOffset = 0;
|
||||
int nextOffset = 0;
|
||||
|
||||
unsigned int laneId = threadIdx.x % warpSize;
|
||||
unsigned int warpId = threadIdx.x / warpSize;
|
||||
|
||||
if (laneId == 0 && i < rows) {
|
||||
hackOffset = hackOffsets[hackId];
|
||||
nextOffset = hackOffsets[hackId+1];
|
||||
}
|
||||
|
||||
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);
|
||||
nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
|
||||
|
||||
if (hackId >= hackCount)
|
||||
return;
|
||||
|
||||
cM += hackOffset*hackSize + hackLaneId;
|
||||
hdiaOffsets += hackOffset;
|
||||
|
||||
for (int k=0; k<count; k++) {
|
||||
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
|
||||
}
|
||||
|
||||
// diags for this hack is next hackOffset minus current hackOffset
|
||||
int diags = nextOffset - hackOffset;
|
||||
|
||||
// Warp oriented
|
||||
int rounds = (diags + warpSize - 1)/warpSize;
|
||||
|
||||
volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
|
||||
|
||||
for (int r = 0; r < rounds; r++) {
|
||||
// in the last round diags will be <= warpSize
|
||||
if (laneId < diags)
|
||||
offsetsChunk[laneId] = hdiaOffsets[laneId];
|
||||
|
||||
if (i < rows) {
|
||||
int dCount = min(diags, warpSize);
|
||||
|
||||
for (int j = 0; j < dCount; ++j) {
|
||||
int column = offsetsChunk[j] + i;
|
||||
|
||||
if(column >= 0 && column < cols) {
|
||||
px = (VALUE_TYPE *) x;
|
||||
for (int k = 0; k < count; k++) {
|
||||
VALUE_TYPE xValue = px[column];
|
||||
VALUE_TYPE mValue = cM[0];
|
||||
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
|
||||
px = px + xPitch;
|
||||
}
|
||||
}
|
||||
cM += hackSize;
|
||||
}
|
||||
}
|
||||
diags -= warpSize;
|
||||
hdiaOffsets += warpSize;
|
||||
}
|
||||
|
||||
// Since z and y are accessed with the same offset by the same thread,
|
||||
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
|
||||
if (i >= rows)
|
||||
return;
|
||||
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = z;
|
||||
|
||||
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
|
||||
for (int k=0; k<count; k++) {
|
||||
yVal = py[i];
|
||||
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
|
||||
py += yPitch;
|
||||
pz += zPitch;
|
||||
}
|
||||
else
|
||||
for (int k=0; k<count; k++) {
|
||||
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
|
||||
pz += zPitch;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
|
||||
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
|
||||
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
|
||||
const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
|
||||
int xPitch, VALUE_TYPE beta)
|
||||
{
|
||||
dim3 block (THREAD_BLOCK, 1);
|
||||
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
|
||||
// Should we generalize the code to 1/2/4/8 threads per row?
|
||||
// And maybe adjust THREAD_BLOCK size?
|
||||
int shrMemSize,maxShmemSz;
|
||||
maxShmemSz = getGPUSharedMemPerBlock();
|
||||
shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
|
||||
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
|
||||
alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
|
||||
x, xPitch, beta);
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hdia.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE float
|
||||
#define TYPE_SYMBOL S
|
||||
#define TEX_FETCH_TYPE float
|
||||
#include "hdia_spmm_base.cuh"
|
||||
|
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include "cuComplex.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hdia.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE cuDoubleComplex
|
||||
#define TYPE_SYMBOL Z
|
||||
#define TEX_FETCH_TYPE int4
|
||||
#include "hdia_spmm_base.cuh"
|
||||
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include "cuComplex.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE cuFloatComplex
|
||||
#define TYPE_SYMBOL C
|
||||
#define TEX_FETCH_TYPE cuFloatComplex
|
||||
#include "hell_spmm_base.cuh"
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include <stdio.h>
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE double
|
||||
#define TYPE_SYMBOL D
|
||||
#define TEX_FETCH_TYPE int2
|
||||
#include "hell_spmm_base.cuh"
|
@ -0,0 +1,274 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#define PRE_CONCAT(A, B) A ## B
|
||||
#define CONCAT(A, B) PRE_CONCAT(A, B)
|
||||
|
||||
#undef GEN_SPGPU_HELL_NAME
|
||||
#undef X_TEX
|
||||
#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX)
|
||||
|
||||
__device__ __host__ static float zero_float() { return 0.0f; }
|
||||
__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); }
|
||||
__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; }
|
||||
|
||||
__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); }
|
||||
__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); }
|
||||
__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); }
|
||||
|
||||
__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); }
|
||||
__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); }
|
||||
__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); }
|
||||
|
||||
__device__ static float readValue_float(float fetch) { return fetch; }
|
||||
__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; }
|
||||
|
||||
// host or c.c >= 1.3
|
||||
#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__)
|
||||
__device__ __host__ static double zero_double() { return 0.0; }
|
||||
__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); }
|
||||
__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; }
|
||||
|
||||
__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); }
|
||||
__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); }
|
||||
__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); }
|
||||
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); }
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); }
|
||||
__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); }
|
||||
|
||||
__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); }
|
||||
__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch)
|
||||
{
|
||||
cuDoubleComplex c;
|
||||
c.x = __hiloint2double (fetch.y, fetch.x);
|
||||
c.y = __hiloint2double (fetch.w, fetch.z);
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
// Texture cache management
|
||||
texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX;
|
||||
|
||||
#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x)
|
||||
#define unbind_tex_x(x) cudaUnbindTexture(X_TEX)
|
||||
|
||||
__device__ static VALUE_TYPE
|
||||
fetchTex (int pointer)
|
||||
{
|
||||
TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer);
|
||||
return CONCAT(readValue_,VALUE_TYPE) (fetch);
|
||||
}
|
||||
#endif
|
||||
#if __CUDA_ARCH__ < 300
|
||||
extern __shared__ int dynShrMem[];
|
||||
#endif
|
||||
|
||||
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
|
||||
#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
|
||||
#include "hell_spmm_base_template.cuh"
|
||||
#undef GEN_SPGPU_HELL_NAME
|
||||
#if 0
|
||||
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
|
||||
#define GEN_SPGPU_HELL_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_prefetch)
|
||||
#undef USE_PREFETCHING
|
||||
#define USE_PREFETCHING
|
||||
#include "hell_spmm_base_template.cuh"
|
||||
#define ENABLE_CACHE
|
||||
#undef GEN_SPGPU_HELL_NAME
|
||||
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
|
||||
#define GEN_SPGPU_HELL_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache_prefetch)
|
||||
#include "hell_spmm_base_template.cuh"
|
||||
#undef GEN_SPGPU_HELL_NAME
|
||||
#undef USE_PREFETCHING
|
||||
#endif
|
||||
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
|
||||
#define GEN_SPGPU_HELL_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hellspmm_texcache)
|
||||
#include "hell_spmm_base_template.cuh"
|
||||
#undef GEN_SPGPU_HELL_NAME
|
||||
#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
|
||||
|
||||
#if 0
|
||||
|
||||
void
|
||||
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
|
||||
(spgpuHandle_t handle,
|
||||
int count,
|
||||
VALUE_TYPE* z,
|
||||
int zPitch,
|
||||
const VALUE_TYPE *y,
|
||||
int yPitch,
|
||||
VALUE_TYPE alpha,
|
||||
const VALUE_TYPE* cM,
|
||||
const int* rP,
|
||||
int hackSize,
|
||||
const __device int* hackOffsets,
|
||||
const __device int* rS,
|
||||
const __device int* rIdx,
|
||||
int rows,
|
||||
const VALUE_TYPE *x,
|
||||
int xPitch,
|
||||
VALUE_TYPE beta,
|
||||
int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *px,*py, *pz;
|
||||
int cnt, c1;
|
||||
|
||||
dim3 block (THREAD_BLOCK, 1);
|
||||
// dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
|
||||
// Should we generalize the code to 1/2/4/8 threads per row?
|
||||
// And maybe adjust THREAD_BLOCK size?
|
||||
int shrMemSize,maxShmemSz;
|
||||
int numMp=getGPUMultiProcessors();
|
||||
int maxThMp=getGPUMaxThreadsPerMP();
|
||||
int nmblksMp=maxThMp/THREAD_BLOCK;
|
||||
int nmblk=nmblksMp*numMp;
|
||||
dim3 grid (nmblk);
|
||||
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
|
||||
if (shrMemSize > maxShmemSz) {
|
||||
fprintf(stderr,"Fatal error: SHMEM size too large %ld %ld\n",shrMemSize,maxShmemSz);
|
||||
return;
|
||||
}
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > 2*MMBSZ) {
|
||||
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (MMBSZ, pz, zPitch,py, yPitch,
|
||||
alpha, cM, rP, hackSize, hackOffsets,
|
||||
rS, rows, px, xPitch, beta, baseIndex);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt > MMBSZ) {
|
||||
c1 = cnt/2;
|
||||
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (c1, pz, zPitch,py, yPitch,
|
||||
alpha, cM, rP, hackSize, hackOffsets,
|
||||
rS, rows, px, xPitch, beta, baseIndex);
|
||||
cnt -= c1;
|
||||
}
|
||||
if (cnt > MMBSZ) {
|
||||
fprintf(stderr,"Invalid residual count %d\n",cnt);
|
||||
} else if (cnt > 0){
|
||||
CONCAT(GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (cnt, pz, zPitch,py, yPitch,
|
||||
alpha, cM, rP, hackSize, hackOffsets,
|
||||
rS, rows, px, xPitch, beta, baseIndex);
|
||||
}
|
||||
cudaCheckError("CUDA error on hell_spmm");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void
|
||||
GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
|
||||
(spgpuHandle_t handle,
|
||||
int count,
|
||||
VALUE_TYPE* z,
|
||||
int zPitch,
|
||||
const VALUE_TYPE *y,
|
||||
int yPitch,
|
||||
VALUE_TYPE alpha,
|
||||
const VALUE_TYPE* cM,
|
||||
const int* rP,
|
||||
int hackSize,
|
||||
const __device int* hackOffsets,
|
||||
const __device int* rS,
|
||||
const __device int* rIdx,
|
||||
int rows,
|
||||
const VALUE_TYPE *x,
|
||||
int xPitch,
|
||||
VALUE_TYPE beta,
|
||||
int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *px,*py, *pz;
|
||||
int cnt;
|
||||
int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
|
||||
|
||||
// maxNForACall should be a multiple of hackSize
|
||||
maxNForACall = (maxNForACall/hackSize)*hackSize;
|
||||
int maxShmemSz;
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
//fprintf(stderr,"MaxSHmemSz %d \n",maxShmemSz);
|
||||
while (rows > maxNForACall) {//managing large vectors
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > MMBSZ) {
|
||||
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
|
||||
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
hackSize, hackOffsets,
|
||||
rS, rIdx,
|
||||
maxNForACall,
|
||||
px, xPitch, beta, baseIndex);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt >0) {
|
||||
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
hackSize, hackOffsets,
|
||||
rS, rIdx,
|
||||
maxNForACall,
|
||||
px, xPitch, beta, baseIndex);
|
||||
}
|
||||
|
||||
y = y + maxNForACall;
|
||||
z = z + maxNForACall;
|
||||
hackOffsets = hackOffsets + maxNForACall/hackSize;
|
||||
rS = rS + maxNForACall;
|
||||
|
||||
rows -= maxNForACall;
|
||||
}
|
||||
cnt = count;
|
||||
px = (VALUE_TYPE *) x;
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = (VALUE_TYPE *) z;
|
||||
while (cnt > MMBSZ) {
|
||||
//fprintf(stderr,"counts %d %d %d : pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);
|
||||
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
|
||||
alpha, cM, rP, hackSize, hackOffsets,
|
||||
rS, rIdx, rows,
|
||||
px, xPitch, beta, baseIndex);
|
||||
px += xPitch*MMBSZ;
|
||||
py += yPitch*MMBSZ;
|
||||
pz += zPitch*MMBSZ;
|
||||
cnt -= MMBSZ;
|
||||
}
|
||||
if (cnt >0) {
|
||||
CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
|
||||
py, yPitch,
|
||||
alpha, cM, rP,
|
||||
hackSize, hackOffsets,
|
||||
rS, rIdx,
|
||||
rows,
|
||||
px, xPitch, beta, baseIndex);
|
||||
}
|
||||
|
||||
cudaCheckError("CUDA error on hell_spmm");
|
||||
}
|
@ -0,0 +1,194 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2015
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
#define IDX2
|
||||
#define THREAD_BLOCK 128
|
||||
#define MMBSZ 8
|
||||
|
||||
#if 0
|
||||
|
||||
__global__ void
|
||||
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
|
||||
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
|
||||
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
|
||||
int hackSize, const int* hackOffsets, const int* rS, int rows,
|
||||
const VALUE_TYPE *x, int xPitch,
|
||||
VALUE_TYPE beta, int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *pz,*px,*py;
|
||||
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
|
||||
VALUE_TYPE yVal;
|
||||
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
|
||||
|
||||
int *rrP;
|
||||
VALUE_TYPE *rcM;
|
||||
|
||||
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
unsigned int gridSize = gridDim.x * blockDim.x;
|
||||
|
||||
while (i < rows) {
|
||||
int j;
|
||||
int hackId = i / hackSize;
|
||||
int hackLaneId = i % hackSize;
|
||||
|
||||
int hackOffset;
|
||||
unsigned int laneId = threadIdx.x % 32;
|
||||
if (laneId == 0)
|
||||
hackOffset = hackOffsets[hackId];
|
||||
//__syncthreads();
|
||||
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
|
||||
|
||||
rrP = (int *) rP + hackOffset;
|
||||
rcM = (VALUE_TYPE *) cM + hackOffset;
|
||||
|
||||
int rowSize = rS[i];
|
||||
for (int k=0; k<count; k++) {
|
||||
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
|
||||
}
|
||||
|
||||
for (int j = 0; j < rowSize; j++) {
|
||||
int pointer;
|
||||
VALUE_TYPE value;
|
||||
VALUE_TYPE fetch;
|
||||
|
||||
pointer = rrP[0] - baseIndex;
|
||||
rrP += hackSize;
|
||||
|
||||
value = rcM[0];
|
||||
rcM += hackSize;
|
||||
|
||||
px = (VALUE_TYPE *) x;
|
||||
for (int k=0; k<count; k++) {
|
||||
fetch = px[pointer];
|
||||
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
|
||||
px = px + xPitch;
|
||||
}
|
||||
}
|
||||
// Since z and y are accessed with the same offset by the same thread,
|
||||
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = z;
|
||||
if (CONCAT(VALUE_TYPE, _isNotZero(beta))) {
|
||||
for (int k=0; k<count; k++) {
|
||||
yVal = py[i];
|
||||
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
|
||||
py += yPitch;
|
||||
pz += zPitch;
|
||||
}
|
||||
} else {
|
||||
for (int k=0; k<count; k++) {
|
||||
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
|
||||
pz += zPitch;
|
||||
}
|
||||
}
|
||||
|
||||
i += gridSize;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__global__ void
|
||||
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
|
||||
(int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
|
||||
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
|
||||
int hackSize, const int* hackOffsets, const int* rS, int rows,
|
||||
const VALUE_TYPE *x, int xPitch,
|
||||
VALUE_TYPE beta, int baseIndex)
|
||||
{
|
||||
VALUE_TYPE *pz,*px,*py;
|
||||
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
|
||||
VALUE_TYPE yVal;
|
||||
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
|
||||
|
||||
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
|
||||
|
||||
if (i < rows) {
|
||||
int j;
|
||||
int hackId = i / hackSize;
|
||||
int hackLaneId = i % hackSize;
|
||||
|
||||
int hackOffset;
|
||||
unsigned int laneId = threadIdx.x % 32;
|
||||
if (laneId == 0)
|
||||
hackOffset = hackOffsets[hackId];
|
||||
//__syncthreads();
|
||||
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;
|
||||
|
||||
rP += hackOffset;
|
||||
cM += hackOffset;
|
||||
|
||||
int rowSize = rS[i];
|
||||
for (int k=0; k<count; k++) {
|
||||
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
|
||||
}
|
||||
|
||||
for (int j = 0; j < rowSize; j++) {
|
||||
int pointer;
|
||||
VALUE_TYPE value;
|
||||
VALUE_TYPE fetch;
|
||||
|
||||
pointer = rP[0] - baseIndex;
|
||||
rP += hackSize;
|
||||
|
||||
value = cM[0];
|
||||
cM += hackSize;
|
||||
|
||||
px = (VALUE_TYPE *) x;
|
||||
for (int k=0; k<count; k++) {
|
||||
fetch = px[pointer];
|
||||
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[k][threadIdx.x]);
|
||||
px = px + xPitch;
|
||||
}
|
||||
}
|
||||
// Since z and y are accessed with the same offset by the same thread,
|
||||
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
|
||||
py = (VALUE_TYPE *) y;
|
||||
pz = z;
|
||||
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
|
||||
for (int k=0; k<count; k++) {
|
||||
yVal = py[i];
|
||||
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
|
||||
py += yPitch;
|
||||
pz += zPitch;
|
||||
}
|
||||
else
|
||||
for (int k=0; k<count; k++) {
|
||||
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
|
||||
pz += zPitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
CONCAT(_,GEN_SPGPU_HELL_NAME(TYPE_SYMBOL))
|
||||
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
|
||||
VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
|
||||
const int* rS, const __device int* rIdx, int rows,
|
||||
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
|
||||
{
|
||||
dim3 block (THREAD_BLOCK, 1);
|
||||
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
|
||||
// Should we generalize the code to 1/2/4/8 threads per row?
|
||||
// And maybe adjust THREAD_BLOCK size?
|
||||
int shrMemSize,maxShmemSz;
|
||||
maxShmemSz=getGPUSharedMemPerBlock();
|
||||
shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
|
||||
CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
|
||||
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
|
||||
alpha, cM, rP, hackSize, hackOffsets, rS, rows,
|
||||
x, xPitch, beta, baseIndex);
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE float
|
||||
#define TYPE_SYMBOL S
|
||||
#define TEX_FETCH_TYPE float
|
||||
#include "hell_spmm_base.cuh"
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* spGPU - Sparse matrices on GPU library.
|
||||
*
|
||||
* Copyright (C) 2010 - 2014
|
||||
* Davide Barbieri - University of Rome Tor Vergata
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 3 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include "cudadebug.h"
|
||||
#include "cudalang.h"
|
||||
#include "cuComplex.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "core.h"
|
||||
#include "hell.h"
|
||||
int getGPUSharedMemPerBlock();
|
||||
int getGPUMultiProcessors();
|
||||
int getGPUMaxThreadsPerMP();
|
||||
}
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#define VALUE_TYPE cuDoubleComplex
|
||||
#define TYPE_SYMBOL Z
|
||||
#define TEX_FETCH_TYPE int4
|
||||
#include "hell_spmm_base.cuh"
|
@ -1,438 +0,0 @@
|
||||
!
|
||||
! Parallel Sparse BLAS version 3.5
|
||||
! (C) Copyright 2006-2018
|
||||
! Salvatore Filippone
|
||||
! Alfredo Buttari
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
!
|
||||
subroutine psb_cilu_fct(a,l,u,d,info,blck)
|
||||
|
||||
!
|
||||
! This routine copies and factors "on the fly" from A and BLCK
|
||||
! into L/D/U.
|
||||
!
|
||||
!
|
||||
use psb_base_mod
|
||||
implicit none
|
||||
! .. Scalar Arguments ..
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
! .. Array Arguments ..
|
||||
type(psb_cspmat_type),intent(in) :: a
|
||||
type(psb_c_csr_sparse_mat),intent(inout) :: l,u
|
||||
type(psb_cspmat_type),intent(in), optional, target :: blck
|
||||
complex(psb_spk_), intent(inout) :: d(:)
|
||||
! .. Local Scalars ..
|
||||
integer(psb_ipk_) :: l1, l2,m,err_act
|
||||
type(psb_cspmat_type), pointer :: blck_
|
||||
character(len=20) :: name, ch_err
|
||||
name='psb_ilu_fct'
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
! .. Executable Statements ..
|
||||
!
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => blck
|
||||
else
|
||||
allocate(blck_,stat=info)
|
||||
if (info /= psb_success_) then
|
||||
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call blck_%csall(izero,izero,info,ione)
|
||||
|
||||
endif
|
||||
|
||||
call psb_cilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
|
||||
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_cilu_fctint'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call l%set_triangle()
|
||||
call l%set_lower()
|
||||
call l%set_unit()
|
||||
call u%set_triangle()
|
||||
call u%set_upper()
|
||||
call u%set_unit()
|
||||
call l%set_nrows(m)
|
||||
call l%set_ncols(m)
|
||||
call u%set_nrows(m)
|
||||
call u%set_ncols(m)
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => null()
|
||||
else
|
||||
call blck_%free()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_free'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
deallocate(blck_)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
|
||||
contains
|
||||
subroutine psb_cilu_fctint(m,ma,a,mb,b,&
|
||||
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
|
||||
implicit none
|
||||
|
||||
type(psb_cspmat_type) :: a,b
|
||||
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
|
||||
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
|
||||
complex(psb_spk_), dimension(:) :: laspk,uaspk,d
|
||||
|
||||
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
|
||||
complex(psb_spk_) :: dia,temp
|
||||
integer(psb_ipk_), parameter :: nrb=60
|
||||
type(psb_c_coo_sparse_mat) :: trw
|
||||
integer(psb_ipk_) :: int_err(5)
|
||||
character(len=20) :: name, ch_err
|
||||
|
||||
name='psb_cilu_fctint'
|
||||
if(psb_get_errstatus() /= 0) return
|
||||
info=psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
call trw%allocate(izero,izero,ione)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_all'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
lia2(1) = 1
|
||||
uia2(1) = 1
|
||||
l1=0
|
||||
l2=0
|
||||
m = ma+mb
|
||||
|
||||
do i = 1, ma
|
||||
d(i) = czero
|
||||
|
||||
!
|
||||
!
|
||||
select type(aa => a%a)
|
||||
type is (psb_c_csr_sparse_mat)
|
||||
do j = aa%irp(i), aa%irp(i+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i,i+irb-1,trw,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
nz = trw%get_nzeros()
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
!!$
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloop: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
|
||||
dia = dia - temp*uaspk(jj)
|
||||
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
|
||||
cycle updateloop
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloop
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (abs(dia) < s_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
info = psb_err_pivot_too_small_
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') abs(dia)
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = cone/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! write(psb_err_unit,*)'diag(',i,')=',d(i)
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
do i = ma+1, m
|
||||
d(i) = czero
|
||||
|
||||
select type(aa => b%a)
|
||||
type is (psb_c_csr_sparse_mat)
|
||||
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i-ma,i-ma+irb-1,trw,info)
|
||||
nz = trw%get_nzeros()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloopb: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
dia = dia - temp*uaspk(jj)
|
||||
cycle updateloopb
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloopb
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (abs(dia) < s_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') abs(dia)
|
||||
info = psb_err_pivot_too_small_
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = cone/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call trw%free()
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
end subroutine psb_cilu_fctint
|
||||
end subroutine psb_cilu_fct
|
@ -1,441 +0,0 @@
|
||||
!
|
||||
! Parallel Sparse BLAS version 3.5
|
||||
! (C) Copyright 2006-2018
|
||||
! Salvatore Filippone
|
||||
! Alfredo Buttari
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
!
|
||||
subroutine psb_dilu_fct(a,l,u,d,info,blck)
|
||||
|
||||
!
|
||||
! This routine copies and factors "on the fly" from A and BLCK
|
||||
! into L/D/U.
|
||||
!
|
||||
!
|
||||
use psb_base_mod
|
||||
implicit none
|
||||
! .. Scalar Arguments ..
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
! .. Array Arguments ..
|
||||
type(psb_dspmat_type),intent(in) :: a
|
||||
type(psb_d_csr_sparse_mat),intent(inout) :: l,u
|
||||
type(psb_dspmat_type),intent(in), optional, target :: blck
|
||||
real(psb_dpk_), intent(inout) :: d(:)
|
||||
! .. Local Scalars ..
|
||||
integer(psb_ipk_) :: l1,l2,m,err_act
|
||||
type(psb_dspmat_type), pointer :: blck_
|
||||
character(len=20) :: name, ch_err
|
||||
name='psb_ilu_fct'
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
! .. Executable Statements ..
|
||||
!
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => blck
|
||||
else
|
||||
allocate(blck_,stat=info)
|
||||
if (info /= psb_success_) then
|
||||
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call blck_%csall(izero,izero,info,ione)
|
||||
|
||||
endif
|
||||
|
||||
call psb_dilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
|
||||
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_dilu_fctint'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call l%set_triangle()
|
||||
call l%set_lower()
|
||||
call l%set_unit()
|
||||
call u%set_triangle()
|
||||
call u%set_upper()
|
||||
call u%set_unit()
|
||||
call l%set_nrows(m)
|
||||
call l%set_ncols(m)
|
||||
call u%set_nrows(m)
|
||||
call u%set_ncols(m)
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => null()
|
||||
else
|
||||
call blck_%free()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_free'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
deallocate(blck_)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
|
||||
contains
|
||||
subroutine psb_dilu_fctint(m,ma,a,mb,b,&
|
||||
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
|
||||
use psb_mat_mod
|
||||
|
||||
implicit none
|
||||
|
||||
type(psb_dspmat_type), target :: a
|
||||
type(psb_dspmat_type), target :: b
|
||||
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
|
||||
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
|
||||
real(psb_dpk_), dimension(:) :: laspk,uaspk,d
|
||||
|
||||
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
|
||||
real(psb_dpk_) :: dia,temp
|
||||
integer(psb_ipk_), parameter :: nrb=60
|
||||
type(psb_d_coo_sparse_mat) :: trw
|
||||
integer(psb_ipk_) :: int_err(5)
|
||||
character(len=20) :: name, ch_err
|
||||
|
||||
|
||||
name='psb_dilu_fctint'
|
||||
if(psb_get_errstatus() /= 0) return
|
||||
info=psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
call trw%allocate(izero,izero,ione)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_all'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
lia2(1) = 1
|
||||
uia2(1) = 1
|
||||
l1=0
|
||||
l2=0
|
||||
m = ma+mb
|
||||
|
||||
do i = 1, ma
|
||||
d(i) = dzero
|
||||
!
|
||||
!
|
||||
select type(aa => a%a)
|
||||
type is (psb_d_csr_sparse_mat)
|
||||
do j = aa%irp(i), aa%irp(i+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i,i+irb-1,trw,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
nz = trw%get_nzeros()
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
!!$
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloop: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
|
||||
dia = dia - temp*uaspk(jj)
|
||||
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
|
||||
cycle updateloop
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloop
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (dabs(dia) < d_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
info = psb_err_pivot_too_small_
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') dia
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = done/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! write(psb_err_unit,*)'diag(',i,')=',d(i)
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
do i = ma+1, m
|
||||
d(i) = dzero
|
||||
|
||||
select type(aa => b%a)
|
||||
type is (psb_d_csr_sparse_mat)
|
||||
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i-ma,i-ma+irb-1,trw,info)
|
||||
nz = trw%get_nzeros()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloopb: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
dia = dia - temp*uaspk(jj)
|
||||
cycle updateloopb
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloopb
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (dabs(dia) < d_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') dia
|
||||
info = psb_err_pivot_too_small_
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = done/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call trw%free()
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
end subroutine psb_dilu_fctint
|
||||
end subroutine psb_dilu_fct
|
@ -1,440 +0,0 @@
|
||||
!
|
||||
! Parallel Sparse BLAS version 3.5
|
||||
! (C) Copyright 2006-2018
|
||||
! Salvatore Filippone
|
||||
! Alfredo Buttari
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
!
|
||||
subroutine psb_silu_fct(a,l,u,d,info,blck)
|
||||
|
||||
!
|
||||
! This routine copies and factors "on the fly" from A and BLCK
|
||||
! into L/D/U.
|
||||
!
|
||||
!
|
||||
use psb_base_mod
|
||||
implicit none
|
||||
! .. Scalar Arguments ..
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
! .. Array Arguments ..
|
||||
type(psb_sspmat_type),intent(in) :: a
|
||||
type(psb_s_csr_sparse_mat),intent(inout) :: l,u
|
||||
type(psb_sspmat_type),intent(in), optional, target :: blck
|
||||
real(psb_spk_), intent(inout) :: d(:)
|
||||
! .. Local Scalars ..
|
||||
integer(psb_ipk_) :: l1,l2,m,err_act
|
||||
type(psb_sspmat_type), pointer :: blck_
|
||||
character(len=20) :: name, ch_err
|
||||
name='psb_ilu_fct'
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
! .. Executable Statements ..
|
||||
!
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => blck
|
||||
else
|
||||
allocate(blck_,stat=info)
|
||||
if (info /= psb_success_) then
|
||||
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call blck_%csall(izero,izero,info,ione)
|
||||
|
||||
endif
|
||||
|
||||
call psb_silu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
|
||||
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_silu_fctint'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call l%set_triangle()
|
||||
call l%set_lower()
|
||||
call l%set_unit()
|
||||
call u%set_triangle()
|
||||
call u%set_upper()
|
||||
call u%set_unit()
|
||||
call l%set_nrows(m)
|
||||
call l%set_ncols(m)
|
||||
call u%set_nrows(m)
|
||||
call u%set_ncols(m)
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => null()
|
||||
else
|
||||
call blck_%free()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_free'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
deallocate(blck_)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
|
||||
contains
|
||||
subroutine psb_silu_fctint(m,ma,a,mb,b,&
|
||||
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
|
||||
use psb_mat_mod
|
||||
|
||||
implicit none
|
||||
|
||||
type(psb_sspmat_type) :: a
|
||||
type(psb_sspmat_type) :: b
|
||||
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
|
||||
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
|
||||
real(psb_spk_), dimension(:) :: laspk,uaspk,d
|
||||
|
||||
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
|
||||
real(psb_spk_) :: dia,temp
|
||||
integer(psb_ipk_), parameter :: nrb=60
|
||||
type(psb_s_coo_sparse_mat) :: trw
|
||||
integer(psb_ipk_) :: int_err(5)
|
||||
character(len=20) :: name, ch_err
|
||||
|
||||
name='psb_silu_fctint'
|
||||
if(psb_get_errstatus() /= 0) return
|
||||
info=psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
call trw%allocate(izero,izero,ione)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_all'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
lia2(1) = 1
|
||||
uia2(1) = 1
|
||||
l1=0
|
||||
l2=0
|
||||
m = ma+mb
|
||||
|
||||
do i = 1, ma
|
||||
d(i) = szero
|
||||
!
|
||||
!
|
||||
select type(aa => a%a)
|
||||
type is (psb_s_csr_sparse_mat)
|
||||
do j = aa%irp(i), aa%irp(i+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i,i+irb-1,trw,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
nz = trw%get_nzeros()
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
!!$
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloop: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
|
||||
dia = dia - temp*uaspk(jj)
|
||||
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
|
||||
cycle updateloop
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloop
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (abs(dia) < s_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
info = psb_err_pivot_too_small_
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') dia
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = sone/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! write(psb_err_unit,*)'diag(',i,')=',d(i)
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
do i = ma+1, m
|
||||
d(i) = szero
|
||||
|
||||
select type(aa => b%a)
|
||||
type is (psb_s_csr_sparse_mat)
|
||||
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i-ma,i-ma+irb-1,trw,info)
|
||||
nz = trw%get_nzeros()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloopb: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
dia = dia - temp*uaspk(jj)
|
||||
cycle updateloopb
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloopb
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (abs(dia) < s_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') dia
|
||||
info = psb_err_pivot_too_small_
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = sone/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call trw%free()
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
end subroutine psb_silu_fctint
|
||||
end subroutine psb_silu_fct
|
@ -1,438 +0,0 @@
|
||||
!
|
||||
! Parallel Sparse BLAS version 3.5
|
||||
! (C) Copyright 2006-2018
|
||||
! Salvatore Filippone
|
||||
! Alfredo Buttari
|
||||
!
|
||||
! Redistribution and use in source and binary forms, with or without
|
||||
! modification, are permitted provided that the following conditions
|
||||
! are met:
|
||||
! 1. Redistributions of source code must retain the above copyright
|
||||
! notice, this list of conditions and the following disclaimer.
|
||||
! 2. Redistributions in binary form must reproduce the above copyright
|
||||
! notice, this list of conditions, and the following disclaimer in the
|
||||
! documentation and/or other materials provided with the distribution.
|
||||
! 3. The name of the PSBLAS group or the names of its contributors may
|
||||
! not be used to endorse or promote products derived from this
|
||||
! software without specific written permission.
|
||||
!
|
||||
! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS
|
||||
! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
! POSSIBILITY OF SUCH DAMAGE.
|
||||
!
|
||||
!
|
||||
subroutine psb_zilu_fct(a,l,u,d,info,blck)
|
||||
|
||||
!
|
||||
! This routine copies and factors "on the fly" from A and BLCK
|
||||
! into L/D/U.
|
||||
!
|
||||
!
|
||||
use psb_base_mod
|
||||
implicit none
|
||||
! .. Scalar Arguments ..
|
||||
integer(psb_ipk_), intent(out) :: info
|
||||
! .. Array Arguments ..
|
||||
type(psb_zspmat_type),intent(in) :: a
|
||||
type(psb_z_csr_sparse_mat),intent(inout) :: l,u
|
||||
type(psb_zspmat_type),intent(in), optional, target :: blck
|
||||
complex(psb_dpk_), intent(inout) :: d(:)
|
||||
! .. Local Scalars ..
|
||||
integer(psb_ipk_) :: l1, l2,m,err_act
|
||||
type(psb_zspmat_type), pointer :: blck_
|
||||
character(len=20) :: name, ch_err
|
||||
name='psb_ilu_fct'
|
||||
info = psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
! .. Executable Statements ..
|
||||
!
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => blck
|
||||
else
|
||||
allocate(blck_,stat=info)
|
||||
if (info /= psb_success_) then
|
||||
call psb_errpush(psb_err_from_subroutine_,name,a_err='Allocate')
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call blck_%csall(izero,izero,info,ione)
|
||||
|
||||
endif
|
||||
|
||||
call psb_zilu_fctint(m,a%get_nrows(),a,blck_%get_nrows(),blck_,&
|
||||
& d,l%val,l%ja,l%irp,u%val,u%ja,u%irp,l1,l2,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_zilu_fctint'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
call l%set_triangle()
|
||||
call l%set_lower()
|
||||
call l%set_unit()
|
||||
call u%set_triangle()
|
||||
call u%set_upper()
|
||||
call u%set_unit()
|
||||
call l%set_nrows(m)
|
||||
call l%set_ncols(m)
|
||||
call u%set_nrows(m)
|
||||
call u%set_ncols(m)
|
||||
|
||||
if (present(blck)) then
|
||||
blck_ => null()
|
||||
else
|
||||
call blck_%free()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_free'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
deallocate(blck_)
|
||||
endif
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
|
||||
contains
|
||||
subroutine psb_zilu_fctint(m,ma,a,mb,b,&
|
||||
& d,laspk,lia1,lia2,uaspk,uia1,uia2,l1,l2,info)
|
||||
implicit none
|
||||
|
||||
type(psb_zspmat_type) :: a,b
|
||||
integer(psb_ipk_) :: m,ma,mb,l1,l2,info
|
||||
integer(psb_ipk_), dimension(:) :: lia1,lia2,uia1,uia2
|
||||
complex(psb_dpk_), dimension(:) :: laspk,uaspk,d
|
||||
|
||||
integer(psb_ipk_) :: i,j,k,l,low1,low2,kk,jj,ll, irb, ktrw,err_act, nz
|
||||
complex(psb_dpk_) :: dia,temp
|
||||
integer(psb_ipk_), parameter :: nrb=60
|
||||
type(psb_z_coo_sparse_mat) :: trw
|
||||
integer(psb_ipk_) :: int_err(5)
|
||||
character(len=20) :: name, ch_err
|
||||
|
||||
name='psb_zilu_fctint'
|
||||
if(psb_get_errstatus() /= 0) return
|
||||
info=psb_success_
|
||||
call psb_erractionsave(err_act)
|
||||
call trw%allocate(izero,izero,ione)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='psb_sp_all'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
|
||||
lia2(1) = 1
|
||||
uia2(1) = 1
|
||||
l1=0
|
||||
l2=0
|
||||
m = ma+mb
|
||||
|
||||
do i = 1, ma
|
||||
d(i) = zzero
|
||||
|
||||
!
|
||||
!
|
||||
select type(aa => a%a)
|
||||
type is (psb_z_csr_sparse_mat)
|
||||
do j = aa%irp(i), aa%irp(i+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i,i+irb-1,trw,info)
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
nz = trw%get_nzeros()
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
!!$
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloop: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
! write(psb_err_unit,*)'aggiorno dia',dia,'temp',temp,'jj',jj,'u%aspk',uaspk(jj)
|
||||
dia = dia - temp*uaspk(jj)
|
||||
! write(psb_err_unit,*)'dia',dia,'temp',temp,'jj',jj,'aspk',uaspk(jj)
|
||||
cycle updateloop
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloop
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloop
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (abs(dia) < d_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
info = psb_err_pivot_too_small_
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') abs(dia)
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = zone/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! write(psb_err_unit,*)'diag(',i,')=',d(i)
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
do i = ma+1, m
|
||||
d(i) = zzero
|
||||
|
||||
select type(aa => b%a)
|
||||
type is (psb_z_csr_sparse_mat)
|
||||
do j = aa%irp(i-ma), aa%irp(i-ma+1) - 1
|
||||
k = aa%ja(j)
|
||||
! write(psb_err_unit,*)'KKKKK',k
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = aa%val(j)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = aa%val(j)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = aa%val(j)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
enddo
|
||||
|
||||
class default
|
||||
|
||||
if ((mod(i,nrb) == 1).or.(nrb == 1)) then
|
||||
irb = min(ma-i+1,nrb)
|
||||
call aa%csget(i-ma,i-ma+irb-1,trw,info)
|
||||
nz = trw%get_nzeros()
|
||||
if(info /= psb_success_) then
|
||||
info=psb_err_from_subroutine_
|
||||
ch_err='a%csget'
|
||||
call psb_errpush(info,name,a_err=ch_err)
|
||||
goto 9999
|
||||
end if
|
||||
ktrw=1
|
||||
end if
|
||||
|
||||
do
|
||||
if (ktrw > nz ) exit
|
||||
if (trw%ia(ktrw) > i) exit
|
||||
k = trw%ja(ktrw)
|
||||
if ((k < i).and.(k >= 1)) then
|
||||
l1 = l1 + 1
|
||||
laspk(l1) = trw%val(ktrw)
|
||||
lia1(l1) = k
|
||||
else if (k == i) then
|
||||
d(i) = trw%val(ktrw)
|
||||
else if ((k > i).and.(k <= m)) then
|
||||
l2 = l2 + 1
|
||||
uaspk(l2) = trw%val(ktrw)
|
||||
uia1(l2) = k
|
||||
end if
|
||||
ktrw = ktrw + 1
|
||||
enddo
|
||||
end select
|
||||
|
||||
|
||||
lia2(i+1) = l1 + 1
|
||||
uia2(i+1) = l2 + 1
|
||||
|
||||
dia = d(i)
|
||||
do kk = lia2(i), lia2(i+1) - 1
|
||||
!
|
||||
! compute element alo(i,k) of incomplete factorization
|
||||
!
|
||||
temp = laspk(kk)
|
||||
k = lia1(kk)
|
||||
laspk(kk) = temp*d(k)
|
||||
! update the rest of row i using alo(i,k)
|
||||
low1 = kk + 1
|
||||
low2 = uia2(i)
|
||||
updateloopb: do jj = uia2(k), uia2(k+1) - 1
|
||||
j = uia1(jj)
|
||||
!
|
||||
if (j < i) then
|
||||
! search alo(i,*) for matching index J
|
||||
do ll = low1, lia2(i+1) - 1
|
||||
l = lia1(ll)
|
||||
if (l > j) then
|
||||
low1 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
laspk(ll) = laspk(ll) - temp*uaspk(jj)
|
||||
low1 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
!
|
||||
else if (j == i) then
|
||||
! j=i update diagonal
|
||||
dia = dia - temp*uaspk(jj)
|
||||
cycle updateloopb
|
||||
!
|
||||
else if (j > i) then
|
||||
! search aup(i,*) for matching index j
|
||||
do ll = low2, uia2(i+1) - 1
|
||||
l = uia1(ll)
|
||||
if (l > j) then
|
||||
low2 = ll
|
||||
exit
|
||||
else if (l == j) then
|
||||
uaspk(ll) = uaspk(ll) - temp*uaspk(jj)
|
||||
low2 = ll + 1
|
||||
cycle updateloopb
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
!
|
||||
! for milu al=1.; for ilu al=0.
|
||||
! al = 1.d0
|
||||
! dia = dia - al*temp*aup(jj)
|
||||
enddo updateloopb
|
||||
enddo
|
||||
!
|
||||
!
|
||||
! Non singularity
|
||||
!
|
||||
if (abs(dia) < d_epstol) then
|
||||
!
|
||||
! Pivot too small: unstable factorization
|
||||
!
|
||||
int_err(1) = i
|
||||
write(ch_err,'(g20.10)') abs(dia)
|
||||
info = psb_err_pivot_too_small_
|
||||
call psb_errpush(info,name,i_err=int_err,a_err=ch_err)
|
||||
goto 9999
|
||||
else
|
||||
dia = zone/dia
|
||||
end if
|
||||
d(i) = dia
|
||||
! Scale row i of upper triangle
|
||||
do kk = uia2(i), uia2(i+1) - 1
|
||||
uaspk(kk) = uaspk(kk)*dia
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call trw%free()
|
||||
|
||||
call psb_erractionrestore(err_act)
|
||||
return
|
||||
|
||||
9999 continue
|
||||
call psb_erractionrestore(err_act)
|
||||
if (err_act == psb_act_abort_) then
|
||||
call psb_error()
|
||||
return
|
||||
end if
|
||||
return
|
||||
end subroutine psb_zilu_fctint
|
||||
end subroutine psb_zilu_fct
|
Loading…
Reference in New Issue