diff --git a/cuda/impl/Makefile b/cuda/impl/Makefile old mode 100755 new mode 100644 index 12bf0747..0f8536b8 --- a/cuda/impl/Makefile +++ b/cuda/impl/Makefile @@ -275,19 +275,26 @@ psb_d_cuda_cp_hdiag_from_coo.o \ psb_d_cuda_mv_hdiag_from_coo.o \ psb_d_cuda_hdiag_to_gpu.o \ psb_d_cuda_hdiag_csmv.o \ +psb_d_cuda_hdiag_csmm.o \ psb_d_cuda_hdiag_mold.o \ psb_d_cuda_hdiag_vect_mv.o \ psb_s_cuda_cp_hdiag_from_coo.o \ psb_s_cuda_mv_hdiag_from_coo.o \ psb_s_cuda_hdiag_to_gpu.o \ psb_s_cuda_hdiag_csmv.o \ +psb_s_cuda_hdiag_csmm.o \ psb_s_cuda_hdiag_mold.o \ psb_s_cuda_hdiag_vect_mv.o \ psb_s_cuda_dnsg_mat_impl.o \ psb_d_cuda_dnsg_mat_impl.o \ psb_c_cuda_dnsg_mat_impl.o \ -psb_z_cuda_dnsg_mat_impl.o - +psb_z_cuda_dnsg_mat_impl.o \ +psb_z_cuda_hdiag_csmv.o \ +psb_z_cuda_hdiag_csmm.o \ +psb_z_cuda_hdiag_vect_mv.o \ +psb_c_cuda_hdiag_csmv.o \ +psb_c_cuda_hdiag_csmm.o \ +psb_c_cuda_hdiag_vect_mv.o objs: $(OBJS) lib: objs diff --git a/cuda/impl/psb_c_cuda_hdiag_csmm.F90 b/cuda/impl/psb_c_cuda_hdiag_csmm.F90 new file mode 100644 index 00000000..75bd59d9 --- /dev/null +++ b/cuda/impl/psb_c_cuda_hdiag_csmm.F90 @@ -0,0 +1,123 @@ +! Parallel Sparse BLAS GPU plugin +! (C) Copyright 2013 +! +! Salvatore Filippone +! Alessandro Fanfarillo +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! + +subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hdiagdev_mod + use psb_vectordev_mod + use psb_c_cuda_hdiag_mat_mod, psb_protect_name => psb_c_cuda_hdiag_csmm + implicit none + class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta, x(:,:) + complex(psb_spk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + character :: trans_ + integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy + complex(psb_spk_), allocatable :: acc(:) + type(c_ptr) :: gpX, gpY + logical :: tra + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='c_cuda_hdiag_csmm' + logical, parameter :: debug=.false. + + info = psb_success_ + call psb_erractionsave(err_act) + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + + if (tra) then + m = a%get_ncols() + n = a%get_nrows() + else + n = a%get_ncols() + m = a%get_nrows() + end if + + if (size(x,1) psb_c_cuda_hdiag_multivect_mv + use psb_c_cuda_multivect_mod + implicit none + class(psb_c_cuda_hdiag_mat_mod), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta + class(psb_c_base_multivect_type), intent(inout) :: x + class(psb_c_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + complex(psb_spk_), allocatable :: rx(:,:), ry(:,:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='c_cuda_hdiag_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (a%is_dev()) call a%sync() + if (.not.x%is_host()) call x%sync() + if (beta /= dzero) then + if (.not.y%is_host()) call y%sync() + end if + call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_c_multivect_cuda) + select type(yy => y) + type is (psb_c_multivect_cuda) + if (a%is_host()) call a%sync() + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + if (a%is_dev()) call a%sync() + rx = xx%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + if (a%is_dev()) call a%sync() + rx = x%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_c_cuda_hdiag_multivect_mv diff --git a/cuda/impl/psb_d_cuda_hdiag_csmm.F90 b/cuda/impl/psb_d_cuda_hdiag_csmm.F90 new file mode 100644 index 00000000..2629db74 --- /dev/null +++ b/cuda/impl/psb_d_cuda_hdiag_csmm.F90 @@ -0,0 +1,123 @@ +! Parallel Sparse BLAS GPU plugin +! (C) Copyright 2013 +! +! Salvatore Filippone +! Alessandro Fanfarillo +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! + +subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hdiagdev_mod + use psb_vectordev_mod + use psb_d_cuda_hdiag_mat_mod, psb_protect_name => psb_d_cuda_hdiag_csmm + implicit none + class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta, x(:,:) + real(psb_dpk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + character :: trans_ + integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy + real(psb_dpk_), allocatable :: acc(:) + type(c_ptr) :: gpX, gpY + logical :: tra + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='d_cuda_hdiag_csmm' + logical, parameter :: debug=.false. + + info = psb_success_ + call psb_erractionsave(err_act) + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + + if (tra) then + m = a%get_ncols() + n = a%get_nrows() + else + n = a%get_ncols() + m = a%get_nrows() + end if + + if (size(x,1) psb_d_cuda_hdiag_multivect_mv + use psb_d_cuda_multivect_mod + implicit none + class(psb_d_cuda_hdiag_mat_mod), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_multivect_type), intent(inout) :: x + class(psb_d_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + real(psb_dpk_), allocatable :: rx(:,:), ry(:,:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='d_cuda_hdiag_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (a%is_dev()) call a%sync() + if (.not.x%is_host()) call x%sync() + if (beta /= dzero) then + if (.not.y%is_host()) call y%sync() + end if + call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_d_multivect_cuda) + select type(yy => y) + type is (psb_d_multivect_cuda) + if (a%is_host()) call a%sync() + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + if (a%is_dev()) call a%sync() + rx = xx%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + if (a%is_dev()) call a%sync() + rx = x%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_d_cuda_hdiag_multivect_mv diff --git a/cuda/impl/psb_s_cuda_hdiag_csmm.F90 b/cuda/impl/psb_s_cuda_hdiag_csmm.F90 new file mode 100644 index 00000000..7066daf4 --- /dev/null +++ b/cuda/impl/psb_s_cuda_hdiag_csmm.F90 @@ -0,0 +1,123 @@ +! Parallel Sparse BLAS GPU plugin +! (C) Copyright 2013 +! +! Salvatore Filippone +! Alessandro Fanfarillo +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! + +subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hdiagdev_mod + use psb_vectordev_mod + use psb_s_cuda_hdiag_mat_mod, psb_protect_name => psb_s_cuda_hdiag_csmm + implicit none + class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta, x(:,:) + real(psb_spk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + character :: trans_ + integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy + real(psb_spk_), allocatable :: acc(:) + type(c_ptr) :: gpX, gpY + logical :: tra + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='s_cuda_hdiag_csmm' + logical, parameter :: debug=.false. + + info = psb_success_ + call psb_erractionsave(err_act) + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + + if (tra) then + m = a%get_ncols() + n = a%get_nrows() + else + n = a%get_ncols() + m = a%get_nrows() + end if + + if (size(x,1) psb_s_cuda_hdiag_multivect_mv + use psb_s_cuda_multivect_mod + implicit none + class(psb_s_cuda_hdiag_mat_mod), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta + class(psb_s_base_multivect_type), intent(inout) :: x + class(psb_s_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + real(psb_spk_), allocatable :: rx(:,:), ry(:,:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='s_cuda_hdiag_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (a%is_dev()) call a%sync() + if (.not.x%is_host()) call x%sync() + if (beta /= dzero) then + if (.not.y%is_host()) call y%sync() + end if + call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_s_multivect_cuda) + select type(yy => y) + type is (psb_s_multivect_cuda) + if (a%is_host()) call a%sync() + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + if (a%is_dev()) call a%sync() + rx = xx%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + if (a%is_dev()) call a%sync() + rx = x%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_s_cuda_hdiag_multivect_mv diff --git a/cuda/impl/psb_z_cuda_hdiag_csmm.F90 b/cuda/impl/psb_z_cuda_hdiag_csmm.F90 new file mode 100644 index 00000000..5d5dc700 --- /dev/null +++ b/cuda/impl/psb_z_cuda_hdiag_csmm.F90 @@ -0,0 +1,123 @@ +! Parallel Sparse BLAS GPU plugin +! (C) Copyright 2013 +! +! Salvatore Filippone +! Alessandro Fanfarillo +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! 1. Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! 2. Redistributions in binary form must reproduce the above copyright +! notice, this list of conditions, and the following disclaimer in the +! documentation and/or other materials provided with the distribution. +! 3. The name of the PSBLAS group or the names of its contributors may +! not be used to endorse or promote products derived from this +! software without specific written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +! TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +! PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PSBLAS GROUP OR ITS CONTRIBUTORS +! BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. +! + +subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + + use psb_base_mod + use hdiagdev_mod + use psb_vectordev_mod + use psb_z_cuda_hdiag_mat_mod, psb_protect_name => psb_z_cuda_hdiag_csmm + implicit none + class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:) + complex(psb_dpk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + + character :: trans_ + integer(psb_ipk_) :: i,j,k,m,n, nnz, ir, jc, nxy + complex(psb_dpk_), allocatable :: acc(:) + type(c_ptr) :: gpX, gpY + logical :: tra + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='z_cuda_hdiag_csmm' + logical, parameter :: debug=.false. + + info = psb_success_ + call psb_erractionsave(err_act) + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + + if (tra) then + m = a%get_ncols() + n = a%get_nrows() + else + n = a%get_ncols() + m = a%get_nrows() + end if + + if (size(x,1) psb_z_cuda_hdiag_multivect_mv + use psb_z_cuda_multivect_mod + implicit none + class(psb_z_cuda_hdiag_mat_mod), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta + class(psb_z_base_multivect_type), intent(inout) :: x + class(psb_z_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + complex(psb_dpk_), allocatable :: rx(:,:), ry(:,:) + logical :: tra + character :: trans_ + Integer(Psb_ipk_) :: err_act + character(len=20) :: name='z_cuda_hdiag_multivect_mv' + + call psb_erractionsave(err_act) + info = psb_success_ + + if (present(trans)) then + trans_ = trans + else + trans_ = 'N' + end if + + if (.not.a%is_asb()) then + info = psb_err_invalid_mat_state_ + call psb_errpush(info,name) + goto 9999 + endif + + + tra = (psb_toupper(trans_) == 'T').or.(psb_toupper(trans_)=='C') + if (tra) then + if (a%is_dev()) call a%sync() + if (.not.x%is_host()) call x%sync() + if (beta /= dzero) then + if (.not.y%is_host()) call y%sync() + end if + call a%psb_d_hdia_sparse_mat%spmm(alpha,x,beta,y,info,trans) + call y%set_host() + else + if (a%is_host()) call a%sync() + select type (xx => x) + type is (psb_z_multivect_cuda) + select type(yy => y) + type is (psb_z_multivect_cuda) + if (a%is_host()) call a%sync() + if (xx%is_host()) call xx%sync() + if (beta /= dzero) then + if (yy%is_host()) call yy%sync() + end if + info = spmmHdiagDevice(a%deviceMat,alpha,xx%deviceVect,& + & beta,yy%deviceVect) + if (info /= 0) then + call psb_errpush(psb_err_from_subroutine_ai_,name,& + & a_err='spmmHDIAGDevice',i_err=(/info,izero,izero,izero,izero/)) + info = psb_err_from_subroutine_ai_ + goto 9999 + end if + call yy%set_dev() + class default + if (a%is_dev()) call a%sync() + rx = xx%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + class default + if (a%is_dev()) call a%sync() + rx = x%get_vect() + ry = y%get_vect() + call a%spmm(alpha,rx,beta,ry,info) + call y%bld(ry) + end select + + end if + if (info /= 0) goto 9999 + call psb_erractionrestore(err_act) + return + +9999 call psb_error_handler(err_act) + + return + +end subroutine psb_z_cuda_hdiag_multivect_mv diff --git a/cuda/psb_c_cuda_hdiag_mat_mod.F90 b/cuda/psb_c_cuda_hdiag_mat_mod.F90 index f06e501e..92539847 100644 --- a/cuda/psb_c_cuda_hdiag_mat_mod.F90 +++ b/cuda/psb_c_cuda_hdiag_mat_mod.F90 @@ -44,7 +44,8 @@ module psb_c_cuda_hdiag_mat_mod procedure, nopass :: get_fmt => c_cuda_hdiag_get_fmt ! procedure, pass(a) :: sizeof => c_cuda_hdiag_sizeof procedure, pass(a) :: vect_mv => psb_c_cuda_hdiag_vect_mv - ! procedure, pass(a) :: csmm => psb_c_cuda_hdiag_csmm + procedure, pass(a) :: multivect_mv => psb_c_cuda_hdiag_multivect_mv + procedure, pass(a) :: csmm => psb_c_cuda_hdiag_csmm procedure, pass(a) :: csmv => psb_c_cuda_hdiag_csmv ! procedure, pass(a) :: in_vect_sv => psb_c_cuda_hdiag_inner_vect_sv ! procedure, pass(a) :: scals => psb_c_cuda_hdiag_scals @@ -77,6 +78,15 @@ module psb_c_cuda_hdiag_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_c_cuda_hdiag_vect_mv + subroutine psb_c_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_c_base_multivect_type, psb_ipk_ + class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta + class(psb_c_base_multivect_type), intent(inout) :: x + class(psb_c_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_c_cuda_hdiag_multivect_mv end interface !!$ interface @@ -172,17 +182,17 @@ module psb_c_cuda_hdiag_mat_mod end subroutine psb_c_cuda_hdiag_csmv end interface -!!$ interface -!!$ subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) -!!$ import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_ -!!$ class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a -!!$ complex(psb_spk_), intent(in) :: alpha, beta, x(:,:) -!!$ complex(psb_spk_), intent(inout) :: y(:,:) -!!$ integer(psb_ipk_), intent(out) :: info -!!$ character, optional, intent(in) :: trans -!!$ end subroutine psb_c_cuda_hdiag_csmm -!!$ end interface -!!$ + interface + subroutine psb_c_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_ + class(psb_c_cuda_hdiag_sparse_mat), intent(in) :: a + complex(psb_spk_), intent(in) :: alpha, beta, x(:,:) + complex(psb_spk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_c_cuda_hdiag_csmm + end interface + !!$ interface !!$ subroutine psb_c_cuda_hdiag_scal(d,a,info, side) !!$ import :: psb_c_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_ diff --git a/cuda/psb_d_cuda_hdiag_mat_mod.F90 b/cuda/psb_d_cuda_hdiag_mat_mod.F90 index 46b63b43..4862bf7f 100644 --- a/cuda/psb_d_cuda_hdiag_mat_mod.F90 +++ b/cuda/psb_d_cuda_hdiag_mat_mod.F90 @@ -44,7 +44,8 @@ module psb_d_cuda_hdiag_mat_mod procedure, nopass :: get_fmt => d_cuda_hdiag_get_fmt ! procedure, pass(a) :: sizeof => d_cuda_hdiag_sizeof procedure, pass(a) :: vect_mv => psb_d_cuda_hdiag_vect_mv - ! procedure, pass(a) :: csmm => psb_d_cuda_hdiag_csmm + procedure, pass(a) :: multivect_mv => psb_d_cuda_hdiag_multivect_mv + procedure, pass(a) :: csmm => psb_d_cuda_hdiag_csmm procedure, pass(a) :: csmv => psb_d_cuda_hdiag_csmv ! procedure, pass(a) :: in_vect_sv => psb_d_cuda_hdiag_inner_vect_sv ! procedure, pass(a) :: scals => psb_d_cuda_hdiag_scals @@ -77,6 +78,15 @@ module psb_d_cuda_hdiag_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_d_cuda_hdiag_vect_mv + subroutine psb_d_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_d_base_multivect_type, psb_ipk_ + class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta + class(psb_d_base_multivect_type), intent(inout) :: x + class(psb_d_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_d_cuda_hdiag_multivect_mv end interface !!$ interface @@ -172,17 +182,17 @@ module psb_d_cuda_hdiag_mat_mod end subroutine psb_d_cuda_hdiag_csmv end interface -!!$ interface -!!$ subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) -!!$ import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_ -!!$ class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a -!!$ real(psb_dpk_), intent(in) :: alpha, beta, x(:,:) -!!$ real(psb_dpk_), intent(inout) :: y(:,:) -!!$ integer(psb_ipk_), intent(out) :: info -!!$ character, optional, intent(in) :: trans -!!$ end subroutine psb_d_cuda_hdiag_csmm -!!$ end interface -!!$ + interface + subroutine psb_d_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_ + class(psb_d_cuda_hdiag_sparse_mat), intent(in) :: a + real(psb_dpk_), intent(in) :: alpha, beta, x(:,:) + real(psb_dpk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_d_cuda_hdiag_csmm + end interface + !!$ interface !!$ subroutine psb_d_cuda_hdiag_scal(d,a,info, side) !!$ import :: psb_d_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_ diff --git a/cuda/psb_s_cuda_hdiag_mat_mod.F90 b/cuda/psb_s_cuda_hdiag_mat_mod.F90 index cac72c86..8cc373ab 100644 --- a/cuda/psb_s_cuda_hdiag_mat_mod.F90 +++ b/cuda/psb_s_cuda_hdiag_mat_mod.F90 @@ -44,7 +44,8 @@ module psb_s_cuda_hdiag_mat_mod procedure, nopass :: get_fmt => s_cuda_hdiag_get_fmt ! procedure, pass(a) :: sizeof => s_cuda_hdiag_sizeof procedure, pass(a) :: vect_mv => psb_s_cuda_hdiag_vect_mv - ! procedure, pass(a) :: csmm => psb_s_cuda_hdiag_csmm + procedure, pass(a) :: multivect_mv => psb_s_cuda_hdiag_multivect_mv + procedure, pass(a) :: csmm => psb_s_cuda_hdiag_csmm procedure, pass(a) :: csmv => psb_s_cuda_hdiag_csmv ! procedure, pass(a) :: in_vect_sv => psb_s_cuda_hdiag_inner_vect_sv ! procedure, pass(a) :: scals => psb_s_cuda_hdiag_scals @@ -77,6 +78,15 @@ module psb_s_cuda_hdiag_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_s_cuda_hdiag_vect_mv + subroutine psb_s_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_s_base_multivect_type, psb_ipk_ + class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta + class(psb_s_base_multivect_type), intent(inout) :: x + class(psb_s_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_s_cuda_hdiag_multivect_mv end interface !!$ interface @@ -172,17 +182,17 @@ module psb_s_cuda_hdiag_mat_mod end subroutine psb_s_cuda_hdiag_csmv end interface -!!$ interface -!!$ subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) -!!$ import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_ -!!$ class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a -!!$ real(psb_spk_), intent(in) :: alpha, beta, x(:,:) -!!$ real(psb_spk_), intent(inout) :: y(:,:) -!!$ integer(psb_ipk_), intent(out) :: info -!!$ character, optional, intent(in) :: trans -!!$ end subroutine psb_s_cuda_hdiag_csmm -!!$ end interface -!!$ + interface + subroutine psb_s_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_ + class(psb_s_cuda_hdiag_sparse_mat), intent(in) :: a + real(psb_spk_), intent(in) :: alpha, beta, x(:,:) + real(psb_spk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_s_cuda_hdiag_csmm + end interface + !!$ interface !!$ subroutine psb_s_cuda_hdiag_scal(d,a,info, side) !!$ import :: psb_s_cuda_hdiag_sparse_mat, psb_spk_, psb_ipk_ diff --git a/cuda/psb_z_cuda_hdiag_mat_mod.F90 b/cuda/psb_z_cuda_hdiag_mat_mod.F90 index 70897664..a4a3ea7e 100644 --- a/cuda/psb_z_cuda_hdiag_mat_mod.F90 +++ b/cuda/psb_z_cuda_hdiag_mat_mod.F90 @@ -44,7 +44,8 @@ module psb_z_cuda_hdiag_mat_mod procedure, nopass :: get_fmt => z_cuda_hdiag_get_fmt ! procedure, pass(a) :: sizeof => z_cuda_hdiag_sizeof procedure, pass(a) :: vect_mv => psb_z_cuda_hdiag_vect_mv - ! procedure, pass(a) :: csmm => psb_z_cuda_hdiag_csmm + procedure, pass(a) :: multivect_mv => psb_z_cuda_hdiag_multivect_mv + procedure, pass(a) :: csmm => psb_z_cuda_hdiag_csmm procedure, pass(a) :: csmv => psb_z_cuda_hdiag_csmv ! procedure, pass(a) :: in_vect_sv => psb_z_cuda_hdiag_inner_vect_sv ! procedure, pass(a) :: scals => psb_z_cuda_hdiag_scals @@ -77,6 +78,15 @@ module psb_z_cuda_hdiag_mat_mod integer(psb_ipk_), intent(out) :: info character, optional, intent(in) :: trans end subroutine psb_z_cuda_hdiag_vect_mv + subroutine psb_z_cuda_hdiag_multivect_mv(alpha,a,x,beta,y,info,trans) + import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_z_base_multivect_type, psb_ipk_ + class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta + class(psb_z_base_multivect_type), intent(inout) :: x + class(psb_z_base_multivect_type), intent(inout) :: y + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_z_cuda_hdiag_multivect_mv end interface !!$ interface @@ -172,17 +182,17 @@ module psb_z_cuda_hdiag_mat_mod end subroutine psb_z_cuda_hdiag_csmv end interface -!!$ interface -!!$ subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) -!!$ import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_ -!!$ class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a -!!$ complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:) -!!$ complex(psb_dpk_), intent(inout) :: y(:,:) -!!$ integer(psb_ipk_), intent(out) :: info -!!$ character, optional, intent(in) :: trans -!!$ end subroutine psb_z_cuda_hdiag_csmm -!!$ end interface -!!$ + interface + subroutine psb_z_cuda_hdiag_csmm(alpha,a,x,beta,y,info,trans) + import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_ + class(psb_z_cuda_hdiag_sparse_mat), intent(in) :: a + complex(psb_dpk_), intent(in) :: alpha, beta, x(:,:) + complex(psb_dpk_), intent(inout) :: y(:,:) + integer(psb_ipk_), intent(out) :: info + character, optional, intent(in) :: trans + end subroutine psb_z_cuda_hdiag_csmm + end interface + !!$ interface !!$ subroutine psb_z_cuda_hdiag_scal(d,a,info, side) !!$ import :: psb_z_cuda_hdiag_sparse_mat, psb_dpk_, psb_ipk_ diff --git a/cuda/spgpu/kernels/Makefile b/cuda/spgpu/kernels/Makefile index 69bccf9e..c59559fa 100644 --- a/cuda/spgpu/kernels/Makefile +++ b/cuda/spgpu/kernels/Makefile @@ -17,8 +17,10 @@ OBJS=cabs.o camax.o casum.o caxpby.o caxy.o cdot.o cgath.o \ dscal.o dscat.o dsetscal.o ell_ccsput.o ell_cspmv.o ell_dcsput.o \ ell_dspmv.o ell_scsput.o ell_sspmv.o ell_zcsput.o ell_zspmv.o \ ell_cspmm.o ell_dspmm.o ell_sspmm.o ell_zspmm.o \ - hdia_cspmv.o hdia_dspmv.o hdia_sspmv.o hdia_zspmv.o hell_cspmv.o hell_dspmv.o \ - hell_sspmv.o hell_zspmv.o hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \ + hdia_cspmv.o hdia_cspmm.o hdia_dspmv.o hdia_dspmm.o \ + hdia_sspmv.o hdia_sspmm.o hdia_zspmv.o hdia_zspmm.o \ + hell_cspmv.o hell_dspmv.o hell_sspmv.o hell_zspmv.o \ + hell_cspmm.o hell_dspmm.o hell_sspmm.o hell_zspmm.o \ igath.o iscat.o isetscal.o sabs.o samax.o sasum.o \ saxpby.o saxy.o sdot.o sgath.o snrm2.o sscal.o sscat.o ssetscal.o zabs.o zamax.o sabgdxyz.o\ zasum.o zaxpby.o zaxy.o zdot.o zgath.o znrm2.o zscal.o zscat.o zsetscal.o zabgdxyz.o \ diff --git a/cuda/spgpu/kernels/ell_spmm_base_template.cuh b/cuda/spgpu/kernels/ell_spmm_base_template.cuh index 60732acb..712a432a 100644 --- a/cuda/spgpu/kernels/ell_spmm_base_template.cuh +++ b/cuda/spgpu/kernels/ell_spmm_base_template.cuh @@ -25,7 +25,6 @@ CONCAT(GEN_SPGPU_ELL_NAME(TYPE_SYMBOL), _krn) const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex) { - // TODO VALUE_TYPE *pz,*px,*py; VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)(); VALUE_TYPE yVal; diff --git a/cuda/spgpu/kernels/hdia_cspmm.cu b/cuda/spgpu/kernels/hdia_cspmm.cu new file mode 100644 index 00000000..3462f385 --- /dev/null +++ b/cuda/spgpu/kernels/hdia_cspmm.cu @@ -0,0 +1,35 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include "cuComplex.h" + +extern "C" +{ +#include "core.h" +#include "hdia.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE cuFloatComplex +#define TYPE_SYMBOL C +#define TEX_FETCH_TYPE cuFloatComplex +#include "hdia_spmm_base.cuh" diff --git a/cuda/spgpu/kernels/hdia_dspmm.cu b/cuda/spgpu/kernels/hdia_dspmm.cu new file mode 100644 index 00000000..9e79084a --- /dev/null +++ b/cuda/spgpu/kernels/hdia_dspmm.cu @@ -0,0 +1,36 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2014 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" + +extern "C" +{ +#include "core.h" +#include "hdia.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +//#define ENABLE_CACHE +#define VALUE_TYPE double +#define TYPE_SYMBOL D +//#define TEX_FETCH_TYPE int2 +#include "hdia_spmm_base.cuh" + diff --git a/cuda/spgpu/kernels/hdia_spmm_base.cuh b/cuda/spgpu/kernels/hdia_spmm_base.cuh new file mode 100644 index 00000000..96cb7e2c --- /dev/null +++ b/cuda/spgpu/kernels/hdia_spmm_base.cuh @@ -0,0 +1,100 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + + +#define PRE_CONCAT(A, B) A ## B +#define CONCAT(A, B) PRE_CONCAT(A, B) + +#undef GEN_SPGPU_HDIA_NAME +#undef X_TEX +#define X_TEX CONCAT(x_tex_, FUNC_SUFFIX) + +__device__ __host__ static float zero_float() { return 0.0f; } +__device__ __host__ static cuFloatComplex zero_cuFloatComplex() { return make_cuFloatComplex(0.0, 0.0); } +__device__ __host__ static bool float_isNotZero(float x) { return x != 0.0f; } + +__device__ static float float_fma(float a, float b, float c) { return PREC_FADD(PREC_FMUL (a, b), c); } +__device__ static float float_add(float a, float b) { return PREC_FADD (a, b); } +__device__ static float float_mul(float a, float b) { return PREC_FMUL (a, b); } + +__device__ static cuFloatComplex cuFloatComplex_fma(cuFloatComplex a, cuFloatComplex b, cuFloatComplex c) { return cuCfmaf(a, b, c); } +__device__ static cuFloatComplex cuFloatComplex_add(cuFloatComplex a, cuFloatComplex b) { return cuCaddf(a, b); } +__device__ static cuFloatComplex cuFloatComplex_mul(cuFloatComplex a, cuFloatComplex b) { return cuCmulf(a, b); } + +__device__ static float readValue_float(float fetch) { return fetch; } +__device__ static cuFloatComplex readValue_cuFloatComplex(cuFloatComplex fetch) { return fetch; } + +// host or c.c >= 1.3 +#if (__CUDA_ARCH__ >= 130) || (!__CUDA_ARCH__) +__device__ __host__ static double zero_double() { return 0.0; } +__device__ __host__ static cuDoubleComplex zero_cuDoubleComplex() { return make_cuDoubleComplex(0.0, 0.0); } +__device__ __host__ static bool double_isNotZero(double x) { return x != 0.0; } + +__device__ static double double_fma(double a, double b, double c) { return PREC_DADD(PREC_DMUL (a, b), c); } +__device__ static double double_add(double a, double b) { return PREC_DADD (a, b); } +__device__ static double double_mul(double a, double b) { return PREC_DMUL (a, b); } + +__device__ static cuDoubleComplex cuDoubleComplex_fma(cuDoubleComplex a, cuDoubleComplex b, cuDoubleComplex c) { return cuCfma(a, b, c); } +__device__ static cuDoubleComplex cuDoubleComplex_add(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a, b); } +__device__ static cuDoubleComplex cuDoubleComplex_mul(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a, b); } + +__device__ static double readValue_double(int2 fetch) { return __hiloint2double (fetch.y, fetch.x); } +__device__ static cuDoubleComplex readValue_cuDoubleComplex(int4 fetch) +{ + cuDoubleComplex c; + c.x = __hiloint2double (fetch.y, fetch.x); + c.y = __hiloint2double (fetch.w, fetch.z); + return c; +} +#endif +#if 0 +// Texture cache management +texture < TEX_FETCH_TYPE, 1, cudaReadModeElementType > X_TEX; + +#define bind_tex_x(x) cudaBindTexture(NULL, X_TEX, x) +#define unbind_tex_x(x) cudaUnbindTexture(X_TEX) + +__device__ static VALUE_TYPE +fetchTex (int pointer) +{ + TEX_FETCH_TYPE fetch = tex1Dfetch (X_TEX, pointer); + return CONCAT(readValue_,VALUE_TYPE) (fetch); +} +#endif +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla) +#define GEN_SPGPU_HDIA_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_vanilla) +#include "hdia_spmv_base_template.cuh" +#if 0 +#undef GEN_SPGPU_HDIA_NAME +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch) +#define GEN_SPGPU_HDIA_NAME_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_prefetch) +#undef USE_PREFETCHING +#define USE_PREFETCHING +#include "hdia_spmv_base_template.cuh" +#define ENABLE_CACHE +#undef GEN_SPGPU_HDIA_NAME +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch) +#define GEN_SPGPU_HDIA_NAME_TEX_PREFETCH(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache_prefetch) +#include "hdia_spmv_base_template.cuh" +#undef GEN_SPGPU_HDIA_NAME +#undef USE_PREFETCHING +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache) +#define GEN_SPGPU_HDIA_NAME_TEX(x) CONCAT(CONCAT(spgpu,x),hdiaspmv_texcache) +#include "hdia_spmv_base_template.cuh" +#endif +#undef GEN_SPGPU_HDIA_NAME +#define GEN_SPGPU_HDIA_NAME(x) CONCAT(CONCAT(spgpu,x),hdiaspmv) + diff --git a/cuda/spgpu/kernels/hdia_spmm_base_template.cuh b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh new file mode 100644 index 00000000..b1902ab5 --- /dev/null +++ b/cuda/spgpu/kernels/hdia_spmm_base_template.cuh @@ -0,0 +1,18 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define THREAD_BLOCK 128 + diff --git a/cuda/spgpu/kernels/hdia_sspmm.cu b/cuda/spgpu/kernels/hdia_sspmm.cu new file mode 100644 index 00000000..67efb4e6 --- /dev/null +++ b/cuda/spgpu/kernels/hdia_sspmm.cu @@ -0,0 +1,35 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" + +extern "C" +{ +#include "core.h" +#include "hdia.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE float +#define TYPE_SYMBOL S +#define TEX_FETCH_TYPE float +#include "hdia_spmm_base.cuh" + diff --git a/cuda/spgpu/kernels/hdia_zspmm.cu b/cuda/spgpu/kernels/hdia_zspmm.cu new file mode 100644 index 00000000..cab85828 --- /dev/null +++ b/cuda/spgpu/kernels/hdia_zspmm.cu @@ -0,0 +1,36 @@ +/* + * spGPU - Sparse matrices on GPU library. + * + * Copyright (C) 2010 - 2015 + * Davide Barbieri - University of Rome Tor Vergata + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "cudadebug.h" +#include "cudalang.h" +#include "cuComplex.h" + +extern "C" +{ +#include "core.h" +#include "hdia.h" + int getGPUSharedMemPerBlock(); + int getGPUMultiProcessors(); + int getGPUMaxThreadsPerMP(); +} + +#include "debug.h" + +#define VALUE_TYPE cuDoubleComplex +#define TYPE_SYMBOL Z +#define TEX_FETCH_TYPE int4 +#include "hdia_spmm_base.cuh" +