Fixed documentation. Also going for GetRow in SYMBMM. NUMBMM to be completed.

19 years ago · e98bf6f89c
parent 4a4ad11ca4
commit e98bf6f89c
14 changed files with 6854 additions and 6067 deletions
--- a/docs/pdf/commrout.tex
+++ b/docs/pdf/commrout.tex
@ -10,7 +10,7 @@ routines not tied to a discretization space see~\ref{sec:toolsrout}.

 \subroutine{psb\_halo}{Halo Data Communication}    

-These subroutines restore a consistent status for the halo
+These subroutines gathers the values of the halo
 elements, and  (optionally) scale the result:

 \[ x \leftarrow \alpha x \]
@ -80,8 +80,7 @@ An integer value that contains an error code.

 \subroutine{psb\_ovrl}{Overlap Update}    

-These subroutines restore a consistent status for the overlap
-elements:
+These subroutines applies an overlap operator to the input vector:

 \[ x \leftarrow Q x \]
 where:
@ -152,8 +151,8 @@ An integer value that contains an error code.

 \section*{Usage notes}
 \begin{enumerate}
-\item If there is no overlap in the data distribution, no operations
-are performed;
+\item If there is no overlap in the data distribution associated with
+  the descriptor, no operations are performed;
 \item The operator $P^{T}$ performs the reduction sum of overlap
 elements; it is a ``prolongation'' operator $P^T$ that
 replicates overlap elements, accounting for the physical replication
--- a/docs/pdf/datastruct.tex
+++ b/docs/pdf/datastruct.tex
@ -2,11 +2,11 @@
 \label{sec:datastruct}
 %\ifthenelse{\boolean{mtc}}{\minitoc}{}

-In this chapter are illustrated data structures used for definition of
-routines interfaces. This include data structure for sparse matrix,
-communication descriptor and preconditioner. These data structures are used for
-calling PSBLAS routines in Fortran~90 language and will be used to next
-chapters containing these callings. 
+In this chapter we  illustrate the  data structures used for definition of
+routines interfaces. They  include data structures for sparse matrices,
+communication descriptors and preconditioners.%%  These data structures
+%% are used for calling PSBLAS routines in Fortran~90 language and will
+%% be used to next chapters containing these callings.  

 All the data types and subroutine interfaces are defined in the module
 \verb|psb_sparse_mod|; this will have to be included by every user
@ -23,11 +23,10 @@ Every structure of this type is associated to a sparse matrix, it
 contains data about general matrix informations and elements to be
 exchanged among processes.  

-It is not necessary for the user to
-know the internal structure of \verb|psb_desc_type|, it is set in
-fully-transparent mode by PSBLAS-TOOLS routines when inserting a new
-sparse matrix, however the definition of the descriptor is the
-following.  
+It is not necessary for the user to know the internal structure of
+\verb|psb_desc_type|, it is set in a transparent mode by the tools
+routines of Sec.~\ref{sec:toolsrout} while creating a new sparse
+matrix; nevertheless we include its description for the curious reader:  
 \begin{description}
 \item[{\bf matrix\_data}] includes general information about matrix and
 process grid. More precisely:
@ -90,16 +89,16 @@ process then element $i$ contains local index correpondent to global variable $i
 else element $i$ contains -1 (NULL) value.\\
 Specified as: a pointer to an integer array of rank one.
 \end{description}
-FORTRAN95 interface for \verb|psb_desc_type| structures is therefore defined
+The Fortran95 definition  for \verb|psb_desc_type| structures is 
 as follows:
 \begin{figure}[h!]
  \begin{Sbox}
    \begin{minipage}[tl]{0.9\textwidth}
 \begin{verbatim} 
 type psb_desc_type 
-   integer, pointer :: matrix_data(:), halo_index(:)
-   integer, pointer :: overlap_elem(:), overlap_index(:)
-   integer, pointer :: loc_to_glob(:), glob_to_loc(:)
+   integer, pointer :: matrix_data(:)=>null(), halo_index(:)=>null()
+   integer, pointer :: overlap_elem(:)=>null(), overlap_index(:)=>null()
+   integer, pointer :: loc_to_glob(:)=>null(), glob_to_loc(:)=>null()
 end type psb_desc_type 
 \end{verbatim}
    \end{minipage}
@ -152,10 +151,9 @@ state, which can take the following values:
 \label{sec:spmat}
 The \hypertarget{spdata}{{\tt psb\_spmat\_type}} data structure
 contains all information about local portion of the sparse matrix and   
-its storage mode. Many of this fields are set in fully-transparent
-mode by PSBLAS-TOOLS routines when inserting a new sparse matrix, user
-must set only fields which describe matrix storage mode. \\
-Fields contained in Sparse matrix structures are:
+its storage mode. Most  of these fields are set  by the tools
+routines when inserting a new sparse matrix; the user needs only
+choose, if he/she so whishes, a specific matrix storage mode. \\
 \begin{description}
 \item[{\bf aspk}] Contains values of the local distributed sparse
 matrix.\\
@ -201,8 +199,9 @@ type psb_dspmat_type
   character   :: fida(5)
   character   :: descra(10)
   integer     :: infoa(psb_ifa_size_)
-   real(kind(1.d0)), pointer :: aspk(:)
-   integer, pointer :: ia1(:), ia2(:), pr(:), pl(:)
+   real(kind(1.d0)), pointer :: aspk(:)=>null()
+   integer, pointer :: ia1(:)=>null(), ia2(:)=>null()
+   integer, pointer :: pr(:)=>null(), pl(:)=>null()
 end type psb_dspmat_type
 \end{verbatim}
    \end{minipage}
@ -273,12 +272,16 @@ values:

 \subsection{Preconditioner data structure}
 \label{sec:prec}
-PSBLAS-2.0 offers the possibility to use many different types of
+Our library  offers support for  many different types of
 preconditioning schemes. Besides the simple well known preconditioners
-like Diagonal Scaling or Block Jacobi (with ILU(0) incomplete
-factorization) also more complex preconditioning methods are
-implemented like the Additive Schwarz and Two-Level ones. A
-preconditioner is held in the \hypertarget{precdata}{{\tt
+like Diagonal Scaling or Block Jacobi with either  incomplete
+factorization ILU(0) or complete LU factorization.  We also provide an
+experimental package of  complex 
+preconditioning methods like the Additive Schwarz and Multilevel
+Additive Schwarz; these last preconditioners will be described in a
+separate document. 
+
+ A preconditioner is held in the \hypertarget{precdata}{{\tt
    psb\_prec\_type}} data structure which depends on the
 \verb|psb_base_prec| reported in 
 figure~\ref{fig:prectype}. The \verb|psb_base_prec| 
--- a/docs/pdf/figures/points.pdf
+++ b/docs/pdf/figures/points.pdf
--- a/docs/pdf/intro.tex
+++ b/docs/pdf/intro.tex
@ -26,12 +26,12 @@ proposal for BLAS on dense matrices~\cite{BLAS1,BLAS2,BLAS3}.
 The applicability of sparse iterative solvers to many different areas
 causes some terminology problems because the same concept may be
 denoted through different names depending on the application area. The
-PSBLAS features presented in this section will be discussed mainly in terms of finite
-difference discretizations of Partial Differential Equations (PDEs).
-However, the scope of the library is wider than that: for example, it
-can be applied to finite element discretizations of PDEs, and even to
-different classes of problems such as nonlinear optimization, for
-example in optimal control problems.
+PSBLAS features presented in this document will be discussed referring
+to a   finite difference discretization of a Partial Differential
+Equation (PDE). However, the scope of the library is wider than
+that: for example, it can be applied to finite element discretizations
+of PDEs, and even to different classes of problems such as nonlinear
+optimization, for example in optimal control problems.

 The design of a solver for sparse linear systems is driven by many
 conflicting objectives, such as limiting occupation of storage
@ -75,6 +75,9 @@ Message Passing Interface code is encapsulated within the BLACS
 layer. However, in some cases, MPI routines are directly used either
 to improve efficiency or to implement communication patterns for which
 the BLACS package doesn't provide any method. 
+
+In any case we provide wrappers around the BLACS routines so that the
+user does not need to delve into their details (see Sec.~\ref{sec:toolsrout}).
 %% We assume that the user program has initialized a BLACS process grid
 %% with one column and as many rows as there are processes; the PSBLAS
 %% initialization routines will take the communication context for this
@ -86,6 +89,121 @@ the BLACS package doesn't provide any method.
 \caption{PSBLAS library components hierarchy.\label{fig:psblas}}
 \end{figure}

+
+The type of linear system matrices that we address  typically arise in the
+numerical solution of PDEs;  in such a context,
+it is necessary to pay special attention to the
+structure of the problem from which the application originates. 
+The nonzero pattern of a matrix arising from the
+discretization of a PDE is influenced by various factors, such as the
+shape of the  domain, the discretization strategy, and
+the equation/unknown ordering. The matrix itself can be interpreted as
+the  adjacency matrix of the graph associated with the discretization
+mesh. 
+
+The distribution of the coefficient matrix for the linear system is
+based on the ``owner computes'' rule: 
+the variable associated to each mesh point is assigned to a process
+that will  own the corresponding row in the coefficient matrix and
+will  carry out all related computations. This allocation strategy 
+is equivalent to a partition of the discretization mesh into {\em
+sub-domains}. 
+Our library  supports any distribution that keeps together 
+the coefficients of each matrix row; there are no other constraints on
+the variable assignment. 
+This choice is consistent with  data distributions commonly used in
+ScaLAPACK  such as  \verb|CYCLIC(N)| and \verb|BLOCK|, 
+as well as completely arbitrary assignments of
+equation indices to processes. In particular it is consistent with the
+usage of graph partitioning tools commonly available in the
+literature, e.g. METIS~\cite{METIS}.
+Dense vectors  conform  to sparse
+matrices, that is, the entries of a vector follow the same distribution
+of the matrix rows.  
+
+We assume that the sparse matrix is built in parallel, where each
+process generates its own portion. We never require that the entire
+matrix be available on a single node. However, it is possible
+to hold the entire matrix in one process and distribute it
+explicitly\footnote{In our prototype implementation  we provide 
+sample scatter/gather routines.}, even though  the resulting 
+bottleneck would make this option unattractive in most  cases. 
+
+
+\subsection{Basic Nomenclature}
+
+
+Our computational model implies that the data allocation on the
+parallel distributed memory machine is guided by the structure of the
+physical model, and specifically by the discretization mesh of the
+PDE. 
+
+Each point of the discretization mesh will have (at least) one
+associated equation/variable, and therefore one index. We say that
+point  $i$ {\em depends\/} on point $j$ if the  equation for a
+variable associated with $i$ contains a term in $j$,  or equivalently
+if $a_{ij} \ne0$.  
+After the partition of the discretization mesh into {\em sub-domains\/}
+assigned to the parallel processes,
+we classify the  points of a given sub-domain as following.
+\begin{description}
+\item[Internal.] An internal point of
+ a given domain {\em depends} only on  points of the
+same domain. 
+If all points of a domain are assigned to one
+process, then a computational step (e.g., a
+matrix-vector product) of the 
+equations associated with the internal points  requires no data
+items from other domains and no communications.
+
+\item[Boundary.] A point of
+a given domain is a boundary point if it {\em depends} on  points
+belonging to other domains.
+
+\item[Halo.] A halo point for a given domain is a point belonging to
+another domain such that there is a boundary point which {\em depends\/}
+on it. Whenever performing a computational step, such as a
+matrix-vector product, the values associated with halo points are
+requested from other domains. A boundary point of a given 
+domain is a halo point for (at least) another domain; therefore
+the cardinality of the boundary points set denotes the amount of data
+ sent to other domains. 
+\item[Overlap.] An overlap point is a boundary point assigned to
+multiple domains. Any operation that involves an overlap point
+has to be replicated for each assignment. 
+\end{description}
+Overlap points do not usually exist in the basic data
+distribution, but they are a feature of Domain Decomposition
+Schwarz preconditioners which we are in the process of including in
+our distribution~\cite{PARA04,APNUM06}. 
+
+We denote the sets of  internal, boundary and halo points for a given
+subdomain  by $\cal I$, $\cal B$ and $\cal H$.
+Each subdomain is assigned to one process; each process usually
+owns one subdomain, although the user may choose to assign more than
+one subdomain to a process.  If each process $i$ owns one
+subdomain, the number of rows in the local sparse matrix is
+$|{\cal I}_i| + |{\cal B}_i|$, and the number of local columns
+(i.e. those for which there exists at least one non-zero entry in the
+local rows)  is $|{\cal I}_i| + |{\cal B}_i| +|{\cal H}_i|$.
+
+\begin{figure}[h] 
+\begin{center}
+\leavevmode
+\rotatebox{-90}{\includegraphics[scale=0.45]{figures/points}}
+\end{center}
+\caption{Point classfication.\label{fig:points}}
+\end{figure}
+
+This classification of mesh points guides the naming scheme that we
+adopted in the library internals and in the data structures. We
+explicitly note that ``Halo'' points are also often called ``ghost''
+points in the literature. 
+
+
+
+\subsection{Library contents}
+
 The PSBLAS library consists of various classes of subroutines:
 \begin{description}
 \item[Computational routines] comprising:
@ -231,8 +349,8 @@ multiple time steps, the following structure may be more appropriate:
 \item Call the iterative method of choice, e.g. \verb|psb_bicgstab|
 \end{enumerate}
 \end{enumerate}
-The insertion routines will be called as many times as needed; it is
-clear that they only need be called on the data that is actually
+The insertion routines will be called as many times as needed; 
+they only need to  be called on the data that is actually
 allocated to the current process, i.e. each process generates its own
 data. 

--- a/docs/pdf/precs.tex
+++ b/docs/pdf/precs.tex
@ -9,8 +9,8 @@ many parameters that is possible to adjust to fit the user's needs:
 \begin{itemize}
 \item Diagonal Scaling
 \item Block Jacobi with ILU(0) factorization
-\item Additive Schwarz with the Restricted Additive Schwarz and
-  Additive Schwarz with Harmonic extensions;
+%% \item Additive Schwarz with the Restricted Additive Schwarz and
+%%   Additive Schwarz with Harmonic extensions;
 \end{itemize}
 The PSBLAS library is incorporating a package of two-level Additive
  Schwarz preconditioners called MD2P4; this is actually a family of 
@ -66,21 +66,21 @@ $ptype$ string as follows\footnote{The string is case-insensitive}:
  block-diagonal of matrix $A$, where block boundaries are determined
  by the data allocation boundaries for each process; requires no
  communication. Only $ILU(0)$ is currently implemented. 
-\item[AS] Additive Schwarz preconditioner (see~\cite{PARA04}); in this
-  case the user may specify additional flags through the integer
-  vector \verb|ir| as follows:
-\begin{description}
-\item[$iv(1)$] Number of overlap levels, an integer $novr>=0$, default
-  $novr=1$.  
-\item[$iv(2)$] Restriction operator, legal values: \verb|psb_halo_|,
-  \verb|psb_none_|; default: \verb|psb_halo_|
-\item[$iv(3)$] Prolongation operator, legal values: \verb|psb_none_|,
-  \verb|psb_sum_|, \verb|psb_avg_|, default: \verb|psb_none_|
-\item[$iv(4)$] Factorization type, legal values: \verb|f_ilu_n_|,
-  \verb|f_slu_|, \verb|f_umf_|, default: \verb|f_ilu_n_|.
-\end{description}
-Note that the default corresponds to a Restricted Additive Schwarz
-preconditioner with $ILU(0)$ and 1 level of overlap.
+%% \item[AS] Additive Schwarz preconditioner (see~\cite{PARA04}); in this
+%%   case the user may specify additional flags through the integer
+%%   vector \verb|ir| as follows:
+%% \begin{description}
+%% \item[$iv(1)$] Number of overlap levels, an integer $novr>=0$, default
+%%   $novr=1$.  
+%% \item[$iv(2)$] Restriction operator, legal values: \verb|psb_halo_|,
+%%   \verb|psb_none_|; default: \verb|psb_halo_|
+%% \item[$iv(3)$] Prolongation operator, legal values: \verb|psb_none_|,
+%%   \verb|psb_sum_|, \verb|psb_avg_|, default: \verb|psb_none_|
+%% \item[$iv(4)$] Factorization type, legal values: \verb|f_ilu_n_|,
+%%   \verb|f_slu_|, \verb|f_umf_|, default: \verb|f_ilu_n_|.
+%% \end{description}
+%% Note that the default corresponds to a Restricted Additive Schwarz
+%% preconditioner with $ILU(0)$ and 1 level of overlap.
 %% \item[2L] Second level of a multilevel preconditioner, see below
 %% \end{description}
 %% If a multilevel preconditioner is desired, the user should call
--- a/docs/pdf/title.tex
+++ b/docs/pdf/title.tex
@ -5,7 +5,7 @@

 \ifx\pdfoutput\undefined % We're not running pdftex
 \else
-\pdfbookmark{PSBLAS-v2.0 User's Guide}{title}
+\pdfbookmark{PSBLAS-v2.0.1 User's Guide}{title}
 \fi
 \newlength{\centeroffset}
 \setlength{\centeroffset}{-0.5\oddsidemargin}
@ -15,7 +15,7 @@
 \vspace*{\stretch{1}}
 \noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
 \flushright
-{\Huge\bfseries PSBLAS-2.0 User's guide
+{\Huge\bfseries PSBLAS-2.0.1 User's guide
 }
 \noindent\rule[-1ex]{\textwidth}{5pt}\\[2.5ex]
 \hfill\emph{\Large A reference guide for the Parallel Sparse BLAS library}
--- a/docs/pdf/toolsrout.tex
+++ b/docs/pdf/toolsrout.tex
@ -412,7 +412,8 @@ Specified as: an integer variable.
 Scope:{\bf local}.\\
 Type:{\bf required}.\\
 Specified as: a structured data of type \descdata.
-\item[nnz] the number of nonzeroes in the local part of the assembled matrix.\\
+\item[nnz] An estimate of the number of nonzeroes in the local
+  part of the assembled matrix.\\ 
 Scope: {\bf global}.\\
 Type: {\bf optional}.\\
 Specified as: an integer value. 
@ -437,7 +438,7 @@ Specified as: an integer variable.
 \item Providing a good estimate for the number of nonzeroes $nnz$ in
  the assembled matrix may substantially improve performance in the
  matrix build phase, as it will reduce or eliminate the need for
-  multiple data allocations. 
+  (potentially multiple) data reallocations. 
 \end{enumerate}


--- a/docs/pdf/userguide.tex
+++ b/docs/pdf/userguide.tex
@ -24,7 +24,7 @@
 \relax
 \pdfcompresslevel=0             %-- 0 = none, 9 = best
 \pdfinfo{                       %-- Info dictionary of PDF output  /Author (Alfredo Buttari)
-  /Title (Parallel Sparse BLAS V. 2.0)
+  /Title (Parallel Sparse BLAS V. 2.0.1)
  /Subject (Parallel Sparse Basic Linear Algebra Subroutines)
  /Keywords (Computer Science Linear Algebra Fluid Dynamics Parallel Linux MPI PSBLAS Iterative Solvers Preconditioners)
  /Creator (pdfLaTeX)
@ -201,6 +201,13 @@ in G.~Joubert, A.~Murli, F.~Peters, M.~Vanneschi, editors,
 Parallel Computing - Advances \& Current Issues,
 pp.~441--448, Imperial College Press, 2002. 
 %
+\bibitem{METIS}
+Karypis, G. and Kumar, V.,
+{\em {METIS}: Unstructured Graph Partitioning and Sparse Matrix
+  Ordering System}.
+Minneapolis, MN 55455: University of Minnesota, Department of
+  Computer Science, 1995. 
+Internet Address: {\verb|http://www.cs.umn.edu/~karypis|}.
 \bibitem{machiels}
 {Machiels, L. and Deville, M.}
 {\em Fortran 90: An entry to object-oriented programming for the solution
--- a/docs/userguide.pdf
+++ b/docs/userguide.pdf
--- a/src/serial/psb_dnumbmm.f90
+++ b/src/serial/psb_dnumbmm.f90
@ -39,13 +39,49 @@ subroutine psb_dnumbmm(a,b,c)
  type(psb_dspmat_type)         :: a,b,c
  real(kind(1.d0)), allocatable :: temp(:)
  integer                       :: info
+  
+  interface psb_sp_getrow
+    subroutine psb_dspgetrow(irw,a,nz,ia,ja,val,info,iren,lrw)
+      use psb_spmat_type
+      type(psb_dspmat_type), intent(in) :: a
+      integer, intent(in)       :: irw
+      integer, intent(out)      :: nz
+      integer, intent(inout)    :: ia(:), ja(:)
+      real(kind(1.d0)),  intent(inout)    :: val(:)
+      integer, intent(in), target, optional :: iren(:)
+      integer, intent(in), optional :: lrw
+      integer, intent(out)  :: info
+    end subroutine psb_dspgetrow
+  end interface

-  allocate(temp(max(a%m,a%k,b%m,b%k)),stat=info)

+  allocate(temp(max(a%m,a%k,b%m,b%k)),stat=info)
+  if (info /= 0) then
+    return
+  endif
  call psb_realloc(size(c%ia1),c%aspk,info)
-  call numbmm(a%m,a%k,b%k,a%ia2,a%ia1,0,a%aspk,&
-       & b%ia2,b%ia1,0,b%aspk,&
-       & c%ia2,c%ia1,0,c%aspk,temp)
+  if (.true.) then 
+    call numbmm(a%m,a%k,b%k,a%ia2,a%ia1,0,a%aspk,&
+         & b%ia2,b%ia1,0,b%aspk,&
+         & c%ia2,c%ia1,0,c%aspk,temp)
+  else
+    call inner_numbmm(a,b,c,temp,info)
+  end if
  deallocate(temp) 
  return
+
+contains 
+
+  subroutine inner_numbmm(a,b,c,temp,info)
+    type(psb_dspmat_type) :: a,b,c
+    integer               :: info
+    real(kind(1.d0))      :: temp(:)
+    integer, allocatable  :: iarw(:), iacl(:),ibrw(:),ibcl(:)
+    real(kind(1.d0)), allocatable :: aval(:),bval(:)
+    integer  :: maxlmn,i,j,m,n,k,l,istart,length,nazr,nbzr,jj,ii,minlm,minmn
+
+
+  end subroutine inner_numbmm
+
+
 end subroutine psb_dnumbmm
--- a/src/serial/psb_dspgetrow.f90
+++ b/src/serial/psb_dspgetrow.f90
@ -68,7 +68,7 @@ subroutine psb_dspgetrow(irw,a,nz,ia,ja,val,info,iren,lrw)
    end subroutine psb_dspgtblk
  end interface

-  integer               :: lrw_, ierr(2), err_act
+  integer               :: lrw_, ierr(5), err_act
  type(psb_dspmat_type) :: b
  integer, pointer      :: iren_(:)
  character(len=20)     :: name, ch_err
--- a/src/serial/psb_dsymbmm.f90
+++ b/src/serial/psb_dsymbmm.f90
@ -49,16 +49,37 @@ subroutine psb_dsymbmm(a,b,c)
    end subroutine symbmm
  end interface

+  interface psb_sp_getrow
+    subroutine psb_dspgetrow(irw,a,nz,ia,ja,val,info,iren,lrw)
+      use psb_spmat_type
+      type(psb_dspmat_type), intent(in) :: a
+      integer, intent(in)       :: irw
+      integer, intent(out)      :: nz
+      integer, intent(inout)    :: ia(:), ja(:)
+      real(kind(1.d0)),  intent(inout)    :: val(:)
+      integer, intent(in), target, optional :: iren(:)
+      integer, intent(in), optional :: lrw
+      integer, intent(out)  :: info
+    end subroutine psb_dspgetrow
+  end interface
+
  if (b%m /= a%k) then 
    write(0,*) 'Mismatch in SYMBMM: ',a%m,a%k,b%m,b%k
  endif
  allocate(itemp(max(a%m,a%k,b%m,b%k)),stat=info)    
+  if (info /= 0) then 
+    return
+  endif
  nze = max(a%m+1,2*a%m)
  call psb_sp_reall(c,nze,info)
 !!$  write(0,*) 'SYMBMM90 ',size(c%pl),size(c%pr)
-  call symbmm(a%m,a%k,b%k,a%ia2,a%ia1,0,&
-       & b%ia2,b%ia1,0,&
-       & c%ia2,c%ia1,0,itemp)
+  if (.false.) then 
+    call symbmm(a%m,a%k,b%k,a%ia2,a%ia1,0,&
+         & b%ia2,b%ia1,0,&
+         & c%ia2,c%ia1,0,itemp)
+  else 
+    call inner_symbmm(a,b,c,itemp,info)
+  endif
  c%pl(1) = 0
  c%pr(1) = 0
  c%m=a%m
@ -67,4 +88,85 @@ subroutine psb_dsymbmm(a,b,c)
  c%descra='GUN'
  deallocate(itemp) 
  return
+
+contains
+
+  subroutine inner_symbmm(a,b,c,index,info)
+    type(psb_dspmat_type) :: a,b,c
+    integer               :: index(:),info
+    integer, allocatable  :: iarw(:), iacl(:),ibrw(:),ibcl(:)
+    real(kind(1.d0)), allocatable :: aval(:),bval(:)
+    integer  :: maxlmn,i,j,m,n,k,l,istart,length,nazr,nbzr,jj,ii,minlm,minmn
+
+
+    n = a%m
+    m = a%k 
+    l = b%k 
+    maxlmn = max(l,m,n)
+
+    allocate(iarw(maxlmn),iacl(maxlmn),ibrw(maxlmn),ibcl(maxlmn),&
+         & aval(maxlmn),bval(maxlmn), stat=info)
+    if (info /= 0) then 
+      return
+    endif
+
+
+    if (size(c%ia2) < n+1) then 
+
+      call psb_realloc(n+1,c%ia2,info)
+    endif
+    do i=1,maxlmn
+      index(i)=0
+    end do
+
+      c%ia2(1)=1
+      minlm = min(l,m)
+      minmn = min(m,n)
+
+      main: do  i=1,n
+        istart=-1
+        length=0
+        call psb_sp_getrow(i,a,nazr,iarw,iacl,aval,info)
+        do jj=1, nazr
+          
+          j=iacl(jj)
+          
+          if ((j<1).or.(j>m)) then 
+            write(0,*) ' SymbMM: Problem with A ',i,jj,j,m
+          endif
+          call psb_sp_getrow(j,b,nbzr,ibrw,ibcl,bval,info)
+          do k=1,nbzr
+            if ((ibcl(k)<1).or.(ibcl(k)>maxlmn)) then 
+                write(0,*) 'Problem in SYMBMM 1:',j,k,ibcl(k),maxlmn
+            else
+              if(index(ibcl(k)).eq.0) then
+                index(ibcl(k))=istart
+                istart=ibcl(k)
+                length=length+1
+              endif
+            endif
+          end do
+        end do
+
+        c%ia2(i+1)=c%ia2(i)+length
+        
+        if (c%ia2(i+1) > size(c%ia1)) then 
+          if (n > (2*i)) then 
+            nze = max(c%ia2(i+1), c%ia2(i)*((n+i-1)/i))
+          else
+            nze = max(c%ia2(i+1), nint((dble(c%ia2(i))*(dble(n)/i)))   )
+          endif 
+          call psb_realloc(nze,c%ia1,info)
+        end if 
+        do j= c%ia2(i),c%ia2(i+1)-1
+          c%ia1(j)=istart
+          istart=index(istart)
+          index(c%ia1(j))=0
+        end do
+        call isr(length,c%ia1(c%ia2(i)))
+        index(i) = 0
+      end do main
+
+  end subroutine inner_symbmm
+
 end subroutine psb_dsymbmm
--- a/src/serial/psb_zspgetrow.f90
+++ b/src/serial/psb_zspgetrow.f90
@ -68,7 +68,7 @@ subroutine psb_zspgetrow(irw,a,nz,ia,ja,val,info,iren,lrw)
    end subroutine psb_zspgtblk
  end interface

-  integer               :: lrw_, ierr(2), err_act
+  integer               :: lrw_, ierr(5), err_act
  type(psb_zspmat_type) :: b
  integer, pointer      :: iren_(:)
  character(len=20)     :: name, ch_err
--- a/src/serial/psb_zsymbmm.f90
+++ b/src/serial/psb_zsymbmm.f90
@ -49,16 +49,37 @@ subroutine psb_zsymbmm(a,b,c)
    end subroutine symbmm
  end interface

+  interface psb_sp_getrow
+    subroutine psb_zspgetrow(irw,a,nz,ia,ja,val,info,iren,lrw)
+      use psb_spmat_type
+      type(psb_zspmat_type), intent(in) :: a
+      integer, intent(in)       :: irw
+      integer, intent(out)      :: nz
+      integer, intent(inout)    :: ia(:), ja(:)
+      complex(kind(1.d0)),  intent(inout)    :: val(:)
+      integer, intent(in), target, optional :: iren(:)
+      integer, intent(in), optional :: lrw
+      integer, intent(out)  :: info
+    end subroutine psb_zspgetrow
+  end interface
+
  if (b%m /= a%k) then 
    write(0,*) 'Mismatch in SYMBMM: ',a%m,a%k,b%m,b%k
  endif
  allocate(itemp(max(a%m,a%k,b%m,b%k)),stat=info)    
+  if (info /= 0) then 
+    return
+  endif
  nze = max(a%m+1,2*a%m)
  call psb_sp_reall(c,nze,info)
 !!$  write(0,*) 'SYMBMM90 ',size(c%pl),size(c%pr)
-  call symbmm(a%m,a%k,b%k,a%ia2,a%ia1,0,&
-       & b%ia2,b%ia1,0,&
-       & c%ia2,c%ia1,0,itemp)
+  if (.false.) then 
+    call symbmm(a%m,a%k,b%k,a%ia2,a%ia1,0,&
+         & b%ia2,b%ia1,0,&
+         & c%ia2,c%ia1,0,itemp)
+  else 
+    call inner_symbmm(a,b,c,itemp,info)
+  endif
  c%pl(1) = 0
  c%pr(1) = 0
  c%m=a%m
@ -67,4 +88,82 @@ subroutine psb_zsymbmm(a,b,c)
  c%descra='GUN'
  deallocate(itemp) 
  return
+contains
+  subroutine inner_symbmm(a,b,c,index,info)
+    type(psb_zspmat_type) :: a,b,c
+    integer               :: index(:),info
+    integer, allocatable  :: iarw(:), iacl(:),ibrw(:),ibcl(:)
+    complex(kind(1.d0)), allocatable :: aval(:),bval(:)
+    integer  :: maxlmn,i,j,m,n,k,l,istart,length,nazr,nbzr,jj,ii,minlm,minmn
+
+
+    n = a%m
+    m = a%k 
+    l = b%k 
+    maxlmn = max(l,m,n)
+
+    allocate(iarw(maxlmn),iacl(maxlmn),ibrw(maxlmn),ibcl(maxlmn),&
+         & aval(maxlmn),bval(maxlmn), stat=info)
+    if (info /= 0) then 
+      return
+    endif
+
+
+    if (size(c%ia2) < n+1) then 
+
+      call psb_realloc(n+1,c%ia2,info)
+    endif
+    do i=1,maxlmn
+      index(i)=0
+    end do
+
+      c%ia2(1)=1
+      minlm = min(l,m)
+      minmn = min(m,n)
+
+      main: do  i=1,n
+        istart=-1
+        length=0
+        call psb_sp_getrow(i,a,nazr,iarw,iacl,aval,info)
+        do jj=1, nazr
+          
+          j=iacl(jj)
+          
+          if ((j<1).or.(j>m)) then 
+            write(0,*) ' SymbMM: Problem with A ',i,jj,j,m
+          endif
+          call psb_sp_getrow(j,b,nbzr,ibrw,ibcl,bval,info)
+          do k=1,nbzr
+            if ((ibcl(k)<1).or.(ibcl(k)>maxlmn)) then 
+                write(0,*) 'Problem in SYMBMM 1:',j,k,ibcl(k),maxlmn
+            else
+              if(index(ibcl(k)).eq.0) then
+                index(ibcl(k))=istart
+                istart=ibcl(k)
+                length=length+1
+              endif
+            endif
+          end do
+        end do
+
+        c%ia2(i+1)=c%ia2(i)+length
+        
+        if (c%ia2(i+1) > size(c%ia1)) then 
+          if (n > (2*i)) then 
+            nze = max(c%ia2(i+1), c%ia2(i)*((n+i-1)/i))
+          else
+            nze = max(c%ia2(i+1), nint((dble(c%ia2(i))*(dble(n)/i)))   )
+          endif 
+          call psb_realloc(nze,c%ia1,info)
+        end if 
+        do j= c%ia2(i),c%ia2(i+1)-1
+          c%ia1(j)=istart
+          istart=index(istart)
+          index(c%ia1(j))=0
+        end do
+        call isr(length,c%ia1(c%ia2(i)))
+        index(i) = 0
+      end do main
+
+  end subroutine inner_symbmm
 end subroutine psb_zsymbmm