diff --git a/docs/html/img1.png b/docs/html/img1.png
index be2b1fe9..0423c177 100644
Binary files a/docs/html/img1.png and b/docs/html/img1.png differ
diff --git a/docs/html/img32.png b/docs/html/img32.png
index 5ec9d53a..8f2424d5 100644
Binary files a/docs/html/img32.png and b/docs/html/img32.png differ
diff --git a/docs/html/img52.png b/docs/html/img52.png
index bf879aac..a71e3542 100644
Binary files a/docs/html/img52.png and b/docs/html/img52.png differ
diff --git a/docs/html/img58.png b/docs/html/img58.png
index 0e817929..f220846e 100644
Binary files a/docs/html/img58.png and b/docs/html/img58.png differ
diff --git a/docs/html/img71.png b/docs/html/img71.png
index 1fd3d052..b7d510e1 100644
Binary files a/docs/html/img71.png and b/docs/html/img71.png differ
diff --git a/docs/html/img83.png b/docs/html/img83.png
index f2edec00..e2052d1f 100644
Binary files a/docs/html/img83.png and b/docs/html/img83.png differ
diff --git a/docs/html/node11.html b/docs/html/node11.html
index 1a083452..005983e5 100644
--- a/docs/html/node11.html
+++ b/docs/html/node11.html
@@ -276,7 +276,7 @@ three steps:
  ALT="$i=1,\ldots,m$">;
 </LI>
 <LI>prolongation and sum of the <IMG
- WIDTH="22" HEIGHT="31" ALIGN="MIDDLE" BORDER="0"
+ WIDTH="23" HEIGHT="31" ALIGN="MIDDLE" BORDER="0"
  SRC="img32.png"
  ALT="$w_i$">'s, i.e. <!-- MATH
  $w = \sum_{i=1}^m (R_i^{\delta})^T w_i$
@@ -309,7 +309,7 @@ time on parallel distributed-memory computers is the so-called <I>Restricted AS
  HREF="node24.html#CAI_SARKIS">5</A>,<A
  HREF="node24.html#EFSTATHIOU">13</A>]. It
 is obtained by zeroing the components of <IMG
- WIDTH="22" HEIGHT="31" ALIGN="MIDDLE" BORDER="0"
+ WIDTH="23" HEIGHT="31" ALIGN="MIDDLE" BORDER="0"
  SRC="img32.png"
  ALT="$w_i$"> corresponding to the
 overlapping vertices when applying the prolongation. Therefore,
@@ -613,7 +613,7 @@ M_{2LH-POST}^{-1} = M_{1L}^{-1} + \left( I - M_{1L}^{-1}A \right) M_{C}^{-1}.
  -->
 
 <IMG
- WIDTH="316" HEIGHT="33" BORDER="0"
+ WIDTH="317" HEIGHT="33" BORDER="0"
  SRC="img58.png"
  ALT="\begin{displaymath}
 M_{2LH-POST}^{-1} = M_{1L}^{-1} + \left( I - M_{1L}^{-1}A \right) M_{C}^{-1}.
diff --git a/docs/html/node12.html b/docs/html/node12.html
index a7ca5236..42cf3ddc 100644
--- a/docs/html/node12.html
+++ b/docs/html/node12.html
@@ -142,7 +142,7 @@ N_r = \left\{s \in W: |a_{rs}| > \theta \sqrt{|a_{rr}a_{ss}|} \right\}
  -->
 
 <IMG
- WIDTH="320" HEIGHT="38" BORDER="0"
+ WIDTH="319" HEIGHT="38" BORDER="0"
  SRC="img71.png"
  ALT="\begin{displaymath}N_r = \left\{s \in W: \vert a_{rs}\vert &gt; \theta \sqrt{\vert a_{rr}a_{ss}\vert} \right\}
 \cup \left\{ r \right\} ,
@@ -272,7 +272,7 @@ S = I - \omega D^{-1} A ,
  -->
 <TABLE WIDTH="100%" ALIGN="CENTER">
 <TR VALIGN="MIDDLE"><TD ALIGN="CENTER" NOWRAP><A NAME="eq:jac_smoother"></A><IMG
- WIDTH="125" HEIGHT="30" BORDER="0"
+ WIDTH="126" HEIGHT="30" BORDER="0"
  SRC="img83.png"
  ALT="\begin{displaymath}
 S = I - \omega D^{-1} A ,
diff --git a/docs/html/node3.html b/docs/html/node3.html
index 65043f34..2c2e40ac 100644
--- a/docs/html/node3.html
+++ b/docs/html/node3.html
@@ -75,7 +75,7 @@ Ax=b,
  -->
 <TABLE WIDTH="100%" ALIGN="CENTER">
 <TR VALIGN="MIDDLE"><TD ALIGN="CENTER" NOWRAP><A NAME="system1"></A><IMG
- WIDTH="58" HEIGHT="30" BORDER="0"
+ WIDTH="57" HEIGHT="30" BORDER="0"
  SRC="img1.png"
  ALT="\begin{displaymath}
 Ax=b,
diff --git a/docs/src/Makefile b/docs/src/Makefile
new file mode 100644
index 00000000..93b19db5
--- /dev/null
+++ b/docs/src/Makefile
@@ -0,0 +1,386 @@
+## $Id: Makefile 1524 2007-01-17 17:06:06Z sfilippo $
+##---------------------------------------------------------------------------
+## LaTeX Makefile
+## Copyright (C) 1996-2001  Michael Forman	Michael.Forman@Colorado.EDU
+## 
+## This program is free software; you can redistribute it and/or
+## modify it under the terms of the GNU General Public License
+## as published by the Free Software Foundation; either version 2
+## of the License, or (at your option) any later version.
+## 
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+##
+## This copyright applies to this Makefile, and all perl scripts.  
+## The GPL does NOT apply to the actual content of the paper or thesis.  
+##---------------------------------------------------------------------------
+## 01.Dec,1996	forman	 Initial Makefile 
+## 01.Jun,1997	forman	 Added support for print, preview, and bibtex
+## 01.Jan,1998	stones	 tmp and lib directories to reduce clutter
+## 05.Feb,1998	forman	 Added the following functions:
+##			 .PHONY to prevent errors
+##			 generic TEXFILE definition with following patsubst's
+##			 vpath definitions
+##			 gzip, gunzip, tar, ci
+##			 search for \bibliography in tex file
+##			 conditional ifeq omits bibtex if unnecessary
+##			 documentation added
+## 30.Nov,1998	forman	 Added TOPFILE and SECFILE definitions to support
+##        		 texfiles with "input" commands.
+## 23.Dec,1998	marshats Added RCSFILES and ci/co capabilites for multiple files
+##                       Now only runs makeindex if $(IDX) file exists.
+## 15.Mar,1999	forman	 Added 'make wc' for papers with word quotas.
+## 24.Aug,1999	forman	 Converted Makefile to use pdftex as the primary
+##			 complier.  Now generates true pdf and ps files.
+## 09.Feb,2000	forman	 Simplified the documentation.  Removed direct
+##			 compilation of tex into DVI and PS.  Conversion
+##			 is done with PDF2PS now.
+## 08.Aug,2000	forman	 Added define for figures directories.
+## 18.Aug,2000	forman	 Removed the redundant code in the $(pdflatex) and
+##			 $(pdflatex-bibtex) definitions by splitting them up
+##			 into several smaller definitions which are called
+##			 from a single set of "if-then" statements.
+## 18.Aug,2000	forman	 Added support for glosstex and makeindex.
+## 19.Aug,2000	forman	 To save space, all files in tmp are now links.
+## 19.Aug,2000	forman	 Removed all postscript commands in favor of pdf.
+##---------------------------------------------------------------------------
+##
+## This Makefile expects the following directory structure:
+##
+##     	Makefile	This file
+##     	*.tex		Put latex files in root directory.
+##     	RCS/		Create an RCS directory for "ci" and "co".
+##	doc/		Document directory.  Includes gpl.txt.
+##     	lib/		Put all cls, sty, idx, gdf, and bib files in lib.
+##     	figures/	Put all figures in the figures directory.
+##     	tmp/		Never put anything in tmp -- it gets cleaned out.
+##
+##---------------------------------------------------------------------------
+##
+## Normal Usage:
+##	make 		Run pdflatex
+##	make clean 	Remove all files in tmp and the pdf link in root.
+##	make preview 	Preview the compiled file
+##	make ci 	Check in the RCSFILES
+##	make co 	Check out the RCSFILES
+##
+## Advanced Usage:
+##	make gzip	Recursively gzip all the files in the root directory
+##	make gunzip	Reverse the above process
+##	make tar 	Tar and gzip the working directory
+##	make wc		Count the number of words in your report
+##
+##---------------------------------------------------------------------------
+## WARNING:
+## 	If "make ci" fails, "make co" will nuke your files!
+## 	Don't put anything in tmp, "make clean" will delete everything!
+##---------------------------------------------------------------------------
+#
+
+TOPFILE   = userguide.tex
+HTMLFILE  = userhtml.tex
+SECFILE   = title.tex abstract.tex overview.tex distribution.tex \
+		building.tex background.tex gettingstarted.tex userinterface.tex \
+            errors.tex bibliography.tex license.tex 
+FIGDIR    = figures
+
+XPDFFLAGS = 
+ACROFLAGS = 				#-- macos
+#ACROFLAGS = -geometry 1234x1168+0+0	#-- new-sydney-wide
+#ACROFLAGS = -geometry 1000x1000+0+0	#-- sydney-wide
+#ACROFLAGS = -geometry 750x1000+0+0	#-- sydney
+#ACROFLAGS = -geometry 1200x1200+0+0	#-- home-wide
+#ACROFLAGS = -geometry 900x1200+0+0	#-- home
+
+##---------------------------------------------------------------------------
+## Change nothing below here (unless you're really really good).
+#----------------------------------------------------------------------------
+
+##
+## Accounting
+#
+START    = $(shell date)
+WDIR     = $(notdir $(shell pwd))
+DATE     = $(shell date +%Y-%m-%d)
+
+##
+## Programs
+#
+LATEX    = latex
+LTX2HTML = latex2html
+PDFLATEX = pdflatex
+ACRO     = evince
+XPDF     = xpdf
+WC       = wc
+PDF2PS   = pdf2ps
+PDF2TEXT = pdftotext
+MAKEIDX  = makeindex
+GLOSSTEX = glosstex
+BIBTEX   = bibtex
+FILTER   = ../bin/texfilter 
+FILTER   = cat
+CLEANIDX = ../bin/cleanidx
+
+##
+## Files
+#
+TEXFILES = $(TOPFILE) $(SECFILE) $(HTMLFILE)
+RCSFILES = $(TEXFILES) Makefile
+
+BASEFILE = $(patsubst %.tex,%,$(TOPFILE))
+IDX      = $(join $(BASEFILE),.idx)
+PDF      = $(join $(BASEFILE),.pdf)
+PS       = $(join $(BASEFILE),.ps)
+GXS      = $(join $(BASEFILE),.gxs)
+GLX      = $(join $(BASEFILE),.glx)
+BASEHTML = $(patsubst %.tex,%,$(HTMLFILE))
+HTML     = $(join $(HTMLFILE),.html)
+HTMLDIR  = ../html
+HTMLFLAGS = -noaddress
+
+FIGURES  = $(sort $(wildcard $(FIGDIR)/*))
+GLOFILES:= $(sort $(wildcard lib/*.gdf))
+GLOFILES:= $(patsubst lib/%,%,$(GLOFILES))
+BIBFILES:= $(shell grep ^.bibliography{ $(TOPFILE)|sed "s/,/ /g"|sed "s/%.*//g")
+BIBFILES:= $(patsubst \bibliography{%,%,$(BIBFILES))
+BIBFILES:= $(patsubst %},%,$(BIBFILES))
+BIBFILES:= $(sort $(addsuffix .bib,$(BIBFILES)))
+
+LIBFILES = $(sort $(wildcard lib/*))
+
+TEXLNFIL = $(addprefix $(PWD)/,$(TEXFILES))
+
+#============================================================================
+
+all: pdf html
+	
+pdf: $(PDF)
+html: $(HTML)
+
+$(PDF): $(TEXFILES) $(LIBFILES) $(FIGURES) Makefile
+	$(header)
+	$(initialize)
+	$(pdflatex-filter)
+        ifneq ($(BIBFILES),)
+	  $(bibtex)
+	  $(pdflatex-filter)
+        endif
+        ifneq ($(GLOFILES),)
+	  $(glosstex)
+	  $(pdflatex-filter)
+        endif
+#	$(makeindex)
+	$(pdflatex-filter)
+	$(finish)
+
+$(HTML): $(TEXFILES) $(LIBFILES) $(FIGURES) Makefile
+	$(header)
+	$(initialize)
+	$(latex-filter)
+        ifneq ($(BIBFILES),)
+	  $(bibtex)
+	  $(latex-filter)
+        endif
+        ifneq ($(GLOFILES),)
+	  $(glosstex)
+	  $(latex-filter)
+        endif
+#	$(makeindex)
+	$(latex-filter)
+	$(ltx2html-filter)
+
+$(PS): $(PDF)
+	$(PDF2PS) $< $(PS)
+
+ps: $(PS) 
+
+#============================================================================
+
+.PHONY: clean clean-tmp preview print gzip gunzip tar ci
+
+clean:
+	rm -f $(PDF) $(PS)
+	cd tmp ; rm -f *
+
+clean-tmp:
+	cd tmp ; rm -f *
+
+#----------------------------------------------------------------------------
+
+preview: $(PDF)
+	$(ACRO) $(ACROFLAGS) $< &
+
+xpreview: $(PDF)
+	$(XPDF) $(XPDFFLAGS) $< &
+
+wc: $(PDF)
+	$(PDF2TEXT) $< | $(WC) 
+
+#----------------------------------------------------------------------------
+
+ci: $(RCSFILES)
+	$(shell for i in $(RCSFILES) ; do ci -u $$i ; done) 
+
+co: $(RCSFILES)
+	$(shell for i in $(RCSFILES) ; do co -l $$i ; done) 
+
+gzip: clean-tmp
+	gzip -r ./
+	gunzip Makefile.gz
+
+gunzip: 
+	gunzip -r ./
+
+tar: clean-tmp
+	cd ..                               ; \
+	tar cvf $(WDIR)-$(DATE).tar $(WDIR) ; \
+	gzip $(WDIR)-$(DATE).tar
+
+info: 
+	$(header)
+
+#============================================================================
+
+define header
+  @echo
+  @echo "#---------------------------------------------------------------------"
+  @echo "MAKEFILE = LaTeX PDF Makefile"
+  @echo "AUTHOR   = Alfredo Buttari"
+  @echo 'ID       = $$Id: Makefile 1524 2007-01-17 17:06:06Z sfilippo $ ' 
+  @echo "#---------------------------------------------------------------------"
+  @echo
+  @echo "ACRO     = $(ACRO) $(ACROFLAGS) $(PDF)"
+  @echo "XPDF     = $(XPDF) $(XPDFFLAGS) $(PDF)"
+  @echo "GV       = $(GV) $(GVFLAGS) $(PS)"
+  @echo "LPR      = $(LPR) $(LPRFLAGS) $(PS)"
+  @echo
+  @echo "WDIR     = $(WDIR)"
+  @echo "DATE     = $(DATE)"
+  @echo
+  @echo "TOPFILE  = $(TOPFILE)"
+  @echo "SECFILE  = $(SECFILE)"
+  @echo "TEXFILES = $(TEXFILES)"
+  @echo "PDF      = $(PDF)"
+  @echo "PS       = $(PS)"
+  @echo "BIBFILES = $(BIBFILES)"
+  @echo "GLOFILES = $(GLOFILES)"
+  @echo "IDXFILES = $(IDXFILES)"
+  @echo
+endef
+# @echo "FIGURES  = $(FIGURES)"
+
+#----------------------------------------------------------------------------
+
+
+define initialize
+  @if test ! -d tmp; then mkdir tmp; fi
+  @ln -sf $(TEXLNFIL) tmp
+  @ln -sf $(PWD)/lib/* tmp
+  @ln -sf $(PWD)/$(FIGDIR) tmp
+endef
+
+#----------------------------------------------------------------------------
+
+define pdflatex
+  @echo
+  @echo "----- pdflatex -------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; $(PDFLATEX) $(TOPFILE)
+endef
+
+define latex
+  @echo
+  @echo "----- latex -------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; $(LATEX) $(HTMLFILE)
+endef
+
+#----------------------------------------------------------------------------
+
+define pdflatex-filter
+  @echo
+  @echo "----- latex -------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; ($(PDFLATEX) $(TOPFILE) 2>&1) | $(FILTER)
+endef
+
+define latex-filter
+  @echo
+  @echo "----- latex -------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; ($(LATEX) $(HTMLFILE) 2>&1) | $(FILTER)
+endef
+
+define ltx2html-filter
+  @echo
+  @echo "----- latex -------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; ($(LTX2HTML) $(HTMLFLAGS) -dir ../$(HTMLDIR) $(HTMLFILE)  2>&1) | $(FILTER)
+endef
+
+#----------------------------------------------------------------------------
+
+define bibtex
+  @echo
+  @echo "----- bibtex ---------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; $(BIBTEX) $(BASEFILE)
+endef
+
+#----------------------------------------------------------------------------
+
+define glosstex
+  @echo
+  @echo "----- glosstex -------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; $(GLOSSTEX) $(BASEFILE) $(GLOFILES)
+  cd tmp; $(MAKEIDX) $(GXS) -o $(GLX) -s glosstex.ist
+endef
+
+#----------------------------------------------------------------------------
+
+define makeindex
+  @echo
+  @echo "----- makeindex ------------------------------------------------------"
+  @echo -n "Starting: "; date
+  @echo
+  cd tmp; mv $(IDX) $(IDX)-; $(CLEANIDX) < $(IDX)- > $(IDX)
+  cd tmp; $(MAKEIDX) $(IDX)
+endef
+
+#----------------------------------------------------------------------------
+
+define finish
+  @ln -sf tmp/$@ .
+  @echo
+  @echo "----- finish ---------------------------------------------------------"
+  @echo -n "Start:  "$(START); echo
+  @echo -n "Finish: "; date
+  @echo -n "Output: "; ls -l -o tmp/$@
+  @echo -n "Target: "
+endef
+
+define ltx2html-finish
+  @ln -sf tmp/$@ .
+  @echo
+  @echo "----- finish ---------------------------------------------------------"
+  @echo -n "Start:  "$(START); echo
+  @echo -n "Finish: "; date
+  @echo -n "Output: "; ls -l -o tmp/$@
+  @echo -n "Target: "
+endef
+
+
diff --git a/docs/src/abstract.tex b/docs/src/abstract.tex
new file mode 100644
index 00000000..45856568
--- /dev/null
+++ b/docs/src/abstract.tex
@@ -0,0 +1,27 @@
+\section*{Abstract}
+\addcontentsline{toc}{section}{Abstract}  
+\textsc{MLD2P4 (Multi-Level Domain Decomposition Parallel Preconditioners Package based on
+PSBLAS}) is a package of parallel algebraic multi-level preconditioners.
+It implements various versions of one-level additive and of multi-level additive
+and hybrid Schwarz algorithms. In the multi-level case, a purely algebraic approach
+is applied to generate coarse-level corrections, so that no geometric background is needed
+concerning the matrix to be preconditioned. The matrix is required to be square, real
+or complex, with a symmetric sparsity pattern. 
+
+MLD2P4 has been designed to provide scalable and easy-to-use preconditioners in the
+context of the PSBLAS (Parallel Sparse Basic Linear Algebra Subprograms)
+computational framework and can be used in conjuction with the Krylov solvers
+available in this framework. MLD2P4 enables the user to easily specify different aspects
+of a generic algebraic multilevel Schwarz preconditioner, thus allowing to search
+for the ``best'' preconditioner for the problem at hand. 
+
+The package has been designed  employing object-oriented techniques,
+using Fortran 95, with interfaces to additional third party libraries
+such as UMFPACK, SuperLU and SuperLU\_Dist, that
+can be exploited in building multi-level preconditioners. The parallel
+implementation is based on a Single Program Multiple Data (SPMD)
+paradigm for distributed-memory architectures; the inter-process data
+communication is based on MPI and is managed mainly through PSBLAS.
+
+This guide provides a brief description of the functionalities and
+the user interface of MLD2P4.
diff --git a/docs/src/background.tex b/docs/src/background.tex
new file mode 100644
index 00000000..52e7674b
--- /dev/null
+++ b/docs/src/background.tex
@@ -0,0 +1,348 @@
+\section{Multi-level Domain Decomposition Background\label{sec:background}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:background} Multi-level Domain Decomposition Background}}
+
+\emph{Domain Decomposition} (DD) preconditioners, coupled with Krylov iterative
+solvers, are widely used in the parallel solution of large and sparse linear systems.
+These preconditioners are based on the divide and conquer technique: the matrix
+to be preconditioned is divided into submatrices, a ``local'' linear system
+involving each submatrix is (approximately) solved, and the local solutions are used
+to build a preconditioner for the whole original matrix. This process
+often corresponds to dividing a physical domain associated to the original matrix
+into subdomains, e.g. in a PDE discretization, to (approximately) solving the
+subproblems corresponding to the subdomains and to building an approximate
+solution of the original problem from the local solutions 
+\cite{Cai_Widlund_92,dd1_94,dd2_96}. 
+
+\emph{Additive Schwarz} preconditioners are DD preconditioners using overlapping
+submatrices, i.e.\ with some common rows, to couple the local information
+related to the submatrices (see, e.g., \cite{dd2_96}).
+The main motivation for choosing Additive Schwarz preconditioners is their
+intrinsic parallelism. A drawback of these
+preconditioners is that the number of iterations of the preconditioned solvers
+generally grows with the number of submatrices. This may be a serious limitation
+on parallel computers, since the number of submatrices usually matches the number
+of available processors. Optimal convergence rates, i.e.\ iteration numbers
+independent of the number of submatrices, can be obtained by correcting the
+preconditioner through a suitable approximation of the original linear system
+in a coarse space, which globally couples the information related to the single
+submatrices. 
+
+\emph{Two-level Schwarz} preconditioners are obtained
+by combining basic (one-level) Sch\-warz preconditioners with a coarse-level
+correction. In this context, the one-level preconditioner is often
+called `smoother'. Different two-level preconditioners are obtained by varying the
+choice of the smoother and of the coarse-level correction, and the
+way they are combined \cite{dd2_96}. The same reasoning can be applied starting
+from the coarse-level system, i.e.\ a coarse-space correction can be built
+from this system, thus obtaining \emph{multi-level} preconditioners.
+
+It is worth noting that optimal preconditioners do not necessarily correspond
+to minimum execution times. Indeed, to obtain effective multi-level preconditioners
+a tradeoff between optimality of convergence and the cost of building and applying
+the coarse-space corrections must be achieved. The choice of the number of levels,
+i.e.\ of the coarse-space corrections, also affects the effectiveness of the
+preconditioners. One more goal is to get convergence rates as less sensitive
+as possible to variations in the matrix coefficients.
+
+Two main approaches can be used to build coarse-space corrections. The geometric approach
+applies coarsening strategies based on the knowledge of some physical grid associated
+to the matrix and requires the user to define grid transfer operators from the fine
+to the coarse levels and vice versa. This may result difficult for complex geometries;
+furthermore, suitable one-level preconditioners may be required to get efficient
+interplay between fine and coarse levels, e.g.\ when matrices with highly varying coefficients
+are considered. The algebraic approach builds coarse-space corrections using only matrix
+information. It performs a fully automatic coarsening and enforces the interplay between
+the fine and coarse levels by suitably choosing the coarse space and the coarse-to-fine
+interpolation \cite{StubenGMD69_99}.
+
+MLD2P4 uses a pure algebraic approach for building the sequence of coarse matrices
+starting from the original matrix. The algebraic approach is based on the \emph{smoothed 
+aggregation} algorithm \cite{BREZINA_VANEK,VANEK_MANDEL_BREZINA}. A decoupled version
+of this algorithm is implemented, where the smoothed aggregation is applied locally
+to each submatrix \cite{TUMINARO_TONG}. In the next two subsections we provide
+a brief description of the multi-level Schwarz preconditioners and of the smoothed
+aggregation technique as implemented in MLD2P4. For further details the user
+is referred to \cite{para_04,aaecc_07,apnum_07,dd2_96}.
+
+
+\subsection{Multi-level Schwarz Preconditioners\label{sec:multilevel}}
+
+The Multilevel preconditioners implemented in MLD2P4 are obtained by combining
+AS preconditioners with coarse-space corrections; therefore
+we first provide a sketch of the AS preconditioners.
+
+Given the linear system \Ref{system1},
+where $A=(a_{ij}) \in \Re^{n \times n}$ is a
+nonsingular sparse matrix with a symmetric nonzero pattern,
+let $G=(W,E)$ be the adjacency graph of $A$, where $W=\{1, 2, \ldots, n\}$
+and $E=\{(i,j) : a_{ij} \neq 0\}$ are the vertex set and the edge set of $G$,
+respectively. Two vertices are called adjacent if there is an edge connecting
+them. For any integer $\delta > 0$, a $\delta$-overlap
+partition of $W$ can be defined recursively as follows.
+Given a 0-overlap (or non-overlapping) partition of $W$,
+i.e.\ a set of $m$ disjoint nonempty sets $W_i^0 \subset W$ such that
+$\cup_{i=1}^m W_i^0 = W$, a $\delta$-overlap
+partition of $W$ is obtained by considering the sets
+$W_i^\delta \supset W_i^{\delta-1}$ obtained by including the vertices that
+are adjacent to any vertex in $W_i^{\delta-1}$.
+
+Let $n_i^\delta$ be the size of $W_i^\delta$ and $R_i^{\delta} \in 
+\Re^{n_i^\delta \times n}$ the restriction operator that maps
+a vector $v \in \Re^n$ onto the vector $v_i^{\delta} \in \Re^{n_i^\delta}$
+containing the components of $v$ corresponding to the vertices in
+$W_i^\delta$. The transpose of $R_i^{\delta}$ is a
+prolongation operator from $\Re^{n_i^\delta}$ to $\Re^n$.
+The matrix $A_i^\delta=R_i^\delta A (R_i^\delta)^T \in
+\Re^{n_i^\delta \times n_i^\delta}$ can be considered
+as a restriction of $A$ corresponding to the set $W_i^{\delta}$.
+
+The \emph{classical one-level AS} preconditioner is defined by
+\[
+M_{AS}^{-1}= \sum_{i=1}^m (R_i^{\delta})^T 
+(A_i^\delta)^{-1} R_i^{\delta},
+\]
+where $A_i^\delta$ is assumed to be nonsingular. Its application
+to a vector $v \in \Re^n$ within a Krylov solver requires the following
+three steps:
+\begin{enumerate}
+	\item restriction of $v$ as $v_i = R_i^{\delta} v$, $i=1,\ldots,m$;
+	\item solution of the linear systems $A_i^\delta w_i = v_i$,
+	      $i=1,\ldots,m$;
+	\item prolongation and sum of the $w_i$'s, i.e. $w = \sum_{i=1}^m (R_i^{\delta})^T w_i$.
+\end{enumerate}
+Note that the linear systems at step 2 are usually solved approximately,
+e.g.\ using incomplete LU factorizations such as ILU($p$), MILU($p$) and
+ILU($p,t$) \cite[Chapter 10]{Saad_book}.
+
+A variant of the classical AS preconditioner that outperforms it
+in terms of convergence rate and of computation and communication
+time on parallel distributed-memory computers is the so-called \emph{Restricted AS
+(RAS)} preconditioner~\cite{CAI_SARKIS,EFSTATHIOU}. It
+is obtained by zeroing the components of $w_i$ corresponding to the
+overlapping vertices when applying the prolongation. Therefore,
+RAS differs from classical AS by the prolongation operators,
+which are substituted by $(\tilde{R}_i^0)^T \in \Re^{n_i^\delta \times n}$,
+where $\tilde{R}_i^0$ is obtained by zeroing the rows of $R_i^\delta$
+corresponding to the vertices in $W_i^\delta \backslash W_i^0$:
+\[
+M_{RAS}^{-1}= \sum_{i=1}^m (\tilde{R}_i^0)^T 
+(A_i^\delta)^{-1} R_i^{\delta}.
+\]
+Analogously, the AS variant called \emph{AS with Harmonic extension (ASH)}
+is defined by
+\[ M_{ASH}^{-1}= \sum_{i=1}^m (R_i^{\delta})^T 
+(A_i^\delta)^{-1} \tilde{R}_i^0.
+\]
+We note that for $\delta=0$ the three variants of the AS preconditioner are
+all equal to the block-Jacobi preconditioner.
+
+As already observed, the convergence rate of the one-level Schwarz
+preconditioned iterative solvers deteriorates as the number $m$ of partitions
+of $W$ increases \cite{dd1_94,dd2_96}. To reduce the dependency
+of the number of iterations on the degree of parallelism we may
+introduce a global coupling among the overlapping partitions by defining 
+a coarse-space approximation $A_C$ of the matrix $A$. 
+In a pure algebraic setting, $A_C$ is usually built with
+a Galerkin approach. Given a set $W_C$ of \emph{coarse vertices},
+with size $n_C$, and a suitable restriction operator
+$R_C \in \Re^{n_C \times n}$, $A_C$ is defined as
+\[
+A_C=R_C A R_C^T
+\]
+and the coarse-level correction matrix to be combined with a generic
+one-level AS preconditioner $M_{1L}$ is obtained as
+\[
+M_{C}^{-1}= R_C^T A_C^{-1} R_C,
+\]
+where $A_C$ is assumed to be nonsingular. The application of $M_{C}^{-1}$
+to a vector $v$ corresponds to a restriction, a solution and
+a prolongation step; the solution step, involving the matrix $A_C$,
+may be carried out also approximately.
+
+The combination of $M_{C}$ and $M_{1L}$ may be
+performed in either an additive or a multiplicative framework.
+In the former case, the \emph{two-level additive} Schwarz preconditioner
+is obtained:
+\[
+M_{2LA}^{-1} = M_{C}^{-1} + M_{1L}^{-1}. 
+\]
+Applying $M_{2L-A}^{-1}$ to a vector $v$ within a Krylov solver
+corresponds to applying $M_{C}^{-1}$
+and $M_{1L}^{-1}$ to $v$ independently and then summing up
+the results.
+
+In the multiplicative case, the combination can be
+performed by first applying the smoother $M_{1L}^{-1}$ and then
+the coarse-level correction operator $M_{C}^{-1}$:
+\[
+\begin{array}{l}
+w = M_{1L}^{-1} v, \\
+z = w + M_{C}^{-1} (v-Aw);
+\end{array}
+\]
+this corresponds to the following \emph{two-level hybrid pre-smoothed}
+Schwarz preconditioner:
+\[
+M_{2LH-PRE}^{-1} = M_{C}^{-1} + \left( I - M_{C}^{-1}A \right) M_{1L}^{-1}. 
+\]
+On the other hand, by applying the smoother after the coarse-level correction,
+i.e.\ by computing
+\[
+\begin{array}{l}
+w = M_{C}^{-1} v , \\
+z = w + M_{1L}^{-1} (v-Aw) , 
+\end{array}
+\]
+the \emph{two-level hybrid post-smoothed}
+Schwarz preconditioner is obtained:
+\[
+M_{2LH-POST}^{-1} = M_{1L}^{-1} + \left( I - M_{1L}^{-1}A \right) M_{C}^{-1}. 
+\]
+One more variant of two-level hybrid preconditioner is obtained by applying
+the smoother before and after the coarse-level correction. In this case, the
+preconditioner is symmetric if $A$, $M_{1L}$ and $M_{C}$ are symmetric.
+
+As previously noted, on parallel computers the number of submatrices usually matches
+the number of available processors. When the size of the system to be preconditioned
+is very large, the use of many processors, i.e.\ of many small submatrices, often
+leads to a large coarse-level system, whose solution may be computationally expensive.
+On the other hand, the use of few processors often leads to local sumatrices that
+are too expensive to be processed on single processors, because of memory and/or
+computing requirements. Therefore, it seems natural to use a recursive approach,
+in which the coarse-level correction is re-applied starting from the current
+coarse-level system. The corresponding preconditioners, called \emph{multi-level}
+preconditioners, can significantly reduce the computational cost of preconditioning
+with respect to the two-level case (see \cite[Chapter 3]{dd2_96}). 
+Additive and hybrid multilevel preconditioners
+are obtained as direct extensions of the two-level counterparts.
+For a detailed descrition of them, the reader is
+referred to \cite[Chapter 3]{dd2_96}.
+The algorithm for the application of a multi-level hybrid 
+post-smoothed preconditioner $M$ to a vector $v$, i.e.\ for the
+computation of $w=M^{-1}v$, is reported, for
+example, in Figure~\ref{fig:mlhpost_alg}. Here the number of levels
+is denoted by $nlev$ and the levels are numbered in increasing order starting
+from the finest one, i.e.\ the finest level is level 1; the coarse matrix
+and the corresponding basic preconditioner at each level $l$ are denoted by $A_l$ and
+$M_l$, respectively, with $A_1=A$.
+% 
+\begin{figure}[t]
+\begin{center}
+\framebox{
+\begin{minipage}{.85\textwidth} {\small
+\begin{tabbing}
+\quad \=\quad \=\quad \=\quad \\[-1mm]
+%
+%! assign the finest matrix\\
+%$A_1 \leftarrow A$;\\[1mm]
+%! define the number of levels $nlev$ \\[1mm]
+%! define $nlev-1$ prolongators\\
+%$R_l^T, l=2, \ldots, nlev$;\\[1mm]
+%! define $nlev-1$ coarser matrices\\
+%$A_l \leftarrow R_lA_{l-1}R_l^T, \; l=2, \ldots, nlev$;\\[1mm]
+%! define the $nlev-1$ basic Schwarz preconditioners\\
+%$M_l$, basic preconditioner for $A_l \; l=1, \ldots, nlev-1$;\\[1mm]
+%$! assign a vector $v$\\
+%
+$v_1 = v$; \\[2mm]
+\textbf{for $l=2, nlev$ do}\\[1mm]
+\> ! transfer $v_{l-1}$ to the next coarser level\\
+\>  $v_l = R_lv_{l-1}$ \\[1mm]
+\textbf{endfor} \\[2mm]
+! apply the coarsest-level correction\\[1mm]
+$y_{nlev} = A_{nlev}^{-1} v_{nlev}$\\[2mm]
+\textbf{for $l=nlev -1 , 1, -1$ do}\\[1mm]
+\> ! transfer $y_{l+1}$ to the next finer level\\
+\> $y_l = R_{l+1}^T y_{l+1}$;\\[1mm]
+\> ! compute the residual at the current level\\
+\> $r_l = v_l-A_l^{-1} y_l$;\\[1mm]
+\> ! apply the basic Schwarz preconditioner to the residual\\
+\> $r_l = M_l^{-1} r_l$\\[1mm]
+\> ! update $y_l$\\
+\> $y_l = y_l+r_l$\\
+\textbf{endfor} \\[1mm]
+$w = y_1$;
+\end{tabbing}
+}
+\end{minipage}
+}
+\caption{Application of the multi-level hybrid post-smoothed preconditioner.\label{fig:mlhpost_alg}}
+\end{center}
+\end{figure}
+%
+
+
+\subsection{Smoothed Aggregation\label{sec:aggregation}}
+
+In order to define the restriction operator $R_C$, which is used to compute
+the coarse-level matrix $A_C$, MLD2P4 uses the \emph{smoothed aggregation}
+algorithm described in \cite{BREZINA_VANEK,VANEK_MANDEL_BREZINA}.
+The basic idea of this algorithm is to build a coarse set of vertices
+$W_C$ by suitably grouping the vertices of $W$ into disjoint subsets
+(aggregates), and to define the coarse-to-fine space transfer operator $R_C^T$ by
+applying a suitable smoother to a simple piecewise constant
+prolongation operator, to improve the quality of the coarse-space correction.
+
+Three main steps can be identified in the smoothed aggregation procedure:
+\begin{enumerate}
+	\item coarsening of the vertex set $W$, to obtain $W_C$;
+	\item construction of the prolongator $R_C^T$;
+	\item application of $R_C$ and $R_C^T$ to build $A_C$.
+\end{enumerate}
+%\textbf{NOTA: Controllare cosa fa trilinos dopo il primo passo.}
+ 
+To perform the coarsening step, we have implemented the aggregation algorithm sketched
+in \cite{apnum_07}. According to \cite{VANEK_MANDEL_BREZINA}, a modification of
+this algorithm has been actually considered,
+in which each aggregate $N_r$ is made of vertices of $W$ that are \emph{strongly coupled}
+to a certain root vertex $r \in W$, i.e.\
+\[  N_r = \left\{s \in W: |a_{rs}| > \theta \sqrt{|a_{rr}a_{ss}|} \right\}
+    \cup \left\{ r \right\} ,
+\]
+for a given $\theta \in [0,1]$.
+Since this algorithm has a sequential nature, a \emph{decoupled} version of
+it has been chosen, where each processor $i$ independently applies the algorithm to
+the set of vertices $W_i^0$ assigned to it in the initial data distribution. This
+version is embarrassingly parallel, since it does not require any data communication.
+On the other hand, it may produce non-uniform aggregates near boundary vertices,
+i.e.\ near vertices adjacent to vertices in other processors, and is strongly
+dependent on the number of processors and on the initial partitioning of the matrix $A$.
+Nevertheless, this algorithm has been chosen for the implementation in MLD2P4,
+since it has been shown to produce good results in practice
+\cite{aaecc_07,apnum_07,TUMINARO_TONG}.
+
+The prolongator $P_C=R_C^T$ is built starting from a \emph{tentative prolongator}
+$P \in \Re^{n \times n_C}$, defined as
+\begin{equation} 
+P=(p_{ij}), \quad  p_{ij}= 
+\left\{ \begin{array}{ll}
+1 & \quad \mbox{if} \; i \in V^j_C \\
+0 & \quad \mbox{otherwise}
+\end{array} \right. .
+\label{eq:tent_prol}
+\end{equation}
+$P_C$ is obtained by
+applying to $P$ a smoother $S \in \Re^{n \times n}$:
+\begin{equation}
+P_C = S P,
+\label{eq:smoothed_prol}
+\end{equation}
+in order to remove oscillatory components from the range of the prolongator
+and hence to improve the convergence properties of the multi-level
+Schwarz method \cite{BREZINA_VANEK,StubenGMD69_99}.
+A simple choice for $S$ is the damped Jacobi smoother:
+\begin{equation}
+S = I - \omega D^{-1} A , 
+\label{eq:jac_smoother}
+\end{equation}
+where the value of $\omega$ can be chosen
+using some estimate of the spectral radius of $D^{-1}A$ \cite{BREZINA_VANEK}.
+%
+%\textbf{NOTA: filtering di $A$ nello smoothing, da implementare?}
+%
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: 
diff --git a/docs/src/bibliography.tex b/docs/src/bibliography.tex
new file mode 100644
index 00000000..b68ae5e9
--- /dev/null
+++ b/docs/src/bibliography.tex
@@ -0,0 +1,207 @@
+%\section{Bibliography\label{sec:bib}}
+\begin{thebibliography}{99}
+\addcontentsline{toc}{section}{\refname}  
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{References}}
+
+%\let\refname\relax
+
+%
+%\bibitem{PARA04FOREST}
+%G.~Bella, S.~Filippone, A.~De Maio, A., Testa, M.: 
+%A Simulation Model for Forest Fires.
+%In: Dongarra, J., Madsen, K., Wasniewski, J. (eds.):
+%Proceedings of PARA~04 Workshop on State of the Art
+%in Scientific Computing. Lecture Notes in Computer Science, 3732. Berlin:
+%Springer, 2005
+%
+\bibitem{BREZINA_VANEK}
+M.~Brezina, P.~Van{\v e}k,
+{\em A Black-Box Iterative Solver Based on a Two-Level Schwarz Method},
+Computing, 63, 1999, 233--263.
+%
+\bibitem{para_04}
+A.~Buttari, P.~D'Ambra, D.~di Serafino, S.~Filippone,
+{\em Extending PSBLAS to Build Parallel Schwarz Preconditioners},
+in , J.~Dongarra, K.~Madsen, J.~Wasniewski, editors,
+Proceedings of PARA~04 Workshop on State of the Art
+in Scientific Computing, Lecture Notes in Computer Science,
+Springer, 2005, 593--602.
+%
+\bibitem{aaecc_07} A.~Buttari, P.~D'Ambra, D.~di~Serafino, S.~Filippone,
+{\em 2LEV-D2P4: a package of high-performance preconditioners
+for scientific and engineering applications},
+Applicable Algebra in Engineering, Communications and Computing, 
+18, 3, 2007, 223--239.
+%Published online: 13 February 2007, {\tt http://dx.doi.org/10.1007/s00200-007-0035-z}
+%
+\bibitem{apnum_07}  P.~D'Ambra, S.~Filippone,  D.~di~Serafino,
+{\em On the Development of PSBLAS-based Parallel Two-level Schwarz Preconditioners},
+Applied Numerical Mathematics, Elsevier Science, 
+57, 11-12, 2007, 1181-1196.
+%published online 3 February 2007, {\tt
+%  http://dx.doi.org/10.1016/j.apnum.2007.01.006}
+
+%% \bibitem{DOUGLAS}
+%% R.E.~Bank and C.C.~Douglas,
+%% {\em SMMP: Sparse Matrix Multiplication Package}, 
+%% Advances in Computational Mathematics, 1993, 1, 127-137.
+%% (See also {\tt http://www.mgnet.org/~douglas/ccd-codes.html}) 
+%
+%
+%% \bibitem{CAI_SAAD}
+%% X.~C.~Cai and Y.~Saad,
+%% {\em Overlapping Domain Decomposition Algorithms for General Sparse Matrices},
+%% Numerical Linear Algebra with Applications, 3(3), pp.~221--237, 1996.
+%
+\bibitem{CAI_SARKIS}
+X.~C.~Cai, M.~Sarkis,
+{\em A Restricted Additive Schwarz Preconditioner for General Sparse Linear Systems},
+SIAM Journal on Scientific Computing, 21, 2, 1999, 792--797.
+%
+\bibitem{Cai_Widlund_92}
+X.~C.~Cai, O.~B.~Widlund,
+{\em Domain Decomposition Algorithms for Indefinite Elliptic Problems},
+SIAM Journal on Scientific and Statistical Computing, 13, 1, 1992, 243--258.
+%
+\bibitem{dd1_94}
+T.~Chan and T.~Mathew,
+{\em Domain Decomposition Algorithms},
+in A.~Iserles, editor, Acta Numerica 1994, 61--143.
+Cambridge University Press.
+% 
+\bibitem{UMFPACK}
+T.A.~Davis, 
+{\em Algorithm 832: UMFPACK - an Unsymmetric-pattern Multifrontal
+Method with a Column Pre-ordering Strategy},
+ACM Transactions on Mathematical Software, 30, 2004, 196--199.
+(See also {\tt http://www.cise.ufl.edu/~davis/})
+%
+\bibitem{SUPERLU}
+J.W.~Demmel, S.C.~Eisenstat, J.R.~Gilbert, X.S.~Li and J.W.H.~Liu,
+A supernodal approach to sparse partial pivoting,
+SIAM Journal on Matrix Analysis and Applications, 20, 3, 1999, 720--755.
+%
+\bibitem{blas3}
+J.~J.~Dongarra, J.~Du Croz, I.~S.~Duff, S.~Hammarling,
+\emph{A set of Level 3 Basic Linear Algebra Subprograms},
+ACM Transactions on Mathematical Software, 16, 1990, 1--17.
+%
+\bibitem{blas2}
+J.~J.~Dongarra, J.~Du Croz, S.~Hammarling, R.~J.~Hanson,
+\emph{An extended set of FORTRAN Basic Linear Algebra Subprograms},
+ACM Transactions on Mathematical Software, 14, 1988, 1--17.
+%
+\bibitem{BLACS}
+J.~J.~Dongarra and R.~C.~Whaley,
+{\em A User's Guide to the BLACS v.~1.1},
+Lapack Working Note 94, Tech.\ Rep.\ UT-CS-95-281, University of
+Tennessee, March 1995 (updated May 1997).
+%
+%\bibitem{sblas_97}
+%I.~Duff, M.~Marrone, G.~Radicati and C.~Vittoli,
+%{\em Level 3 Basic Linear Algebra Subprograms for Sparse Matrices: 
+%a User Level Interface},
+%ACM Transactions on Mathematical Software, 23(3), pp.~379--401, 1997.
+%
+%\bibitem{sblas_02}
+%I.~Duff, M.~Heroux and R.~Pozo,
+%{\em An Overview of the Sparse Basic Linear
+%Algebra Subprograms: the New Standard from the BLAS Technical Forum},
+%ACM Transactions on Mathematical Software, 28(2), pp.~239--267, 2002.
+%
+\bibitem{EFSTATHIOU} 
+E.~Efstathiou, J.~G.~Gander,
+{\em Why Restricted Additive Schwarz Converges Faster than Additive Schwarz},
+BIT Numerical Mathematics, 43, 2003, 945--959.
+%
+\bibitem{PSBLASGUIDE}
+S.~Filippone, A.~Buttari, 
+{\em PSBLAS-2.3 User's Guide. A Reference Guide for the Parallel Sparse BLAS Library}, 2008,
+available from \texttt{http://www.ce.uniroma2.it/psblas/}.
+%
+\bibitem{psblas_00}
+S.~Filippone, M.~Colajanni, 
+{\em PSBLAS: A Library for Parallel Linear Algebra
+Computation on Sparse Matrices},
+ACM Transactions on Mathematical Software, 26, 4, 2000, 527--550.
+%
+\bibitem{MPI2}
+W.~Gropp, S.~Huss-Lederman, A.~Lumsdaine, E.~Lusk, B.~Nitzberg, W.~Saphir, M.~Snir, 
+{\em MPI: The Complete Reference. Volume 2 - The MPI-2 Extensions},
+MIT Press, 1998.
+%
+\bibitem{blas1}
+C.~L.~Lawson, R.~J.~Hanson, D.~Kincaid, F.~T.~Krogh,
+\emph{Basic Linear Algebra Subprograms for FORTRAN usage},
+ACM Transactions on Mathematical Software, 5, 1979, 308--323.
+%
+\bibitem{SUPERLUDIST}
+X.~S.~Li, J.~W.~Demmel, {\em SuperLU\_DIST: A Scalable Distributed-memory
+Sparse Direct Solver for Unsymmetric Linear Systems},
+ACM Transactions on Mathematical Software, 29, 2, 2003, 110--140.
+%
+%\bibitem{KIVA3PSBLAS}
+%S.~Filippone, P.~D'Ambra, M.~Colajanni,
+%{\em Using a Parallel Library of Sparse Linear Algebra in a Fluid Dynamics 
+%Applications Code on Linux Clusters},
+%in G.~Joubert, A.~Murli, F.~Peters, M.~Vanneschi, editors,
+%Parallel Computing - Advances \& Current Issues,
+%pp.~441--448, Imperial College Press, 2002. 
+%
+%\bibitem{METIS}
+%Karypis, G. and Kumar, V.,
+%{\em {METIS}: Unstructured Graph Partitioning and Sparse Matrix
+%  Ordering System}.
+%Minneapolis, MN 55455: University of Minnesota, Department of
+%  Computer Science, 1995. 
+%Internet Address: {\verb|http://www.cs.umn.edu/~karypis|}.
+%\bibitem{BLAS1}
+%Lawson, C.,  Hanson, R., Kincaid, D. and Krogh, F.,
+%   Basic {L}inear {A}lgebra {S}ubprograms for {F}ortran usage,
+%{ACM Trans. Math. Softw.} vol.~{5}, 38--329, 1979.
+%
+%\bibitem{machiels}
+%{Machiels, L. and Deville, M.}
+%{\em Fortran 90: An entry to object-oriented programming for the solution
+%  of partial differential equations.}
+%{ACM Trans. Math. Softw.} vol.~{23}, 32--49.
+%\bibitem{metcalf}
+%{Metcalf, M., Reid, J. and Cohen, M.}
+%{\em Fortran 95/2003 explained.}
+%{Oxford University Press}, 2004.
+%
+\bibitem{Saad_book}
+Y.~Saad,
+\emph{Iterative methods for sparse linear systems}, 2nd edition,
+SIAM, 2003
+
+\bibitem{dd2_96}
+B.~Smith, P.~Bjorstad, W.~Gropp,
+{\em Domain Decomposition: Parallel Multilevel Methods for Elliptic
+Partial Differential Equations},
+Cambridge University Press, 1996.
+%
+\bibitem{MPI1}
+M.~Snir, S.~Otto, S.~Huss-Lederman, D.~Walker, J.~Dongarra,
+{\em MPI: The Complete Reference. Volume 1 - The MPI Core}, second edition,
+MIT Press, 1998.
+%%
+\bibitem{StubenGMD69_99}
+K.~St\"{u}ben,
+{\em Algebraic Multigrid (AMG): an Introduction with Applications},
+in A.~Sch\"{u}ller, U.~Trottenberg, C.~Oosterlee, editors, Multigrid,
+Academic Press, 2000.
+%
+\bibitem{TUMINARO_TONG}
+R.~S.~Tuminaro, C.~Tong,
+{\em Parallel Smoothed Aggregation Multigrid: Aggregation Strategies on Massively Parallel Machines},
+in J. Donnelley, editor, Proceedings of SuperComputing 2000, Dallas, 2000.
+%
+\bibitem{VANEK_MANDEL_BREZINA}
+P.~Van{\v e}k, J.~Mandel and M.~Brezina,
+{\em Algebraic Multigrid by Smoothed Aggregation for Second and Fourth Order Elliptic Problems},
+Computing, 56, 1996, 179-196.
+%
+
+\end{thebibliography}
diff --git a/docs/src/building.tex b/docs/src/building.tex
new file mode 100644
index 00000000..7c1d699e
--- /dev/null
+++ b/docs/src/building.tex
@@ -0,0 +1,242 @@
+\section{Configuring and Building MLD2P4\label{sec:building}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:building} Configuring and Building MLD2P4}}
+To build MLD2P4 it is necessary to set up a Makefile with appropriate
+values for your system; this is done by means of the \verb|configure|
+script. The distribution also includes the autoconf and automake
+sources employed to generate the script, but usually this is not needed
+to build the software. 
+
+MLD2P4 is implemented almost entirely in Fortran~95, with some
+interfaces to external libraries in C; the Fortran compiler
+must support the Fortran~95 standard plus the extension TR15581, which
+enhances the usability of \verb|ALLOCATABLE| variables. Most modern
+Fortran compilers support this language level. In particular, this is
+supported by the GNU Fortran compiler as of version 4.2.0; however we
+recommend to use the latest available release (4.3.1 at the time of
+this writing).
+The software defines data types and interfaces for
+real and complex data, in both single and double precision. 
+
+\subsection{Prerequisites}
+
+The following base libraries are needed: 
+\begin{description}
+\item[BLAS] \cite{blas3,blas2,blas1} Many vendors provide optimized versions
+  of the Basic Linear Algebra Subprograms; if no vendor version is
+  available for a given platform, the ATLAS software
+  (\verb!http://math-atlas.sourceforge.net/!)
+  may be employed.  The reference BLAS from Netlib
+  (\verb|http://www.netlib.org/blas|) are meant to define the standard
+  behaviour of the BLAS interface, so they are not optimized for any
+  particular plaftorm, and should only be used as a last
+  resort. Note that BLAS computations form a relatively small part of
+  the MLD2P4/PSBLAS computations; they are however critical when using
+  preconditioners based on the UMFPACK or SuperLU third party
+  libraries.  
+\item[MPI] \cite{MPI2,MPI1} A version of MPI is available on most
+  high-performance computing systems; only version 1.1 is required.
+\item[BLACS] \cite{BLACS} The Basic Linear Algebra Communication Subprograms
+  are available in source form from \verb|http://www.netlib.org/blacs|;
+  some vendors  include them in their parallel computing
+  support libraries.
+ \item[PSBLAS] \cite{PSBLASGUIDE,psblas_00} Parallel Sparse BLAS is
+  available from \\ \verb|http://www.ce.uniroma2.it/psblas|; version 2.3
+  (or later) is required. Indeed, all the prerequisites
+  listed so far are also prerequisites of PSBLAS.
+  To build the MLD2P4 library it is necessary to get access to
+  the source PSBLAS directory employed to build the version under use; after
+  the MLD2P4 build process completes, only the compiled form of the
+  PSBLAS library is necessary to build user applications.
+\end{description}
+
+Please note that the four previous libraries must have Fortran
+interfaces compatible with MLD2P4;
+usually this means that they should all be built with the same
+compiler as MLD2P4.
+
+\subsection{Optional third party libraries}
+
+We provide interfaces to the following third-party software libraries;
+note that these are optional, but if you enable them some defaults
+for multilevel preconditioners may change to reflect their presence. 
+
+\begin{description}
+\item[UMFPACK] \cite{UMFPACK}
+  A sparse direct factorization package available from \\ 
+  \verb|http://www.cise.ufl.edu/research/sparse/umfpack/|; 
+  provides serial factorization and triangular system solution for double
+  precision real and complex data. We have tested
+  versions 4.4 and 5.1. 
+\item[SuperLU] \cite{SUPERLU}
+  A sparse direct factorization package available from \\
+  \verb|http://crd.lbl.gov/~xiaoye/SuperLU/|; provides serial
+  factorization and triangular system solution for single and double precision,
+  real and complex data. We have tested versions 3.0 and 3.1.
+\item[SuperLU\_Dist] \cite{SUPERLUDIST}
+  A sparse direct factorization package available
+  from the same site as SuperLU; provides parallel factorization and
+  triangular system solution for double precision real and complex data.
+  We have tested version 2.1.
+\end{description}
+
+\subsection{Configuration options}
+
+To build  MLD2P4 the first step is to use the \verb|configure| script
+in the main directory to generate the necessary makefile(s). 
+
+As a minimal example consider the following:
+\begin{verbatim}
+./configure --with-psblas=/home/user/PSBLAS/psblas-2.3
+\end{verbatim}
+which assumes that the various MPI compilers and support libraries are
+available in the standard directories on the system, and specifies
+only the PSBLAS build directory (note that the latter directory must
+be specified with an {\em absolute} path).
+The full set of options may be looked at by issuing the command
+\verb|./configure --help|, which produces:
+\begin{verbatim}
+`configure' configures MLD2P4 1.0 to adapt to many kinds of systems.
+
+Usage: ./configure [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print `checking...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for `--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or `..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+			  [/usr/local]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+			  [PREFIX]
+
+By default, `make install' will install all the files in
+`/usr/local/bin', `/usr/local/lib' etc.  You can specify
+an installation prefix other than `/usr/local' using `--prefix',
+for instance `--prefix=$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR           user executables [EPREFIX/bin]
+  --sbindir=DIR          system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR       program executables [EPREFIX/libexec]
+  --sysconfdir=DIR       read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR   modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR    modifiable single-machine data [PREFIX/var]
+  --libdir=DIR           object code libraries [EPREFIX/lib]
+  --includedir=DIR       C header files [PREFIX/include]
+  --oldincludedir=DIR    C header files for non-gcc [/usr/include]
+  --datarootdir=DIR      read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR          read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR          info documentation [DATAROOTDIR/info]
+  --localedir=DIR        locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR           man documentation [DATAROOTDIR/man]
+  --docdir=DIR           documentation root [DATAROOTDIR/doc/mld2p4]
+  --htmldir=DIR          html documentation [DOCDIR]
+  --dvidir=DIR           dvi documentation [DOCDIR]
+  --pdfdir=DIR           pdf documentation [DOCDIR]
+  --psdir=DIR            ps documentation [DOCDIR]
+
+Optional Packages:
+  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-psblas           The source directory for PSBLAS, for example,
+                          --with-psblas=/opt/packages/psblas-2.3
+  --with-libs             List additional link flags here. For example,
+                          --with-libs=-lspecial_system_lib or
+                          --with-libs=-L/path/to/libs
+  --with-clibs            additional CLIBS flags to be added: will prepend
+                          to CLIBS
+  --with-flibs            additional FLIBS flags to be added: will prepend
+                          to FLIBS
+  --with-library-path     additional LIBRARYPATH flags to be added: will
+                          prepend to LIBRARYPATH
+  --with-include-path     additional INCLUDEPATH flags to be added: will
+                          prepend to INCLUDEPATH
+  --with-module-path      additional MODULE_PATH flags to be added: will
+                          prepend to MODULE_PATH
+  --with-umfpack=LIBNAME  Specify the library name for UMFPACK library.
+                          Default: "-lumfpack -lamd"
+  --with-umfpackdir=DIR   Specify the directory for UMFPACK library and
+                          includes.
+  --with-superlu=LIBNAME  Specify the library name for SUPERLU library.
+                          Default: "-lslu"
+  --with-superludir=DIR   Specify the directory for SUPERLU library and
+                          includes.
+  --with-superludist=LIBNAME
+                          Specify the libname for SUPERLUDIST library.
+                          Requires you also specify SuperLU. Default: "-lslud"
+  --with-superludistdir=DIR
+                          Specify the directory for SUPERLUDIST library and
+                          includes.
+
+Some influential environment variables:
+  FC          Fortran compiler command
+  FCFLAGS     Fortran compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  CPPFLAGS    C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CPP         C preprocessor
+  MPICC       MPI C compiler command
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to <bugreport@mld2p4.it>.
+\end{verbatim}
+Thus, a sample build with libraries in installation
+directories specifics to the GNU 4.3 compiler suite might be as
+follows, specifying only the UMFPACK external package: 
+\begin{verbatim}
+ ./configure --with-psblas=/home/user/psblas-2.3/ \
+ --with-libs="-L/usr/local/BLAS/gnu43 -L/usr/local/BLACS/gnu43" \
+ --with-blacs=-lmpiblacs  --with-umfpackdir=/usr/local/UMFPACK/gnu43 
+\end{verbatim}
+Once the configure script has completed execution, it will have
+generated the file \verb|Make.inc| which will then be used by all
+Makefiles in the directory tree. 
+
+To build the library the user will now enter 
+\begin{verbatim}
+make
+\end{verbatim}
+followed (optionally) by 
+\begin{verbatim}
+make install
+\end{verbatim}
+
+\subsection{Example and test programs\label{sec:ex_and_test}}
+The package contains the \verb|examples| and \verb|tests| directories;
+both of them are further divided into \verb|fileread| and
+\verb|pargen| subdirectories. Their purpose is as follows:
+\begin{description}
+\item[\tt examples] contains a set of simple example programs with a
+  predefined choice of preconditioners, selectable via integer
+  values. These are intended to get an acquaintance with the
+  multilevel preconditioners.
+\item[\tt tests] contains a set of more sophisticated examples that
+  will allow the user, via the input files in the \verb|runs|
+  subdirectories, to experiment with the full range of preconditioners
+  implemented in the library.
+\end{description}
+The \verb|fileread| directories contain sample programs that read
+sparse matrices from files, according to the Matrix Market or the
+Harwell-Boeing storage format; the \verb|pdegen| instead generate
+matrices in full parallel mode from the discretization of a sample PDE. 
diff --git a/docs/src/distribution.tex b/docs/src/distribution.tex
new file mode 100644
index 00000000..6541de15
--- /dev/null
+++ b/docs/src/distribution.tex
@@ -0,0 +1,19 @@
+\section{Code Distribution\label{sec:distribution}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:distribution} Code Distribution}}
+
+\noindent
+MLD2P4 is available from the web site 
+\begin{quotation}
+\texttt{http://www.mld2p4.it}
+\end{quotation}
+where contact points for further information can be also found.
+To report bugs or ask general usage questions, please, send an email to
+\texttt{bugreport@mld2p4.it}.
+
+
+The software is available under a modified BSD license, as specified
+in Appendix~\ref{sec:license}; please note that some of the optional
+third party libraries may be licensed under a different and more
+stringent license, most notably the GPL, and this should be taken into
+account when treating derived works. 
diff --git a/docs/src/errors.tex b/docs/src/errors.tex
new file mode 100644
index 00000000..375a5f69
--- /dev/null
+++ b/docs/src/errors.tex
@@ -0,0 +1,20 @@
+\section{Error Handling\label{sec:errors}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:errors} Error handling}}
+
+The error handling in MLD2P4 is based on the PSBLAS (version 2) error
+handling. Error conditions are signaled via an integer argument
+\verb|info|; whenever an error condition is detected, an error trace
+stack is built by the library up to the top-level, user-callable
+routine. This routine will then decide, according to the user
+preferences, whether the error should be handled by terminating the
+program or by returning the error condition to the user code, which
+will then take action, and whether
+an error message should be printed. These options may be set by using
+the PSBLAS error handling routines; for further details see the PSBLAS
+User's Guide \cite{PSBLASGUIDE}. 
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: 
diff --git a/docs/src/gettingstarted.tex b/docs/src/gettingstarted.tex
new file mode 100644
index 00000000..df94329c
--- /dev/null
+++ b/docs/src/gettingstarted.tex
@@ -0,0 +1,308 @@
+\section{Getting Started\label{sec:started}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:started} Getting Started}}
+
+We describe the basics for building and applying MLD2P4 one-level and multi-level
+Schwarz preconditioners with the Krylov solvers included in PSBLAS \cite{PSBLASGUIDE}.
+The following steps are required:
+\begin{enumerate} 
+\item \emph{Declare the preconditioner data structure}. It is a derived data type,
+  \verb|mld_|\-\emph{x}\verb|prec_| \verb|type|, where \emph{x} may be \verb|s|, \verb|d|, \verb|c|
+	or \verb|z|, according to the basic data type of the sparse matrix
+	(\verb|s| = real single precision; \verb|d| = real double precision;
+	\verb|c| = complex single precision; \verb|z| = complex double precision).
+	This data structure is accessed by the user only through the MLD2P4 routines,
+	following an object-oriented approach.
+\item \emph{Allocate and initialize the preconditioner data structure, according to
+	a preconditioner type chosen by the user}. This is performed by the routine
+	\verb|mld_precinit|, which also sets defaults for each preconditioner
+	type selected by the user. The defaults associated to each preconditioner
+	type are given in Table~\ref{tab:precinit}, where the strings used by
+	\verb|mld_precinit| to identify the preconditioner types are also given.
+	Note that these strings are valid also if uppercase letters are substituted by
+	corresponding lowercase ones.
+\item \emph{Modify the selected preconditioner type, by properly setting
+  preconditioner parameters.} This is performed by the routine \verb|mld_precset|.
+  This routine must be called only if the user wants to modify the default values
+  of the parameters associated to the selected preconditioner type, to obtain a variant
+  of the preconditioner. Examples of use of \verb|mld_precset| are given in
+  Section~\ref{sec:examples}; a complete list of all the
+  preconditioner parameters and their allowed and default values is provided in 
+  Section~\ref{sec:userinterface}, Tables~\ref{tab:p_type}-\ref{tab:p_coarse}. 
+\item \emph{Build the preconditioner for a given matrix.} This is performed by
+  the routine \verb|mld_precbld|.
+\item \emph{Apply the preconditioner at each iteration of a Krylov solver.}
+  This is performed by the routine \verb|mld_precaply|. When using the PSBLAS Krylov solvers,
+  this step is completely transparent to the user, since \verb|mld_precaply| is called
+  by the PSBLAS routine implementing the Krylov solver (\verb|psb_krylov|).
+\item \emph{Free the preconditioner data structure}. This is performed by
+  the routine \verb|mld_| \verb|precfree|. This step is complementary to step 1 and should
+  be performed when the preconditioner is no more used.
+\end{enumerate}
+A detailed description of the above routines is given in Section~\ref{sec:userinterface}.
+Examples showing the basic use of MLD2P4 are reported in Section~\ref{sec:examples}.
+
+Note that the Fortran 95 module \verb|mld_prec_mod|, containing the definition of the 
+preconditioner data type and the interfaces to the routines of MLD2P4,
+must be used in any program calling such routines.
+The modules \verb|psb_base_mod|, for the sparse matrix and communication descriptor
+data types, and \verb|psb_krylov_mod|, for interfacing with the
+Krylov solvers, must be also used (see Section~\ref{sec:examples}).
+
+\ \\
+\textbf{Remark 1.} The coarsest-level solver used by the default two-level
+preconditioner has been chosen by taking into account that, on parallel
+machines, it often leads to the smallest execution time when applied to
+linear systems coming from finite-difference discretizations of basic
+elliptic PDE problems, considered as standard tests for multi-level Schwarz
+preconditioners \cite{aaecc_07,apnum_07}. However, this solver does
+not necessarily correspond to the smallest number of iterations of the
+preconditioned Krylov method, which is usually obtained by applying
+a direct solver to the coarsest-level system, e.g.\ based on the LU
+factorization (see Section~\ref{sec:userinterface}
+for the coarsest-level solvers available in MLD2P4). 
+
+\ \\
+\textbf{Remark 2.} The include path for MLD2P4 must override
+those for PSBLAS, e.g.\ the latter must come first in the sequence
+passed to the compiler, as the MLD2P4 version of the Krylov solver
+interfaces must override that of PSBLAS. This will change in the future
+when the support for the \verb|class| statement becomes widespread in Fortran
+compilers. 
+
+
+\begin{table}[th]
+\begin{center}
+%{\small
+\begin{tabular}{|l|l|p{7.8cm}|}
+\hline
+\textsc{type}       & \textsc{string} & \textsc{default preconditioner} \\ \hline
+No preconditioner &\verb|'NOPREC'|& Considered only to use the PSBLAS
+                                    Krylov solvers with no preconditioner. \\ \hline
+Diagonal          & \verb|'DIAG'| & --- \\ \hline
+Block Jacobi      & \verb|'BJAC'| & Block Jacobi with ILU(0) on the local blocks.\\ \hline
+Additive Schwarz  & \verb|'AS'|   & Restricted Additive Schwarz (RAS),
+                                    with overlap 1 and ILU(0) on the local blocks. \\ \hline
+Multilevel        &\verb|'ML'|    & Multi-level hybrid preconditioner (additive on the
+                                    same level and multiplicative through the levels),
+                                    with post-smoothing only.
+                                    Number of levels: 2.
+	                                  Post-smoother: RAS with overlap 1 and ILU(0)
+                                    on the local blocks.
+                                    Aggregation: decoupled smoothed aggregation with
+                                    threshold $\theta = 0$.
+                                    Coarsest matrix: distributed among the processors.
+                                    Coarsest-level solver: 
+                                    4 sweeps of the block-Jacobi solver, 
+                                    with LU (or ILU) factorization of the blocks
+                                    (UMFPACK for the double precision versions and
+                                    SuperLU for the single precision ones, if the packages
+                                    have been installed; ILU(0), otherwise).         \\
+\hline
+\end{tabular}
+%}
+\end{center}
+
+\caption{Preconditioner types, corresponding strings and default choices.
+\label{tab:precinit}}
+\end{table}
+
+\subsection{Examples\label{sec:examples}}
+
+The code reported in Figure~\ref{fig:ex_default} shows how to set and apply the default
+multi-level preconditioner available in the real double precision version
+of MLD2P4 (see Table~\ref{tab:precinit}). This preconditioner is chosen
+by simply specifying \verb|'ML'| as second argument of \verb|mld_precinit|
+(a call to \verb|mld_precset| is not needed) and is applied with the BiCGSTAB
+solver provided by PSBLAS. As previously observed, the modules \verb|psb_base_mod|,
+\verb|mld_prec_mod| and \verb|psb_krylov_mod| must be used by the example program.
+ 
+The part of the code concerning the
+reading and assembling of the sparse matrix and the right-hand side vector, performed
+through the PSBLAS routines for sparse matrix and vector management, is not reported
+here for brevity; the statements concerning the deallocation of the PSBLAS
+data structure are neglected too.
+The complete code can be found in the example program file \verb|mld_dexample_ml.f90|,
+in the directory \verb|examples/fileread| of the MLD2P4 tree (see
+Section~\ref{sec:ex_and_test}).
+For details on the use of the PSBLAS routines, see the PSBLAS User's
+Guide \cite{PSBLASGUIDE}.
+
+The setup and application of the default multi-level
+preconditioners for the real single precision and the complex, single and double
+precision, versions are obtained with straightforward modifications of the previous
+example (see Section~\ref{sec:userinterface} for details). If these versions are installed,
+the corresponding Fortran 95 codes are available in \verb|examples/fileread/|.
+
+\begin{figure}[tbp]
+\begin{center}
+\begin{minipage}{.90\textwidth} 
+{\small
+\begin{verbatim}
+  use psb_base_mod
+  use mld_prec_mod
+  use psb_krylov_mod
+... ...
+!
+! sparse matrix
+  type(psb_dspmat_type) :: A
+! sparse matrix descriptor
+  type(psb_desc_type)   :: desc_A
+! preconditioner
+  type(mld_dprec_type)  :: P
+! right-hand side and solution vectors
+  real(kind(1.d0))      :: b(:), x(:)
+... ...
+!
+! initialize the parallel environment
+  call psb_init(ictxt)
+  call psb_info(ictxt,iam,np)
+... ...
+!
+! read and assemble the matrix A and the right-hand side b 
+! using PSBLAS routines for sparse matrix / vector management 
+... ...
+!
+! initialize the default multi-level preconditioner, i.e. hybrid
+! Schwarz, using RAS (with overlap 1 and ILU(0) on the blocks) 
+! as post-smoother and 4 block-Jacobi sweeps (with UMFPACK LU
+! on the blocks) as distributed coarse-level solver
+  call mld_precinit(P,'ML',info)
+!
+! build the preconditioner
+  call mld_precbld(A,desc_A,P,info)
+!
+! set the solver parameters and the initial guess
+  ... ...
+!
+! solve Ax=b with preconditioned BiCGSTAB
+  call psb_krylov('BICGSTAB',A,P,b,x,tol,desc_A,info)
+  ... ...
+!
+! deallocate the preconditioner
+  call mld_precfree(P,info)
+!
+! deallocate other data structures
+  ... ...
+!
+! exit the parallel environment
+  call psb_exit(ictxt)
+  stop
+\end{verbatim}
+}
+\end{minipage}
+\caption{Setup and application of the default multi-level Schwarz preconditioner.
+\label{fig:ex_default}}
+\end{center}
+\end{figure}
+
+Different versions of multi-level preconditioners can be obtained by changing
+the default values of the preconditioner parameters. The code reported in
+Figure~\ref{fig:ex_3lh} shows how to set a three-level hybrid Schwarz
+preconditioner, which uses block Jacobi with ILU(0) on the
+local blocks as post-smoother, has a coarsest matrix replicated on the processors,
+and solves the coarsest-level system with the LU factorization from UMFPACK~\cite{UMFPACK}.
+The number of levels is specified by using \verb|mld_precinit|; the other
+preconditioner parameters are set by calling \verb|mld_precset|. Note that
+the type of multilevel framework (i.e.\ multiplicative among the levels
+with post-smoothing only) is not specified since it is the default 
+set by \verb|mld_precinit|. 
+
+Figure~\ref{fig:ex_3la} shows how to
+set a three-level additive Schwarz preconditioner,
+which uses RAS, with overlap 1 and ILU(0) on the blocks, 
+as pre- and post-smoother, and applies five block-Jacobi sweeps, with
+the UMFPACK LU factorization on the blocks, as distributed coarsest-level
+solver. Again, \verb|mld_precset| is used only to set
+non-default values of the parameters (see Tables~\ref{tab:p_type}-\ref{tab:p_coarse}).
+In both cases, the construction and the application of the preconditioner
+are carried out as for the default multi-level preconditioner.
+The code fragments shown in in Figures~\ref{fig:ex_3lh}-\ref{fig:ex_3la} are
+included in the example program file \verb|mld_dexample_ml.f90| too.
+
+Finally, Figure~\ref{fig:ex_1l} shows the setup of a one-level
+additive Schwarz preconditioner, i.e.\ RAS with overlap 2. The corresponding
+example program is available in \verb|mld_dexample_| \verb|1lev.f90|.
+
+For all the previous preconditioners, example programs where the sparse matrix and
+the right-hand side are generated by discretizing a PDE with Dirichlet
+boundary conditions are also available in the directory \verb|examples/pdegen|.
+
+\ \\
+\textbf{Remark 3.} Any PSBLAS-based program using the basic preconditioners
+implemented in PSBLAS 2.0, i.e.\ the diagonal and block-Jacobi ones,
+can use the diagonal and block-Jacobi preconditioners
+implemented in MLD2P4 without any change in the code.
+The PSBLAS-based program must be only recompiled
+and linked to the MLD2P4 library.
+\\
+
+
+\begin{figure}[tbh]
+\begin{center}
+\begin{minipage}{.90\textwidth} 
+{\small
+\begin{verbatim}
+... ...
+! set a three-level hybrid Schwarz preconditioner, which uses 
+! block Jacobi (with ILU(0) on the blocks) as post-smoother,
+! a coarsest matrix replicated on the processors, and the 
+! LU factorization from UMFPACK as coarse-level solver
+  call mld_precinit(P,'ML',info,nlev=3)
+  call_mld_precset(P,mld_smoother_type_,'BJAC',info)
+  call mld_precset(P,mld_coarse_mat_,'REPL',info)
+  call mld_precset(P,mld_coarse_solve_,'UMF',info)
+... ...
+\end{verbatim}
+}
+\end{minipage}
+
+\caption{Setup of a hybrid three-level Schwarz preconditioner.\label{fig:ex_3lh}}
+\end{center}
+\end{figure}
+
+\begin{figure}[tbh]
+\begin{center}
+\begin{minipage}{.90\textwidth} 
+{\small
+\begin{verbatim}
+... ...
+! set a three-level additive Schwarz preconditioner, which uses 
+! RAS (with overlap 1 and ILU(0) on the blocks) as pre- and 
+! post-smoother, and 5 block-Jacobi sweeps (with UMFPACK LU
+! on the blocks) as distributed coarsest-level solver
+  call mld_precinit(P,'ML',info,nlev=3)
+  call mld_precset(P,mld_ml_type_,'ADD',info)
+  call_mld_precset(P,mld_smoother_pos_,'TWOSIDE',info)
+  call mld_precset(P,mld_coarse_sweeps_,5,info)
+... ...
+\end{verbatim}
+}
+\end{minipage}
+
+\caption{Setup of an additive three-level Schwarz preconditioner.\label{fig:ex_3la}}
+\end{center}
+\end{figure}
+
+\begin{figure}[tbh]
+\begin{center}
+\begin{minipage}{.90\textwidth} 
+{\small
+\begin{verbatim}
+... ...
+! set RAS with overlap 2 and ILU(0) on the local blocks
+  call mld_precinit(P,'AS',info)
+  call mld_precset(P,mld_sub_ovr_,2,info)
+... ...
+\end{verbatim}
+}
+\end{minipage}
+\caption{Setup of a one-level Schwarz preconditioner.\label{fig:ex_1l}}
+\end{center}
+\end{figure}
+
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: 
diff --git a/docs/src/intro.tex b/docs/src/intro.tex
new file mode 100644
index 00000000..864ab263
--- /dev/null
+++ b/docs/src/intro.tex
@@ -0,0 +1,34 @@
+\section{Introduction}\label{sec:intro}
+\markboth{\underline{MLD2P4 User's and Reference Guide}}
+         {\underline{\ref{sec:overview} Introduction}}
+
+The MLD2P4 library provides ....
+
+
+\subsection{Programming model}
+
+The MLD2P4 librarary is based on the Single Program Multiple Data
+(SPMD) programming model: each process participating in the
+computation performs the same actions on a chunk of data. Parallelism
+is thus data-driven. 
+
+Because of this structure, many subroutines coordinate their action
+across the various processes, thus providing an implicit
+synchronization point, and therefore \emph{must} be
+called simultaneously by all processes participating in the
+computation. 
+However there are many cases where no synchronization, and indeed no
+communication among processes, is implied. 
+
+Throughout this user's guide each subroutine will be clearly indicated
+as:
+\begin{description}
+\item[Synchronous:] must be called simultaneously by all the
+  processes in the relevant communication context;
+\item[Asynchronous:] may be called in a totally independent manner.
+\end{description}
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: 
diff --git a/docs/src/license.tex b/docs/src/license.tex
new file mode 100644
index 00000000..005ffa08
--- /dev/null
+++ b/docs/src/license.tex
@@ -0,0 +1,44 @@
+\section{License\label{sec:license}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:license} License}}
+
+The MLD2P4 is freely distributable under the following copyright
+terms: {\small
+\begin{verbatim} 
+                         MLD2P4  version 1.0
+MultiLevel Domain Decomposition Parallel Preconditioners Package
+           based on PSBLAS (Parallel Sparse BLAS version 2.3)
+
+(C) Copyright 2008
+
+                    Salvatore Filippone  University of Rome Tor Vergata       
+                    Alfredo Buttari      University of Rome Tor Vergata
+                    Pasqua D'Ambra       ICAR-CNR, Naples
+                    Daniela di Serafino  Second University of Naples
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions, and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+  3. The name of the MLD2P4 group or the names of its contributors may
+     not be used to endorse or promote products derived from this
+     software without specific written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE MLD2P4 GROUP OR ITS CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+\end{verbatim}
+}
\ No newline at end of file
diff --git a/docs/src/overview.tex b/docs/src/overview.tex
new file mode 100644
index 00000000..d4537981
--- /dev/null
+++ b/docs/src/overview.tex
@@ -0,0 +1,90 @@
+\section{General Overview\label{sec:overview}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:overview} General Overview}}
+            
+The \textsc{Multi-Level Domain Decomposition Parallel Preconditioners Package based on
+PSBLAS (MLD2P4}) provides \emph{multi-level Schwarz preconditioners}~\cite{dd2_96},
+to be used in the iterative solutions of sparse linear systems:
+\begin{equation} 
+Ax=b, 
+\label{system1}
+\end{equation} 
+where $A$ is a square, real or complex, sparse matrix with a symmetric sparsity pattern.
+%
+%\textbf{NOTA: Caso non simmetrico, aggregazione con $(A+A^T)$ fatta! 
+%Dovremmo implementare uno smoothed prolongator
+%adeguato e fare qualcosa di consistente anche con 1-lev Schwarz.}
+%
+These preconditioners have the following general features:
+\begin{itemize}
+\item both \emph{additive and hybrid multilevel} variants are implemented,
+i.e.\ variants that are additive among the levels and inside each level, and variants
+that are multiplicative among the levels and additive inside each level;
+the basic Additive Schwarz (AS) preconditioners are obtained by considering only one level;
+\item a \emph{purely algebraic} approach is used to
+generate a sequence of coarse-level corrections to a basic AS preconditioner, without
+explicitly using any information on the geometry of the original problem (e.g.\ the
+discretization of a PDE). The \emph{smoothed aggregation} technique is applied
+as algebraic coarsening strategy~\cite{BREZINA_VANEK,VANEK_MANDEL_BREZINA}.
+\end{itemize}
+
+The package is written in \emph{Fortran~95}, following an
+\emph{object-oriented approach} through the exploitation of features
+such as abstract data type creation, functional 
+overloading and dynamic memory management.
+% , while providing a smooth
+% path towards the integration in legacy application codes. 
+The parallel implementation is based
+on a Single Program Multiple Data (SPMD) paradigm for distributed-memory architectures. 
+Single and double precision implementations of MLD2P4 are available for both the
+real and the complex case, that can be used through a single interface.
+
+
+MLD2P4 has been designed to implement scalable and easy-to-use multilevel preconditioners
+in the context of the \emph{PSBLAS (Parallel Sparse BLAS)
+computational framework}~\cite{psblas_00}. 
+PSBLAS is a library originally developed to address the parallel implementation of
+iterative solvers for sparse linear system, by providing basic linear algebra
+operators and data management facilities for distributed sparse matrices; it
+also includes parallel Krylov solvers, built on the top of the basic PSBLAS kernels.
+The preconditioners available in MLD2P4 can be used with these Krylov solvers.
+The choice of PSBLAS has been mainly motivated by the need of having
+a portable and efficient software infrastructure implementing ``de facto'' standard
+parallel sparse linear algebra kernels, to pursue goals such as performance,
+portability, modularity ed extensibility in the development of the preconditioner
+package. On the other hand, the implementation of MLD2P4 has led to some
+revisions and extentions of the PSBLAS kernels, leading to the
+recent PSBLAS 2.0 version~\cite{PSBLASGUIDE}. The inter-process comunication required
+by MLD2P4 is encapsulated into the PSBLAS routines, except few cases where
+MPI~\cite{MPI1} is explicitly called. Therefore, MLD2P4 can be run on any parallel
+machine where PSBLAS and MPI implementations are available.
+
+MLD2P4 has a layered and modular software architecture where three main layers can be identified. 
+The lower layer consists of the PSBLAS kernels, the middle one implements
+the construction and application phases of the preconditioners, and the upper one
+provides a uniform and easy-to-use interface to all the preconditioners. 
+This architecture allows for different levels of use of the package:
+few black-box routines at the upper layer allow non-expert users to easily
+build any preconditioner available in MLD2P4 and to apply it within a PSBLAS Krylov solver.
+On the other hand, the routines of the middle and lower layer can be used and extended
+by expert users to build new versions of multi-level Schwarz preconditioners.
+We provide here a description of the upper-layer routines, but not of the
+medium-layer ones.
+
+This guide is organized as follows. General information on the distribution of the source code
+is reported in Section~\ref{sec:distribution}, while details on the configuration
+and installation of the package are given in Section~\ref{sec:building}. A description of
+multi-level Schwarz preconditioners based on smoothed aggregation is provided
+in Section~\ref{sec:background}, to help the users in choosing among the different preconditioners
+implemented in MLD2P4. The basics for building and applying the preconditioners
+with the Krylov solvers implemented in PSBLAS are reported in Section~\ref{sec:started}, where the
+Fortran 95 codes of a few sample programs are also shown. A reference guide for
+the upper-layer routines of MLD2P4, that are the user interface, is provided
+in Section~\ref{sec:userinterface}. The error handling mechanism used by the package is briefly described
+in Section~\ref{sec:errors}. The copyright terms concerning the distribution and modification
+of MLD2P4 are reported in Appendix~\ref{sec:license}.
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: 
diff --git a/docs/src/precs.tex b/docs/src/precs.tex
new file mode 100644
index 00000000..510606cf
--- /dev/null
+++ b/docs/src/precs.tex
@@ -0,0 +1,280 @@
+\section{Preconditioner routines}
+\label{sec:precs}
+\markboth{\underline{MLD2P4 User's and Reference Guide}}
+         {\underline{\ref{sec:precs} Preconditioners}}
+
+% \section{Preconditioners}
+\label{sec:psprecs}
+The MLD2P4 library  contains the implementation of many preconditioning
+techniques. The preconditioners may be applied as normal ``base'' 
+preconditioners; alternatively multiple ``base'' preconditioners may
+be combined  in a multilevel framework. 
+
+The base (one-level) preconditioners include: 
+\begin{itemize}
+\item Diagonal Scaling
+\item Block Jacobi 
+\item Additive Schwarz, Restricted Additive Schwarz and
+  Additive Schwarz with Harmonic extensions;
+\end{itemize}
+The Jacobi and Additive Schwarz preconditioners can make use of the
+following solvers:
+\begin{itemize}
+\item Level-$p$ Incomplete LU factorization ($ILU(p)$);
+\item Threshold Incomplete LU factorization ($ILU(\tau,p)$);
+\item Complete LU factorization by means of the following optional
+  external   packages: 
+\begin{itemize}
+\item UMFPACK;
+\item SuperLU;
+\item SuperLU\_Dist.
+\end{itemize}
+\end{itemize}
+
+The supporting data type and subroutine interfaces are defined in the
+module  \verb|mld_prec_mod|; the module also overrides the variables
+and tyep definitions of \verb|psb_prec_mod| so as to function as a
+drop-in replacement for the PSBLAS methods. Thus if the user does not
+wish to employ the additional MLD2P4 capabitlities, it is possible to
+migrate an existing PSBLAS program without any source code
+modifications, only a recompilation is needed. 
+
+%% We also provide a companion  package of multi-level Additive
+%%   Schwarz preconditioners called MD2P4; this is actually a family of 
+%%   preconditioners since there is the possibility to choose between
+%%   many variants, and is currently in an experimental stateIts
+%%   documentation is planned to appear after stabilization of the
+%%   package, which will characterize release  2.1 of our library.
+
+
+
+
+\subroutine{mld\_precinit}{Initialize a  preconditioner}
+
+\syntax{call mld\_precinit}{prec, ptype, info}
+\syntax*{call mld\_precinit}{prec, ptype, info, nlev}
+
+\begin{description}
+\item[Type:] Asynchronous.
+\item[\bf On Entry]
+\item[ptype] the type of preconditioner. 
+Scope: {\bf global} \\
+Type: {\bf required}\\
+Intent: {\bf in}.\\
+Specified as: a character string, see usage notes.
+\item[nlev] Number of levels in a multilevel  precondtioner. 
+Scope: {\bf global} \\
+Type: {\bf optional}\\
+Specified as: an integer value, see usage notes. 
+%% \item[rs] 
+%% Scope: {\bf global} \\
+%% Type: {\bf optional}\\
+%% Specified as: a long precision real number.
+\item[\bf On Exit]
+
+\item[prec] 
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: a preconditioner data structure \precdata.
+\item[info] 
+Scope: {\bf global} \\
+Type: {\bf required}\\
+Intent: {\bf out}.\\
+Error code: if no error, 0 is returned.
+\end{description}
+\subsection*{Usage Notes}
+%% The PSBLAS 2.0 contains a number of preconditioners, ranging from a
+%% simple diagonal scaling to 2-level domain decomposition. These
+%% preconditioners may use the SuperLU or the UMFPACK software, if
+%% installed; see~\cite{SUPERLU,UMFPACK}. 
+Legal inputs to this subroutine are interpreted depending on the
+$ptype$ string as follows\footnote{The string is case-insensitive}:
+\begin{description}
+\item[NONE] No preconditioning, i.e. the preconditioner is just a copy
+  operator.
+\item[DIAG] Diagonal scaling; each entry of the input vector is
+  multiplied by the reciprocal of the sum of the absolute values of
+  the coefficients in the corresponding row of matrix  $A$;
+\item[BJAC] Precondition by a  factorization of the
+  block-diagonal of matrix $A$, where block boundaries are determined
+  by the data allocation boundaries for each process; requires no
+  communication. 
+\item[AS] Additive Schwarz; default is to apply the Restricted
+  Additive Schwarz variant, with an $ILU(0)$ factorization
+\item[ML] Multilevel preconditioner.
+\end{description}
+
+
+
+\subroutine{mld\_precset}{Set preconditioner features}
+
+\syntax{call mld\_precset}{prec, what, val, info, ilev}
+
+
+\begin{description}
+\item[Type:] Asynchronous.
+\item[\bf On Entry]
+\item[prec] the preconditioner.\\
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: an already initialized precondtioner data structure \precdata\\
+\item[what] The feature to be set. \\
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}.\\
+Specified as: an integer constants. Symbolic names are available in
+the library module, see usage notes for legal values.
+\item[val] The value  to set the chosen feature to. \\
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}.\\
+Specified as: an integer, double precision or character variable. 
+Symbolic names for some choices are available in the library module,
+see usage notes for legal values. 
+\item[ilev] The level of a multilevel preconditioner to which the
+  feature choice should apply.\\
+Scope: {\bf global} \\
+Type: {\bf optional}\\
+Specified as: an integer value, see usage notes. 
+\end{description}
+
+\begin{description}
+\item[\bf On Return]
+\item[prec] the preconditioner.\\
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: a precondtioner data structure \precdata\\
+\item[info] Error code.\\
+Scope: {\bf local} \\
+Type: {\bf required} \\
+Intent: {\bf out}.\\
+An integer value; 0 means no error has been detected. 
+\end{description}
+
+\subsection*{Usage Notes}
+Legal inputs to this subroutine are interpreted depending on the value
+of \verb|what| input as follows
+\begin{description}
+\item[mld\_coarse\_mat\_] 
+\end{description}
+
+
+\subroutine{mld\_precbld}{Builds a preconditioner}
+
+\syntax{call mld\_precbld}{a, desc\_a, prec, info}
+
+\begin{description}
+\item[Type:] Synchronous.
+\item[\bf On Entry]
+\item[a] the system sparse matrix.
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}, target.\\
+Specified as: a sparse matrix data structure \spdata.
+\item[prec] the preconditioner.\\
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: an already initialized precondtioner data structure \precdata\\
+\item[desc\_a] the problem communication descriptor. 
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}, target.\\
+Specified as: a communication descriptor data structure \descdata.
+%% \item[upd] 
+%% Scope: {\bf global} \\
+%% Type: {\bf optional}\\
+%% Intent: {\bf in}.\\
+%% Specified as: a character.
+\end{description}
+
+\begin{description}
+\item[\bf On Return]
+\item[prec] the preconditioner.\\
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: a precondtioner data structure \precdata\\
+\item[info] Error code.\\
+Scope: {\bf local} \\
+Type: {\bf required} \\
+Intent: {\bf out}.\\
+An integer value; 0 means no error has been detected. 
+\end{description}
+
+
+
+\subroutine{mld\_precaply}{Preconditioner application routine}
+
+\syntax{call mld\_precaply}{prec,x,y,desc\_a,info,trans,work}
+\syntax*{call mld\_precaply}{prec,x,desc\_a,info,trans}
+
+\begin{description}
+\item[Type:] Synchronous.
+\item[\bf On Entry]
+\item[prec] the preconditioner.
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}.\\
+Specified as: a preconditioner data structure \precdata.
+\item[x] the source vector.
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: a double precision array.
+\item[desc\_a] the problem communication descriptor.
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}.\\
+Specified as: a communication data structure \descdata.
+\item[trans] 
+Scope: {\bf } \\
+Type: {\bf optional}\\
+Intent: {\bf in}.\\
+Specified as: a character.
+\item[work] an optional work space
+Scope: {\bf local} \\
+Type: {\bf optional}\\
+Intent: {\bf inout}.\\
+Specified as: a double precision array.
+\end{description}
+
+\begin{description}
+\item[\bf On Return]
+\item[y] the destination vector.
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf inout}.\\
+Specified as: a double precision array.
+\item[info] Error code.\\
+Scope: {\bf local} \\
+Type: {\bf required} \\
+Intent: {\bf out}.\\
+An integer value; 0 means no error has been detected. 
+\end{description}
+
+
+
+\subroutine{mld\_prec\_descr}{Prints a description of current preconditioner}
+
+\syntax{call mld\_prec\_descr}{prec}
+
+\begin{description}
+\item[Type:] Asynchronous.
+\item[\bf On Entry]
+\item[prec] the preconditioner.
+Scope: {\bf local} \\
+Type: {\bf required}\\
+Intent: {\bf in}.\\
+Specified as: a preconditioner data structure \precdata.
+\end{description}
+
+
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: 
diff --git a/docs/src/title.tex b/docs/src/title.tex
new file mode 100644
index 00000000..831759b8
--- /dev/null
+++ b/docs/src/title.tex
@@ -0,0 +1,72 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Contents: The title page
+% $Id: title.tex 1999 2007-10-29 15:25:27Z sfilippo $
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\ifcase\pdfoutput % We're not running pdftex
+{\Large\bfseries MLD2P4\\[.8ex] User's and Reference Guide}\\
+\emph{\large A guide for the Multi-Level Domain Decomposition \\[.6ex]
+Parallel Preconditioners Package
+based on PSBLAS}
+{\bfseries Pasqua D'Ambra}\\
+ ICAR-CNR, Naples, Italy\\[3ex]
+{\bfseries Daniela di Serafino}\\
+ Second University of Naples, Italy\\[3ex]
+{\bfseries Salvatore Filippone} \\
+ University of Rome ``Tor Vergata'', Italy 
+%\\[10ex]
+%\today
+Software version: 1.0\\
+%\today
+July 24, 2008
+\or
+\pdfbookmark{MLD2P4 User's and Reference Guide}{title}
+\newlength{\centeroffset}
+%\setlength{\centeroffset}{-0.5\oddsidemargin}
+%\addtolength{\centeroffset}{0.5\evensidemargin}
+%\addtolength{\textwidth}{-\centeroffset}
+\thispagestyle{empty}
+\vspace*{\stretch{1}}
+\noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
+\flushright
+{\Huge\bfseries MLD2P4\\[.8ex] User's and Reference Guide
+}
+\noindent\rule[-1ex]{\textwidth}{5pt}\\[2.5ex]
+\hfill\emph{\Large A guide for the Multi-Level Domain Decomposition \\[.6ex]
+Parallel Preconditioners Package
+based on PSBLAS}
+\end{minipage}}
+
+\vspace{\stretch{1}}
+\noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
+\flushright
+{\large\bfseries Pasqua D'Ambra}\\
+\large ICAR-CNR, Naples, Italy\\[3ex]
+{\large\bfseries Daniela di Serafino}\\
+\large Second University of Naples, Italy\\[3ex]
+{\large\bfseries Salvatore Filippone} \\
+\large University of Rome ``Tor Vergata'', Italy 
+%\\[10ex]
+%\today
+\end{minipage}}
+
+\vspace{\stretch{1}}
+\noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
+\flushright
+\large Software version: 1.0\\
+%\today
+\large July 24, 2008
+\end{minipage}}
+%\addtolength{\textwidth}{\centeroffset}
+\vspace{\stretch{2}}
+\fi
+
+\endinput
+
+%
+
+% Local Variables:
+% TeX-master: "userguide"
+% mode: latex
+% mode: flyspell
+% End:
diff --git a/docs/src/userguide.tex b/docs/src/userguide.tex
new file mode 100644
index 00000000..b040f2ae
--- /dev/null
+++ b/docs/src/userguide.tex
@@ -0,0 +1,173 @@
+\documentclass[a4paper,twoside,11pt]{article}
+\usepackage{pstricks}
+\usepackage{fancybox}
+\usepackage{amsfonts}
+\usepackage{ifpdf}
+% \usepackage{minitoc}
+% \setcounter{minitocdepth}{2}
+\usepackage[bookmarks=true, 
+            bookmarksnumbered=true, 
+            bookmarksopen=false, 
+            plainpages=false,
+            pdfpagelabels,
+            colorlinks, 
+            citecolor=red,
+            linkcolor=blue]{hyperref}
+\usepackage{ifthen}
+\usepackage{graphicx}
+\newtheorem{theorem}{Theorem}
+\newtheorem{corollary}{Corollary}
+\usepackage{rotating}
+%\newboolean{mtc}
+%\setboolean{mtc}{true}
+
+\pdfoutput=1
+\relax
+\pdfcompresslevel=0             %-- 0 = none, 9 = best
+\pdfinfo{                       %-- Info dictionary of PDF output  /Author (PD, DdS, SF)
+  /Title    (MultiLevel Domain Decomposition Parallel Preconditioners Package
+             based on PSBLAS, V. 1.0)
+  /Subject  (MultiLevel Domain Decomposition Parallel Preconditioners Package)
+  /Keywords (Parallel Numerical Software, Algebraic Multilevel Preconditioners, Sparse Iterative Solvers, PSBLAS, MPI)
+  /Creator  (pdfLaTeX)
+  /Producer ($Id: userguide.tex 2008-04-08 Pasqua D'Ambra, Daniela di Serafino,
+             Salvatore Filippone$)
+}
+\pdfcatalog{ %-- Catalog dictionary of PDF output.
+%  /URI (http://ce.uniroma2.it/psblas)
+} 
+
+\setlength\textwidth{1.15\textwidth}
+\setlength\oddsidemargin{0.3in}
+\setlength\evensidemargin{0.2in}
+% \newlength{\centeroffset}
+% \setlength{\centeroffset}{0.5\oddsidemargin}
+% \addtolength{\centeroffset}{0.5\evensidemargin}
+% \addtolength{\textwidth}{-\centeroffset}
+\pagestyle{myheadings}
+
+\newcounter{subroutine}[subsection]
+\newcounter{example}[subroutine]
+\makeatletter
+\def\subroutine{\@ifstar{\@subroutine}{\clearpage\@subroutine}}%
+\def\@subroutine#1#2{%
+\stepcounter{subroutine}%
+      \section*{\flushleft #1---#2 \endflushleft}%
+      \addcontentsline{toc}{subsection}{#1}%
+      \markright{#1}}%
+\newcommand{\subsubroutine}[2]{%
+\stepcounter{subroutine}%
+      \subsection*{\flushleft #1---#2 \endflushleft}%
+      \addcontentsline{toc}{subsubsection}{#1}%
+      \markright{#1}}%
+\newcommand{\examplename}{Example}
+\newcommand{\syntaxname}{Syntax}
+\def\syntax{\@ifstar{\@ssyntax}{\@syntax}}%
+\def\@syntax{\nobreak\section*{\syntaxname}%
+     \@ssyntax}%
+\def\@ssyntax#1#2{%
+  \nobreak
+   \setbox\@tempboxa\hbox{#1\ {\em $($#2$)$}}%
+   \ifdim \wd\@tempboxa >\hsize
+        \setbox\@tempboxa\hbox{\em $($#2$)$}
+	\ifdim\wd\@tempboxa >\hsize
+          \begin{flushright}#1\ \em$($#2$)$\end{flushright}%
+	\else
+         \hbox to\hsize{#1\hfil}%
+         \hbox to\hsize{\hfil\box\@tempboxa}%
+        \fi
+     \else
+       \hbox to\hsize{\hfil\box\@tempboxa\hfil}%
+   \fi\par\vskip\baselineskip}
+\makeatother
+\newcommand{\example}{\stepcounter{example}%
+\section*{\examplename~\theexample}}
+\def\bsideways{\sidewaystable}
+\def\esideways{\endsidewaystable}
+
+\newcommand{\precdata}{\hyperlink{precdata}{{\tt mld\_prec\_type}}}
+\newcommand{\descdata}{\hyperlink{descdata}{{\tt psb\_desc\_type}}}
+\newcommand{\spdata}{\hyperlink{spdata}{{\tt psb\_spmat\_type}}}
+\newcommand{\Ref}[1]{\mbox{(\ref{#1})}}
+
+\begin{document}
+\pdfbookmark{MLD2P4 User's and Reference Guide}{title}
+\newlength{\centeroffset}
+%\setlength{\centeroffset}{-0.5\oddsidemargin}
+%\addtolength{\centeroffset}{0.5\evensidemargin}
+%\addtolength{\textwidth}{-\centeroffset}
+\thispagestyle{empty}
+\vspace*{\stretch{1}}
+\noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
+\flushright
+{\Huge\bfseries MLD2P4\\[.8ex] User's and Reference Guide
+}
+\noindent\rule[-1ex]{\textwidth}{5pt}\\[2.5ex]
+\hfill\emph{\Large A guide for the Multi-Level Domain Decomposition \\[.6ex]
+Parallel Preconditioners Package
+based on PSBLAS}
+\end{minipage}}
+
+\vspace{\stretch{1}}
+\noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
+\flushright
+{\large\bfseries Pasqua D'Ambra}\\
+\large ICAR-CNR, Naples, Italy\\[3ex]
+{\large\bfseries Daniela di Serafino}\\
+\large Second University of Naples, Italy\\[3ex]
+{\large\bfseries Salvatore Filippone} \\
+\large University of Rome ``Tor Vergata'', Italy 
+%\\[10ex]
+%\today
+\end{minipage}}
+
+\vspace{\stretch{1}}
+\noindent\hspace*{\centeroffset}\makebox[0pt][l]{\begin{minipage}{\textwidth}
+\flushright
+\large Software version: 1.0\\
+%\today
+\large July 24, 2008
+\end{minipage}}
+%\addtolength{\textwidth}{\centeroffset}
+\vspace{\stretch{2}}
+\clearpage
+\ \\
+\thispagestyle{empty}
+\clearpage
+
+\pagenumbering{roman}   % Roman numbering
+\setcounter{page}{1}    % Abstract start on page i
+
+\include{abstract}
+\cleardoublepage
+
+\begingroup
+  \renewcommand*{\thepage}{toc}
+  %\pagenumbering{roman}   % Roman numbering
+  %\setcounter{page}{1}    % Abstract start on page ii
+  \tableofcontents
+\endgroup  
+\cleardoublepage
+
+\pagenumbering{arabic}  % Arabic numbering
+\setcounter{page}{1}    % Chapters start on page 1
+
+\include{overview}
+\include{distribution}
+\include{building}
+\include{background}
+\include{gettingstarted}
+\include{userinterface}
+\include{errors}
+\clearpage
+\appendix
+\include{license}
+\cleardoublepage
+\include{bibliography}
+
+
+\end{document}
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: 'userguide'
+%%% End: 
diff --git a/docs/src/userhtml.tex b/docs/src/userhtml.tex
new file mode 100644
index 00000000..dc1b4784
--- /dev/null
+++ b/docs/src/userhtml.tex
@@ -0,0 +1,149 @@
+\documentclass[a4paper,twoside,11pt]{article}
+\usepackage{pstricks}
+\usepackage{fancybox}
+\usepackage{amsfonts}
+\usepackage{ifpdf}
+% \usepackage{minitoc}
+% \setcounter{minitocdepth}{2}
+\usepackage[bookmarks=true, 
+            bookmarksnumbered=true, 
+            bookmarksopen=false, 
+            plainpages=false,
+            pdfpagelabels,
+            colorlinks, 
+            linkcolor=blue]{hyperref}
+\usepackage{ifthen}
+\usepackage{graphicx}
+\newtheorem{theorem}{Theorem}
+\newtheorem{corollary}{Corollary}
+\usepackage{rotating}
+%\newboolean{mtc}
+%\setboolean{mtc}{true}
+
+\pdfoutput=0
+% \relax
+% \pdfcompresslevel=0             %-- 0 = none, 9 = best
+% \pdfinfo{                       %-- Info dictionary of PDF output  /Author (PD, DdS, SF)
+%   /Title    (MultiLevel Domain Decomposition Parallel Preconditioners Package
+%              based on PSBLAS, V. 1.0)
+%   /Subject  (MultiLevel Domain Decomposition Parallel Preconditioners Package)
+%   /Keywords (Parallel Numerical Software, Algebraic Multilevel Preconditioners, Sparse Iterative Solvers, PSBLAS, MPI)
+%   /Creator  (pdfLaTeX)
+%   /Producer ($Id: userguide.tex 2008-04-08 Pasqua D'Ambra, Daniela di Serafino,
+%              Salvatore Filippone$)
+% }
+% \pdfcatalog{ %-- Catalog dictionary of PDF output.
+% %  /URI (http://ce.uniroma2.it/psblas)
+% } 
+
+\setlength\textwidth{1.15\textwidth}
+% \setlength\evensidemargin{.7in}
+% \newlength{\centeroffset}
+% \setlength{\centeroffset}{0.5\oddsidemargin}
+% \addtolength{\centeroffset}{0.5\evensidemargin}
+% \addtolength{\textwidth}{-\centeroffset}
+\pagestyle{myheadings}
+
+\newcounter{subroutine}[subsection]
+\newcounter{example}[subroutine]
+\makeatletter
+\def\subroutine{\@ifstar{\@subroutine}{\clearpage\@subroutine}}%
+\def\@subroutine#1#2{%
+\stepcounter{subroutine}%
+      \section*{\flushleft #1---#2 \endflushleft}%
+      \addcontentsline{toc}{subsection}{#1}%
+      \markright{#1}}%
+\newcommand{\subsubroutine}[2]{%
+\stepcounter{subroutine}%
+      \subsection*{\flushleft #1---#2 \endflushleft}%
+      \addcontentsline{toc}{subsubsection}{#1}%
+      \markright{#1}}%
+\newcommand{\examplename}{Example}
+\newcommand{\syntaxname}{Syntax}
+\def\syntax{\@ifstar{\@ssyntax}{\@syntax}}%
+\def\@syntax{\nobreak\section*{\syntaxname}%
+     \@ssyntax}%
+\def\@ssyntax#1#2{%
+  \nobreak
+   \setbox\@tempboxa\hbox{#1\ {\em $($#2$)$}}%
+   \ifdim \wd\@tempboxa >\hsize
+        \setbox\@tempboxa\hbox{\em $($#2$)$}
+	\ifdim\wd\@tempboxa >\hsize
+          \begin{flushright}#1\ \em$($#2$)$\end{flushright}%
+	\else
+         \hbox to\hsize{#1\hfil}%
+         \hbox to\hsize{\hfil\box\@tempboxa}%
+        \fi
+     \else
+       \hbox to\hsize{\hfil\box\@tempboxa\hfil}%
+   \fi\par\vskip\baselineskip}
+\makeatother
+\newcommand{\example}{\stepcounter{example}%
+\section*{\examplename~\theexample}}
+\def\bsideways{\begin{table}}
+\def\esideways{\end{table}}
+
+\newcommand{\precdata}{\hyperlink{precdata}{{\tt mld\_prec\_type}}}
+\newcommand{\descdata}{\hyperlink{descdata}{{\tt psb\_desc\_type}}}
+\newcommand{\spdata}{\hyperlink{spdata}{{\tt psb\_spmat\_type}}}
+\newcommand{\Ref}[1]{\mbox{(\ref{#1})}}
+
+\begin{document}
+{\Large\bfseries MLD2P4\\[.8ex] User's and Reference Guide}\\[\baselineskip]
+\emph{\large A guide for the Multi-Level Domain Decomposition 
+Parallel Preconditioners Package
+based on PSBLAS}\\[3ex]
+{\bfseries Pasqua D'Ambra}\\
+ ICAR-CNR, Naples, Italy\\
+{\bfseries Daniela di Serafino}\\
+ Second University of Naples, Italy\\
+{\bfseries Salvatore Filippone} \\
+ University of Rome ``Tor Vergata'', Italy\\[2ex]
+%\\[10ex]
+%\today
+Software version: 1.0\\
+%\today
+July 24, 2008
+\clearpage
+\ \\
+\thispagestyle{empty}
+\clearpage
+
+\pagenumbering{roman}   % Roman numbering
+\setcounter{page}{1}    % Abstract start on page i
+
+\include{abstract}
+\cleardoublepage
+
+\begingroup
+  \renewcommand*{\thepage}{toc}
+  %\pagenumbering{roman}   % Roman numbering
+  %\setcounter{page}{1}    % Abstract start on page ii
+  \tableofcontents
+\endgroup  
+\cleardoublepage
+
+\pagenumbering{arabic}  % Arabic numbering
+\setcounter{page}{1}    % Chapters start on page 1
+
+\include{overview}
+\include{distribution}
+\include{building}
+\include{background}
+\include{gettingstarted}
+\include{userinterface}
+%\include{advanced}
+\include{errors}
+%\include{listofroutines}
+\cleardoublepage
+\appendix
+\include{license}
+\cleardoublepage
+\include{bibliography}
+
+
+\end{document}
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: 'userguide'
+%%% End: 
diff --git a/docs/src/userinterface.tex b/docs/src/userinterface.tex
new file mode 100644
index 00000000..e3d5cf34
--- /dev/null
+++ b/docs/src/userinterface.tex
@@ -0,0 +1,443 @@
+\section{User Interface\label{sec:userinterface}}
+\markboth{\textsc{MLD2P4 User's and Reference Guide}}
+         {\textsc{\ref{sec:userinterface} User Interface}}
+
+
+The basic user interface of MLD2P4 consists of six routines. The four routines \verb|mld_| \verb|precinit|,
+\verb|mld_precset|, \verb|mld_precbld| and \verb|mld_precaply| encapsulate all the functionalities
+for the setup and the application of any one-level and multi-level
+preconditioner implemented in the package.
+The routine \verb|mld_precfree| deallocates the preconditioner data structure, while
+\verb|mld_precdescr| prints a description of the preconditioner setup by the user.
+
+For each routine, the same user interface is overloaded with
+respect to the real/complex case and the single/double precision;
+arguments with appropriate data types must be passed to the routine,
+i.e.
+\begin{itemize}
+\item the sparse matrix data structure, containing the matrix to be
+  preconditioned, must be of type \verb|mld_|\emph{x}\verb|spmat_type|
+	with \emph{x} = \verb|s| for real single precision, \emph{x} = \verb|d|
+	for real double precision, \emph{x} = \verb|c| for complex single precision,
+	\emph{x} = \verb|z| for complex double precision;
+\item the preconditioner data structure must be of type
+  \verb|mld_|\emph{x}\verb|prec_type|, with \emph{x} =    
+  \verb|s|, \verb|d|, \verb|c|, \verb|z|, according to the sparse
+  matrix data structure;
+\item the arrays containing the vectors $v$ and $w$ involved in
+  the preconditioner application $w=M^{-1}v$ must be of type   
+  \emph{type}\verb|(|\emph{kind\_parameter}\verb|)|, with \emph{type} =
+  \verb|real|, \verb|complex| and \emph{kind\_parameter} = \verb|kind(1.e0)|,
+  \verb|kind(1.d0)|, according to the sparse matrix and preconditioner
+  data structure; note that the PSBLAS module \verb|psb_base_mod|
+  provides the constants \verb|psb_spk_|
+  = \verb|kind(1.e0)| and \verb|psb_dpk_| = \verb|kind(1.d0)|;
+\item real parameters defining the preconditioner must be declared
+  according to the precision of the sparse matrix and preconditioner
+  data structures (see Section~\ref{sec:precset}).
+\end{itemize}
+A description of each routine is given in the remainder of this section.
+
+\clearpage
+
+\subsection{Subroutine mld\_precinit\label{sec:precinit}}
+
+\begin{center}
+\verb|mld_precinit(p,ptype,info)| \\
+\verb|mld_precinit(p,ptype,info,nlev)| \\
+\end{center}
+
+\noindent
+This routine allocates and initializes the preconditioner data structure,
+according to the preconditioner type chosen by the user.
+
+{\vskip2\baselineskip\noindent\large\bfseries Arguments}
+
+\begin{tabular}{p{1.2cm}p{12cm}}
+\verb|p|      & \verb|type(mld_|\emph{x}\verb|prec_type), intent(inout)|.\\
+              & The preconditioner data structure. Note that \emph{x}
+                must be chosen according to the real/complex, single/double
+                precision version of MLD2P4 under use.\\
+\verb|ptype|  & \verb|character(len=*), intent(in)|.\\
+              & The type of preconditioner. Its values are specified
+              in Table~\ref{tab:precinit}.\\
+              & Note that the strings are case insensitive.\\
+\verb|info|   & \verb|integer, intent(out)|.\\
+              & Error code. If no error, 0 is returned. See Section~\ref{sec:errors} for details.\\
+\verb|nlev|   & \verb|integer, optional, intent(in)|.\\
+              & The number of levels of the multilevel preconditioner.
+                If \verb|nlev| is not present and \verb|ptype|=\verb|'ML'|, \verb|'ml'|, 
+                then \verb|nlev|=2 is assumed. Otherwise, \verb|nlev| is ignored.\\
+\end{tabular}
+
+\clearpage
+
+\subsection{Subroutine mld\_precset\label{sec:precset}}
+
+\begin{center}
+\verb|mld_precset(p,what,val,info)|\\
+\end{center}
+
+\noindent
+This routine sets the parameters defining the preconditioner. More
+precisely, the parameter identified by \verb|what| is assigned the value
+contained in \verb|val|.
+
+{\vskip2\baselineskip\noindent\large\bfseries Arguments}
+
+\begin{tabular}{p{1.2cm}p{12cm}}
+\verb|p|      & \verb|type(mld_|\emph{x}\verb|prec_type), intent(inout)|.\\
+              & The preconditioner data structure. Note that \emph{x} must
+                be chosen according to the real/complex, single/double precision
+                 version of MLD2P4 under use.\\
+\verb|what|   & \verb|integer, intent(in)|. \\
+              & The number identifying the parameter to be set.
+                A mnemonic constant has been associated to each of these
+                numbers, as reported in Tables~\ref{tab:p_type}-\ref{tab:p_coarse}.\\
+\verb|val |   & \verb|integer| \emph{or} \verb|character(len=*)| \emph{or}
+                \verb|real(psb_spk_)| \emph{or} \verb|real(psb_dpk_)|,
+                \verb|intent(in)|.\\
+              & The value of the parameter to be set. The list of allowed
+                values and the corresponding data types is given in
+                Tables~\ref{tab:p_type}-\ref{tab:p_coarse}.
+                When the value is of type \verb|character(len=*)|,
+                it is also treated as case insensitive.\\
+\verb|info|   & \verb|integer, intent(out)|.\\
+              & Error code. If no error, 0 is returned. See Section~\ref{sec:errors}
+                for details.\\
+%
+%\verb|ilev|   & \verb|integer, optional, intent(in)|.\\
+%              & For the multilevel preconditioner, the level at which the
+%                preconditioner parameter has to be set.
+%                The levels are numbered in increasing
+%                order starting from the finest one, i.e.\ level 1 is the finest level.
+%                If \verb|ilev| is not present, the parameter identified by \verb|what|
+%                is set at all the appropriate levels (see Table~\ref{tab:params}).
+\end{tabular}
+
+\ \\
+A variety of (one-level and multi-level) preconditioners can be obtained
+by a suitable setting of the preconditioner parameters. These parameters
+can be logically divided into four groups, i.e.\ parameters defining
+\begin{enumerate}
+	\item the type of multi-level preconditioner;
+	\item the one-level preconditioner used as smoother;
+	\item the aggregation algorithm;
+	\item the coarse-space correction at the coarsest level.
+\end{enumerate}
+A list of the parameters that can be set, along with their allowed and
+default values, is given in Tables~\ref{tab:p_type}-\ref{tab:p_coarse}.
+For a detailed description  of the meaning of the parameters, please
+refer to Section~\ref{sec:background}. 
+%
+%Note that the routine allows to set different features of the
+%preconditioner at each level through the use of \verb|ilev|.
+%This should be done by users with experience in the field of
+%multi-level preconditioners. Non-expert users are recommended
+%to call \verb| mld_precset| without specifying \verb|ilev|.
+
+\bsideways
+\begin{center}
+\begin{tabular}{|l|l|p{2cm}|l|p{7cm}|}
+\hline
+\verb|what|              & \textsc{data type}        &  \verb|val|      &  \textsc{default}  &
+\textsc{comments} \\ \hline
+%\multicolumn{5}{|c|}{\emph{type of the multi-level preconditioner}}\\ \hline
+\verb|mld_ml_type_|      & \verb|character(len=*)|
+                         & \texttt{'ADD'} \ \ \ \texttt{'MULT'}   
+                         & \texttt{'MULT'}
+                         & Basic multi-level framework: additive or multiplicative
+                           among the levels (always additive inside a level).         \\ \hline 
+\verb|mld_smoother_type_|& \verb|character(len=*)|
+                         & \texttt{'DIAG'} \ \ \ \texttt{'BJAC'} \ \ \ \texttt{'AS'}
+                         & \texttt{'AS'}
+                         & Basic one-level preconditioner (i.e.\ smoother): diagonal,
+                           block Jacobi, AS. \\ \hline
+\verb|mld_smoother_pos_| & \verb|character(len=*)|
+                         & \texttt{'PRE'} \ \ \ \texttt{'POST'} \ \ \ \texttt{'TWOSIDE'}
+                         & \texttt{'POST'}
+                         & ``Position'' of the smoother: pre-smoother, post-smoother, 
+                           pre- and post-smoother. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Parameters defining the type of multi-level preconditioner.
+\label{tab:p_type}}                       
+\esideways
+                   
+\bsideways
+\begin{center}
+\begin{tabular}{|l|l|p{3.2cm}|l|p{7cm}|}
+\hline
+\verb|what|              & \textsc{data type}        &  \verb|val|      &  \textsc{default}  &
+\textsc{comments} \\ \hline
+%\multicolumn{5}{|c|}{\emph{basic one-level preconditioner (smoother)}} \\ \hline
+\verb|mld_sub_ovr_|       & \verb|integer|
+                         & any~int.~num.~$\ge 0$
+                         & 1
+                         & Number of overlap layers. \\ \hline
+\verb|mld_sub_restr_|    & \verb|character(len=*)|
+                         & \texttt{'HALO'} \hspace{2.5cm} \texttt{'NONE'}
+                         & \texttt{'HALO'}
+                         & Type of restriction operator:
+                           \texttt{'HALO'} for taking into account the overlap, \texttt{'NONE'} 
+                           for neglecting it. \\ \hline
+\verb|mld_sub_prol_|     & \verb|character(len=*)|
+                         & \texttt{'SUM'} \hspace{2.5cm} \texttt{'NONE'}
+                         & \texttt{'NONE'}
+                         & Type of prolongation operator:
+                           \texttt{'SUM'} for adding the contributions from the overlap, \texttt{'NONE'}
+                           for neglecting them.   \\ \hline
+\verb|mld_sub_solve_|    & \verb|character(len=*)|
+                         & \texttt{'ILU'} \hspace{2.5cm} \texttt{'MILU'} \hspace{2.5cm} \texttt{'ILUT'} 
+                           \hspace{2.5cm} \texttt{'UMF'} \hspace{2.5cm} \texttt{'SLU'}
+                         & \texttt{'UMF'}
+                         & Local solver: ILU($p$), MILU($p$), ILU($p,t$), LU from UMFPACK, LU from SuperLU
+                           (plus triangular solve). \\ \hline  
+\verb|mld_sub_fillin_|   & \verb|integer|
+                         & Any~int.~num.~$\ge 0$
+                         & 0
+                         & Fill-in level $p$ of the incomplete LU factorizations. \\ \hline
+\verb|mld_sub_iluthrs_|  & \verb|real(|\emph{kind\_parameter}\verb|)|
+                         & Any~real~num.~$\ge 0$
+                         & 0
+                         & Drop tolerance $t$ in the ILU($p,t$) factorization. \\ \hline
+\verb|mld_sub_ren_|      & \verb|character(len=*)|
+                         & \texttt{'RENUM\_NONE'}  \texttt{'RENUM\_GLOBAL'} %, \texttt{'RENUM_GPS'}
+                         & \texttt{'RENUM\_NONE'}
+                         & Row and column reordering of the local submatrices: no reordering,
+                           reordering according to the global numbering of the rows and columns of
+                           the whole matrix. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Parameters defining the one-level preconditioner used as smoother.
+\label{tab:p_smoother}}  
+\esideways
+                   
+\bsideways
+\begin{center}
+\begin{tabular}{|l|l|p{2.3cm}|p{2.6cm}|p{7cm}|}
+\hline
+\verb|what|              & \textsc{data type}        &  \verb|val|      &  \textsc{default}  &
+\textsc{comments} \\ \hline
+%\multicolumn{5}{|c|}{\emph{aggregation algorithm}} \\ \hline
+\verb|mld_aggr_alg_|     & \verb|character(len=*)|
+                         & \texttt{'DEC'}
+                         & \texttt{'DEC'}
+                         & Aggregation algorithm. Currently, only the decoupled aggregation is available. \\ \hline
+\verb|mld_aggr_kind_|    & \verb|character(len=*)|
+                         & \texttt{'SMOOTH'} \hspace{2.5cm} \texttt{'RAW'}
+                         & \texttt{'SMOOTH'}
+                         & Type of aggregation: smoothed, raw (i.e.\ using the tentative prolongator). \\ \hline
+\verb|mld_aggr_thresh_|  & \verb|real(|\emph{kind\_parameter}\verb|)|
+                         & Any~real~num. $\in [0, 1]$
+                         & 0
+                         & Threshold $\theta$ in the aggregation algorithm. \\ \hline
+\verb|mld_aggr_eig_|     & \verb|character(len=*)|
+                         & \texttt{'A\_NORMI'}
+                         & \texttt{'A\_NORMI'}
+                         & Estimate of the eigenvalue $D^{-1}A$ with largest modulus,
+                           to build the damping parameter $\omega$ in the smoothed aggregation.
+                           Currently, only the infinity norm of
+                           the matrix is available. \\ \hline
+\verb|mld_aggr_damp_|    & \verb|real(|\emph{kind\_parameter}\verb|)|
+                         & Any~real~num.
+                         & $4/(3||D^{-1}A||_\infty)$
+                         & Damping parameter $\omega$ in the smoothed aggregation algorithm. 
+                           If the user specifies a negative value, then $\omega$
+                           is set to its default value;
+                           otherwise, $\omega$ is set to the value provided by the
+                           user. In the latter case no estimate of the eigenvalue of
+                           $D^{-1}A$ with largest modulus is computed.\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Parameters defining the aggregation algorithm.
+\label{tab:p_aggregation}} 
+\esideways
+                     
+\bsideways
+\begin{center}
+\begin{tabular}{|l|l|p{3.2cm}|l|p{7cm}|}
+\hline
+\verb|what|              & \textsc{data type}        &  \verb|val|      &  \textsc{default}  &
+\textsc{comments} \\ \hline
+%\multicolumn{5}{|c|}{\emph{coarse-space correction at the coarsest level}}\\ \hline
+\verb|mld_coarse_mat_|   & \verb|character(len=*)|
+                         & \texttt{'DISTR'} \hspace{2.5cm} \texttt{'REPL'}
+                         & \texttt{'DISTR'}
+                         & Coarsest matrix: distributed among the processors or
+                           replicated on each of them. \\ \hline
+\verb|mld_coarse_solve_| & \verb|character(len=*)|
+                         & \texttt{'BJAC'} \hspace{2.5cm} \texttt{'UMF'} \hspace{2.5cm}
+                           \texttt{'SLU'} \hspace{2.5cm} \texttt{'SLUDIST'}
+                         & \texttt{'BJAC'}
+                         & Solver used at the coarsest level: block Jacobi, sequential
+                           LU from UMFPACK, sequential LU from SuperLU, 
+                           distributed LU from SuperLU\_Dist.
+                           \texttt{'BJAC'} and \texttt{'SLUDIST'} require the coarsest 
+                           matrix to be distributed, while \texttt{'UMF'} and
+                           \texttt{'SLU'} require it to be replicated. \\ \hline
+\verb|mld_coarse_subsolve_| & \verb|character(len=*)|
+                         & \texttt{'ILU'} \hspace{2.5cm} \texttt{'MILU'}
+                           \hspace{2.5cm} \texttt{'ILUT'}
+                           \hspace{2.5cm} \texttt{'UMF'} \hspace{2.5cm} \texttt{'SLU'}
+                         & \texttt{'UMF'}
+                         & Solver for the diagonal blocks of the coarse matrix,
+                           in case the block Jacobi solver
+                           is chosen as coarsest-level solver: ILU($p$), MILU($p$),
+                           ILU($p,t$), LU from UMFPACK,
+                           LU from SuperLU, plus triangular solve. \\ \hline
+\verb|mld_coarse_sweeps_|& \verb|integer|                         
+                         & Any~int.~num.~$> 0$
+                         & 4
+                         & Number of Block-Jacobi sweeps when 'BJAC' is used as
+                           coarsest-level solver. \\ \hline
+\verb|mld_coarse_fillin_| & \verb|integer|
+                         & Any~int.~num.~$\ge 0$
+                         & 0
+                         & Fill-in level $p$ of the incomplete LU factorizations. \\ \hline
+\verb|mld_coarse_iluthrs_| & \verb|real(|\emph{kind\_parameter}\verb|)|
+                         & Any~real.~num.~$\ge 0$
+                         & 0
+                         & Drop tolerance $t$ in the ILU($p,t$) factorization. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Parameters defining the coarse-space correction at the coarsest
+level.\label{tab:p_coarse}} 
+\esideways
+
+
+\clearpage
+
+\subsection{Subroutine mld\_precbld\label{sec:precbld}}
+
+\begin{center}
+\verb|mld_precbld(a,desc_a,p,info)|\\
+\end{center}
+
+\noindent
+This routine builds the preconditioner according to the requirements made by
+the user through the routines \verb|mld_precinit| and \verb|mld_precset|.
+
+{\vskip2\baselineskip\noindent\large\bfseries Arguments}
+
+\begin{tabular}{p{1.2cm}p{12cm}}
+\verb|a|      & \verb|type(psb_|\emph{x}\verb|spmat_type), intent(in)|. \\
+              & The sparse matrix structure containing the local part of the
+                matrix to be preconditioned. Note that \emph{x} must be chosen according
+                to the real/complex, 
+single/double precision version of MLD2P4 under use.
+                See the PSBLAS User's Guide for details \cite{PSBLASGUIDE}.\\
+\verb|desc_a| & \verb|type(psb_desc_type), intent(in)|. \\
+              & The communication descriptor of \verb|a|. See the PSBLAS User's Guide for
+                details \cite{PSBLASGUIDE}.\\
+\verb|p|      & \verb|type(mld_|\emph{x}\verb|prec_type), intent(inout)|.\\
+              & The preconditioner data structure. Note that \emph{x} must be chosen according
+                to the real/complex, single/double precision version of MLD2P4 under use.\\
+\verb|info|   & \verb|integer, intent(out)|.\\
+              & Error code. If no error, 0 is returned. See Section~\ref{sec:errors} for details.\\
+\end{tabular}
+
+\clearpage
+\subsection{Subroutine mld\_precaply\label{sec:precaply}}
+
+\begin{center}
+\verb|mld_precaply(p,x,y,desc_a,info)|\\
+\verb|mld_precaply(p,x,y,desc_a,info,trans,work)|\\
+\end{center}
+
+\noindent
+This routine computes $y = op(M^{-1})\, x$, where $M$ is a previously built
+preconditioner, stored into \verb|p|, and $op$
+denotes the preconditioner itself or its transpose, according to
+the value of \verb|trans|.
+Note that, when MLD2P4 is used with a Krylov solver from PSBLAS,
+\verb|mld_precaply| is called within the PSBLAS routine \verb|mld_krylov|
+and hence it is completely transparent to the user.
+
+{\vskip2\baselineskip\noindent\large\bfseries Arguments}
+
+\begin{tabular}{p{1.2cm}p{12cm}}
+\verb|p|      & \verb|type(mld_|\emph{x}\verb|prec_type), intent(inout)|.\\
+              & The preconditioner data structure, containing the local part of $M$.
+                Note that \emph{x} must be chosen according
+                to the real/complex, single/double precision version of MLD2P4 under use.\\
+\verb|x|      & \emph{type}\verb|(|\emph{kind\_parameter}\verb|), dimension(:), intent(in)|.\\
+              & The local part of the vector $x$. Note that \emph{type} and   
+                \emph{kind\_parameter} must be chosen according
+                to the real/complex, single/double precision version of MLD2P4 under use.\\
+\verb|y|      & \emph{type}\verb|(|\emph{kind\_parameter}\verb|), dimension(:), intent(out)|.\\
+              & The local part of the vector $y$. Note that \emph{type} and
+                \emph{kind\_parameter} must be chosen according
+                to the real/complex, single/double precision version of MLD2P4 under use.\\
+\verb|desc_a| & \verb|type(psb_desc_type), intent(in)|. \\
+              & The communication descriptor associated to the matrix to be
+                preconditioned.\\
+\verb|info|   & \verb|integer, intent(out)|.\\
+              & Error code. If no error, 0 is returned. See Section~\ref{sec:errors} for details.\\
+\verb|trans|  & \verb|character(len=1), optional, intent(in).|\\
+              & If \verb|trans| = \verb|'N','n'| then $op(M^{-1}) = M^{-1}$;
+                if \verb|trans| = \verb|'T','t'| then $op(M^{-1}) = M^{-T}$
+                (transpose of $M^{-1})$;  if \verb|trans| = \verb|'C','c'| then $op(M^{-1}) = M^{-C}$
+                (conjugate transpose of $M^{-1})$.\\
+\verb|work|  & \emph{type}\verb|(|\emph{kind\_parameter}\verb|), dimension(:), optional, target|.\\
+             & Workspace. Its size should be at
+               least \verb|4 * psb_cd_get_local_| \verb|cols(desc_a)| (see the PSBLAS User's Guide).
+               Note that \emph{type} and \emph{kind\_parameter} must be chosen according
+               to the real/complex, single/double precision version of MLD2P4 under use.\\
+\end{tabular}
+
+\clearpage
+
+\subsection{Subroutine mld\_precfree\label{sec:precfree}}
+
+\begin{center}
+\verb|mld_precfree(p,info)|\\
+\end{center}
+
+\noindent
+This routine deallocates the preconditioner data structure.
+
+{\vskip2\baselineskip\noindent\large\bfseries Arguments}
+
+\begin{tabular}{p{1.2cm}p{10.5cm}}
+\verb|p|      & \verb|type(mld_|\emph{x}\verb|prec_type), intent(inout)|.\\
+              & The preconditioner data structure. Note that \emph{x} must be chosen according
+                to the real/complex, single/double precision version of MLD2P4 under use.\\
+\verb|info|   & \verb|integer, intent(out)|.\\
+              & Error code. If no error, 0 is returned. See Section~\ref{sec:errors} for details.\\
+\end{tabular}
+
+\clearpage
+
+\subsection{Subroutine mld\_precdescr\label{sec:precdescr}}
+
+\begin{center}
+\verb|mld_precdescr(p,info)|\\
+\verb|mld_precdescr(p,info,iout)|\\
+\end{center}
+
+\noindent
+This routine prints a description of the preconditioner to the standard output or
+to a file. It must be called after \verb|mld_precbld| has been called.
+
+{\vskip2\baselineskip\noindent\large\bfseries Arguments}
+
+\begin{tabular}{p{1.2cm}p{12cm}}
+\verb|p|      & \verb|type(mld_|\emph{x}\verb|prec_type), intent(in)|.\\
+              & The preconditioner data structure. Note that \emph{x} must be chosen according
+                to the real/complex, single/double precision version of MLD2P4 under use.\\
+\verb|info|   & \verb|integer, intent(out)|.\\
+              & Error code. If no error, 0 is returned. See Section~\ref{sec:errors} for details.\\
+\verb|iout|   & \verb|integer, intent(in), optional|.\\
+              & The id of the file where the preconditioner description
+                will be printed; the default is the standard output.\\
+\end{tabular}
+
+%%% Local Variables: 
+%%% mode: latex
+%%% TeX-master: "userguide"
+%%% End: