From 0c883525308de8c268a23fc9236ac464cb9d3ddd Mon Sep 17 00:00:00 2001
From: wlthr <dwalther@enseirb-matmeca.fr>
Date: Thu, 29 Jun 2023 19:07:23 +0200
Subject: [PATCH] added sp3mm4amg source code to the repository and created C
 wrapper routine to be used in the fortran interface

---
 .vscode/settings.json                         |    7 +
 base/serial/impl/sp3mm4amg/COPYING            |   15 +
 .../impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c      |   66 +
 .../sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c     |  249 ++
 .../Sp3MM_CSR_OMP_SymbStep_Generic.c          |  594 ++++
 .../sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c  |   92 +
 .../impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c |  573 +++
 .../serial/impl/sp3mm4amg/commons/ompGetICV.c |   86 +
 .../commons/ompi_dims_create/mpi.h.in         | 3141 +++++++++++++++++
 .../ompi_dims_create/ompi_config_minimal.h    |  193 +
 .../ompi_dims_create/ompi_dims_create.c       |  288 ++
 .../sp3mm4amg/commons/sparseUtilsGeneric.c    |  499 +++
 .../impl/sp3mm4amg/commons/sparseUtilsMulti.c |   38 +
 base/serial/impl/sp3mm4amg/commons/utils.c    |  502 +++
 .../impl/sp3mm4amg/fbind/psb_f_spmm_ub.c      |  107 +
 .../sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h   |   94 +
 .../include/Sp3MM_CSR_OMP_Num_Generic.h       |   95 +
 .../include/Sp3MM_CSR_OMP_SymbStep_Generic.h  |  195 +
 .../include/Sp3MM_CSR_OMP_SymbStep_Multi.h    |   80 +
 .../include/Sp3MM_CSR_OMP_UB_Generic.h        |   95 +
 .../impl/sp3mm4amg/include/SpMMUtilsGeneric.h |  406 +++
 .../impl/sp3mm4amg/include/SpMMUtilsMulti.h   |   56 +
 base/serial/impl/sp3mm4amg/include/config.h   |  155 +
 .../impl/sp3mm4amg/include/inlineExports.h    |   49 +
 .../sp3mm4amg/include/inlineExports_Generic.h |   43 +
 .../include/linuxK_rbtree_minimalized.h       |  335 ++
 base/serial/impl/sp3mm4amg/include/macros.h   |  118 +
 .../impl/sp3mm4amg/include/macrosLinuxMock.h  |  140 +
 base/serial/impl/sp3mm4amg/include/mmio.h     |  133 +
 .../impl/sp3mm4amg/include/ompChunksDivide.h  |  105 +
 .../serial/impl/sp3mm4amg/include/ompGetICV.h |   41 +
 .../sp3mm4amg/include/ompi_config_minimal.h   |  193 +
 base/serial/impl/sp3mm4amg/include/parser.h   |   90 +
 .../impl/sp3mm4amg/include/sparseMatrix.h     |  216 ++
 .../sp3mm4amg/include/sparseUtilsGeneric.h    |  138 +
 .../impl/sp3mm4amg/include/sparseUtilsMulti.h |   52 +
 base/serial/impl/sp3mm4amg/include/utils.h    |  181 +
 .../sp3mm4amg/lib/linuxK_rbtree_minimalized.c |  414 +++
 base/serial/impl/sp3mm4amg/lib/mmio.c         |  511 +++
 base/serial/impl/sp3mm4amg/lib/parser.c       |  391 ++
 cbind/base/psb_objhandle_mod.F90              |   33 +-
 41 files changed, 10777 insertions(+), 32 deletions(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 base/serial/impl/sp3mm4amg/COPYING
 create mode 100644 base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c
 create mode 100644 base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
 create mode 100644 base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
 create mode 100644 base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c
 create mode 100644 base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
 create mode 100644 base/serial/impl/sp3mm4amg/commons/ompGetICV.c
 create mode 100644 base/serial/impl/sp3mm4amg/commons/ompi_dims_create/mpi.h.in
 create mode 100644 base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_config_minimal.h
 create mode 100644 base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_dims_create.c
 create mode 100644 base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
 create mode 100644 base/serial/impl/sp3mm4amg/commons/sparseUtilsMulti.c
 create mode 100644 base/serial/impl/sp3mm4amg/commons/utils.c
 create mode 100644 base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
 create mode 100644 base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Num_Generic.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Generic.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Multi.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/SpMMUtilsMulti.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/config.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/inlineExports.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/linuxK_rbtree_minimalized.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/macros.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/macrosLinuxMock.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/mmio.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/ompGetICV.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/ompi_config_minimal.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/parser.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/sparseMatrix.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/sparseUtilsMulti.h
 create mode 100644 base/serial/impl/sp3mm4amg/include/utils.h
 create mode 100644 base/serial/impl/sp3mm4amg/lib/linuxK_rbtree_minimalized.c
 create mode 100644 base/serial/impl/sp3mm4amg/lib/mmio.c
 create mode 100644 base/serial/impl/sp3mm4amg/lib/parser.c

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..b7b048ac
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "fortran.fortls.path": "/usr/bin/fortls",
+    "files.associations": {
+        "sp3mm_csr_omp_multi.h": "c",
+        "sp3mm_csr_omp_ub_generic.h": "c"
+    }
+}
\ No newline at end of file
diff --git a/base/serial/impl/sp3mm4amg/COPYING b/base/serial/impl/sp3mm4amg/COPYING
new file mode 100644
index 00000000..b1ed3a7f
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/COPYING
@@ -0,0 +1,15 @@
+Copyright Andrea Di Iorio 2022
+This file is part of Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN
+Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Sp3MM_4_AMG_OMP_CUDA_C_FORTRAN.  If not, see <http://www.gnu.org/licenses/>.
+
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c
new file mode 100644
index 00000000..9982caaf
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Multi.c
@@ -0,0 +1,66 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//Get multiple implementation for C-Fortran indexing by re-define & re-include
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <alloca.h> //TODO quick hold few CSR cols partition sizes
+#include <omp.h>
+//UB version deps
+#include "Sp3MM_CSR_OMP_Multi.h"
+#include "SpMMUtilsMulti.h"
+#include "sparseUtilsMulti.h"
+#include "ompChunksDivide.h"
+#include "parser.h"
+#include "utils.h"
+#include "macros.h"
+#include "sparseMatrix.h"
+
+#include "inlineExports.h"
+
+//Symb version deps
+#include "Sp3MM_CSR_OMP_Multi.h"
+
+
+//global vars	->	audit
+double Start,End,Elapsed,ElapsedInternal;
+
+#define OFF_F 0
+#include "inlineExports_Generic.h"
+#include "Sp3MM_CSR_OMP_UB_Generic.c"
+#include "Sp3MM_CSR_OMP_Num_Generic.c"
+#undef OFF_F
+
+#define OFF_F 1
+#include "inlineExports_Generic.h"
+#include "Sp3MM_CSR_OMP_UB_Generic.c"
+#include "Sp3MM_CSR_OMP_Num_Generic.c"
+#undef OFF_F
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
new file mode 100644
index 00000000..fc4306c1
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c
@@ -0,0 +1,249 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+/*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/
+#ifndef OFF_F
+    #error generic implementation requires OFF_F defined
+#endif
+
+#ifndef SP3MM_OMP_SYMB
+#define SP3MM_OMP_SYMB
+
+//allocCSRSpMatSymbStep aux function for IRP set by symb step output
+static inline idx_t _setCSR_IRP_1DPartitioing(spmat* m, idx_t* rowSizes){
+	idx_t r,cumulSize;
+	for (r=0,cumulSize=0; r<m->M; cumulSize += rowSizes[r++])
+		m->IRP[r] = cumulSize;
+	DEBUGCHECKS	assert(cumulSize == rowSizes[m->M]);
+	m->IRP[m->M] = cumulSize;
+	return cumulSize;
+}
+static inline idx_t _setCSR_IRP_2DPartitioing(spmat* m,idx_t* rowSizes,ushort gridCols){
+	idx_t cumulSize=0;
+	for (idx_t r=0,cumulSizeOld; r<m->M; r++){
+		m->IRP[r] = cumulSize;
+		for (ushort gc=0; gc<gridCols; gc++){
+			cumulSizeOld = cumulSize;
+			cumulSize += rowSizes[ IDX2D(r,gc,gridCols) ];
+			//inplace update 2D rowParts len matrix as IRP like	start offsets
+			rowSizes[ IDX2D(r,gc,gridCols) ] = cumulSizeOld; 
+		}
+	}
+	DEBUGCHECKS		assert(cumulSize == rowSizes[m->M*gridCols]);
+	m->IRP[m->M] = cumulSize;
+	return cumulSize;
+}
+/*
+ * allocate CSR mat @m, setting up correctly IRP and other buffers allocations
+ * if @gridCols == 1: (1D partitioning of @m) -> @rowsSizes will be an array of row lenghts
+ * if @gridCols >  1: (2D partitioning of @m) -> @rowsSizes will be a matrix with
+ * 		elem i,j = len of j-th colPartition (out of @gridCols) of i-th row
+ */
+static inline int allocCSRSpMatSymbStep(spmat* m,idx_t* rowSizes,ushort gridCols){
+	//setup IRP and get cumul, whole size of @m
+	if (!gridCols)	   return EXIT_FAILURE;
+	idx_t cumulSize;
+	if (gridCols == 1) cumulSize = _setCSR_IRP_1DPartitioing(m,rowSizes);
+	/*if (gridCols > 1} -- static analizer miss unsignedness...*/
+	else		   cumulSize = _setCSR_IRP_2DPartitioing(m,rowSizes,gridCols);
+	m->IRP[m->M] 	=  cumulSize;
+	m->NZ		=  cumulSize;
+
+	if (!(m->AS = malloc(rowSizes[m->M] * sizeof(*m->AS))) ){
+		ERRPRINT("allocCSRSpMatSymbStep m->AS malloc errd\n");
+		return EXIT_FAILURE;
+	}
+	if (!(m->JA = malloc(rowSizes[m->M] * sizeof(*m->JA))) ){
+		ERRPRINT("allocCSRSpMatSymbStep m->JA malloc errd\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+#endif
+//////////////////// COMPUTE CORE Sp[3]MM SYMB-NUMB PHASE //////////////////////////
+////////Sp3MM as 2 x SpMM
+///1D
+spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	///SYMBOLIC STEP
+	if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
+		goto _err;
+	if (allocCSRSpMatSymbStep(AB,rowsSizes,1))	goto _err;
+	
+	///NUMERIC STEP
+	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	#pragma omp parallel for schedule(runtime) private(acc)
+	for (idx_t r=0;  r<A->M; r++){	//row-by-row formulation
+		acc = accVects + omp_get_thread_num();
+		_resetAccVect(acc);   //rezero for the next A row
+		//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
+		/*	direct use of sparse scalar vector multiplication
+		for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen;  ja<A->IRP[r+1]-OFF_F;  ja++){
+			ca 		= A->JA[ja]		- OFF_F;
+			jb 		= B->IRP[ca]	- OFF_F;
+			bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
+			CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
+		} */
+		for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
+			CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
+		//direct sparsify: trasform accumulated dense vector to a CSR row
+		sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
+	}
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG                   checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+
+}
+spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	///SYMBOLIC STEP
+	if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
+		goto _err;
+	if (allocCSRSpMatSymbStep(AB,rowsSizes,1))
+		goto _err;
+	
+	///NUMERIC STEP
+	//perform Gustavson over rows blocks -> M / @cfg->gridRows
+	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
+	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong b,startRow,block; //omp for aux vars
+	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
+	for (b=0;   b < cfg->gridRows; b++){
+		block	  = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
+		startRow   = UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);
+		for (idx_t r=startRow;  r<startRow+block; r++){
+			acc = accVects + omp_get_thread_num();
+			_resetAccVect(acc);   //rezero for the next A row
+			//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
+			/*	direct use of sparse scalar vector multiplication
+			for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen;  ja<A->IRP[r+1]-OFF_F;  ja++){
+				ca 		= A->JA[ja]		- OFF_F;
+				jb 		= B->IRP[ca]	- OFF_F;
+				bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
+				CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
+			} */
+			for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
+				CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
+			//direct sparsify: trasform accumulated dense vector to a CSR row
+			sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
+		}
+	}
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG			checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	if(rowsSizes)   free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+
+}
+spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
+	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
+	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
+	ACC_DENSE *accVectors=NULL,*accV;
+	SPACC* accRowPart;
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	SPMM_ACC* outAccumul=NULL;
+	idx_t	*rowsPartsSizes=NULL, *rowSizes=NULL;	//for rows' cols partition, correct len
+	if (!AB)	goto _err;
+	//2D indexing aux vars
+	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	////get bColOffsets for B column groups 
+	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
+		goto _err;
+	///TODO TODO
+	_err:
+	if (AB) freeSpmat(AB);
+	AB = NULL; 
+	_free:
+	free(rowsPartsSizes);
+	free(bColOffsets);
+	if (accVectors)  freeAccsDense(accVectors,gridSize);
+	if (outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+}
+/*	TODO
+///2D
+//PARTITIONS NOT ALLOCATED
+spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+///SP3MM
+spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
+////////Sp3MM direct
+///1D
+spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
+*/
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
new file mode 100644
index 00000000..c41242a5
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Generic.c
@@ -0,0 +1,594 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+/*
+ * CSR Sp[3]MM Symbolic step implementations
+ * target: 	compute the output matrix size and the row lens for preallocation
+ * 			direct write out partial results
+ * See interfaces in respective header
+ */
+
+
+/*#pragma message( "compiling Sp3MM_CSR_OMP_Symb_Generic.c with config as:" \
+	STR(OFF_F) " - " STR(OUT_IDXS) " - " STR(COL_PARTS) )*/
+#ifndef OFF_F
+    #error generic implementation requires OFF_F defined
+#endif
+
+///setup aux macros for different signatures implementation via #if arith expr
+#pragma push_macro("OUT_IDXS")
+#pragma push_macro("_OUT_IDXS")
+#pragma push_macro("COL_PARTS")
+#pragma push_macro("_COL_PARTS")
+
+#ifdef OUT_IDXS 
+	#define _OUT_IDXS  	TRUE
+#else   
+	#define _OUT_IDXS 	FALSE
+	#define OUT_IDXS	_UNDEF
+#endif
+#ifdef COL_PARTS
+	#define _COL_PARTS	TRUE
+#else   
+	#define _COL_PARTS	FALSE
+	#define COL_PARTS	_UNDEF
+#endif
+///
+
+//////SpMM - rowByrow
+///1row->matrix->outRow
+//RBTREE based
+/*
+ * Compute symbolic product of (nnz indexes of) row @aRowJA and matrix @b
+ * insert nnz indexes of the mul. result row as nodes in a rbtree rooted at @root
+ * with nodes in @nodes which have to be enough for the mul result row (use an UB)
+ * Retuns: multiplication result row NNZ number,se CONFIG_MACROS below for more 
+ *
+ * CONFIG_MACROS:
+ * if _OUT_IDXS == TRUE	return mul.result row nnz idxs in @outIdxs
+ *	  ifdef: OUT_IDXS_RBTREE_NODES: nnz indexes returned inplace sorting rbtree
+ * 								as nnz indexes(JA) of the mul result row
+ * else:							stop at returning the mul. result row lenght
+ * if _COL_PARTS == TRUE	return the number of nonzero elements in
+ *		in each of the @gridCols column partitions inside @rowColPartsLens
+ * OFF_F:	offset back indexes from fortran
+ * 		TODO also output indexes are shifted (see c_b )
+ */
+static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   idx_t* aRowJA, idx_t aRowLen, spmat* b,rbRoot* root, rbNode* nodes
+   #if _OUT_IDXS  == TRUE && !defined OUT_IDXS_RBTREE_NODES 
+   ,idx_t* outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  )
+{
+	//Compute resulting ab's row non zero indexes and total lenght
+	idx_t abRowLen = 0;		//mul.result row len,	return value
+	for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){	//for each entry in a's row
+		c_a = aRowJA[i]-OFF_F;
+		//gather diffrent nnz indexes in corresponding b's `c_a`th row 
+		for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){ 
+			c_b = b->JA[j]-OFF_F;
+			//check if c_b is nonzero index for mul.result row
+			inserted 	 =  rbInsertNewKey (root, nodes+abRowLen, c_b);
+			abRowLen	+=	inserted; //inserted needed just after this
+			/*	LESS EFFICIENT THEN BELOW (here no memory of last colPart)
+			#if _COL_PARTS == TRUE //keep track of which col partition is c_b in
+			if (inserted)
+			 rowColPartsLens[ matchingUnifRangeIdx(c_b, b->N, gridCols) ]++;
+			#endif */
+		}
+	}
+	#if 	_OUT_IDXS == T  && defined OUT_IDXS_RBTREE_NODES
+	/* return the non zero indexes of the mul.result row
+ 	 * sorting inplace the nodes inserted in the rbtree */
+	sortRbNode(nodes,abRowLen);
+	#elif	_OUT_IDXS == T || _COL_PARTS == T
+	uint i=0;
+	idx_t k;
+	#if _COL_PARTS == T
+	//colParts aux vars
+	idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
+	ushort gc=0;
+	idx_t gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
+	idx_t gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
+	#endif	//_COL_PARTS == T
+  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+		k = rb_entry(n,rbNode,rb)->key;
+		#if _OUT_IDXS == T
+		//return the mul.result nnz index inside the rbNodes
+		outIdxs[ i++ ] = k;
+		#endif
+		#if _COL_PARTS == T
+		while (k >= gcEndCol ){	//see if the idx is in another col partition
+								//	TODO also = since gcEndCol as k is 0based
+			gcEndCol = 	unifRemShareEnd(gc ,_colBlock, _colBlockRem);
+			gc++;
+			DEBUGCHECKS{ assert( gc < gridCols ); }
+		}
+		rowColPartsLens[gc]++;
+		#endif //_COL_PARTS == T
+	}
+	#endif	//_OUT_IDXS == T ... _COL_PARTS == T
+	/*DEBUGCHECKS{	//TODO PRINT NNZ INDEXES FOR MANUAL (PAINFUL CHECK)
+		idx_t k;
+  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+			k = rb_entry(n,rbNode,rb)->key;
+			printf("%lu, ",k);
+		}
+		printf("\n");
+	}*/
+
+	return abRowLen;
+}
+
+//SPVECT_IDX_DENSE_MAP based	TODO double implementation for trick syntax folding here...
+/*
+ * SPVECT_IDX_DENSE_MAP based, as SpMM_Row_Symb_Rbtree but with idxMap aux idx keeping
+ * CONFIG_MACROS (new)
+ * IDX_RMUL_SYMB_RBTREE && ( _OUT_IDXS == T || _COL_PARTS == T ):
+ * 	 (tmp) symb mult out indexes will be kept via a rbtree
+ * 	 otherwise directly in the out array appending them and then sorting them
+ * 	 		(potentially same n log n)
+ */
+static inline idx_t CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   idx_t* aRowJA, idx_t aRowLen, spmat* b, SPVECT_IDX_DENSE_MAP* idxsMapAcc
+   #if _OUT_IDXS  == TRUE 
+   ,idx_t* outIdxs
+   #endif
+   #if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
+   ,rbRoot* root, rbNode* nodes
+   #endif	// _OUT_IDXS == TRUE
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  )
+{
+	//Compute resulting ab's row non zero indexes and total lenght
+	idx_t abRowLen = 0;		//mul.result row len,	return value
+	for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){	//for each entry in a's row
+		c_a = aRowJA[i]-OFF_F;
+		//gather diffrent nnz indexes in corresponding b's `c_a`th row 
+		for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){ 
+			c_b = b->JA[j]-OFF_F;
+			//check if c_b is nonzero index for mul.result row
+			inserted  = spVect_idx_in(c_b,idxsMapAcc);
+			#if	_OUT_IDXS == T || _COL_PARTS == T //idxs HAS TO be accumulated
+			if (inserted)
+				#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F 	//add it in a RBTREE struct
+				rbInsertNewKey (root, nodes+idxsMapAcc->len, c_b);
+				#else						//append it, then sort
+				outIdxs[idxsMapAcc->len] = c_b;
+				#endif	//IDX_RMUL_SYMB_RBTREE == T //how accumulated key c_b
+			#endif	//#if	_OUT_IDXS == T || _COL_PARTS == T
+		}
+	}
+	abRowLen = idxsMapAcc->len;
+	//gather idxs or their sparsity struct in output row
+	#if	_OUT_IDXS == T || _COL_PARTS == T 
+	idx_t j = 0,k;
+	#if _COL_PARTS == T 
+	//colParts aux vars
+	idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
+	ushort gc = 0;
+	idx_t  gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
+	idx_t  gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
+	#endif
+	#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F	///idxs recorded in a aux rbtree
+  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+		k = rb_entry(n,rbNode,rb)->key;
+		#if _OUT_IDXS == T
+		outIdxs[ j++ ] = k;	//record ordered key sotred from aux rbtree
+		#endif
+	#else				///idxs recorded in aux append array
+	sort_idx_t(outIdxs,abRowLen);
+  	for (; j < abRowLen; j++){
+		k = outIdxs[j];		//(OSS) already ordered in outIndexes arr
+	#endif	//IDX_RMUL_SYMB_RBTREE == T
+		#if _COL_PARTS == T
+		while (k >= gcEndCol ){	//see if the idx is in another col partition
+				//	TODO also = since gcEndCol as k is 0based
+			gcEndCol = unifRemShareEnd(gc ,_colBlock, _colBlockRem);
+			gc++;
+			DEBUGCHECKS{ assert( gc < gridCols ); }
+		}
+		rowColPartsLens[gc]++;
+		#endif //_COL_PARTS == T
+	}
+	#endif	//_OUT_IDXS == T ... _COL_PARTS == T
+	return abRowLen;
+}
+
+//switch among 2 row_symb_XXX implemenetation aux
+/*
+ * SpMM single row symbolic computation
+ * select one implementation via @implID 
+ * among SpMM_Row_Symb_Rbtree or SpMM_Row_Symb_IdxMap
+ * args will be forwared accordingly
+ */
+static inline idx_t CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   ROW_MMSYM_IMPL_MODE implID, idx_t* aRowJA, idx_t aRowLen, spmat* b,
+   rbRoot* root, rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc
+   #if _OUT_IDXS  == TRUE 
+   ,idx_t* outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  )
+{
+	if (implID == RBTREE)	{
+		return CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)  
+			(
+				aRowJA,aRowLen,b,root,nodes
+   				#if _OUT_IDXS  == TRUE && !defined OUT_IDXS_RBTREE_NODES 
+				,outIdxs
+				#endif
+				#if _COL_PARTS == TRUE
+				,gridCols,rowColPartsLens
+				#endif
+			);
+	}
+	else { //IDXMAP
+		return CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
+			(
+				aRowJA,aRowLen,b,idxsMapAcc
+   				#if		 _OUT_IDXS  == TRUE 
+				,outIdxs
+				#endif
+   				#if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
+   				,root, nodes
+				#endif
+				#if _COL_PARTS == TRUE
+				,gridCols, rowColPartsLens
+				#endif
+			);
+	}
+}
+///SpMM row-by-row
+idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
+  (
+   ROW_MMSYM_IMPL_MODE symbRowImplID, spmat* a, spmat* b
+   #if _OUT_IDXS  == TRUE
+   ,idx_t*** outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols, idx_t** rowColPartsLens
+   #endif
+  )
+{
+
+	///initial allocations
+	rbRoot* rbRoots = NULL;	
+	rbNode* rbNodes	= NULL; 
+	SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
+	idx_t  *rowLens=NULL,*upperBoundedRowsLens=NULL,*upperBoundedSymMat=NULL;
+	idx_t  maxRowLen=0;
+	int rbTreeUsed = (symbRowImplID == RBTREE || 
+			  (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS)) );
+	
+	if ( !(rowLens = malloc(sizeof(*rowLens) * (a->M+1))) ){ 
+		ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
+		goto _err;
+	}
+	if (_OUT_IDXS == TRUE || rbTreeUsed ){
+		if (!(upperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))
+			goto _err;
+	}
+	#if _OUT_IDXS  == TRUE
+	if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
+		ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
+		goto _err;
+	}
+	if (!(upperBoundedSymMat   = malloc(
+	  sizeof(*upperBoundedSymMat)*upperBoundedRowsLens[a->M]))){
+		ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
+		goto _err;
+	}
+	//write rows' start pointer from full matrix JA allocated
+	for (idx_t i=0,cumul=0; i<a->M; cumul += upperBoundedRowsLens[i++])
+		*outIdxs[i] = upperBoundedSymMat + cumul; 
+	#endif	//#if _OUT_IDXS  == TRUE
+	#if _COL_PARTS == TRUE
+	if (!(*rowColPartsLens = malloc(a->M * gridCols * sizeof(**rowColPartsLens)))){
+		ERRPRINT("SpMM_Symb_ rowColPartsLens malloc errd\n");
+		goto _err;
+	}
+	#endif //_COL_PARTS
+	uint maxThreads	= omp_get_max_threads();	//TODO FROM CFG
+	//index keeping aux struct
+	//rbtree implementation or idxMap with aux of symbTree for tmp outIdx keeping
+	if ( rbTreeUsed ){
+		maxRowLen = reductionMaxSeq(upperBoundedRowsLens, a->M);
+		//rbTrees for index keeping
+		rbRoots = malloc(maxThreads * sizeof(*rbRoots));
+		rbNodes	= calloc(maxThreads * maxRowLen, sizeof(*rbNodes));
+		if( !rbRoots || !rbNodes ){
+			ERRPRINT("SpMM_Symb_ threads' aux rbTree mallocs errd\n");
+			goto _err;
+		}
+		//init roots
+		for (uint i=0; i<maxThreads; i++)
+			rbRoots[i] = RB_ROOT_CACHED;
+	} 
+	if (symbRowImplID == IDXMAP){  //idxMap implementation && not outIdxs via rbtree
+		idxsMapAccs 	= calloc(maxThreads, sizeof(*idxsMapAccs));
+		if (!idxsMapAccs){
+			ERRPRINT("SpMM_Symb_ idxsMapAccs calloc errd\n");
+			goto _err;
+		}
+		//init idxs maps
+		for (uint i=0; i<maxThreads; i++){
+			if(initSpVectIdxDenseAcc(b->N,idxsMapAccs+i))	goto _err;
+		}
+	}
+	///rows parallel compute
+	idx_t* aRow;
+	idx_t  aRowLen,rLen,abCumulLen=0;
+	int tid;
+	rbRoot* tRoot;	rbNode* tNodes; 
+	SPVECT_IDX_DENSE_MAP* tIdxsMapAcc = NULL;
+	#pragma omp parallel for schedule(static) \
+	  private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:abCumulLen)
+	for(idx_t r=0; r<a->M; r++){
+		aRow		= a->JA + a->IRP[r]-OFF_F;
+		aRowLen		= a->IRP[r+1] - a->IRP[r];
+		tid 		= omp_get_thread_num();
+		//TODO low overhead pointer airth can be avoided with if (symbRowImplID .. && )
+		tIdxsMapAcc	= idxsMapAccs + tid;
+		tRoot 		= rbRoots + tid;
+		tNodes  	= rbNodes + tid * maxRowLen;
+
+		rLen = CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)  
+		(
+			symbRowImplID, aRow, aRowLen, b, tRoot,tNodes, tIdxsMapAcc
+   			#if _OUT_IDXS  == TRUE
+			,*outIdxs[r]
+			#endif
+			#if _COL_PARTS == TRUE
+			,gridCols, (*rowColPartsLens) + IDX2D(r,0,gridCols)
+			#endif
+		);
+		rowLens[r]  = rLen;
+		abCumulLen += rLen;
+		///reset symb idxs keeping aux structs
+		if (symbRowImplID==RBTREE || (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS))){
+			*tRoot = RB_ROOT_CACHED;
+			memset(tNodes,0,rLen * sizeof(*tNodes));
+		}
+		if (symbRowImplID == IDXMAP)	_resetIdxMap(tIdxsMapAcc);
+		
+	}
+	rowLens[a->M]	= abCumulLen;
+	goto _free;
+
+	_err:
+	free(rowLens);
+	#if _OUT_IDXS  == T
+	if  (outIdxs)			free(*outIdxs);
+	#endif
+	#if _COL_PARTS == T
+	if  (rowColPartsLens)	free(*rowColPartsLens);
+	#endif
+	rowLens = NULL;
+	_free:
+	free(upperBoundedRowsLens);
+	free(upperBoundedSymMat);
+	free(rbRoots);
+	free(rbNodes);
+	if (idxsMapAccs){
+		for (uint i=0; i<maxThreads; i++)	free(idxsMapAccs[i].idxsMap);
+		free(idxsMapAccs);
+	}
+
+	return rowLens;
+}
+
+
+///Sp3MM - rowByrowByrow	////////////////////////////////////////////////////////
+//TODO COL_PARTS VERSION?
+#if _OUT_IDXS == TRUE && _COL_PARTS == FALSE
+idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID,idx_t* aRowJA,idx_t aRowLen,spmat* b,spmat* c, 
+	rbRoot* root,rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc,idx_t* abRowJATmp
+	#if _OUT_IDXS  == TRUE && !defined  OUT_IDXS_RBTREE_NODES 
+	,idx_t* outIdxs
+	#endif
+  )
+{
+	struct rb_node* n;
+	idx_t abRowLen	= CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
+	  (symbMMRowImplID,aRowJA,aRowLen,b,root,nodes,idxsMapAcc,abRowJATmp);
+	cleanRbNodes(root, nodes, abRowLen);
+	idx_t abcRowLen	= CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
+	  (symbMMRowImplID,abRowJATmp,abRowLen,c,root,nodes,idxsMapAcc,abRowJATmp);
+
+	#if 	_OUT_IDXS == TRUE
+	#ifndef OUT_IDXS_RBTREE_NODES	
+	//return the mul.result nnz index inside the rbNodes
+	uint i=0;
+	//rbNodeOrderedVisit(n,root)	outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
+  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
+		outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
+	}
+	#else
+	/* return the non zero indexes of the mul.result row
+ 	 * sorting inplace the nodes inserted in the rbtree */
+	sortRbNode(nodes,abcRowLen);
+	#endif
+	#endif
+	return abcRowLen;
+}
+
+idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID, spmat* a, spmat* b, spmat* c
+	#if _OUT_IDXS  == TRUE
+	,idx_t*** outIdxs
+	#endif
+  )
+{
+	//idxs keeping aux buffs
+	idx_t*  abRowsJATmp	= NULL;
+	rbRoot* rbRoots 	= NULL; rbNode* rbNodes	= NULL; 
+	SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
+	
+	idx_t *abUpperBoundedRowsLens = NULL, *upperBoundedSymMat = NULL;
+	///initial allocations
+	idx_t* rowLens = malloc(sizeof(*rowLens) * (a->M +1) ); //to return
+	if (!rowLens){
+		ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
+		goto _err;
+	}
+	
+	#if _OUT_IDXS  == TRUE
+	if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
+		ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
+		goto _err;
+	}
+	if (!(abUpperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))	
+		goto _err;
+  	/*TODO TODO instead of doing one sym product first to have a correct UB
+	 *	use an heuristics here to get output matrix size
+	 */
+	idx_t abcUBSize = abUpperBoundedRowsLens[a->M] * SP3MM_UB_HEURISTIC;
+	if (!(upperBoundedSymMat=malloc(sizeof(*upperBoundedSymMat)*abcUBSize))){
+		ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
+		goto _err;
+	}
+	//TODO heuristic TO UB rows bounds ... require compacting copy
+	for (idx_t i=0,cumul=0; i<a->M; 
+	  cumul += SP3MM_UB_HEURISTIC * abUpperBoundedRowsLens[i++])
+		*outIdxs[i] = upperBoundedSymMat + cumul; 
+  	#endif	//#if _OUT_IDXS  == TRUE
+	//rbTrees for index keeping
+	uint maxThreads 	= omp_get_max_threads(); //TODO FROM CFG
+	idx_t abMaxRowLen 	= reductionMaxSeq(abUpperBoundedRowsLens, a->M);
+	#ifdef 	  HEURISTICS_UB
+	idx_t maxRowLenUB 	= abMaxRowLen * SP3MM_UB_HEURISTIC; //TODO UB HEURISTC
+	#else
+	idx_t maxRowLenUB 	= c->N;
+	#endif	//HEURISTICS_UB
+
+	if (!(abRowsJATmp	= malloc(maxThreads*maxRowLenUB*sizeof(*abRowsJATmp)) ) ){
+		ERRPRINT("Sp3MM_Symb_ abRowsJATmp malloc errd\n");
+		goto _err;
+	}
+	if (symbMMRowImplID == RBTREE || IDX_RMUL_SYMB_RBTREE ){
+		rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes));
+		rbRoots = malloc(maxThreads * sizeof(*rbRoots));
+		if (!rbRoots || !rbNodes ){
+			ERRPRINT("Sp3MM_Symb_ rbRoots || rbNodes malloc errd\n");
+			goto _err;
+		}
+		//init roots
+		for (uint i=0; i<maxThreads; i++)	rbRoots[i] = RB_ROOT_CACHED;
+		if (!( rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes)) )){
+			ERRPRINT("Sp3MM_Symb_ rbNodes malloc errdn\n");
+			goto _err;
+		}
+	}
+	if (symbMMRowImplID == IDXMAP){
+		if (!( idxsMapAccs = calloc(maxThreads,sizeof(*idxsMapAccs)) )){
+			ERRPRINT("Sp3MM_Symb_ idxsMapAccs calloc errd\n");
+			goto _err;
+		}	
+		//init idxs maps
+		for (uint i=0; i<maxThreads; i++){
+			if(initSpVectIdxDenseAcc(c->N,idxsMapAccs+i))	goto _err;
+		}
+	}
+	///rows parallel compute
+	idx_t* aRow;
+	idx_t  aRowLen,rLen,outCumulLen=0;
+	//threads local pointers
+	int tid;
+	rbRoot* tRoot;
+	rbNode* tNodes;
+	SPVECT_IDX_DENSE_MAP* tIdxsMapAcc;
+	idx_t* tABRowJATmp;
+
+	#pragma omp parallel for schedule(static) \
+	private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:outCumulLen)
+	for(idx_t r=0; r<a->M; r++){
+		aRow		= a->JA + a->IRP[r]-OFF_F;
+		aRowLen		= a->IRP[r+1] - a->IRP[r];
+		tid 		= omp_get_thread_num();
+		tRoot 		= rbRoots + tid;
+		tNodes  	= rbNodes + tid * maxRowLenUB;
+		tIdxsMapAcc = NULL;	//TODO
+		tABRowJATmp	= abRowsJATmp + tid * maxRowLenUB;
+		rLen = CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
+		  (symbMMRowImplID, aRow,aRowLen,b,c,tRoot,tNodes,tIdxsMapAcc,tABRowJATmp,
+		  	#if _OUT_IDXS == TRUE
+			*outIdxs[r]
+			#endif
+		  );
+		outCumulLen += rLen;
+		rowLens[r] = rLen;
+		
+	}
+	goto _free;
+	_err:
+	free(rowLens);
+	#if _OUT_IDXS == T
+	if (*outIdxs)	free(*outIdxs);
+	#endif
+	rowLens = NULL;
+	_free:
+	free(abUpperBoundedRowsLens);
+	free(upperBoundedSymMat);
+	free(rbRoots);
+	free(rbNodes);
+	free(abRowsJATmp);
+
+	return rowLens;
+}
+
+#endif	//#if !defined COL_PARTS && defined OUT_IDXS
+
+
+///restore aux macros entry state
+//#undef _OUT_ID
+//#undef _COL_PARTS
+#pragma pop_macro("OUT_IDXS")
+#pragma pop_macro("_OUT_IDXS")
+#pragma pop_macro("COL_PARTS")
+#pragma pop_macro("_COL_PARTS")
+
+
+
+
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c
new file mode 100644
index 00000000..9305cfbf
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_SymbStep_Multi.c
@@ -0,0 +1,92 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "macros.h"
+#include "config.h"
+#include "sparseMatrix.h"
+#include "linuxK_rbtree_minimalized.h"
+
+#include "Sp3MM_CSR_OMP_SymbStep_Multi.h"
+
+#include "sparseUtilsMulti.h"
+#include "utils.h"
+#include <omp.h>
+
+//inline exports
+//ushort matchingUnifRangeIdx(idx_t idx, idx_t size, ushort rangesN);
+idx_t* CAT(spMMSizeUpperbound_,0)(spmat* ,spmat* );
+idx_t* CAT(spMMSizeUpperbound_,1)(spmat* ,spmat* );
+idx_t* CAT(spMMSizeUpperboundColParts_,0)(spmat* ,spmat* ,ushort);
+idx_t* CAT(spMMSizeUpperboundColParts_,1)(spmat* ,spmat* ,ushort);
+
+/* Multi implementation of symbolic product of sparse matrixes, config macros
+ * OFF_F:		C-Fortran spmat indexing
+ * OUT_IDXS:	indexes output
+ * COL_PARTS:	partitioning columns output...
+ */
+
+#define OUT_IDXS_ON		OutIdxs_
+#define COL_PARTS_ON	ColParts_
+#undef OUT_IDXS
+#undef COL_PARTS
+
+#define OFF_F 0
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+		//generate outIdxs AND colParts versions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
+#define OFF_F 1
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+		//generate outIdxs AND colParts versions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.c"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
diff --git a/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
new file mode 100644
index 00000000..4cc1cfd4
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_UB_Generic.c
@@ -0,0 +1,573 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+/*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/
+#ifndef OFF_F
+	#error generic implementation requires OFF_F defined
+#endif
+
+
+
+//////////////////// COMPUTE CORE Sp[3]MM Upperbound //////////////////////////
+spmat* CAT(spmmSerial_,OFF_F)(spmat* A,spmat* B, CONFIG* _cfg){	//serial implementation
+	spmat* AB = NULL;
+	ACC_DENSE acc;
+	if ( allocAccDense(&acc,B->N) )			goto _free;
+	if (!(AB = allocSpMatrix(A->M,B->N)))	goto _free;
+	for( idx_t r=0; r<A->M; r++ ){
+		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, &acc);
+		sparsifyDirect(&acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
+	}
+	_free:
+	freeAccsDense(&acc,1);
+	
+	return AB;
+}
+////////Sp3MM as 2 x SpMM
+///1D
+spmat* CAT(spmmRowByRow_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	DEBUG printf("spmm\trows of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	///thread aux
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F) (A,B)))   goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M)))  goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t r=0,rSizeCumul=0; r<AB->M; rSizeCumul += rowsSizes[r++]){
+		accSp 		= outAccumul->accs+r;
+		accSp->JA 	= outAccumul->JA + rSizeCumul;
+		accSp->AS 	= outAccumul->AS + rSizeCumul;
+		//accSp->len	= rowsSizes[r];
+	}
+	#endif
+
+	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	#pragma omp parallel for schedule(runtime) private(acc)
+	for (ulong r=0;  r<A->M; r++){	//row-by-row formulation
+		//iterate over nz entry index c inside current row r
+		acc = accVects + omp_get_thread_num();
+		/* direct use of sparse scalar vector multiplication
+		for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; ja<A->IRP[r+1]-OFF_F; ja++){
+			ca = A->JA[ja]  - OFF_F;
+			jb = B->IRP[ca] - OFF_F;
+			bRowLen = B->IRP[ca+1] - B->IRP[ca];
+			CAT(scSparseVectMul_,OFF_F)(A->AS[ja],B->AS+jb,B->JA+jb,bRowLen,acc);
+		}*/
+		for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) //row-by-row formul
+			CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
+		//trasform accumulated dense vector to a CSR row
+		#if SPARSIFY_PRE_PARTITIONING == T
+		_sparsifyUB(acc,outAccumul->accs+r,0);
+		#else
+		sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0);
+		#endif
+		_resetAccVect(acc);   //rezero for the next A row
+	}
+	///merge sparse row computed before
+	if (mergeRows(outAccumul->accs,AB))	goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG				   checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	if(rowsSizes)   free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+}
+
+spmat* CAT(spmmRowByRow1DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	DEBUG printf("spmm\trowBlocks of A,\tfull B\tM=%lu x N=%lu\n",A->M,B->N);
+	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
+	///thread aux
+	ACC_DENSE *accVects = NULL,*acc;
+	SPMM_ACC* outAccumul=NULL;
+	idx_t* rowsSizes = NULL;
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	if (!AB)	goto _err;
+	if (!(rowsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B)))   goto _err;
+	///aux structures alloc 
+	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
+		ERRPRINT("accVects init failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsSizes[AB->M],AB->M)))  goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t r=0,rSizeCumul=0; r<AB->M; rSizeCumul += rowsSizes[r++]){
+		accSp = outAccumul->accs+r;
+		accSp->JA = outAccumul->JA + rSizeCumul;
+		accSp->AS = outAccumul->AS + rSizeCumul;
+	}
+	#endif
+   
+	//perform Gustavson over rows blocks -> M / @cfg->gridRows
+	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong b,startRow,block; //omp for aux vars
+	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
+	for (b=0;   b < cfg->gridRows; b++){
+		block	= UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
+		startRow= UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);
+		acc	= accVects + omp_get_thread_num();
+	   
+		DEBUGPRINT{
+			fflush(NULL);
+			printf("block %lu\t%lu:%lu(%lu)\n",b,startRow,startRow+block-1,block);
+			fflush(NULL);
+		}
+		//row-by-row formulation in the given row block
+		for (ulong r=startRow;  r<startRow+block;  r++){
+			//iterate over nz entry index c inside current row r
+			for (ulong c=A->IRP[r]-OFF_F; c<A->IRP[r+1]-OFF_F; c++) 
+				CAT(scSparseRowMul_,OFF_F)(A->AS[c], B, A->JA[c]-OFF_F, acc);
+			//trasform accumulated dense vector to a CSR row
+			#if SPARSIFY_PRE_PARTITIONING == T
+			_sparsifyUB(acc,outAccumul->accs+r,0);
+			#else
+			sparsifyUBNoPartsBounds(outAccumul,acc,outAccumul->accs + r,0);
+			#endif
+			_resetAccVect(acc);   //rezero for the next A row
+		}
+	}
+	///merge sparse row computed before
+	if (mergeRows(outAccumul->accs,AB))	goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG			checkOverallocPercent(rowsSizes,AB);
+	goto _free;
+
+	_err:
+	if(AB)  freeSpmat(AB);
+	AB=NULL;	//nothing'll be returned
+	_free:
+	if(rowsSizes)   free(rowsSizes);
+	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
+	if(outAccumul)  freeSpMMAcc(outAccumul);
+
+	return AB;
+}
+
+///2D
+//PARTITIONS NOT ALLOCATED
+spmat* CAT(spmmRowByRow2DBlocks_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
+	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
+	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
+	ACC_DENSE *accVectors=NULL,*accV;
+	SPACC* accRowPart;
+	spmat* AB = allocSpMatrix(A->M,B->N);
+	SPMM_ACC* outAccumul=NULL;
+	idx_t*	rowsPartsSizes=NULL;
+	if (!AB)	goto _err;
+	//2D indexing aux vars
+	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	////get bColOffsets for B column groups 
+	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	uint rowsPartsSizesN = aSubRowsN;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperboundColParts_,OFF_F)
+					(A,B,cfg->gridCols,bColOffsets)))
+	#else
+	uint rowsPartsSizesN = AB->M;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B)))
+	#endif
+		goto _err;
+	 
+	//aux vectors  
+
+	///other AUX struct alloc
+	if (!(accVectors = _initAccVectors(gridSize,_colBlock+(_colBlockRem?1:0)))){
+		ERRPRINT("accVectors calloc failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsPartsSizes[rowsPartsSizesN],aSubRowsN)))   
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t i=0,rSizeCumul=0; i<aSubRowsN; rSizeCumul += rowsPartsSizes[i++]){
+		accSp 		= outAccumul->accs+i;
+		accSp->JA 	= outAccumul->JA + rSizeCumul;
+		accSp->AS 	= outAccumul->AS + rSizeCumul;
+	}
+	//memset(outAccumul->AS,0,sizeof(double)*rowsSizes[AB->M]);memset(outAccumul->JA,0,sizeof(idx_t)*rowsSizes[AB->M]);
+	#endif
+	
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong tileID,t_i,t_j;							//for aux vars
+	ulong bPartLen,bPartID,bPartOffset;//B partition acces aux vars
+	#pragma omp parallel for schedule(runtime) \
+	  private(accV,accRowPart,rowBlock,colBlock,startRow,startCol,\
+	  bPartLen,bPartID,bPartOffset,t_i,t_j)
+	for (tileID = 0; tileID < gridSize; tileID++){
+		///get iteration's indexing variables
+		//tile index in the 2D grid of AB computation TODO OMP HOW TO PARALLELIZE 2 FOR
+		t_i = tileID/cfg->gridCols;  //i-th row block
+		t_j = tileID%cfg->gridCols;  //j-th col block
+		//get tile row-cols group FAIR sizes
+		rowBlock = UNIF_REMINDER_DISTRI(t_i,_rowBlock,_rowBlockRem); 
+		startRow = UNIF_REMINDER_DISTRI_STARTIDX(t_i,_rowBlock,_rowBlockRem);
+		startCol = UNIF_REMINDER_DISTRI_STARTIDX(t_j,_colBlock,_colBlockRem);
+		
+		accV = accVectors + tileID; 
+		 
+		DEBUGPRINT{
+			fflush(NULL);
+			colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem);
+			printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",t_i,startRow,startRow+rowBlock-1,rowBlock);
+			printf("colBlock [%lu\t%lu:%lu(%lu)]\n",t_j,startCol,startCol+colBlock-1,colBlock);
+			fflush(NULL);
+		}
+		///AB[t_i][t_j] block compute
+		for (ulong r=startRow;  r<startRow+rowBlock;  r++){
+			//iterate over nz col index j inside current row r
+			//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
+			for (ulong j=A->IRP[r]-OFF_F,c; j<A->IRP[r+1]-OFF_F; j++){
+				//get start of B[A->JA[j]][:colBlock:]
+				c = A->JA[j]-OFF_F; // col of nnz in A[r][:] <-> target B row
+				bPartID	 = IDX2D(c,t_j,cfg->gridCols); 
+				bPartOffset = bColOffsets[ bPartID ];
+				bPartLen = bColOffsets[ bPartID + 1 ] - bPartOffset;
+
+				CAT(scSparseVectMulPart_,OFF_F)(A->AS[j],B->AS+bPartOffset,
+				  B->JA+bPartOffset,bPartLen,startCol,accV);
+			}
+
+			accRowPart = outAccumul->accs + IDX2D(r,t_j,cfg->gridCols);
+			#if SPARSIFY_PRE_PARTITIONING == T
+			_sparsifyUB(accV,accRowPart,startCol);
+			#else
+			sparsifyUBNoPartsBounds(outAccumul,accV,accRowPart,startCol);
+			#endif
+			_resetAccVect(accV);
+		}
+	}
+	if (mergeRowsPartitions(outAccumul->accs,AB,cfg))  goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG				   
+	  CAT(checkOverallocRowPartsPercent_, OFF_F)(rowsPartsSizes, AB, cfg->gridCols, bColOffsets);
+	goto _free;
+
+	_err:
+	if (AB) freeSpmat(AB);
+	AB = NULL; 
+	_free:
+	free(rowsPartsSizes);
+	free(bColOffsets);
+	if (accVectors)  freeAccsDense(accVectors,gridSize);
+	if (outAccumul)  freeSpMMAcc(outAccumul);
+	
+	return AB;
+		
+}
+
+spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
+	DEBUG printf("spmm\trowBlocks of A,\tcolBlocks (allcd) of B\tM=%luxN=%lu\n",A->M,B->N);
+	DEBUG printf("ompParallelizationGrid:\t%dx%d\n",cfg->gridRows,cfg->gridCols);
+	spmat *AB = NULL, *colPartsB = NULL, *colPart;
+	idx_t*   rowsPartsSizes=NULL;
+	//aux vectors  
+	SPMM_ACC* outAccumul=NULL;
+	ACC_DENSE *accVectors=NULL,*accV;
+	SPACC* accRowPart;
+	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
+	//2D indexing aux vars
+	idx_t gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
+	idx_t* bColOffsets = NULL;
+	
+	if (!(AB = allocSpMatrix(A->M,B->N)))		   goto _err;
+	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
+	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
+	
+	////B cols  partition in CSRs
+	//if (!(colPartsB = CAT(colsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))  goto _err;
+	if (!(colPartsB = CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)
+					(B, cfg->gridCols, &bColOffsets)))  
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	uint rowsPartsSizesN = aSubRowsN;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperboundColParts_,OFF_F)
+				 (A, B, cfg->gridCols, bColOffsets)))   
+	#else
+	uint rowsPartsSizesN = AB->M;
+	if (!(rowsPartsSizes = CAT(spMMSizeUpperbound_,OFF_F)(A,B)))
+	#endif
+		goto _err;
+	///other AUX struct alloc
+	if (!(accVectors = _initAccVectors(gridSize,_colBlock+(_colBlockRem?1:0)))){
+		ERRPRINT("accVectors calloc failed\n");
+		goto _err;
+	}
+	if (!(outAccumul = initSpMMAcc(rowsPartsSizes[rowsPartsSizesN],aSubRowsN)))
+		goto _err;
+	#if SPARSIFY_PRE_PARTITIONING == T
+	//prepare sparse accumulators with U.Bounded rows[parts] starts
+	SPACC* accSp;
+	for( idx_t i=0,rLenCumul=0; i<aSubRowsN; rLenCumul += rowsPartsSizes[i++]){
+		accSp 		= outAccumul->accs+i;
+		accSp->JA 	= outAccumul->JA + rLenCumul;
+		accSp->AS 	= outAccumul->AS + rLenCumul;
+	}
+	#endif
+	
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (gridSize,AB,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	ulong tileID,t_i,t_j;	//for aux vars
+	#pragma omp parallel for schedule(runtime) \
+	  private(accV,accRowPart,colPart,rowBlock,colBlock,startRow,startCol,t_i,t_j)
+	for (tileID = 0; tileID < gridSize; tileID++){
+		///get iteration's indexing variables
+		//tile index in the 2D grid of AB computation TODO OMP HOW TO PARALLELIZE 2 FOR
+		t_i = tileID/cfg->gridCols;  //i-th row block
+		t_j = tileID%cfg->gridCols;  //j-th col block
+		//get tile row-cols group FAIR sizes
+		rowBlock = UNIF_REMINDER_DISTRI(t_i,_rowBlock,_rowBlockRem); 
+		colBlock = UNIF_REMINDER_DISTRI(t_j,_colBlock,_colBlockRem);
+		startRow = UNIF_REMINDER_DISTRI_STARTIDX(t_i,_rowBlock,_rowBlockRem);
+		startCol = UNIF_REMINDER_DISTRI_STARTIDX(t_j,_colBlock,_colBlockRem);
+		
+		colPart = colPartsB + t_j;
+		accV = accVectors + tileID; 
+		 
+		DEBUGPRINT{
+			fflush(NULL);
+			printf("rowBlock [%lu\t%lu:%lu(%lu)]\t",
+				t_i,startRow,startRow+rowBlock-1,rowBlock);
+			printf("colBlock [%lu\t%lu:%lu(%lu)]\n",
+				t_j,startCol,startCol+colBlock-1,colBlock);
+			fflush(NULL);
+		}
+		///AB[t_i][t_j] block compute
+		for (ulong r=startRow;  r<startRow+rowBlock;  r++){
+			//iterate over nz col index j inside current row r
+			//row-by-row restricted to colsubset of B to get AB[r][:colBlock:]
+			for (ulong j=A->IRP[r]-OFF_F,c,bRowStart,bRowLen; j<A->IRP[r+1]-OFF_F; j++){
+				//get start of B[A->JA[j]][:colBlock:]
+				c = A->JA[j]-OFF_F; // column of nnz entry in A[r][:] <-> target B row
+				bRowStart = colPart->IRP[c];
+				#ifdef ROWLENS
+				bRowLen   = colPart->RL[c];
+				#else
+				bRowLen   = colPart->IRP[c+1] - bRowStart;
+				#endif
+				CAT(scSparseVectMulPart_,OFF_F)(A->AS[j],
+					colPart->AS+bRowStart,colPart->JA+bRowStart,
+					bRowLen,startCol,accV);
+			}
+
+			accRowPart = outAccumul->accs + IDX2D(r,t_j,cfg->gridCols);
+			#if SPARSIFY_PRE_PARTITIONING == T
+			_sparsifyUB(accV,accRowPart,startCol);
+			#else
+			sparsifyUBNoPartsBounds(outAccumul,accV,accRowPart,startCol);
+			#endif
+			_resetAccVect(accV);
+		}
+	}
+	if (mergeRowsPartitions(outAccumul->accs,AB,cfg))  goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(AB);
+	#endif
+	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
+	DEBUG				   
+	  CAT(checkOverallocRowPartsPercent_, OFF_F)(rowsPartsSizes,AB,cfg->gridCols,bColOffsets);
+	goto _free;
+
+	_err:
+	ERRPRINT("spmmRowByRow2DBlocksAllocated failed\n");
+	if (AB) freeSpmat(AB);
+	AB = NULL; 
+	_free:
+	if (colPartsB){
+		for (ulong i=0; i<cfg->gridCols; i++)
+			freeSpmatInternal(colPartsB+i);
+		free(colPartsB);
+	}
+	free(rowsPartsSizes);
+	free(bColOffsets);
+	if (accVectors)  freeAccsDense(accVectors,gridSize);
+	if (outAccumul)  freeSpMMAcc(outAccumul);
+	
+	return AB;
+		
+}
+///SP3MM
+spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,
+	   			     CONFIG* cfg,SPMM_INTERF spmm){
+	
+	double end,start,elapsed,partial,flops;
+	spmat *RAC = NULL, *out = NULL;
+   
+	if (!spmm){
+		//TODO runtime on sizes decide witch spmm implementation to use if not given
+		spmm = &CAT(spmmRowByRow2DBlocks_,OFF_F);
+	}
+	/* TODO 
+	alloc dense aux vector, reusable over 3 product 
+	TODO arrays sovrallocati per poter essere riusati nelle 2 SpMM
+	ulong auxVectSize = MAX(R->N,AC->N);
+	auxVectSize	  = MAX(auxVectSize,P->N);
+	*/
+	
+	start = omp_get_wtime();
+	/// triple product as a pair of spmm
+	if (!(RAC = spmm(R,AC,cfg)))	goto _free;
+	AUDIT_INTERNAL_TIMES		partial = End - Start;
+	if (!(out = spmm(RAC,P,cfg)))	goto _free;
+	//
+	end = omp_get_wtime();
+	ElapsedInternal = End - Start + partial;
+	VERBOSE {
+		elapsed		 = end - start;
+		flops = (2 * R->NZ * P->NZ * AC->NZ) / (elapsed);
+		printf("elapsed %le - flops %le",elapsed,flops);
+		AUDIT_INTERNAL_TIMES
+			printf("\tinternalTime: %le",ElapsedInternal);
+		printf("\n");
+	}
+	_free:
+	zeroSpmat(RAC);
+	freeSpmat(RAC);
+
+	return out;
+}
+
+////////Sp3MM direct
+///1D
+spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,
+	   SPMM_INTERF spmm){
+	
+	ulong* rowSizes = NULL;
+	SPMM_ACC* outAccumul=NULL;
+	ACC_DENSE *accVectorsR_AC=NULL,*accVectorsRAC_P=NULL,*accRAC,*accRACP;
+
+	///init AB matrix with SPMM heuristic preallocation
+	spmat* out = allocSpMatrix(R->M,P->N);
+	if (!out)   goto _err;
+	/*TODO 3MM VERSION COMPUTE OUT ALLOC :  
+	 -> \forall RAC.row -> hashmap{col=True}->(AVL||RBTHREE); upperBound std col RAC.rows.cols in hashmap || SYM_bis
+	 * NB: UP per RACP => NN note dimensioni righe precise => stesso approccio riservazione spazio di spmm ( fetch_and_add )
+	 *	 SYM_BIS ==> note dimensioni righe => 
+	 *		  1) pre riservazione spazio per righe -> cache allignement per threads 
+				 -(sc. static & blocco di righe allineato a cache block successivo a blocco righe precedente)
+				 -(sc. dynamic& righe tutte allineate a cache block (NO OVERLAPS!) -> huge overhead ?
+	 *		  2) pre riservazione spazio righe diretamente in out CSR
+					-> probabili cache blocks overlap; salvo costo di P.M memcpy
+	*/
+	if (!(rowSizes = CAT(spMMSizeUpperbound_,OFF_F)(R,AC))) goto _err;	///TODO TOO LOOSE UB...INTEGRATE RBTREE FOR SYM->PRECISE
+	///aux structures alloc 
+	if (!(outAccumul = initSpMMAcc(rowSizes[R->M],P->M)))   goto _err; //TODO size estimated with RAC mat
+	if (!(accVectorsR_AC = _initAccVectors(cfg->threadNum,AC->N))){ //TODO LESS || REUSE
+		ERRPRINT("accVectorsR_AC init failed\n");
+		goto _err;
+	}
+	if (!(accVectorsRAC_P = _initAccVectors(cfg->threadNum,R->N))){ //TODO LESS || REUSE
+		ERRPRINT("accVectorsRAC_P init failed\n");
+		goto _err;
+	}
+
+	ulong c;
+	((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (R->M,R,cfg);
+	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
+	#pragma omp parallel for schedule(runtime) private(accRAC,accRACP,c)
+	for (ulong r=0;  r<R->M; r++){	//row-by-row formulation
+		//iterate over nz entry index c inside current row r
+		accRAC  = accVectorsR_AC  + omp_get_thread_num();
+		accRACP = accVectorsRAC_P + omp_get_thread_num();
+		//computing (tmp) R*AC r-th row
+		for (idx_t j=R->IRP[r]-OFF_F; j<R->IRP[r+1]-OFF_F; j++)
+			CAT(scSparseRowMul_,OFF_F)(R->AS[j], AC, R->JA[j]-OFF_F, accRAC);
+		//forward the computed row
+		for (idx_t j=0; j<accRAC->nnzIdxMap.len; j++){
+			c = accRAC->nnzIdx[j];	
+			CAT(scSparseRowMul_,OFF_F)(accRAC->v[c],P,c,accRACP);
+		}
+		//trasform accumulated dense vector to a CSR row TODO in UB buff
+		sparsifyUBNoPartsBounds(outAccumul,accRACP,outAccumul->accs+r,0);
+		_resetAccVect(accRAC);
+		_resetAccVect(accRACP);
+	}
+	///merge sparse row computed before
+	if (mergeRows(outAccumul->accs,out))	goto _err;
+	#if OFF_F != 0
+	C_FortranShiftIdxs(out);
+	#endif
+	AUDIT_INTERNAL_TIMES{
+		End=omp_get_wtime();
+		ElapsedInternal = End-Start;
+	}
+	DEBUG				   checkOverallocPercent(rowSizes,out);
+	goto _free;
+
+	_err:
+	if(out) freeSpmat(out);
+	out = NULL;
+	_free:
+	if(rowSizes)	   free(rowSizes);
+	if(accVectorsR_AC)  freeAccsDense(accVectorsR_AC,cfg->threadNum);
+	if(accVectorsRAC_P) freeAccsDense(accVectorsRAC_P,cfg->threadNum);
+	if(outAccumul)	  freeSpMMAcc(outAccumul);
+	
+	return out;
+}
diff --git a/base/serial/impl/sp3mm4amg/commons/ompGetICV.c b/base/serial/impl/sp3mm4amg/commons/ompGetICV.c
new file mode 100644
index 00000000..1df55dde
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/ompGetICV.c
@@ -0,0 +1,86 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include <stdlib.h>
+#include <stdio.h>
+#include <omp.h>
+#include <config.h>
+
+char* SCHEDULES[]={"OMP_SCHED_STATIC","OMP_SCHED_DYNAMIC","OMP_SCHED_GUIDED","OMP_SCHED_AUTO"};
+void ompGetRuntimeSchedule(int* kind_chunk_monotonic){
+	/*
+	 * export OMP_SCHEDULE="[modifier:]kind[, chunk]"
+	 * typedef enum omp_sched_t {
+	 * //schedule kinds
+	 * omp_sched_static     = 0x1,
+	 * omp_sched_dynamic    = 0x2,
+	 * omp_sched_guided     = 0x3,
+	 * omp_sched_auto       = 0x4,
+	 * //schedule modifier  //TODO API>=5.0
+	 * omp_sched_monotonic  = 0x80000000u   //TODO OMP >=5
+	 * } omp_sched_t;
+	 */
+	omp_sched_t k,kind; int chunk_size,monotonic=0;
+	omp_get_schedule(&kind,&chunk_size);
+	k=kind; //[monotonic OFF]
+	#if _OPENMP >= 201811   //OMP_SCHED_SCHEDULE modifier from 5.0
+	monotonic = omp_sched_monotonic & kind;
+	if(monotonic)   k = kind - omp_sched_monotonic;
+	#endif
+	printf("omp sched gather:\tkind:%s\tomp chunkSize:%d\tmonotonic:%s\tfairChunkFolding:%d\n",
+	  SCHEDULES[k-1],chunk_size,monotonic?"Y":"N",FAIR_CHUNKS_FOLDING);
+	if (kind_chunk_monotonic){
+		kind_chunk_monotonic[0] = k;
+		kind_chunk_monotonic[1] = chunk_size;
+		kind_chunk_monotonic[2] = monotonic;
+	}
+}
+
+float ompVersionMacroMap(){
+	switch ( _OPENMP ){
+		case 200505:	return 2.5; 
+		case 200805:	return 3.0;
+		case 201107:	return 3.1;
+		case 201307:	return 4.0;
+		case 201511:	return 4.5;
+		case 201811:	return 5.0;
+		case 202011:	return 5.1;
+	}
+}
+//WRAPPER to print all ICV vars
+#ifdef OMP_GET_ICV_MAIN
+int main(){
+#else
+void ompGetAllICV(){
+#endif
+	printf("export OMP_DISPLAY_ENV=VERBOSE for full ICV details\n"); 
+	printf("omp MAX THREADS USABLE\t%d\n",omp_get_max_threads());
+	ompGetRuntimeSchedule(NULL);
+	printf("omp API version:\t %1.1f\n",ompVersionMacroMap()); 
+}
diff --git a/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/mpi.h.in b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/mpi.h.in
new file mode 100644
index 00000000..76154a5b
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/mpi.h.in
@@ -0,0 +1,3141 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007-2021 Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2009-2012 Oak Rigde National Laboratory.  All rights reserved.
+ * Copyright (c) 2011-2020 Sandia National Laboratories. All rights reserved.
+ * Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2011-2013 INRIA.  All rights reserved.
+ * Copyright (c) 2015      University of Houston. All rights reserved.
+ * Copyright (c) 2015-2021 Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2017-2019 IBM Corporation.  All rights reserved.
+ * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021-2022 Google, LLC. All rights reserved.
+ * Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates.  All Rights
+ *                         reserved.
+ * Copyright (c) 2021      Bull S.A.S. All rights reserved.
+ * Copyright (c) 2018      Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_MPI_H
+#define OMPI_MPI_H
+
+/* The comment below (and the ending partner) are for building fat
+   distributions on platforms that support it.  Please do not remove */
+
+/* @OMPI_BEGIN_CONFIGURE_SECTION@ */
+
+#ifndef OMPI_CONFIG_H
+
+/* Only include these if OMPI_CONFIG_H isn't defined (meaning if
+   ompi_config.h hasn't already been included).  Otherwise, we'll
+   duplicate all those symbols.  OMPI coding standards say that
+   ompi_config.h must be included before all other files, so this
+   should be good enough */
+
+/* The compiler id which OMPI was built with */
+#undef OPAL_BUILD_PLATFORM_COMPILER_FAMILYID
+
+/* The compiler version which OMPI was built with */
+#undef OPAL_BUILD_PLATFORM_COMPILER_VERSION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef OPAL_STDC_HEADERS
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#undef OPAL_HAVE_ATTRIBUTE_DEPRECATED
+
+/* Whether your compiler has __attribute__ deprecated with the optional argument */
+#undef OPAL_HAVE_ATTRIBUTE_DEPRECATED_ARGUMENT
+
+/* Whether you compiler has __attribute__ error or not */
+#undef OPAL_HAVE_ATTRIBUTE_ERROR
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#undef OPAL_HAVE_SYS_TIME_H
+
+/* Define to 1 if you have the <sys/synch.h> header file. */
+#undef OPAL_HAVE_SYS_SYNCH_H
+
+/* Define to 1 if the system has the type `long long'. */
+#undef OPAL_HAVE_LONG_LONG
+
+/* The size of a `bool', as computed by sizeof. */
+#undef OPAL_SIZEOF_BOOL
+
+/* The size of a `int', as computed by sizeof. */
+#undef OPAL_SIZEOF_INT
+
+/* The size of a `void*', as computed by sizeof. */
+#undef OPAL_SIZEOF_VOID_P
+
+/* Maximum length of datarep string (default is 128) */
+#undef OPAL_MAX_DATAREP_STRING
+
+/* Maximum length of error strings (default is 256) */
+#undef OPAL_MAX_ERROR_STRING
+
+/* Maximum length of info keys (default is 36) */
+#undef OPAL_MAX_INFO_KEY
+
+/* Maximum length of info vals (default is 256) */
+#undef OPAL_MAX_INFO_VAL
+
+/* Maximum length of object names (default is 64) */
+#undef OPAL_MAX_OBJECT_NAME
+
+/* Maximum length of port names (default is 1024) */
+#undef OPAL_MAX_PORT_NAME
+
+/* Maximum length of processor names (default is 256) */
+#undef OPAL_MAX_PROCESSOR_NAME
+
+/* Maximum length of processor names (default is 1024) */
+#undef OPAL_MAX_PSET_NAME_LEN
+
+/* Maximum length of from group tag (default is 256) */
+#undef OPAL_MAX_STRINGTAG_LEN
+
+/* The number or Fortran INTEGER in MPI Status */
+#undef OMPI_FORTRAN_STATUS_SIZE
+
+/* Whether we have FORTRAN LOGICAL*1 or not */
+#undef OMPI_HAVE_FORTRAN_LOGICAL1
+
+/* Whether we have FORTRAN LOGICAL*2 or not */
+#undef OMPI_HAVE_FORTRAN_LOGICAL2
+
+/* Whether we have FORTRAN LOGICAL*4 or not */
+#undef OMPI_HAVE_FORTRAN_LOGICAL4
+
+/* Whether we have FORTRAN LOGICAL*8 or not */
+#undef OMPI_HAVE_FORTRAN_LOGICAL8
+
+/* Whether we have FORTRAN INTEGER*1 or not */
+#undef OMPI_HAVE_FORTRAN_INTEGER1
+
+/* Whether we have FORTRAN INTEGER*16 or not */
+#undef OMPI_HAVE_FORTRAN_INTEGER16
+
+/* Whether we have FORTRAN INTEGER*2 or not */
+#undef OMPI_HAVE_FORTRAN_INTEGER2
+
+/* Whether we have FORTRAN INTEGER*4 or not */
+#undef OMPI_HAVE_FORTRAN_INTEGER4
+
+/* Whether we have FORTRAN INTEGER*8 or not */
+#undef OMPI_HAVE_FORTRAN_INTEGER8
+
+/* Whether we have FORTRAN REAL*16 or not */
+#undef OMPI_HAVE_FORTRAN_REAL16
+
+/* Whether we have FORTRAN REAL*2 or not */
+#undef OMPI_HAVE_FORTRAN_REAL2
+
+/* Whether we have FORTRAN REAL*4 or not */
+#undef OMPI_HAVE_FORTRAN_REAL4
+
+/* Whether we have FORTRAN REAL*8 or not */
+#undef OMPI_HAVE_FORTRAN_REAL8
+
+/* Whether in include MPI-1 compatibility */
+#undef OMPI_ENABLE_MPI1_COMPAT
+
+/* Whether we have float _Complex  or not */
+#undef HAVE_FLOAT__COMPLEX
+
+/* Whether we have double _Complex  or not */
+#undef HAVE_DOUBLE__COMPLEX
+
+/* Whether we have long double _Complex  or not */
+#undef HAVE_LONG_DOUBLE__COMPLEX
+
+/* Type of MPI_Aint */
+#undef OMPI_MPI_AINT_TYPE
+
+/* Type of MPI_Offset */
+#undef OMPI_MPI_OFFSET_TYPE
+
+/* MPI datatype corresponding to MPI_Offset */
+#undef OMPI_OFFSET_DATATYPE
+
+/* Size of the MPI_Offset corresponding type */
+#undef OMPI_MPI_OFFSET_SIZE
+
+/* Type of MPI_Count */
+#undef OMPI_MPI_COUNT_TYPE
+
+/* type to use for ptrdiff_t, if it does not exist, set to ptrdiff_t if it does exist */
+#undef ptrdiff_t
+
+/* Whether OMPI was built with parameter checking or not */
+#undef OMPI_PARAM_CHECK
+
+/* Enable warnings in wrong usage (e.g. deprecated) in user-level code */
+#ifndef OMPI_WANT_MPI_INTERFACE_WARNING
+#undef OMPI_WANT_MPI_INTERFACE_WARNING
+#endif
+
+/* Major, minor, and release version of Open MPI */
+#undef OMPI_MAJOR_VERSION
+#undef OMPI_MINOR_VERSION
+#undef OMPI_RELEASE_VERSION
+
+/* A  type that allows us to have sentinel type values that are still
+   valid */
+#undef ompi_fortran_bogus_type_t
+
+/* C type corresponding to FORTRAN INTEGER */
+#undef ompi_fortran_integer_t
+
+/* Whether C compiler supports -fvisibility */
+#undef OPAL_C_HAVE_VISIBILITY
+
+#ifndef OMPI_DECLSPEC
+#  if defined(WIN32) || defined(_WIN32)
+#    if defined(OMPI_IMPORTS)
+#      define OMPI_DECLSPEC        __declspec(dllimport)
+#    else
+#      define OMPI_DECLSPEC
+#    endif  /* defined(OMPI_IMPORTS) */
+#  else
+#    if OPAL_C_HAVE_VISIBILITY == 1
+#       define OMPI_DECLSPEC __attribute__((visibility("default")))
+#    else
+#       define OMPI_DECLSPEC
+#    endif
+#  endif
+#endif
+
+#ifndef MPI_Fint
+/* MPI_Fint is the same as ompi_fortran_INTEGER_t */
+#define MPI_Fint ompi_fortran_integer_t
+#endif
+
+#endif /* #ifndef OMPI_CONFIG_H */
+
+/* @OMPI_END_CONFIGURE_SECTION@ */
+
+/* include for ptrdiff_t */
+#ifdef OPAL_STDC_HEADERS
+#include <stddef.h>
+#endif
+
+#ifndef OMPI_BUILDING
+#define OMPI_BUILDING 0
+#endif
+
+
+/*
+ * Just in case you need it.  :-)
+ */
+#define OPEN_MPI 1
+
+/*
+ * MPI version
+ */
+#define MPI_VERSION 3
+#define MPI_SUBVERSION 1
+
+
+/*
+ * Do we want MPI interface deprecated function warnings?  This is
+ * only relevant if we're not building Open MPI (i.e., we're compiling an
+ * MPI application).
+ */
+#if !OMPI_BUILDING
+     /*
+      * Figure out which compiler is being invoked (in order to compare if
+      * it was different than what OMPI was built with).  Do some preprocessor
+      * hacks to eliminate warnings in the portable_platform.h file.
+      */
+#    ifndef SIZEOF_VOID_P
+#    define CLEANUP_SIZEOF_VOID_P 1
+#    define SIZEOF_VOID_P OPAL_SIZEOF_VOID_P
+#    else
+#    define CLEANUP_SIZEOF_VOID_P 0
+#    endif
+
+#    ifndef _PORTABLE_PLATFORM_H
+#    define _PORTABLE_PLATFORM_H 0
+#    endif
+#    ifndef PLATFORM_HEADER_VERSION
+#    define PLATFORM_HEADER_VERSION 0
+#    endif
+
+#    include "mpi_portable_platform.h"
+
+#if CLEANUP_SIZEOF_VOID_P
+#undef SIZEOF_VOID_P
+#endif
+
+     /*
+      * If we're currently using the same compiler that was used to
+      * build Open MPI, enable compile-time warning of user-level code
+      * (e.g. usage of deprecated functions).
+      */
+#    if (OPAL_BUILD_PLATFORM_COMPILER_FAMILYID == PLATFORM_COMPILER_FAMILYID) && \
+        (OPAL_BUILD_PLATFORM_COMPILER_VERSION == PLATFORM_COMPILER_VERSION)
+
+#        if OMPI_WANT_MPI_INTERFACE_WARNING
+#            if OPAL_HAVE_ATTRIBUTE_DEPRECATED
+#                if OPAL_HAVE_ATTRIBUTE_DEPRECATED_ARGUMENT
+#                    define __mpi_interface_deprecated__(msg) __attribute__((__deprecated__(msg)))
+#                else
+#                    define __mpi_interface_deprecated__(msg) __attribute__((__deprecated__))
+#                endif
+#            endif
+#        endif
+
+         /* For MPI removed APIs, there is no generally portable way to cause
+          * the C compiler to error with a nice message, on the _usage_ of
+          * one of these symbols.  We've gone with tiered approach:
+          *
+          * If the user configured with --enable-mpi1-compatibility,
+          * just emit a compiletime warning (via the deprecation function
+          * attribute) that they're using an MPI1 removed function.
+          *
+          * Otherwise, we'd like to issue a fatal error directing the
+          * user that they've used an MPI1 removed function.  If the
+          * user's compiler supports C11 _Static_assert() or
+          * C++11 static_assert(), we #define the MPI routines to
+          * instead be a call to an assert with an appropriate message
+          * suggesting the new MPI3 equivalent.
+          *
+          * Otherwise, if the user's compiler supports the error function
+          * attribute, define the MPI routines with that error attribute.
+          * This is supported by most modern GNU compilers.
+          *
+          * Finally if the compiler doesn't support any of those, just
+          * Don't declare those MPI routines at all in mpi.h
+          *
+          * Don't do MACRO magic for building Profiling library as it
+          * interferes with the above.
+          */
+#        if defined(OMPI_OMIT_MPI1_COMPAT_DECLS)
+             /* The user set OMPI_OMIT_MPI1_COMPAT_DECLS, do what he commands */
+#        elif OMPI_ENABLE_MPI1_COMPAT
+#            define OMPI_OMIT_MPI1_COMPAT_DECLS 0
+#            define OMPI_REMOVED_USE_STATIC_ASSERT 0
+#            define __mpi_interface_removed__(func, newfunc) __mpi_interface_deprecated__(#func " was removed in MPI-3.0.  Use " #newfunc " instead.  continuing...")
+#        elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+             /* This is the C11 (or later) case, which uses
+                _Static_assert() */
+#            define OMPI_OMIT_MPI1_COMPAT_DECLS 1
+#            define OMPI_REMOVED_USE_STATIC_ASSERT 1
+/* This macro definition may show up in compiler output.  So we both
+ * outdent it back to column 0 and give it a user-friendly name to
+ * help users grok what we are trying to tell them here.
+ */
+#define THIS_SYMBOL_WAS_REMOVED_IN_MPI30(symbol, new_symbol) 0; _Static_assert(0, #symbol " was removed in MPI-3.0.  Use " #new_symbol " instead.")
+#define THIS_FUNCTION_WAS_REMOVED_IN_MPI30(func, newfunc) _Static_assert(0, #func " was removed in MPI-3.0.  Use " #newfunc " instead.")
+#        elif defined(__cplusplus) && (__cplusplus >= 201103L)
+             /* This is the C++11 (or later) case, which uses
+                static_assert() */
+#            define OMPI_OMIT_MPI1_COMPAT_DECLS 1
+#            define OMPI_REMOVED_USE_STATIC_ASSERT 1
+/* This macro definition may show up in compiler output.  So we both
+ * outdent it back to column 0 and give it a user-friendly name to
+ * help users grok what we are trying to tell them here.
+ */
+#define THIS_SYMBOL_WAS_REMOVED_IN_MPI30(symbol, new_symbol) 0; static_assert(0, #symbol " was removed in MPI-3.0.  Use " #new_symbol " instead.")
+#define THIS_FUNCTION_WAS_REMOVED_IN_MPI30(func, newfunc) static_assert(0, #func " was removed in MPI-3.0.  Use " #newfunc " instead.")
+#        elif OPAL_HAVE_ATTRIBUTE_ERROR
+#            define OMPI_OMIT_MPI1_COMPAT_DECLS 0
+#            define OMPI_REMOVED_USE_STATIC_ASSERT 0
+#            define __mpi_interface_removed__(func, newfunc) __attribute__((__error__(#func " was removed in MPI-3.0.  Use " #newfunc " instead.")))
+#        else
+#            define OMPI_OMIT_MPI1_COMPAT_DECLS 1
+#            define OMPI_REMOVED_USE_STATIC_ASSERT 0
+#        endif
+#    endif
+#endif
+
+/*
+ * If we didn't define __mpi_interface_deprecated__ above, then we
+ * don't want it, so define it to empty (can't use #undef in the logic
+ * above because autoconf will comment it out).
+ */
+#if !defined(__mpi_interface_deprecated__)
+#    define __mpi_interface_deprecated__(msg)
+#endif
+
+#if !defined(__mpi_interface_removed__)
+#    define __mpi_interface_removed__(A,B)
+#endif
+
+#if !defined(THIS_SYMBOL_WAS_REMOVED_IN_MPI30)
+#    define THIS_SYMBOL_WAS_REMOVED_IN_MPI30(symbol, newsymbol)
+#endif
+#if !defined(THIS_FUNCTION_WAS_REMOVED_IN_MPI30)
+#    define THIS_FUNCTION_WAS_REMOVED_IN_MPI30(func, newfunc)
+#endif
+
+#if !defined(OMPI_REMOVED_USE_STATIC_ASSERT)
+#    define OMPI_REMOVED_USE_STATIC_ASSERT 0
+#endif
+
+#if !defined(OMPI_OMIT_MPI1_COMPAT_DECLS)
+#    define OMPI_OMIT_MPI1_COMPAT_DECLS (!(OMPI_ENABLE_MPI1_COMPAT || OMPI_BUILDING))
+#endif
+
+/*
+ * To accommodate programs written for MPI implementations that use a
+ * straight ROMIO import
+ */
+#if !OMPI_BUILDING
+#define MPIO_Request MPI_Request
+#define MPIO_Test MPI_Test
+#define MPIO_Wait MPI_Wait
+#endif
+
+/*
+ * When initializing global pointers to Open MPI internally-defined
+ * structs, some compilers warn about type-punning to incomplete
+ * types.  Therefore, when full struct definitions are unavailable
+ * (when not building Open MPI), cast to an opaque (void *) pointer to
+ * disable any strict-aliasing optimizations.  Don't cast to (void *)
+ * when building Open MPI so that we actually get the benefit of type
+ * checking (because we *do* have the full type definitions available
+ * when building OMPI).
+ */
+#if !OMPI_BUILDING
+#if defined(c_plusplus) || defined(__cplusplus)
+#define OMPI_PREDEFINED_GLOBAL(type, global) (static_cast<type> (static_cast<void *> (&(global))))
+#else
+#define OMPI_PREDEFINED_GLOBAL(type, global) ((type) ((void *) &(global)))
+#endif
+#else
+#define OMPI_PREDEFINED_GLOBAL(type, global) ((type) &(global))
+#endif
+
+#if defined(c_plusplus) || defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Typedefs
+ */
+
+typedef OMPI_MPI_AINT_TYPE MPI_Aint;
+typedef OMPI_MPI_OFFSET_TYPE MPI_Offset;
+typedef OMPI_MPI_COUNT_TYPE MPI_Count;
+typedef struct ompi_communicator_t *MPI_Comm;
+typedef struct ompi_datatype_t *MPI_Datatype;
+typedef struct ompi_errhandler_t *MPI_Errhandler;
+typedef struct ompi_file_t *MPI_File;
+typedef struct ompi_group_t *MPI_Group;
+typedef struct ompi_info_t *MPI_Info;
+typedef struct ompi_op_t *MPI_Op;
+typedef struct ompi_request_t *MPI_Request;
+typedef struct ompi_message_t *MPI_Message;
+typedef struct ompi_status_public_t MPI_Status;
+typedef struct ompi_f08_status_public_t MPI_F08_status;
+typedef struct ompi_win_t *MPI_Win;
+typedef struct mca_base_var_enum_t *MPI_T_enum;
+typedef struct ompi_mpit_cvar_handle_t *MPI_T_cvar_handle;
+typedef struct mca_base_pvar_handle_t *MPI_T_pvar_handle;
+typedef struct mca_base_pvar_session_t *MPI_T_pvar_session;
+typedef struct ompi_instance_t *MPI_Session;
+
+/*
+ * MPI_Status
+ */
+struct ompi_status_public_t {
+    /* These fields are publicly defined in the MPI specification.
+       User applications may freely read from these fields. */
+    int MPI_SOURCE;
+    int MPI_TAG;
+    int MPI_ERROR;
+    /* The following two fields are internal to the Open MPI
+       implementation and should not be accessed by MPI applications.
+       They are subject to change at any time.  These are not the
+       droids you're looking for. */
+    int _cancelled;
+    size_t _ucount;
+};
+typedef struct ompi_status_public_t ompi_status_public_t;
+
+/*
+ * MPI_F08_status
+ */
+struct ompi_f08_status_public_t {
+    /* These fields are publicly defined in the MPI specification.
+       User applications may freely read from these fields. */
+    MPI_Fint MPI_SOURCE;
+    MPI_Fint MPI_TAG;
+    MPI_Fint MPI_ERROR;
+    MPI_Fint internal[OMPI_FORTRAN_STATUS_SIZE - 3];
+};
+typedef struct ompi_f08_status_public_t ompi_f08_status_public_t;
+
+/*
+ * User typedefs
+ */
+typedef int (MPI_Datarep_extent_function)(MPI_Datatype, MPI_Aint *, void *);
+typedef int (MPI_Datarep_conversion_function)(void *, MPI_Datatype,
+                                              int, void *, MPI_Offset, void *);
+typedef void (MPI_Comm_errhandler_function)(MPI_Comm *, int *, ...);
+typedef void (MPI_Session_errhandler_function) (MPI_Session *, int *, ...);
+
+    /* This is a little hackish, but errhandler.h needs space for a
+       MPI_File_errhandler_function.  While it could just be removed, this
+       allows us to maintain a stable ABI within OMPI, at least for
+       apps that don't use MPI I/O. */
+typedef void (ompi_file_errhandler_function)(MPI_File *, int *, ...);
+typedef void (MPI_Win_errhandler_function)(MPI_Win *, int *, ...);
+typedef void (MPI_User_function)(void *, void *, int *, MPI_Datatype *);
+typedef int (MPI_Comm_copy_attr_function)(MPI_Comm, int, void *,
+                                            void *, void *, int *);
+typedef int (MPI_Comm_delete_attr_function)(MPI_Comm, int, void *, void *);
+typedef int (MPI_Type_copy_attr_function)(MPI_Datatype, int, void *,
+                                            void *, void *, int *);
+typedef int (MPI_Type_delete_attr_function)(MPI_Datatype, int,
+                                              void *, void *);
+typedef int (MPI_Win_copy_attr_function)(MPI_Win, int, void *,
+                                           void *, void *, int *);
+typedef int (MPI_Win_delete_attr_function)(MPI_Win, int, void *, void *);
+typedef int (MPI_Session_delete_attr_function)(MPI_Session, int, void *, void *);
+typedef int (MPI_Grequest_query_function)(void *, MPI_Status *);
+typedef int (MPI_Grequest_free_function)(void *);
+typedef int (MPI_Grequest_cancel_function)(void *, int);
+
+/*
+ * Deprecated typedefs.  Usage is discouraged, as these may be deleted
+ * in future versions of the MPI Standard.
+ */
+typedef MPI_Comm_errhandler_function MPI_Comm_errhandler_fn
+        __mpi_interface_deprecated__("MPI_Comm_errhandler_fn was deprecated in MPI-2.2; use MPI_Comm_errhandler_function instead");
+typedef ompi_file_errhandler_function MPI_File_errhandler_fn
+        __mpi_interface_deprecated__("MPI_File_errhandler_fn was deprecated in MPI-2.2; use MPI_File_errhandler_function instead");
+typedef ompi_file_errhandler_function MPI_File_errhandler_function;
+typedef MPI_Win_errhandler_function MPI_Win_errhandler_fn
+        __mpi_interface_deprecated__("MPI_Win_errhandler_fn was deprecated in MPI-2.2; use MPI_Win_errhandler_function instead");
+
+
+/*
+ * Miscellaneous constants
+ */
+#define MPI_ANY_SOURCE         -1                      /* match any source rank */
+#define MPI_PROC_NULL          -2                      /* rank of null process */
+#define MPI_ROOT               -4                      /* special value for intercomms */
+#define MPI_ANY_TAG            -1                      /* match any message tag */
+#define MPI_MAX_PROCESSOR_NAME OPAL_MAX_PROCESSOR_NAME /* max proc. name length */
+#define MPI_MAX_ERROR_STRING   OPAL_MAX_ERROR_STRING   /* max error message length */
+#define MPI_MAX_OBJECT_NAME    OPAL_MAX_OBJECT_NAME    /* max object name length */
+#define MPI_MAX_LIBRARY_VERSION_STRING 256             /* max length of library version string */
+#define MPI_UNDEFINED          -32766                  /* undefined stuff */
+#define MPI_DIST_GRAPH         3                       /* dist graph topology */
+#define MPI_CART               1                       /* cartesian topology */
+#define MPI_GRAPH              2                       /* graph topology */
+#define MPI_KEYVAL_INVALID     -1                      /* invalid key value */
+
+/*
+ * More constants
+ */
+#define MPI_UNWEIGHTED           ((int *) 2)           /* unweighted graph */
+#define MPI_WEIGHTS_EMPTY        ((int *) 3)           /* empty weights */
+#define MPI_BOTTOM               ((void *) 0)          /* base reference address */
+#define MPI_IN_PLACE             ((void *) 1)          /* in place buffer */
+#define MPI_BSEND_OVERHEAD       128                   /* size of bsend header + ptr */
+#define MPI_MAX_INFO_KEY         OPAL_MAX_INFO_KEY     /* max info key length */
+#define MPI_MAX_INFO_VAL         OPAL_MAX_INFO_VAL     /* max info value length */
+#define MPI_ARGV_NULL            ((char **) 0)         /* NULL argument vector */
+#define MPI_ARGVS_NULL           ((char ***) 0)        /* NULL argument vectors */
+#define MPI_ERRCODES_IGNORE      ((int *) 0)           /* don't return error codes */
+#define MPI_MAX_PORT_NAME        OPAL_MAX_PORT_NAME    /* max port name length */
+#define MPI_ORDER_C              0                     /* C row major order */
+#define MPI_ORDER_FORTRAN        1                     /* Fortran column major order */
+#define MPI_DISTRIBUTE_BLOCK     0                     /* block distribution */
+#define MPI_DISTRIBUTE_CYCLIC    1                     /* cyclic distribution */
+#define MPI_DISTRIBUTE_NONE      2                     /* not distributed */
+#define MPI_DISTRIBUTE_DFLT_DARG (-1)                  /* default distribution arg */
+#define MPI_MAX_PSET_NAME_LEN    OPAL_MAX_PSET_NAME_LEN /* max pset name len */
+#define MPI_MAX_STRINGTAG_LEN    OPAL_MAX_STRINGTAG_LEN /* max length of string arg to comm from group funcs*/
+
+/*
+ * Since these values are arbitrary to Open MPI, we might as well make
+ * them the same as ROMIO for ease of mapping.  These values taken
+ * from ROMIO's mpio.h file.
+ */
+#define MPI_MODE_CREATE              1  /* ADIO_CREATE */
+#define MPI_MODE_RDONLY              2  /* ADIO_RDONLY */
+#define MPI_MODE_WRONLY              4  /* ADIO_WRONLY  */
+#define MPI_MODE_RDWR                8  /* ADIO_RDWR  */
+#define MPI_MODE_DELETE_ON_CLOSE    16  /* ADIO_DELETE_ON_CLOSE */
+#define MPI_MODE_UNIQUE_OPEN        32  /* ADIO_UNIQUE_OPEN */
+#define MPI_MODE_EXCL               64  /* ADIO_EXCL */
+#define MPI_MODE_APPEND            128  /* ADIO_APPEND */
+#define MPI_MODE_SEQUENTIAL        256  /* ADIO_SEQUENTIAL */
+
+#define MPI_DISPLACEMENT_CURRENT   -54278278
+
+#define MPI_SEEK_SET               600
+#define MPI_SEEK_CUR               602
+#define MPI_SEEK_END               604
+
+/* Max data representation length */
+#define MPI_MAX_DATAREP_STRING OPAL_MAX_DATAREP_STRING
+
+/*
+ * MPI-2 One-Sided Communications asserts
+ */
+#define MPI_MODE_NOCHECK             1
+#define MPI_MODE_NOPRECEDE           2
+#define MPI_MODE_NOPUT               4
+#define MPI_MODE_NOSTORE             8
+#define MPI_MODE_NOSUCCEED          16
+
+#define MPI_LOCK_EXCLUSIVE           1
+#define MPI_LOCK_SHARED              2
+
+#define MPI_WIN_FLAVOR_CREATE        1
+#define MPI_WIN_FLAVOR_ALLOCATE      2
+#define MPI_WIN_FLAVOR_DYNAMIC       3
+#define MPI_WIN_FLAVOR_SHARED        4
+
+#define MPI_WIN_UNIFIED              0
+#define MPI_WIN_SEPARATE             1
+
+/*
+ * Predefined attribute keyvals
+ *
+ * DO NOT CHANGE THE ORDER WITHOUT ALSO CHANGING THE ORDER IN
+ * src/attribute/attribute_predefined.c and mpif.h.in.
+ */
+enum {
+    /* MPI-1 */
+    MPI_TAG_UB,
+    MPI_HOST,
+    MPI_IO,
+    MPI_WTIME_IS_GLOBAL,
+
+    /* MPI-2 */
+    MPI_APPNUM,
+    MPI_LASTUSEDCODE,
+    MPI_UNIVERSE_SIZE,
+    MPI_WIN_BASE,
+    MPI_WIN_SIZE,
+    MPI_WIN_DISP_UNIT,
+    MPI_WIN_CREATE_FLAVOR,
+    MPI_WIN_MODEL,
+
+    /* MPI-4 */
+    MPI_FT, /* used by OPAL_ENABLE_FT_MPI */
+    MPI_ATTR_PREDEFINED_KEY_MAX,
+};
+
+/*
+ * Error classes and codes
+ * Do not change the values of these without also modifying mpif.h.in.
+ */
+#define MPI_SUCCESS                   0
+#define MPI_ERR_BUFFER                1
+#define MPI_ERR_COUNT                 2
+#define MPI_ERR_TYPE                  3
+#define MPI_ERR_TAG                   4
+#define MPI_ERR_COMM                  5
+#define MPI_ERR_RANK                  6
+#define MPI_ERR_REQUEST               7
+#define MPI_ERR_ROOT                  8
+#define MPI_ERR_GROUP                 9
+#define MPI_ERR_OP                    10
+#define MPI_ERR_TOPOLOGY              11
+#define MPI_ERR_DIMS                  12
+#define MPI_ERR_ARG                   13
+#define MPI_ERR_UNKNOWN               14
+#define MPI_ERR_TRUNCATE              15
+#define MPI_ERR_OTHER                 16
+#define MPI_ERR_INTERN                17
+#define MPI_ERR_IN_STATUS             18
+#define MPI_ERR_PENDING               19
+#define MPI_ERR_ACCESS                20
+#define MPI_ERR_AMODE                 21
+#define MPI_ERR_ASSERT                22
+#define MPI_ERR_BAD_FILE              23
+#define MPI_ERR_BASE                  24
+#define MPI_ERR_CONVERSION            25
+#define MPI_ERR_DISP                  26
+#define MPI_ERR_DUP_DATAREP           27
+#define MPI_ERR_FILE_EXISTS           28
+#define MPI_ERR_FILE_IN_USE           29
+#define MPI_ERR_FILE                  30
+#define MPI_ERR_INFO_KEY              31
+#define MPI_ERR_INFO_NOKEY            32
+#define MPI_ERR_INFO_VALUE            33
+#define MPI_ERR_INFO                  34
+#define MPI_ERR_IO                    35
+#define MPI_ERR_KEYVAL                36
+#define MPI_ERR_LOCKTYPE              37
+#define MPI_ERR_NAME                  38
+#define MPI_ERR_NO_MEM                39
+#define MPI_ERR_NOT_SAME              40
+#define MPI_ERR_NO_SPACE              41
+#define MPI_ERR_NO_SUCH_FILE          42
+#define MPI_ERR_PORT                  43
+#define MPI_ERR_QUOTA                 44
+#define MPI_ERR_READ_ONLY             45
+#define MPI_ERR_RMA_CONFLICT          46
+#define MPI_ERR_RMA_SYNC              47
+#define MPI_ERR_SERVICE               48
+#define MPI_ERR_SIZE                  49
+#define MPI_ERR_SPAWN                 50
+#define MPI_ERR_UNSUPPORTED_DATAREP   51
+#define MPI_ERR_UNSUPPORTED_OPERATION 52
+#define MPI_ERR_WIN                   53
+
+#define MPI_T_ERR_MEMORY              54
+#define MPI_T_ERR_NOT_INITIALIZED     55
+#define MPI_T_ERR_CANNOT_INIT         56
+#define MPI_T_ERR_INVALID_INDEX       57
+#define MPI_T_ERR_INVALID_ITEM        58
+#define MPI_T_ERR_INVALID_HANDLE      59
+#define MPI_T_ERR_OUT_OF_HANDLES      60
+#define MPI_T_ERR_OUT_OF_SESSIONS     61
+#define MPI_T_ERR_INVALID_SESSION     62
+#define MPI_T_ERR_CVAR_SET_NOT_NOW    63
+#define MPI_T_ERR_CVAR_SET_NEVER      64
+#define MPI_T_ERR_PVAR_NO_STARTSTOP   65
+#define MPI_T_ERR_PVAR_NO_WRITE       66
+#define MPI_T_ERR_PVAR_NO_ATOMIC      67
+#define MPI_ERR_RMA_RANGE             68
+#define MPI_ERR_RMA_ATTACH            69
+#define MPI_ERR_RMA_FLAVOR            70
+#define MPI_ERR_RMA_SHARED            71
+#define MPI_T_ERR_INVALID             72
+#define MPI_T_ERR_INVALID_NAME        73
+#define MPI_ERR_PROC_ABORTED          74
+
+/* not #if conditional on OPAL_ENABLE_FT_MPI for ABI */
+#define MPI_ERR_PROC_FAILED           75
+#define MPI_ERR_PROC_FAILED_PENDING   76
+#define MPI_ERR_REVOKED               77
+
+/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
+   MPI_ERR_<foo> code. Set the last code to allow some room for adding
+   error codes without breaking ABI. */
+#define MPI_ERR_LASTCODE              92
+
+/*
+ * Comparison results.  Don't change the order of these, the group
+ * comparison functions rely on it.
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_IDENT,
+  MPI_CONGRUENT,
+  MPI_SIMILAR,
+  MPI_UNEQUAL
+};
+
+/*
+ * MPI_Init_thread constants
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+};
+
+/*
+ * Datatype combiners.
+ * Do not change the order of these without also modifying mpif.h.in.
+ * (see also mpif-common.h.fin).
+ */
+enum {
+  MPI_COMBINER_NAMED,
+  MPI_COMBINER_DUP,
+  MPI_COMBINER_CONTIGUOUS,
+  MPI_COMBINER_VECTOR,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HVECTOR_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HVECTOR_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HVECTOR,
+  MPI_COMBINER_INDEXED,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HINDEXED_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HINDEXED_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HINDEXED,
+  MPI_COMBINER_INDEXED_BLOCK,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_STRUCT_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_STRUCT_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_STRUCT,
+  MPI_COMBINER_SUBARRAY,
+  MPI_COMBINER_DARRAY,
+  MPI_COMBINER_F90_REAL,
+  MPI_COMBINER_F90_COMPLEX,
+  MPI_COMBINER_F90_INTEGER,
+  MPI_COMBINER_RESIZED,
+  MPI_COMBINER_HINDEXED_BLOCK
+};
+
+#if (OMPI_OMIT_MPI1_COMPAT_DECLS)
+/* If not building or configured --enable-mpi1-compatibility, then
+ * we don't want these datatypes, instead we define MPI_COMBINER_*_INTEGER
+ * to our Static Assert message if the compiler supports
+ * that staticly assert with a nice message.
+ */
+#    if (OMPI_REMOVED_USE_STATIC_ASSERT)
+#        define MPI_COMBINER_HVECTOR_INTEGER THIS_SYMBOL_WAS_REMOVED_IN_MPI30(MPI_COMBINER_HVECTOR_INTEGER, MPI_COMBINER_HVECTOR);
+#        define MPI_COMBINER_HINDEXED_INTEGER THIS_SYMBOL_WAS_REMOVED_IN_MPI30(MPI_COMBINER_HINDEXED_INTEGER, MPI_COMBINER_HINDEXED);
+#        define MPI_COMBINER_STRUCT_INTEGER THIS_SYMBOL_WAS_REMOVED_IN_MPI30(MPI_COMBINER_STRUCT_INTEGER, MPI_COMBINER_STRUCT);
+#    endif /* OMPI_REMOVED_USE_STATIC_ASSERT */
+#endif /* Removed datatypes */
+
+/*
+ * Communicator split type constants.
+ * Do not change the order of these without also modifying mpif.h.in
+ * (see also mpif-common.h.fin).
+ */
+enum {
+  MPI_COMM_TYPE_SHARED,
+  OMPI_COMM_TYPE_HWTHREAD,
+  OMPI_COMM_TYPE_CORE,
+  OMPI_COMM_TYPE_L1CACHE,
+  OMPI_COMM_TYPE_L2CACHE,
+  OMPI_COMM_TYPE_L3CACHE,
+  OMPI_COMM_TYPE_SOCKET,
+  OMPI_COMM_TYPE_NUMA,
+  OMPI_COMM_TYPE_BOARD,
+  OMPI_COMM_TYPE_HOST,
+  OMPI_COMM_TYPE_CU,
+  OMPI_COMM_TYPE_CLUSTER
+};
+#define OMPI_COMM_TYPE_NODE MPI_COMM_TYPE_SHARED
+
+/*
+ * MPIT Verbosity Levels
+ */
+enum {
+  MPI_T_VERBOSITY_USER_BASIC,
+  MPI_T_VERBOSITY_USER_DETAIL,
+  MPI_T_VERBOSITY_USER_ALL,
+  MPI_T_VERBOSITY_TUNER_BASIC,
+  MPI_T_VERBOSITY_TUNER_DETAIL,
+  MPI_T_VERBOSITY_TUNER_ALL,
+  MPI_T_VERBOSITY_MPIDEV_BASIC,
+  MPI_T_VERBOSITY_MPIDEV_DETAIL,
+  MPI_T_VERBOSITY_MPIDEV_ALL
+};
+
+/*
+ * MPIT Scopes
+ */
+enum {
+  MPI_T_SCOPE_CONSTANT,
+  MPI_T_SCOPE_READONLY,
+  MPI_T_SCOPE_LOCAL,
+  MPI_T_SCOPE_GROUP,
+  MPI_T_SCOPE_GROUP_EQ,
+  MPI_T_SCOPE_ALL,
+  MPI_T_SCOPE_ALL_EQ
+};
+
+/*
+ * MPIT Object Binding
+ */
+enum {
+  MPI_T_BIND_NO_OBJECT,
+  MPI_T_BIND_MPI_COMM,
+  MPI_T_BIND_MPI_DATATYPE,
+  MPI_T_BIND_MPI_ERRHANDLER,
+  MPI_T_BIND_MPI_FILE,
+  MPI_T_BIND_MPI_GROUP,
+  MPI_T_BIND_MPI_OP,
+  MPI_T_BIND_MPI_REQUEST,
+  MPI_T_BIND_MPI_WIN,
+  MPI_T_BIND_MPI_MESSAGE,
+  MPI_T_BIND_MPI_INFO
+};
+
+/*
+ * MPIT pvar classes
+ */
+enum {
+  MPI_T_PVAR_CLASS_STATE,
+  MPI_T_PVAR_CLASS_LEVEL,
+  MPI_T_PVAR_CLASS_SIZE,
+  MPI_T_PVAR_CLASS_PERCENTAGE,
+  MPI_T_PVAR_CLASS_HIGHWATERMARK,
+  MPI_T_PVAR_CLASS_LOWWATERMARK,
+  MPI_T_PVAR_CLASS_COUNTER,
+  MPI_T_PVAR_CLASS_AGGREGATE,
+  MPI_T_PVAR_CLASS_TIMER,
+  MPI_T_PVAR_CLASS_GENERIC
+};
+
+/*
+ * NULL handles
+ */
+#define MPI_SESSION_NULL OMPI_PREDEFINED_GLOBAL(MPI_Session, ompi_mpi_instance_null)
+#define MPI_GROUP_NULL OMPI_PREDEFINED_GLOBAL(MPI_Group, ompi_mpi_group_null)
+#define MPI_COMM_NULL OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_mpi_comm_null)
+#define MPI_REQUEST_NULL OMPI_PREDEFINED_GLOBAL(MPI_Request, ompi_request_null)
+#define MPI_MESSAGE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Message, ompi_message_null)
+#define MPI_OP_NULL OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_null)
+#define MPI_ERRHANDLER_NULL OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errhandler_null)
+#define MPI_INFO_NULL OMPI_PREDEFINED_GLOBAL(MPI_Info, ompi_mpi_info_null)
+#define MPI_WIN_NULL OMPI_PREDEFINED_GLOBAL(MPI_Win, ompi_mpi_win_null)
+#define MPI_FILE_NULL OMPI_PREDEFINED_GLOBAL(MPI_File, ompi_mpi_file_null)
+#define MPI_T_ENUM_NULL ((MPI_T_enum) NULL)
+
+/*
+ * MPI_INFO_ENV handle
+ */
+#define MPI_INFO_ENV OMPI_PREDEFINED_GLOBAL(MPI_Info, ompi_mpi_info_env)
+
+#if defined(c_plusplus) || defined(__cplusplus)
+#define MPI_STATUS_IGNORE (static_cast<MPI_Status *> (0))
+#define MPI_STATUSES_IGNORE (static_cast<MPI_Status *> (0))
+#else
+#define MPI_STATUS_IGNORE ((MPI_Status *) 0)
+#define MPI_STATUSES_IGNORE ((MPI_Status *) 0)
+#endif
+
+/*
+ * Special MPI_T handles
+ */
+#define MPI_T_PVAR_ALL_HANDLES ((MPI_T_pvar_handle) -1)
+#define MPI_T_PVAR_HANDLE_NULL ((MPI_T_pvar_handle) 0)
+#define MPI_T_PVAR_SESSION_NULL ((MPI_T_pvar_session) 0)
+#define MPI_T_CVAR_HANDLE_NULL ((MPI_T_cvar_handle) 0)
+
+/* MPI-2 specifies that the name "MPI_TYPE_NULL_DELETE_FN" (and all
+   related friends) must be accessible in C, C++, and Fortran. This is
+   unworkable if the back-end Fortran compiler uses all caps for its
+   linker symbol convention -- it results in two functions with
+   different signatures that have the same name (i.e., both C and
+   Fortran use the symbol MPI_TYPE_NULL_DELETE_FN).  So we have to
+   #define the C names to be something else, so that they names are
+   *accessed* through MPI_TYPE_NULL_DELETE_FN, but their actual symbol
+   name is different.
+
+   However, this file is included when the fortran wrapper functions
+   are compiled in Open MPI, so we do *not* want these #defines in
+   this case (i.e., we need the Fortran wrapper function to be
+   compiled as MPI_TYPE_NULL_DELETE_FN).  So add some #if kinds of
+   protection for this case. */
+
+#if !defined(OMPI_COMPILING_FORTRAN_WRAPPERS)
+
+#define MPI_TYPE_NULL_DELETE_FN OMPI_C_MPI_TYPE_NULL_DELETE_FN
+#define MPI_TYPE_NULL_COPY_FN OMPI_C_MPI_TYPE_NULL_COPY_FN
+#define MPI_TYPE_DUP_FN OMPI_C_MPI_TYPE_DUP_FN
+
+#define MPI_COMM_NULL_DELETE_FN OMPI_C_MPI_COMM_NULL_DELETE_FN
+#define MPI_COMM_NULL_COPY_FN OMPI_C_MPI_COMM_NULL_COPY_FN
+#define MPI_COMM_DUP_FN OMPI_C_MPI_COMM_DUP_FN
+
+#define MPI_WIN_NULL_DELETE_FN OMPI_C_MPI_WIN_NULL_DELETE_FN
+#define MPI_WIN_NULL_COPY_FN OMPI_C_MPI_WIN_NULL_COPY_FN
+#define MPI_WIN_DUP_FN OMPI_C_MPI_WIN_DUP_FN
+
+/* MPI_CONVERSION_FN_NULL is a sentinel value, but it has to be large
+   enough to be the same size as a valid function pointer.  It
+   therefore shares many characteristics between Fortran constants and
+   Fortran sentinel functions.  For example, it shares the problem of
+   having Fortran compilers have all-caps versions of the symbols that
+   must be able to be present, and therefore has to be in this
+   conditional block in mpi.h. */
+#define MPI_CONVERSION_FN_NULL ((MPI_Datarep_conversion_function*) 0)
+#endif
+
+OMPI_DECLSPEC int OMPI_C_MPI_TYPE_NULL_DELETE_FN( MPI_Datatype datatype,
+                                                  int type_keyval,
+                                                  void* attribute_val_out,
+                                                  void* extra_state );
+OMPI_DECLSPEC int OMPI_C_MPI_TYPE_NULL_COPY_FN( MPI_Datatype datatype,
+                                                int type_keyval,
+                                                void* extra_state,
+                                                void* attribute_val_in,
+                                                void* attribute_val_out,
+                                                int* flag );
+OMPI_DECLSPEC int OMPI_C_MPI_TYPE_DUP_FN( MPI_Datatype datatype,
+                                          int type_keyval,
+                                          void* extra_state,
+                                          void* attribute_val_in,
+                                          void* attribute_val_out,
+                                          int* flag );
+OMPI_DECLSPEC int OMPI_C_MPI_COMM_NULL_DELETE_FN( MPI_Comm comm,
+                                                  int comm_keyval,
+                                                  void* attribute_val_out,
+                                                  void* extra_state );
+OMPI_DECLSPEC int OMPI_C_MPI_COMM_NULL_COPY_FN( MPI_Comm comm,
+                                                int comm_keyval,
+                                                void* extra_state,
+                                                void* attribute_val_in,
+                                                void* attribute_val_out,
+                                                int* flag );
+OMPI_DECLSPEC int OMPI_C_MPI_COMM_DUP_FN( MPI_Comm comm, int comm_keyval,
+                                          void* extra_state,
+                                          void* attribute_val_in,
+                                          void* attribute_val_out,
+                                          int* flag );
+OMPI_DECLSPEC int OMPI_C_MPI_WIN_NULL_DELETE_FN( MPI_Win window,
+                                                 int win_keyval,
+                                                 void* attribute_val_out,
+                                                 void* extra_state );
+OMPI_DECLSPEC int OMPI_C_MPI_WIN_NULL_COPY_FN( MPI_Win window, int win_keyval,
+                                               void* extra_state,
+                                               void* attribute_val_in,
+                                               void* attribute_val_out,
+                                               int* flag );
+OMPI_DECLSPEC int OMPI_C_MPI_WIN_DUP_FN( MPI_Win window, int win_keyval,
+                                         void* extra_state,
+                                         void* attribute_val_in,
+                                         void* attribute_val_out,
+                                         int* flag );
+
+/*
+ * External variables
+ *
+ * The below externs use the ompi_predefined_xxx_t structures to maintain
+ * back compatibility between MPI library versions.
+ * See ompi/communicator/communicator.h comments with struct ompi_communicator_t
+ * for full explanation why we chose to use the ompi_predefined_xxx_t structure.
+ */
+OMPI_DECLSPEC extern struct ompi_predefined_communicator_t ompi_mpi_comm_world;
+OMPI_DECLSPEC extern struct ompi_predefined_communicator_t ompi_mpi_comm_self;
+OMPI_DECLSPEC extern struct ompi_predefined_communicator_t ompi_mpi_comm_null;
+
+OMPI_DECLSPEC extern struct ompi_predefined_group_t ompi_mpi_group_empty;
+OMPI_DECLSPEC extern struct ompi_predefined_group_t ompi_mpi_group_null;
+
+OMPI_DECLSPEC extern struct ompi_predefined_instance_t ompi_mpi_instance_null;
+
+OMPI_DECLSPEC extern struct ompi_predefined_request_t ompi_request_null;
+
+OMPI_DECLSPEC extern struct ompi_predefined_message_t ompi_message_null;
+OMPI_DECLSPEC extern struct ompi_predefined_message_t ompi_message_no_proc;
+
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_null;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_min;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_max;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_sum;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_prod;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_land;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_band;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_lor;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_bor;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_lxor;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_bxor;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_maxloc;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_minloc;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_replace;
+OMPI_DECLSPEC extern struct ompi_predefined_op_t ompi_mpi_op_no_op;
+
+
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_datatype_null;
+
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_char;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_signed_char;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_unsigned_char;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_byte;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_short;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_unsigned_short;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_unsigned;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_long;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_unsigned_long;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_long_long_int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_unsigned_long_long;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_float;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_double;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_long_double;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_wchar;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_packed;
+
+/*
+ * Following are the C++/C99 datatypes
+ */
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_cxx_bool;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_cxx_cplex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_cxx_dblcplex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_cxx_ldblcplex;
+
+/*
+ * Following are the Fortran datatypes
+ */
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_logical;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_character;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_integer;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_real;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_dblprec;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_cplex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_dblcplex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_ldblcplex;
+
+/* Aggregate struct datatypes are not const */
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_2int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_2integer;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_2real;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_2dblprec;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_2cplex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_2dblcplex;
+
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_float_int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_double_int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_longdbl_int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_short_int;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_long_int;
+
+/* Optional MPI2 datatypes, always declared and defined, but not "exported" as MPI_LOGICAL1 */
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_logical1;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_logical2;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_logical4;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_logical8;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_integer1;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_integer2;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_integer4;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_integer8;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_integer16;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_real2;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_real4;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_real8;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_real16;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_complex4;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_complex8;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_complex16;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_complex32;
+
+/* New datatypes from the MPI 2.2 standard */
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_int8_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_uint8_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_int16_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_uint16_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_int32_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_uint32_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_int64_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_uint64_t;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_aint;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_offset;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_count;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_c_bool;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_c_float_complex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_c_double_complex;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_c_long_double_complex;
+
+OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errhandler_null;
+OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_are_fatal;
+OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_abort;
+OMPI_DECLSPEC extern struct ompi_predefined_errhandler_t ompi_mpi_errors_return;
+
+OMPI_DECLSPEC extern struct ompi_predefined_win_t ompi_mpi_win_null;
+OMPI_DECLSPEC extern struct ompi_predefined_file_t ompi_mpi_file_null;
+
+OMPI_DECLSPEC extern struct ompi_predefined_info_t ompi_mpi_info_null;
+OMPI_DECLSPEC extern struct ompi_predefined_info_t ompi_mpi_info_env;
+
+OMPI_DECLSPEC extern MPI_Fint *MPI_F_STATUS_IGNORE;
+OMPI_DECLSPEC extern MPI_Fint *MPI_F_STATUSES_IGNORE;
+
+OMPI_DECLSPEC extern MPI_Fint *MPI_F08_STATUS_IGNORE;
+OMPI_DECLSPEC extern MPI_Fint *MPI_F08_STATUSES_IGNORE;
+
+/*
+ * Removed datatypes.  These datatypes are only available if Open MPI
+ * was configured with --enable-mpi1-compatibility.
+ *
+ * These datatypes were formally removed from the MPI specification
+ * and should no longer be used in MPI applications.
+ */
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+#    define MPI_UB OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_ub)
+#    define MPI_LB OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_lb)
+
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_lb;
+OMPI_DECLSPEC extern struct ompi_predefined_datatype_t ompi_mpi_ub;
+
+#else
+/* If not building or configured --enable-mpi1-compatibility, then
+ * we don't want these datatypes, instead we define MPI_UB and
+ * MPI_LB to our Static Assert message if the compiler supports
+ * that staticly assert with a nice message.
+ */
+#    if (OMPI_REMOVED_USE_STATIC_ASSERT)
+#        define MPI_UB THIS_SYMBOL_WAS_REMOVED_IN_MPI30(MPI_UB, MPI_Type_create_resized);
+#        define MPI_LB THIS_SYMBOL_WAS_REMOVED_IN_MPI30(MPI_LB, MPI_Type_create_resized);
+#    endif /* OMPI_REMOVED_USE_STATIC_ASSERT */
+#endif /* Removed datatypes */
+
+
+/*
+ * MPI predefined handles
+ */
+#define MPI_COMM_WORLD OMPI_PREDEFINED_GLOBAL( MPI_Comm, ompi_mpi_comm_world)
+#define MPI_COMM_SELF OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_mpi_comm_self)
+
+#define MPI_GROUP_EMPTY OMPI_PREDEFINED_GLOBAL(MPI_Group, ompi_mpi_group_empty)
+
+#define MPI_MESSAGE_NO_PROC OMPI_PREDEFINED_GLOBAL(MPI_Message, ompi_message_no_proc)
+
+#define MPI_MAX OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_max)
+#define MPI_MIN OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_min)
+#define MPI_SUM OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_sum)
+#define MPI_PROD OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_prod)
+#define MPI_LAND OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_land)
+#define MPI_BAND OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_band)
+#define MPI_LOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_lor)
+#define MPI_BOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_bor)
+#define MPI_LXOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_lxor)
+#define MPI_BXOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_bxor)
+#define MPI_MAXLOC OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_maxloc)
+#define MPI_MINLOC OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_minloc)
+#define MPI_REPLACE OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_replace)
+#define MPI_NO_OP OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_mpi_op_no_op)
+
+/* C datatypes */
+#define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_datatype_null)
+#define MPI_BYTE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_byte)
+#define MPI_PACKED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_packed)
+#define MPI_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_char)
+#define MPI_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_short)
+#define MPI_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int)
+#define MPI_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long)
+#define MPI_FLOAT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_float)
+#define MPI_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_double)
+#define MPI_LONG_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long_double)
+#define MPI_UNSIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_unsigned_char)
+#define MPI_SIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_signed_char)
+#define MPI_UNSIGNED_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_unsigned_short)
+#define MPI_UNSIGNED_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_unsigned_long)
+#define MPI_UNSIGNED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_unsigned)
+#define MPI_FLOAT_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_float_int)
+#define MPI_DOUBLE_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_double_int)
+#define MPI_LONG_DOUBLE_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_longdbl_int)
+#define MPI_LONG_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long_int)
+#define MPI_SHORT_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_short_int)
+#define MPI_2INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_2int)
+
+#define MPI_WCHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_wchar)
+#if OPAL_HAVE_LONG_LONG
+#define MPI_LONG_LONG_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long_long_int)
+#define MPI_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_long_long_int)
+#define MPI_UNSIGNED_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_unsigned_long_long)
+#endif  /* OPAL_HAVE_LONG_LONG */
+#define MPI_2COMPLEX OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_2cplex)
+#define MPI_2DOUBLE_COMPLEX OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_2dblcplex)
+
+/* Fortran datatype bindings */
+#define MPI_CHARACTER OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_character)
+#define MPI_LOGICAL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_logical)
+#if OMPI_HAVE_FORTRAN_LOGICAL1
+#define MPI_LOGICAL1 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_logical1)
+#endif
+#if OMPI_HAVE_FORTRAN_LOGICAL2
+#define MPI_LOGICAL2 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_logical2)
+#endif
+#if OMPI_HAVE_FORTRAN_LOGICAL4
+#define MPI_LOGICAL4 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_logical4)
+#endif
+#if OMPI_HAVE_FORTRAN_LOGICAL8
+#define MPI_LOGICAL8 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_logical8)
+#endif
+#define MPI_INTEGER OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_integer)
+#if OMPI_HAVE_FORTRAN_INTEGER1
+#define MPI_INTEGER1 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_integer1)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER2
+#define MPI_INTEGER2 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_integer2)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER4
+#define MPI_INTEGER4 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_integer4)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER8
+#define MPI_INTEGER8 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_integer8)
+#endif
+#if OMPI_HAVE_FORTRAN_INTEGER16
+#define MPI_INTEGER16 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_integer16)
+#endif
+#define MPI_REAL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_real)
+#if OMPI_HAVE_FORTRAN_REAL2
+#define MPI_REAL2 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_real2)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+#define MPI_REAL4 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_real4)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+#define MPI_REAL8 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_real8)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16
+#define MPI_REAL16 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_real16)
+#endif
+#define MPI_DOUBLE_PRECISION OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_dblprec)
+#define MPI_COMPLEX OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_cplex)
+#if OMPI_HAVE_FORTRAN_REAL2
+#define MPI_COMPLEX4 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_complex4)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL4
+#define MPI_COMPLEX8 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_complex8)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL8
+#define MPI_COMPLEX16 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_complex16)
+#endif
+#if OMPI_HAVE_FORTRAN_REAL16
+#define MPI_COMPLEX32 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_complex32)
+#endif
+#define MPI_DOUBLE_COMPLEX OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_dblcplex)
+#define MPI_2REAL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_2real)
+#define MPI_2DOUBLE_PRECISION OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_2dblprec)
+#define MPI_2INTEGER OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_2integer)
+
+/* New datatypes from the MPI 2.2 standard */
+#define MPI_INT8_T                OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int8_t)
+#define MPI_UINT8_T               OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_uint8_t)
+#define MPI_INT16_T               OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int16_t)
+#define MPI_UINT16_T              OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_uint16_t)
+#define MPI_INT32_T               OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int32_t)
+#define MPI_UINT32_T              OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_uint32_t)
+#define MPI_INT64_T               OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_int64_t)
+#define MPI_UINT64_T              OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_uint64_t)
+#define MPI_AINT                  OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_aint)
+#define MPI_OFFSET                OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_offset)
+#define MPI_C_BOOL                OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_c_bool)
+#if HAVE_FLOAT__COMPLEX
+#define MPI_C_COMPLEX             OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_c_float_complex)
+#define MPI_C_FLOAT_COMPLEX       OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_c_float_complex)
+#endif
+#if HAVE_DOUBLE__COMPLEX
+#define MPI_C_DOUBLE_COMPLEX      OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_c_double_complex)
+#endif
+#if HAVE_LONG_DOUBLE__COMPLEX
+#define MPI_C_LONG_DOUBLE_COMPLEX OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_c_long_double_complex)
+#endif
+#define MPI_CXX_BOOL              OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_cxx_bool)
+#define MPI_CXX_COMPLEX           OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_cxx_cplex)
+#define MPI_CXX_FLOAT_COMPLEX     OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_cxx_cplex)
+#define MPI_CXX_DOUBLE_COMPLEX    OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_cxx_dblcplex)
+#define MPI_CXX_LONG_DOUBLE_COMPLEX OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_cxx_ldblcplex)
+
+/* New datatypes from the 3.0 standard */
+#define MPI_COUNT                 OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_mpi_count)
+
+#define MPI_ERRORS_ARE_FATAL OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_are_fatal)
+#define MPI_ERRORS_ABORT OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_abort)
+#define MPI_ERRORS_RETURN OMPI_PREDEFINED_GLOBAL(MPI_Errhandler, ompi_mpi_errors_return)
+
+/* Typeclass definition for MPI_Type_match_size */
+#define MPI_TYPECLASS_INTEGER    1
+#define MPI_TYPECLASS_REAL       2
+#define MPI_TYPECLASS_COMPLEX    3
+
+/* Aint helper macros (MPI-3.1) */
+#define MPI_Aint_add(base, disp) ((MPI_Aint) ((char *) (base) + (disp)))
+#define MPI_Aint_diff(addr1, addr2) ((MPI_Aint) ((char *) (addr1) - (char *) (addr2)))
+#define PMPI_Aint_add(base, disp) MPI_Aint_add(base, disp)
+#define PMPI_Aint_diff(addr1, addr2) MPI_Aint_diff(addr1, addr2)
+
+/*
+ * Predefined info keys
+ */
+#define MPI_INFO_KEY_SESSION_PSET_SIZE  "size"
+
+/*
+ * MPI API
+ */
+
+OMPI_DECLSPEC  int MPI_Abort(MPI_Comm comm, int errorcode);
+OMPI_DECLSPEC  int MPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                  int target_rank, MPI_Aint target_disp, int target_count,
+                                  MPI_Datatype target_datatype, MPI_Op op, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Add_error_class(int *errorclass);
+OMPI_DECLSPEC  int MPI_Add_error_code(int errorclass, int *errorcode);
+OMPI_DECLSPEC  int MPI_Add_error_string(int errorcode, const char *string);
+OMPI_DECLSPEC  int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount,
+                                 MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount,
+                                  MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                  MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                   MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                       MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alloc_mem(MPI_Aint size, MPI_Info info,
+                                 void *baseptr);
+OMPI_DECLSPEC  int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                 MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                  MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Allreduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                      MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount,
+                                MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                 void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                 MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                  void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                  MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                      void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                      MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[],
+                                 void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[],
+                                 MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[],
+                                  void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[],
+                                  MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[],
+                                      void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[],
+                                      MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Barrier(MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ibarrier(MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Barrier_init(MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype,
+                             int root, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ibcast(void *buffer, int count, MPI_Datatype datatype,
+                              int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Bcast_init(void *buffer, int count, MPI_Datatype datatype,
+                                  int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype,
+                             int dest, int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype,
+                                  int dest, int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Buffer_attach(void *buffer, int size);
+OMPI_DECLSPEC  int MPI_Buffer_detach(void *buffer, int *size);
+OMPI_DECLSPEC  int MPI_Cancel(MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int coords[]);
+OMPI_DECLSPEC  int MPI_Cart_create(MPI_Comm old_comm, int ndims, const int dims[],
+                                   const int periods[], int reorder, MPI_Comm *comm_cart);
+OMPI_DECLSPEC  int MPI_Cart_get(MPI_Comm comm, int maxdims, int dims[],
+                                int periods[], int coords[]);
+OMPI_DECLSPEC  int MPI_Cart_map(MPI_Comm comm, int ndims, const int dims[],
+                                const int periods[], int *newrank);
+OMPI_DECLSPEC  int MPI_Cart_rank(MPI_Comm comm, const int coords[], int *rank);
+OMPI_DECLSPEC  int MPI_Cart_shift(MPI_Comm comm, int direction, int disp,
+                                  int *rank_source, int *rank_dest);
+OMPI_DECLSPEC  int MPI_Cart_sub(MPI_Comm comm, const int remain_dims[], MPI_Comm *new_comm);
+OMPI_DECLSPEC  int MPI_Cartdim_get(MPI_Comm comm, int *ndims);
+OMPI_DECLSPEC  int MPI_Close_port(const char *port_name);
+OMPI_DECLSPEC  int MPI_Comm_accept(const char *port_name, MPI_Info info, int root,
+                                   MPI_Comm comm, MPI_Comm *newcomm);
+OMPI_DECLSPEC  MPI_Fint MPI_Comm_c2f(MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Comm_call_errhandler(MPI_Comm comm, int errorcode);
+OMPI_DECLSPEC  int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int *result);
+OMPI_DECLSPEC  int MPI_Comm_connect(const char *port_name, MPI_Info info, int root,
+                                    MPI_Comm comm, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_create_errhandler(MPI_Comm_errhandler_function *function,
+                                              MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_Comm_create_keyval(MPI_Comm_copy_attr_function *comm_copy_attr_fn,
+                                          MPI_Comm_delete_attr_function *comm_delete_attr_fn,
+                                          int *comm_keyval, void *extra_state);
+OMPI_DECLSPEC  int MPI_Comm_create_group(MPI_Comm comm, MPI_Group group, int tag, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_create_from_group(MPI_Group group, const char *tag, MPI_Info info,
+					      MPI_Errhandler errhandler, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_delete_attr(MPI_Comm comm, int comm_keyval);
+OMPI_DECLSPEC  int MPI_Comm_disconnect(MPI_Comm *comm);
+OMPI_DECLSPEC  int MPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Comm_dup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_idup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm, 
+                                           MPI_Request *request);
+OMPI_DECLSPEC  MPI_Comm MPI_Comm_f2c(MPI_Fint comm);
+OMPI_DECLSPEC  int MPI_Comm_free_keyval(int *comm_keyval);
+OMPI_DECLSPEC  int MPI_Comm_free(MPI_Comm *comm);
+OMPI_DECLSPEC  int MPI_Comm_get_attr(MPI_Comm comm, int comm_keyval,
+                                     void *attribute_val, int *flag);
+OMPI_DECLSPEC  int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int nodes[],
+                                         const int degrees[], const int targets[],
+                                         const int weights[], MPI_Info info,
+                                         int reorder, MPI_Comm * newcomm);
+OMPI_DECLSPEC  int MPI_Dist_graph_create_adjacent(MPI_Comm comm_old,
+                                                  int indegree, const int sources[],
+                                                  const int sourceweights[],
+                                                  int outdegree,
+                                                  const int destinations[],
+                                                  const int destweights[],
+                                                  MPI_Info info, int reorder,
+                                                  MPI_Comm *comm_dist_graph);
+OMPI_DECLSPEC int MPI_Dist_graph_neighbors(MPI_Comm comm, int maxindegree,
+                                           int sources[], int sourceweights[],
+                                           int maxoutdegree,
+                                           int destinations[],
+                                           int destweights[]);
+OMPI_DECLSPEC  int MPI_Dist_graph_neighbors_count(MPI_Comm comm,
+                                                  int *inneighbors,
+                                                  int *outneighbors,
+                                                  int *weighted);
+OMPI_DECLSPEC  int MPI_Comm_get_errhandler(MPI_Comm comm, MPI_Errhandler *erhandler);
+OMPI_DECLSPEC  int MPI_Comm_get_info(MPI_Comm comm, MPI_Info *info_used);
+OMPI_DECLSPEC  int MPI_Comm_get_name(MPI_Comm comm, char *comm_name, int *resultlen);
+OMPI_DECLSPEC  int MPI_Comm_get_parent(MPI_Comm *parent);
+OMPI_DECLSPEC  int MPI_Comm_group(MPI_Comm comm, MPI_Group *group);
+OMPI_DECLSPEC  int MPI_Comm_join(int fd, MPI_Comm *intercomm);
+OMPI_DECLSPEC  int MPI_Comm_rank(MPI_Comm comm, int *rank);
+OMPI_DECLSPEC  int MPI_Comm_remote_group(MPI_Comm comm, MPI_Group *group);
+OMPI_DECLSPEC  int MPI_Comm_remote_size(MPI_Comm comm, int *size);
+OMPI_DECLSPEC  int MPI_Comm_set_attr(MPI_Comm comm, int comm_keyval, void *attribute_val);
+OMPI_DECLSPEC  int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler);
+OMPI_DECLSPEC  int MPI_Comm_set_info(MPI_Comm comm, MPI_Info info);
+OMPI_DECLSPEC  int MPI_Comm_set_name(MPI_Comm comm, const char *comm_name);
+OMPI_DECLSPEC  int MPI_Comm_size(MPI_Comm comm, int *size);
+OMPI_DECLSPEC  int MPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info info,
+                                  int root, MPI_Comm comm, MPI_Comm *intercomm,
+                                  int array_of_errcodes[]);
+OMPI_DECLSPEC  int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_of_argv[],
+                                           const int array_of_maxprocs[], const MPI_Info array_of_info[],
+                                           int root, MPI_Comm comm, MPI_Comm *intercomm,
+                                           int array_of_errcodes[]);
+OMPI_DECLSPEC  int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int MPI_Comm_test_inter(MPI_Comm comm, int *flag);
+OMPI_DECLSPEC  int MPI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
+                                        void *result_addr, MPI_Datatype datatype, int target_rank,
+                                        MPI_Aint target_disp, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Dims_create(int nnodes, int ndims, int dims[]);
+OMPI_DECLSPEC  MPI_Fint MPI_Errhandler_c2f(MPI_Errhandler errhandler);
+OMPI_DECLSPEC  MPI_Errhandler MPI_Errhandler_f2c(MPI_Fint errhandler);
+OMPI_DECLSPEC  int MPI_Errhandler_free(MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_Error_class(int errorcode, int *errorclass);
+OMPI_DECLSPEC  int MPI_Error_string(int errorcode, char *string, int *resultlen);
+OMPI_DECLSPEC  int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                              MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                               MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Exscan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                   MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Fetch_and_op(const void *origin_addr, void *result_addr, MPI_Datatype datatype,
+                                    int target_rank, MPI_Aint target_disp, MPI_Op op, MPI_Win win);
+OMPI_DECLSPEC  MPI_Fint MPI_File_c2f(MPI_File file);
+OMPI_DECLSPEC  MPI_File MPI_File_f2c(MPI_Fint file);
+OMPI_DECLSPEC  int MPI_File_call_errhandler(MPI_File fh, int errorcode);
+OMPI_DECLSPEC  int MPI_File_create_errhandler(MPI_File_errhandler_function *function,
+                                              MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_File_set_errhandler( MPI_File file, MPI_Errhandler errhandler);
+OMPI_DECLSPEC  int MPI_File_get_errhandler( MPI_File file, MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_File_open(MPI_Comm comm, const char *filename, int amode,
+                                 MPI_Info info, MPI_File *fh);
+OMPI_DECLSPEC  int MPI_File_close(MPI_File *fh);
+OMPI_DECLSPEC  int MPI_File_delete(const char *filename, MPI_Info info);
+OMPI_DECLSPEC  int MPI_File_set_size(MPI_File fh, MPI_Offset size);
+OMPI_DECLSPEC  int MPI_File_preallocate(MPI_File fh, MPI_Offset size);
+OMPI_DECLSPEC  int MPI_File_get_size(MPI_File fh, MPI_Offset *size);
+OMPI_DECLSPEC  int MPI_File_get_group(MPI_File fh, MPI_Group *group);
+OMPI_DECLSPEC  int MPI_File_get_amode(MPI_File fh, int *amode);
+OMPI_DECLSPEC  int MPI_File_set_info(MPI_File fh, MPI_Info info);
+OMPI_DECLSPEC  int MPI_File_get_info(MPI_File fh, MPI_Info *info_used);
+OMPI_DECLSPEC  int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
+                                     MPI_Datatype filetype, const char *datarep, MPI_Info info);
+OMPI_DECLSPEC  int MPI_File_get_view(MPI_File fh, MPI_Offset *disp,
+                                     MPI_Datatype *etype,
+                                     MPI_Datatype *filetype, char *datarep);
+OMPI_DECLSPEC  int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
+                                    int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void *buf,
+                                        int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void *buf,
+                                     int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void *buf,
+                                         int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void *buf,
+                                     int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, const void *buf,
+                                      int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf,
+                                     int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf,
+                                      int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_read(MPI_File fh, void *buf, int count,
+                                 MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_read_all(MPI_File fh, void *buf, int count,
+                                     MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write(MPI_File fh, const void *buf, int count,
+                                  MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_all(MPI_File fh, const void *buf, int count,
+                                      MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_iread(MPI_File fh, void *buf, int count,
+                                  MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iwrite(MPI_File fh, const void *buf, int count,
+                                   MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iread_all(MPI_File fh, void *buf, int count,
+                                  MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iwrite_all(MPI_File fh, const void *buf, int count,
+                                   MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_seek(MPI_File fh, MPI_Offset offset, int whence);
+OMPI_DECLSPEC  int MPI_File_get_position(MPI_File fh, MPI_Offset *offset);
+OMPI_DECLSPEC  int MPI_File_get_byte_offset(MPI_File fh, MPI_Offset offset,
+                                            MPI_Offset *disp);
+OMPI_DECLSPEC  int MPI_File_read_shared(MPI_File fh, void *buf, int count,
+                                        MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_shared(MPI_File fh, const void *buf, int count,
+                                         MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_iread_shared(MPI_File fh, void *buf, int count,
+                                         MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_iwrite_shared(MPI_File fh, const void *buf, int count,
+                                          MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_File_read_ordered(MPI_File fh, void *buf, int count,
+                                         MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_ordered(MPI_File fh, const void *buf, int count,
+                                          MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_seek_shared(MPI_File fh, MPI_Offset offset, int whence);
+OMPI_DECLSPEC  int MPI_File_get_position_shared(MPI_File fh, MPI_Offset *offset);
+OMPI_DECLSPEC  int MPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void *buf,
+                                              int count, MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_File_read_at_all_end(MPI_File fh, void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, const void *buf,
+                                               int count, MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_File_write_at_all_end(MPI_File fh, const void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_read_all_begin(MPI_File fh, void *buf, int count,
+                                           MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_File_read_all_end(MPI_File fh, void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_all_begin(MPI_File fh, const void *buf, int count,
+                                            MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_File_write_all_end(MPI_File fh, const void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_read_ordered_begin(MPI_File fh, void *buf, int count,
+                                               MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_File_read_ordered_end(MPI_File fh, void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_write_ordered_begin(MPI_File fh, const void *buf, int count,
+                                                MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_File_write_ordered_end(MPI_File fh, const void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_File_get_type_extent(MPI_File fh, MPI_Datatype datatype,
+                                            MPI_Aint *extent);
+OMPI_DECLSPEC  int MPI_File_set_atomicity(MPI_File fh, int flag);
+OMPI_DECLSPEC  int MPI_File_get_atomicity(MPI_File fh, int *flag);
+OMPI_DECLSPEC  int MPI_File_sync(MPI_File fh);
+OMPI_DECLSPEC  int MPI_Finalize(void);
+OMPI_DECLSPEC  int MPI_Finalized(int *flag);
+OMPI_DECLSPEC  int MPI_Free_mem(void *base);
+OMPI_DECLSPEC  int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                              void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                              int root, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                               int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Gather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                   int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                               void *recvbuf, const int recvcounts[], const int displs[],
+                               MPI_Datatype recvtype, int root, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Gatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                    int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Get_address(const void *location, MPI_Aint *address);
+OMPI_DECLSPEC  int MPI_Get_count(const MPI_Status *status, MPI_Datatype datatype, int *count);
+OMPI_DECLSPEC  int MPI_Get_elements(const MPI_Status *status, MPI_Datatype datatype, int *count);
+OMPI_DECLSPEC  int MPI_Get_elements_x(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count);
+OMPI_DECLSPEC  int MPI_Get(void *origin_addr, int origin_count,
+                           MPI_Datatype origin_datatype, int target_rank,
+                           MPI_Aint target_disp, int target_count,
+                           MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                      void *result_addr, int result_count, MPI_Datatype result_datatype,
+                                      int target_rank, MPI_Aint target_disp, int target_count,
+                                      MPI_Datatype target_datatype, MPI_Op op, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Get_library_version(char *version, int *resultlen);
+OMPI_DECLSPEC  int MPI_Get_processor_name(char *name, int *resultlen);
+OMPI_DECLSPEC  int MPI_Get_version(int *version, int *subversion);
+OMPI_DECLSPEC  int MPI_Graph_create(MPI_Comm comm_old, int nnodes, const int index[],
+                                    const int edges[], int reorder, MPI_Comm *comm_graph);
+OMPI_DECLSPEC  int MPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges,
+                                 int index[], int edges[]);
+OMPI_DECLSPEC  int MPI_Graph_map(MPI_Comm comm, int nnodes, const int index[], const int edges[],
+                                 int *newrank);
+OMPI_DECLSPEC  int MPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors);
+OMPI_DECLSPEC  int MPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors,
+                                       int neighbors[]);
+OMPI_DECLSPEC  int MPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges);
+OMPI_DECLSPEC  int MPI_Grequest_complete(MPI_Request request);
+OMPI_DECLSPEC  int MPI_Grequest_start(MPI_Grequest_query_function *query_fn,
+                                      MPI_Grequest_free_function *free_fn,
+                                      MPI_Grequest_cancel_function *cancel_fn,
+                                      void *extra_state, MPI_Request *request);
+OMPI_DECLSPEC  MPI_Fint MPI_Group_c2f(MPI_Group group);
+OMPI_DECLSPEC  int MPI_Group_compare(MPI_Group group1, MPI_Group group2, int *result);
+OMPI_DECLSPEC  int MPI_Group_difference(MPI_Group group1, MPI_Group group2,
+                                        MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Group_excl(MPI_Group group, int n, const int ranks[],
+                                  MPI_Group *newgroup);
+OMPI_DECLSPEC  MPI_Group MPI_Group_f2c(MPI_Fint group);
+OMPI_DECLSPEC  int MPI_Group_free(MPI_Group *group);
+OMPI_DECLSPEC  int MPI_Group_from_session_pset (MPI_Session session, const char *pset_name, MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Group_incl(MPI_Group group, int n, const int ranks[],
+                                  MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Group_intersection(MPI_Group group1, MPI_Group group2,
+                                          MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Group_range_excl(MPI_Group group, int n, int ranges[][3],
+                                        MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Group_range_incl(MPI_Group group, int n, int ranges[][3],
+                                        MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Group_rank(MPI_Group group, int *rank);
+OMPI_DECLSPEC  int MPI_Group_size(MPI_Group group, int *size);
+OMPI_DECLSPEC  int MPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[],
+                                             MPI_Group group2, int ranks2[]);
+OMPI_DECLSPEC  int MPI_Group_union(MPI_Group group1, MPI_Group group2,
+                                   MPI_Group *newgroup);
+OMPI_DECLSPEC  int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                              int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Improbe(int source, int tag, MPI_Comm comm,
+                               int *flag, MPI_Message *message,
+                               MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Imrecv(void *buf, int count, MPI_Datatype type,
+                              MPI_Message *message, MPI_Request *request);
+OMPI_DECLSPEC  MPI_Fint MPI_Info_c2f(MPI_Info info);
+OMPI_DECLSPEC  int MPI_Info_create(MPI_Info *info);
+OMPI_DECLSPEC  int MPI_Info_delete(MPI_Info info, const char *key);
+OMPI_DECLSPEC  int MPI_Info_dup(MPI_Info info, MPI_Info *newinfo);
+OMPI_DECLSPEC  MPI_Info MPI_Info_f2c(MPI_Fint info);
+OMPI_DECLSPEC  int MPI_Info_free(MPI_Info *info);
+OMPI_DECLSPEC  int MPI_Info_get(MPI_Info info, const char *key, int valuelen,
+                                char *value, int *flag);
+OMPI_DECLSPEC  int MPI_Info_get_nkeys(MPI_Info info, int *nkeys);
+OMPI_DECLSPEC  int MPI_Info_get_nthkey(MPI_Info info, int n, char *key);
+OMPI_DECLSPEC  int MPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen,
+                                         int *flag);
+OMPI_DECLSPEC  int MPI_Info_get_string(MPI_Info info, const char *key, int *buflen,
+                                       char *value, int *flag);
+OMPI_DECLSPEC  int MPI_Info_set(MPI_Info info, const char *key, const char *value);
+OMPI_DECLSPEC  int MPI_Init(int *argc, char ***argv);
+OMPI_DECLSPEC  int MPI_Initialized(int *flag);
+OMPI_DECLSPEC  int MPI_Init_thread(int *argc, char ***argv, int required,
+                                   int *provided);
+OMPI_DECLSPEC  int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
+                                        MPI_Comm bridge_comm, int remote_leader,
+                                        int tag, MPI_Comm *newintercomm);
+OMPI_DECLSPEC  int MPI_Intercomm_create_from_groups (MPI_Group local_group, int local_leader, MPI_Group remote_group,
+                                                     int remote_leader, const char *tag, MPI_Info info, MPI_Errhandler errhandler,
+                                                     MPI_Comm *newintercomm);
+OMPI_DECLSPEC  int MPI_Intercomm_merge(MPI_Comm intercomm, int high,
+                                       MPI_Comm *newintercomm);
+OMPI_DECLSPEC  int MPI_Iprobe(int source, int tag, MPI_Comm comm, int *flag,
+                              MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
+                             int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                              int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                             int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Isendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 int dest, int sendtag, void *recvbuf, int recvcount,
+                                 MPI_Datatype recvtype, int source, int recvtag,
+                                 MPI_Comm comm,  MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Isendrecv_replace(void * buf, int count, MPI_Datatype datatype,
+                                         int dest, int sendtag, int source, int recvtag,
+                                         MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                              int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Is_thread_main(int *flag);
+OMPI_DECLSPEC  int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name);
+OMPI_DECLSPEC  MPI_Fint MPI_Message_c2f(MPI_Message message);
+OMPI_DECLSPEC  MPI_Message MPI_Message_f2c(MPI_Fint message);
+OMPI_DECLSPEC  int MPI_Mprobe(int source, int tag, MPI_Comm comm,
+                               MPI_Message *message,
+                               MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Mrecv(void *buf, int count, MPI_Datatype type,
+                             MPI_Message *message, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                          void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                          MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                           void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                           MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                           void *recvbuf, const int recvcounts[], const int displs[],
+                                           MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Neighbor_allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                               MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                            void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                            MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                                void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                                MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                         void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                         MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                          void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                          MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                              void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                              MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],  MPI_Datatype sendtype,
+                                          void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                          MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                           void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                           MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                               void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                               MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                          void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                          MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                           void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                           MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                               void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                               MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  MPI_Fint MPI_Op_c2f(MPI_Op op);
+OMPI_DECLSPEC  int MPI_Op_commutative(MPI_Op op, int *commute);
+OMPI_DECLSPEC  int MPI_Op_create(MPI_User_function *function, int commute, MPI_Op *op);
+OMPI_DECLSPEC  int MPI_Open_port(MPI_Info info, char *port_name);
+OMPI_DECLSPEC  MPI_Op MPI_Op_f2c(MPI_Fint op);
+OMPI_DECLSPEC  int MPI_Op_free(MPI_Op *op);
+OMPI_DECLSPEC  int MPI_Pack_external(const char datarep[], const void *inbuf, int incount,
+                                     MPI_Datatype datatype, void *outbuf,
+                                     MPI_Aint outsize, MPI_Aint *position);
+OMPI_DECLSPEC  int MPI_Pack_external_size(const char datarep[], int incount,
+                                          MPI_Datatype datatype, MPI_Aint *size);
+OMPI_DECLSPEC  int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
+                            void *outbuf, int outsize, int *position, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm,
+                                 int *size);
+OMPI_DECLSPEC  int MPI_Parrived(MPI_Request request, MPI_Count partition, int *flag);
+OMPI_DECLSPEC  int MPI_Pcontrol(const int level, ...);
+OMPI_DECLSPEC  int MPI_Pready(int partitions, MPI_Request request);
+OMPI_DECLSPEC  int MPI_Pready_range(int partition_low, int partition_high,
+                                    MPI_Request request);
+OMPI_DECLSPEC  int MPI_Pready_list(int length, int partition_list[], MPI_Request request);
+OMPI_DECLSPEC  int MPI_Precv_init(void* buf, int partitions, MPI_Count count,
+                                  MPI_Datatype datatype, int source, int tag, MPI_Comm comm,
+                                  MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Psend_init(const void* buf, int partitions, MPI_Count count,
+                                  MPI_Datatype datatype, int dest, int tag, MPI_Comm comm,
+                                  MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Publish_name(const char *service_name, MPI_Info info,
+                                    const char *port_name);
+OMPI_DECLSPEC  int MPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                           int target_rank, MPI_Aint target_disp, int target_count,
+                           MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Query_thread(int *provided);
+OMPI_DECLSPEC  int MPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                   int target_rank, MPI_Aint target_disp, int target_count,
+                                   MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source,
+                                 int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+                            int tag, MPI_Comm comm, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                              MPI_Op op, int root, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                               MPI_Op op, int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Reduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                   MPI_Op op, int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Reduce_local(const void *inbuf, void *inoutbuf, int count,
+                                    MPI_Datatype datatype, MPI_Op op);
+OMPI_DECLSPEC  int MPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype,
+                                      MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype,
+                                       MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Reduce_scatter_init(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype,
+                                           MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype,
+                                            MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype,
+                                             MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Reduce_scatter_block_init(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype,
+                                                 MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Register_datarep(const char *datarep,
+                                        MPI_Datarep_conversion_function *read_conversion_fn,
+                                        MPI_Datarep_conversion_function *write_conversion_fn,
+                                        MPI_Datarep_extent_function *dtype_file_extent_fn,
+                                        void *extra_state);
+OMPI_DECLSPEC  MPI_Fint MPI_Request_c2f(MPI_Request request);
+OMPI_DECLSPEC  MPI_Request MPI_Request_f2c(MPI_Fint request);
+OMPI_DECLSPEC  int MPI_Request_free(MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Request_get_status(MPI_Request request, int *flag,
+                                          MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Rget(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                            int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype,
+                            MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Rget_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                       void *result_addr, int result_count, MPI_Datatype result_datatype,
+                                       int target_rank, MPI_Aint target_disp, int target_count,
+                                       MPI_Datatype target_datatype, MPI_Op op,
+                                       MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                            int target_rank, MPI_Aint target_disp, int target_cout,
+                            MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Rsend(const void *ibuf, int count, MPI_Datatype datatype, int dest,
+                             int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype,
+                                  int dest, int tag, MPI_Comm comm,
+                                  MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                            MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                             MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Scan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                 MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                               int root, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Scatter_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Scatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                int root, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Scatterv_init(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype,
+                                 int dest, int tag, MPI_Comm comm,
+                                 MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest,
+                            int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                int dest, int sendtag, void *recvbuf, int recvcount,
+                                MPI_Datatype recvtype, int source, int recvtag,
+                                MPI_Comm comm,  MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Sendrecv_replace(void * buf, int count, MPI_Datatype datatype,
+                                        int dest, int sendtag, int source, int recvtag,
+                                        MPI_Comm comm, MPI_Status *status);
+OMPI_DECLSPEC  MPI_Fint MPI_Session_c2f (const MPI_Session session);
+OMPI_DECLSPEC  int MPI_Session_create_errhandler (MPI_Session_errhandler_function *session_errhandler_fn,
+                                                  MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_Session_finalize (MPI_Session *session);
+OMPI_DECLSPEC  int MPI_Session_get_info (MPI_Session session, MPI_Info *info_used);
+OMPI_DECLSPEC  int MPI_Session_get_num_psets (MPI_Session session, MPI_Info info, int *npset_names);
+OMPI_DECLSPEC  int MPI_Session_get_nth_pset (MPI_Session session, MPI_Info info, int n, int *len, char *pset_name);
+OMPI_DECLSPEC  int MPI_Session_get_pset_info (MPI_Session session, const char *pset_name, MPI_Info *info_used);
+OMPI_DECLSPEC  int MPI_Session_init (MPI_Info info, MPI_Errhandler errhandler,
+                                     MPI_Session *session);
+OMPI_DECLSPEC  MPI_Session MPI_Session_f2c (MPI_Fint session);
+OMPI_DECLSPEC  int MPI_Session_set_info (MPI_Session session, MPI_Info info);
+OMPI_DECLSPEC  int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype,
+                                  int dest, int tag, MPI_Comm comm,
+                                  MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                             int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Start(MPI_Request *request);
+OMPI_DECLSPEC  int MPI_Startall(int count, MPI_Request array_of_requests[]);
+OMPI_DECLSPEC  int MPI_Status_c2f(const MPI_Status *c_status, MPI_Fint *f_status);
+OMPI_DECLSPEC  int MPI_Status_c2f08(const MPI_Status *c_status, MPI_F08_status *f08_status);
+OMPI_DECLSPEC  int MPI_Status_f082c(const MPI_F08_status *f08_status, MPI_Status *c_status);
+OMPI_DECLSPEC  int MPI_Status_f082f(const MPI_F08_status *f08_status, MPI_Fint *f_status);
+OMPI_DECLSPEC  int MPI_Status_f2c(const MPI_Fint *f_status, MPI_Status *c_status);
+OMPI_DECLSPEC  int MPI_Status_f2f08(const MPI_Fint *f_status, MPI_F08_status *f08_status);
+OMPI_DECLSPEC  int MPI_Status_set_cancelled(MPI_Status *status, int flag);
+OMPI_DECLSPEC  int MPI_Status_set_elements(MPI_Status *status, MPI_Datatype datatype,
+                                           int count);
+OMPI_DECLSPEC  int MPI_Status_set_elements_x(MPI_Status *status, MPI_Datatype datatype,
+                                             MPI_Count count);
+OMPI_DECLSPEC  int MPI_Testall(int count, MPI_Request array_of_requests[], int *flag,
+                               MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int MPI_Testany(int count, MPI_Request array_of_requests[], int *index,
+                               int *flag, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Test_cancelled(const MPI_Status *status, int *flag);
+OMPI_DECLSPEC  int MPI_Testsome(int incount, MPI_Request array_of_requests[],
+                                int *outcount, int array_of_indices[],
+                                MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int MPI_Topo_test(MPI_Comm comm, int *status);
+OMPI_DECLSPEC  MPI_Fint MPI_Type_c2f(MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_Type_commit(MPI_Datatype *type);
+OMPI_DECLSPEC  int MPI_Type_contiguous(int count, MPI_Datatype oldtype,
+                                       MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_darray(int size, int rank, int ndims,
+                                          const int gsize_array[], const int distrib_array[],
+                                          const int darg_array[], const int psize_array[],
+                                          int order, MPI_Datatype oldtype,
+                                          MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_f90_complex(int p, int r, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_f90_integer(int r, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_f90_real(int p, int r, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_hindexed_block(int count, int blocklength,
+                                                  const MPI_Aint array_of_displacements[],
+                                                  MPI_Datatype oldtype,
+                                                  MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_hindexed(int count, const int array_of_blocklengths[],
+                                            const MPI_Aint array_of_displacements[],
+                                            MPI_Datatype oldtype,
+                                            MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_hvector(int count, int blocklength, MPI_Aint stride,
+                                           MPI_Datatype oldtype,
+                                           MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_keyval(MPI_Type_copy_attr_function *type_copy_attr_fn,
+                                          MPI_Type_delete_attr_function *type_delete_attr_fn,
+                                          int *type_keyval, void *extra_state);
+OMPI_DECLSPEC  int MPI_Type_create_indexed_block(int count, int blocklength,
+                                                 const int array_of_displacements[],
+                                                 MPI_Datatype oldtype,
+                                                 MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_struct(int count, const int array_of_block_lengths[],
+                                          const MPI_Aint array_of_displacements[],
+                                          const MPI_Datatype array_of_types[],
+                                          MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_subarray(int ndims, const int size_array[], const int subsize_array[],
+                                            const int start_array[], int order,
+                                            MPI_Datatype oldtype, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb,
+                                           MPI_Aint extent, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_delete_attr(MPI_Datatype type, int type_keyval);
+OMPI_DECLSPEC  int MPI_Type_dup(MPI_Datatype type, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_free(MPI_Datatype *type);
+OMPI_DECLSPEC  int MPI_Type_free_keyval(int *type_keyval);
+OMPI_DECLSPEC  MPI_Datatype MPI_Type_f2c(MPI_Fint datatype);
+OMPI_DECLSPEC  int MPI_Type_get_attr(MPI_Datatype type, int type_keyval,
+                                     void *attribute_val, int *flag);
+OMPI_DECLSPEC  int MPI_Type_get_contents(MPI_Datatype mtype, int max_integers,
+                                         int max_addresses, int max_datatypes,
+                                         int array_of_integers[],
+                                         MPI_Aint array_of_addresses[],
+                                         MPI_Datatype array_of_datatypes[]);
+OMPI_DECLSPEC  int MPI_Type_get_envelope(MPI_Datatype type, int *num_integers,
+                                         int *num_addresses, int *num_datatypes,
+                                         int *combiner);
+OMPI_DECLSPEC  int MPI_Type_get_extent(MPI_Datatype type, MPI_Aint *lb,
+                                       MPI_Aint *extent);
+OMPI_DECLSPEC  int MPI_Type_get_extent_x(MPI_Datatype type, MPI_Count *lb,
+                                         MPI_Count *extent);
+OMPI_DECLSPEC  int MPI_Type_get_name(MPI_Datatype type, char *type_name,
+                                     int *resultlen);
+OMPI_DECLSPEC  int MPI_Type_get_true_extent(MPI_Datatype datatype, MPI_Aint *true_lb,
+                                            MPI_Aint *true_extent);
+OMPI_DECLSPEC  int MPI_Type_get_true_extent_x(MPI_Datatype datatype, MPI_Count *true_lb,
+                                              MPI_Count *true_extent);
+OMPI_DECLSPEC  int MPI_Type_indexed(int count, const int array_of_blocklengths[],
+                                    const int array_of_displacements[],
+                                    MPI_Datatype oldtype, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Type_match_size(int typeclass, int size, MPI_Datatype *type);
+OMPI_DECLSPEC  int MPI_Type_set_attr(MPI_Datatype type, int type_keyval,
+                                     void *attr_val);
+OMPI_DECLSPEC  int MPI_Type_set_name(MPI_Datatype type, const char *type_name);
+OMPI_DECLSPEC  int MPI_Type_size(MPI_Datatype type, int *size);
+OMPI_DECLSPEC  int MPI_Type_size_x(MPI_Datatype type, MPI_Count *size);
+OMPI_DECLSPEC  int MPI_Type_vector(int count, int blocklength, int stride,
+                                   MPI_Datatype oldtype, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int MPI_Unpack(const void *inbuf, int insize, int *position,
+                              void *outbuf, int outcount, MPI_Datatype datatype,
+                              MPI_Comm comm);
+OMPI_DECLSPEC  int MPI_Unpublish_name(const char *service_name, MPI_Info info, const char *port_name);
+OMPI_DECLSPEC  int MPI_Unpack_external (const char datarep[], const void *inbuf, MPI_Aint insize,
+                                        MPI_Aint *position, void *outbuf, int outcount,
+                                        MPI_Datatype datatype);
+OMPI_DECLSPEC  int MPI_Waitall(int count, MPI_Request array_of_requests[],
+                               MPI_Status *array_of_statuses);
+OMPI_DECLSPEC  int MPI_Waitany(int count, MPI_Request array_of_requests[],
+                               int *index, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Wait(MPI_Request *request, MPI_Status *status);
+OMPI_DECLSPEC  int MPI_Waitsome(int incount, MPI_Request array_of_requests[],
+                                int *outcount, int array_of_indices[],
+                                MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int MPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info,
+                                    MPI_Comm comm, void *baseptr, MPI_Win *win);
+OMPI_DECLSPEC  int MPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info,
+                                           MPI_Comm comm, void *baseptr, MPI_Win *win);
+OMPI_DECLSPEC  int MPI_Win_attach(MPI_Win win, void *base, MPI_Aint size);
+OMPI_DECLSPEC  MPI_Fint MPI_Win_c2f(MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_call_errhandler(MPI_Win win, int errorcode);
+OMPI_DECLSPEC  int MPI_Win_complete(MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_create(void *base, MPI_Aint size, int disp_unit,
+                                  MPI_Info info, MPI_Comm comm, MPI_Win *win);
+OMPI_DECLSPEC  int MPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win);
+OMPI_DECLSPEC  int MPI_Win_create_errhandler(MPI_Win_errhandler_function *function,
+                                             MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_Win_create_keyval(MPI_Win_copy_attr_function *win_copy_attr_fn,
+                                         MPI_Win_delete_attr_function *win_delete_attr_fn,
+                                         int *win_keyval, void *extra_state);
+OMPI_DECLSPEC  int MPI_Win_delete_attr(MPI_Win win, int win_keyval);
+OMPI_DECLSPEC  int MPI_Win_detach(MPI_Win win, const void *base);
+OMPI_DECLSPEC  MPI_Win MPI_Win_f2c(MPI_Fint win);
+OMPI_DECLSPEC  int MPI_Win_fence(int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_flush(int rank, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_flush_all(MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_flush_local(int rank, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_flush_local_all(MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_free(MPI_Win *win);
+OMPI_DECLSPEC  int MPI_Win_free_keyval(int *win_keyval);
+OMPI_DECLSPEC  int MPI_Win_get_attr(MPI_Win win, int win_keyval,
+                                    void *attribute_val, int *flag);
+OMPI_DECLSPEC  int MPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int MPI_Win_get_group(MPI_Win win, MPI_Group *group);
+OMPI_DECLSPEC  int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
+OMPI_DECLSPEC  int MPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+OMPI_DECLSPEC  int MPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_lock_all(int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_set_attr(MPI_Win win, int win_keyval, void *attribute_val);
+OMPI_DECLSPEC  int MPI_Win_set_errhandler(MPI_Win win, MPI_Errhandler errhandler);
+OMPI_DECLSPEC  int MPI_Win_set_info(MPI_Win win, MPI_Info info);
+OMPI_DECLSPEC  int MPI_Win_set_name(MPI_Win win, const char *win_name);
+OMPI_DECLSPEC  int MPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr);
+OMPI_DECLSPEC  int MPI_Win_start(MPI_Group group, int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_sync(MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_test(MPI_Win win, int *flag);
+OMPI_DECLSPEC  int MPI_Win_unlock(int rank, MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_unlock_all(MPI_Win win);
+OMPI_DECLSPEC  int MPI_Win_wait(MPI_Win win);
+OMPI_DECLSPEC  double MPI_Wtick(void);
+OMPI_DECLSPEC  double MPI_Wtime(void);
+
+  /*
+   * Profiling MPI API
+   */
+OMPI_DECLSPEC  int PMPI_Abort(MPI_Comm comm, int errorcode);
+OMPI_DECLSPEC  int PMPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                   int target_rank, MPI_Aint target_disp, int target_count,
+                                   MPI_Datatype target_datatype, MPI_Op op, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Add_error_class(int *errorclass);
+OMPI_DECLSPEC  int PMPI_Add_error_code(int errorclass, int *errorcode);
+OMPI_DECLSPEC  int PMPI_Add_error_string(int errorcode, const char *string);
+OMPI_DECLSPEC  int PMPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount,
+                                  MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, int recvcount,
+                                   MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                       void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                       MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const int recvcounts[],
+                                   const int displs[], MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                    MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                        void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                        MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alloc_mem(MPI_Aint size, MPI_Info info,
+                                  void *baseptr);
+OMPI_DECLSPEC  int PMPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+                                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                   MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Allreduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                       MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount,
+                                 MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                  void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                  MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                   void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                   MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                       void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                       MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[],
+                                  void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[],
+                                  MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[],
+                                   void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[],
+                                   MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int sdispls[], const MPI_Datatype sendtypes[],
+                                       void *recvbuf, const int recvcounts[], const int rdispls[], const MPI_Datatype recvtypes[],
+                                       MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Dist_graph_create(MPI_Comm comm_old, int n, const int nodes[],
+                                          const int degrees[], const int targets[],
+                                          const int weights[], MPI_Info info,
+                                          int reorder, MPI_Comm * newcomm);
+OMPI_DECLSPEC  int PMPI_Dist_graph_create_adjacent(MPI_Comm comm_old,
+                                                   int indegree, const int sources[],
+                                                   const int sourceweights[],
+                                                   int outdegree,
+                                                   const int destinations[],
+                                                   const int destweights[],
+                                                   MPI_Info info, int reorder,
+                                                   MPI_Comm *comm_dist_graph);
+OMPI_DECLSPEC int PMPI_Dist_graph_neighbors(MPI_Comm comm, int maxindegree,
+                                            int sources[], int sourceweights[],
+                                            int maxoutdegree,
+                                             int destinations[],
+                                            int destweights[]);
+OMPI_DECLSPEC  int PMPI_Dist_graph_neighbors_count(MPI_Comm comm,
+                                                   int *inneighbors,
+                                                   int *outneighbors,
+                                                   int *weighted);
+OMPI_DECLSPEC  int PMPI_Barrier(MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ibarrier(MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Barrier_init(MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Bcast(void *buffer, int count, MPI_Datatype datatype,
+                              int root, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ibcast(void *buffer, int count, MPI_Datatype datatype,
+                               int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Bcast_init(void *buffer, int count, MPI_Datatype datatype,
+                                   int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Bsend(const void *buf, int count, MPI_Datatype datatype,
+                              int dest, int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype,
+                                   int dest, int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Buffer_attach(void *buffer, int size);
+OMPI_DECLSPEC  int PMPI_Buffer_detach(void *buffer, int *size);
+OMPI_DECLSPEC  int PMPI_Cancel(MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int coords[]);
+OMPI_DECLSPEC  int PMPI_Cart_create(MPI_Comm old_comm, int ndims, const int dims[],
+                                    const int periods[], int reorder, MPI_Comm *comm_cart);
+OMPI_DECLSPEC  int PMPI_Cart_get(MPI_Comm comm, int maxdims, int dims[],
+                                 int periods[], int coords[]);
+OMPI_DECLSPEC  int PMPI_Cart_map(MPI_Comm comm, int ndims, const int dims[],
+                                 const int periods[], int *newrank);
+OMPI_DECLSPEC  int PMPI_Cart_rank(MPI_Comm comm, const int coords[], int *rank);
+OMPI_DECLSPEC  int PMPI_Cart_shift(MPI_Comm comm, int direction, int disp,
+                                   int *rank_source, int *rank_dest);
+OMPI_DECLSPEC  int PMPI_Cart_sub(MPI_Comm comm, const int remain_dims[], MPI_Comm *new_comm);
+OMPI_DECLSPEC  int PMPI_Cartdim_get(MPI_Comm comm, int *ndims);
+OMPI_DECLSPEC  int PMPI_Close_port(const char *port_name);
+OMPI_DECLSPEC  int PMPI_Comm_accept(const char *port_name, MPI_Info info, int root,
+                                    MPI_Comm comm, MPI_Comm *newcomm);
+OMPI_DECLSPEC  MPI_Fint PMPI_Comm_c2f(MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Comm_call_errhandler(MPI_Comm comm, int errorcode);
+OMPI_DECLSPEC  int PMPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int *result);
+OMPI_DECLSPEC  int PMPI_Comm_connect(const char *port_name, MPI_Info info, int root,
+                                     MPI_Comm comm, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_create_errhandler(MPI_Comm_errhandler_function *function,
+                                               MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_Comm_create_keyval(MPI_Comm_copy_attr_function *comm_copy_attr_fn,
+                                           MPI_Comm_delete_attr_function *comm_delete_attr_fn,
+                                           int *comm_keyval, void *extra_state);
+OMPI_DECLSPEC  int PMPI_Comm_create_group(MPI_Comm comm, MPI_Group group, int tag, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_create_from_group(MPI_Group group, const char *tag, MPI_Info info,
+					       MPI_Errhandler errhandler, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_delete_attr(MPI_Comm comm, int comm_keyval);
+OMPI_DECLSPEC  int PMPI_Comm_disconnect(MPI_Comm *comm);
+OMPI_DECLSPEC  int PMPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Comm_dup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_idup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm, 
+                                            MPI_Request *request);
+OMPI_DECLSPEC  MPI_Comm PMPI_Comm_f2c(MPI_Fint comm);
+OMPI_DECLSPEC  int PMPI_Comm_free_keyval(int *comm_keyval);
+OMPI_DECLSPEC  int PMPI_Comm_free(MPI_Comm *comm);
+OMPI_DECLSPEC  int PMPI_Comm_get_attr(MPI_Comm comm, int comm_keyval,
+                                      void *attribute_val, int *flag);
+OMPI_DECLSPEC  int PMPI_Comm_get_errhandler(MPI_Comm comm, MPI_Errhandler *erhandler);
+OMPI_DECLSPEC  int PMPI_Comm_get_info(MPI_Comm comm, MPI_Info *info_used);
+OMPI_DECLSPEC  int PMPI_Comm_get_name(MPI_Comm comm, char *comm_name, int *resultlen);
+OMPI_DECLSPEC  int PMPI_Comm_get_parent(MPI_Comm *parent);
+OMPI_DECLSPEC  int PMPI_Comm_group(MPI_Comm comm, MPI_Group *group);
+OMPI_DECLSPEC  int PMPI_Comm_join(int fd, MPI_Comm *intercomm);
+OMPI_DECLSPEC  int PMPI_Comm_rank(MPI_Comm comm, int *rank);
+OMPI_DECLSPEC  int PMPI_Comm_remote_group(MPI_Comm comm, MPI_Group *group);
+OMPI_DECLSPEC  int PMPI_Comm_remote_size(MPI_Comm comm, int *size);
+OMPI_DECLSPEC  int PMPI_Comm_set_attr(MPI_Comm comm, int comm_keyval, void *attribute_val);
+OMPI_DECLSPEC  int PMPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler);
+OMPI_DECLSPEC  int PMPI_Comm_set_info(MPI_Comm comm, MPI_Info info);
+OMPI_DECLSPEC  int PMPI_Comm_set_name(MPI_Comm comm, const char *comm_name);
+OMPI_DECLSPEC  int PMPI_Comm_size(MPI_Comm comm, int *size);
+OMPI_DECLSPEC  int PMPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info info,
+                                   int root, MPI_Comm comm, MPI_Comm *intercomm,
+                                   int array_of_errcodes[]);
+OMPI_DECLSPEC  int PMPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_of_argv[],
+                                            const int array_of_maxprocs[], const MPI_Info array_of_info[],
+                                            int root, MPI_Comm comm, MPI_Comm *intercomm,
+                                            int array_of_errcodes[]);
+OMPI_DECLSPEC  int PMPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm);
+OMPI_DECLSPEC  int PMPI_Comm_test_inter(MPI_Comm comm, int *flag);
+OMPI_DECLSPEC  int PMPI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
+                                         void *result_addr, MPI_Datatype datatype, int target_rank,
+                                         MPI_Aint target_disp, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Dims_create(int nnodes, int ndims, int dims[]);
+OMPI_DECLSPEC  MPI_Fint PMPI_Errhandler_c2f(MPI_Errhandler errhandler);
+OMPI_DECLSPEC  MPI_Errhandler PMPI_Errhandler_f2c(MPI_Fint errhandler);
+OMPI_DECLSPEC  int PMPI_Errhandler_free(MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_Error_class(int errorcode, int *errorclass);
+OMPI_DECLSPEC  int PMPI_Error_string(int errorcode, char *string, int *resultlen);
+OMPI_DECLSPEC  int PMPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                               MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Exscan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                    MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Fetch_and_op(const void *origin_addr, void *result_addr, MPI_Datatype datatype,
+                                     int target_rank, MPI_Aint target_disp, MPI_Op op, MPI_Win win);
+OMPI_DECLSPEC  MPI_Fint PMPI_File_c2f(MPI_File file);
+OMPI_DECLSPEC  MPI_File PMPI_File_f2c(MPI_Fint file);
+OMPI_DECLSPEC  int PMPI_File_call_errhandler(MPI_File fh, int errorcode);
+OMPI_DECLSPEC  int PMPI_File_create_errhandler(MPI_File_errhandler_function *function,
+                                               MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_File_set_errhandler( MPI_File file, MPI_Errhandler errhandler);
+OMPI_DECLSPEC  int PMPI_File_get_errhandler( MPI_File file, MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_File_open(MPI_Comm comm, const char *filename, int amode,
+                                  MPI_Info info, MPI_File *fh);
+OMPI_DECLSPEC  int PMPI_File_close(MPI_File *fh);
+OMPI_DECLSPEC  int PMPI_File_delete(const char *filename, MPI_Info info);
+OMPI_DECLSPEC  int PMPI_File_set_size(MPI_File fh, MPI_Offset size);
+OMPI_DECLSPEC  int PMPI_File_preallocate(MPI_File fh, MPI_Offset size);
+OMPI_DECLSPEC  int PMPI_File_get_size(MPI_File fh, MPI_Offset *size);
+OMPI_DECLSPEC  int PMPI_File_get_group(MPI_File fh, MPI_Group *group);
+OMPI_DECLSPEC  int PMPI_File_get_amode(MPI_File fh, int *amode);
+OMPI_DECLSPEC  int PMPI_File_set_info(MPI_File fh, MPI_Info info);
+OMPI_DECLSPEC  int PMPI_File_get_info(MPI_File fh, MPI_Info *info_used);
+OMPI_DECLSPEC  int PMPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
+                                      MPI_Datatype filetype, const char *datarep, MPI_Info info);
+OMPI_DECLSPEC  int PMPI_File_get_view(MPI_File fh, MPI_Offset *disp,
+                                      MPI_Datatype *etype,
+                                      MPI_Datatype *filetype, char *datarep);
+OMPI_DECLSPEC  int PMPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
+                                     int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void *buf,
+                                         int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_at(MPI_File fh, MPI_Offset offset, const void *buf,
+                                      int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void *buf,
+                                          int count, MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_iread_at(MPI_File fh, MPI_Offset offset, void *buf,
+                                      int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, const void *buf,
+                                       int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf,
+                                          int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf,
+                                           int count, MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_read(MPI_File fh, void *buf, int count,
+                                  MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_read_all(MPI_File fh, void *buf, int count,
+                                      MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write(MPI_File fh, const void *buf, int count,
+                                   MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_all(MPI_File fh, const void *buf, int count,
+                                       MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_iread(MPI_File fh, void *buf, int count,
+                                   MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iwrite(MPI_File fh, const void *buf, int count,
+                                    MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iread_all(MPI_File fh, void *buf, int count,
+                                       MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iwrite_all(MPI_File fh, const void *buf, int count,
+                                        MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_seek(MPI_File fh, MPI_Offset offset, int whence);
+OMPI_DECLSPEC  int PMPI_File_get_position(MPI_File fh, MPI_Offset *offset);
+OMPI_DECLSPEC  int PMPI_File_get_byte_offset(MPI_File fh, MPI_Offset offset,
+                                             MPI_Offset *disp);
+OMPI_DECLSPEC  int PMPI_File_read_shared(MPI_File fh, void *buf, int count,
+                                         MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_shared(MPI_File fh, const void *buf, int count,
+                                          MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_iread_shared(MPI_File fh, void *buf, int count,
+                                          MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_iwrite_shared(MPI_File fh, const void *buf, int count,
+                                           MPI_Datatype datatype, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_File_read_ordered(MPI_File fh, void *buf, int count,
+                                          MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_ordered(MPI_File fh, const void *buf, int count,
+                                           MPI_Datatype datatype, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_seek_shared(MPI_File fh, MPI_Offset offset, int whence);
+OMPI_DECLSPEC  int PMPI_File_get_position_shared(MPI_File fh, MPI_Offset *offset);
+OMPI_DECLSPEC  int PMPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void *buf,
+                                               int count, MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_File_read_at_all_end(MPI_File fh, void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, const void *buf,
+                                                int count, MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_File_write_at_all_end(MPI_File fh, const void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_read_all_begin(MPI_File fh, void *buf, int count,
+                                            MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_File_read_all_end(MPI_File fh, void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_all_begin(MPI_File fh, const void *buf, int count,
+                                             MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_File_write_all_end(MPI_File fh, const void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_read_ordered_begin(MPI_File fh, void *buf, int count,
+                                                MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_File_read_ordered_end(MPI_File fh, void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_write_ordered_begin(MPI_File fh, const void *buf, int count,
+                                                 MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_File_write_ordered_end(MPI_File fh, const void *buf, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_File_get_type_extent(MPI_File fh, MPI_Datatype datatype,
+                                             MPI_Aint *extent);
+OMPI_DECLSPEC  int PMPI_File_set_atomicity(MPI_File fh, int flag);
+OMPI_DECLSPEC  int PMPI_File_get_atomicity(MPI_File fh, int *flag);
+OMPI_DECLSPEC  int PMPI_File_sync(MPI_File fh);
+OMPI_DECLSPEC  int PMPI_Finalize(void);
+OMPI_DECLSPEC  int PMPI_Finalized(int *flag);
+OMPI_DECLSPEC  int PMPI_Free_mem(void *base);
+OMPI_DECLSPEC  int PMPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                               int root, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Gather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                    int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, const int recvcounts[], const int displs[],
+                                MPI_Datatype recvtype, int root, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                 int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Gatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                     int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Get_address(const void *location, MPI_Aint *address);
+OMPI_DECLSPEC  int PMPI_Get_count(const MPI_Status *status, MPI_Datatype datatype, int *count);
+OMPI_DECLSPEC  int PMPI_Get_elements(const MPI_Status *status, MPI_Datatype datatype,
+                                     int *count);
+OMPI_DECLSPEC  int PMPI_Get_elements_x(const MPI_Status *status, MPI_Datatype datatype,
+                                       MPI_Count *count);
+OMPI_DECLSPEC  int PMPI_Get(void *origin_addr, int origin_count,
+                            MPI_Datatype origin_datatype, int target_rank,
+                            MPI_Aint target_disp, int target_count,
+                            MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                       void *result_addr, int result_count, MPI_Datatype result_datatype,
+                                       int target_rank, MPI_Aint target_disp, int target_count,
+                                       MPI_Datatype target_datatype, MPI_Op op, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Get_library_version(char *version, int *resultlen);
+OMPI_DECLSPEC  int PMPI_Get_processor_name(char *name, int *resultlen);
+OMPI_DECLSPEC  int PMPI_Get_version(int *version, int *subversion);
+OMPI_DECLSPEC  int PMPI_Graph_create(MPI_Comm comm_old, int nnodes, const int index[],
+                                     const int edges[], int reorder, MPI_Comm *comm_graph);
+OMPI_DECLSPEC  int PMPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges,
+                                  int index[], int edges[]);
+OMPI_DECLSPEC  int PMPI_Graph_map(MPI_Comm comm, int nnodes, const int index[], const int edges[],
+                                  int *newrank);
+OMPI_DECLSPEC  int PMPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors);
+OMPI_DECLSPEC  int PMPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors,
+                                        int neighbors[]);
+OMPI_DECLSPEC  int PMPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges);
+OMPI_DECLSPEC  int PMPI_Grequest_complete(MPI_Request request);
+OMPI_DECLSPEC  int PMPI_Grequest_start(MPI_Grequest_query_function *query_fn,
+                                       MPI_Grequest_free_function *free_fn,
+                                       MPI_Grequest_cancel_function *cancel_fn,
+                                       void *extra_state, MPI_Request *request);
+OMPI_DECLSPEC  MPI_Fint PMPI_Group_c2f(MPI_Group group);
+OMPI_DECLSPEC  int PMPI_Group_compare(MPI_Group group1, MPI_Group group2, int *result);
+OMPI_DECLSPEC  int PMPI_Group_difference(MPI_Group group1, MPI_Group group2,
+                                         MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Group_excl(MPI_Group group, int n, const int ranks[],
+                                   MPI_Group *newgroup);
+OMPI_DECLSPEC  MPI_Group PMPI_Group_f2c(MPI_Fint group);
+OMPI_DECLSPEC  int PMPI_Group_free(MPI_Group *group);
+OMPI_DECLSPEC  int PMPI_Group_from_session_pset (MPI_Session session, const char *pset_name, MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Group_incl(MPI_Group group, int n, const int ranks[],
+                                   MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Group_intersection(MPI_Group group1, MPI_Group group2,
+                                           MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Group_range_excl(MPI_Group group, int n, int ranges[][3],
+                                         MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Group_range_incl(MPI_Group group, int n, int ranges[][3],
+                                         MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Group_rank(MPI_Group group, int *rank);
+OMPI_DECLSPEC  int PMPI_Group_size(MPI_Group group, int *size);
+OMPI_DECLSPEC  int PMPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[],
+                                              MPI_Group group2, int ranks2[]);
+OMPI_DECLSPEC  int PMPI_Group_union(MPI_Group group1, MPI_Group group2,
+                                    MPI_Group *newgroup);
+OMPI_DECLSPEC  int PMPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                               int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Improbe(int source, int tag, MPI_Comm comm,
+                                int *flag, MPI_Message *message,
+                                MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Imrecv(void *buf, int count, MPI_Datatype type,
+                               MPI_Message *message, MPI_Request *request);
+OMPI_DECLSPEC  MPI_Fint PMPI_Info_c2f(MPI_Info info);
+OMPI_DECLSPEC  int PMPI_Info_create(MPI_Info *info);
+OMPI_DECLSPEC  int PMPI_Info_delete(MPI_Info info, const char *key);
+OMPI_DECLSPEC  int PMPI_Info_dup(MPI_Info info, MPI_Info *newinfo);
+OMPI_DECLSPEC  MPI_Info PMPI_Info_f2c(MPI_Fint info);
+OMPI_DECLSPEC  int PMPI_Info_free(MPI_Info *info);
+OMPI_DECLSPEC  int PMPI_Info_get(MPI_Info info, const char *key, int valuelen,
+                                 char *value, int *flag);
+OMPI_DECLSPEC  int PMPI_Info_get_nkeys(MPI_Info info, int *nkeys);
+OMPI_DECLSPEC  int PMPI_Info_get_nthkey(MPI_Info info, int n, char *key);
+OMPI_DECLSPEC  int PMPI_Info_get_string(MPI_Info info, const char *key, int *buflen,
+                                        char *value, int *flag);
+OMPI_DECLSPEC  int PMPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen,
+                                          int *flag);
+OMPI_DECLSPEC  int PMPI_Info_set(MPI_Info info, const char *key, const char *value);
+OMPI_DECLSPEC  int PMPI_Init(int *argc, char ***argv);
+OMPI_DECLSPEC  int PMPI_Initialized(int *flag);
+OMPI_DECLSPEC  int PMPI_Init_thread(int *argc, char ***argv, int required,
+                                    int *provided);
+OMPI_DECLSPEC  int PMPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
+                                         MPI_Comm bridge_comm, int remote_leader,
+                                         int tag, MPI_Comm *newintercomm);
+OMPI_DECLSPEC  int PMPI_Intercomm_create_from_groups (MPI_Group local_group, int local_leader, MPI_Group remote_group,
+                                                      int remote_leader, const char *tag, MPI_Info info, MPI_Errhandler errhandler,
+                                                      MPI_Comm *newintercomm);
+OMPI_DECLSPEC  int PMPI_Intercomm_merge(MPI_Comm intercomm, int high,
+                                        MPI_Comm *newintercomm);
+OMPI_DECLSPEC  int PMPI_Iprobe(int source, int tag, MPI_Comm comm, int *flag,
+                               MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source,
+                              int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                               int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                              int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Isendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  int dest, int sendtag, void *recvbuf, int recvcount,
+                                  MPI_Datatype recvtype, int source, int recvtag,
+                                  MPI_Comm comm,  MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Isendrecv_replace(void * buf, int count, MPI_Datatype datatype,
+                                          int dest, int sendtag, int source, int recvtag,
+                                          MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                               int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Precv_init(void* buf, int partitions, MPI_Count count,
+                                  MPI_Datatype datatype, int source, int tag, MPI_Comm comm,
+                                  MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Psend_init(const void* buf, int partitions, MPI_Count count,
+                                  MPI_Datatype datatype, int dest, int tag, MPI_Comm comm,
+                                  MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Pready(int partitions, MPI_Request request);
+OMPI_DECLSPEC  int PMPI_Pready_range(int partition_low, int partition_high,
+                                    MPI_Request request);
+OMPI_DECLSPEC  int PMPI_Pready_list(int length, int partition_list[], MPI_Request request);
+OMPI_DECLSPEC  int PMPI_Parrived(MPI_Request request, MPI_Count partition, int *flag);
+OMPI_DECLSPEC  int PMPI_Is_thread_main(int *flag);
+OMPI_DECLSPEC  int PMPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name);
+OMPI_DECLSPEC  MPI_Fint PMPI_Message_c2f(MPI_Message message);
+OMPI_DECLSPEC  MPI_Message PMPI_Message_f2c(MPI_Fint message);
+OMPI_DECLSPEC  int PMPI_Mprobe(int source, int tag, MPI_Comm comm,
+                               MPI_Message *message,
+                               MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Mrecv(void *buf, int count, MPI_Datatype type,
+                              MPI_Message *message, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                           void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                           MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                            void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                            MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                                MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                            void *recvbuf, const int recvcounts[], const int displs[],
+                                            MPI_Datatype recvtype, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                             void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                             MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                                 void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                                                 MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                          void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                          MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                           void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                           MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                               MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],  MPI_Datatype sendtype,
+                                           void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                           MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                            void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                            MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype,
+                                                void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype,
+                                                MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                           void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                           MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                            void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                            MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                                void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                                MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  MPI_Fint PMPI_Op_c2f(MPI_Op op);
+OMPI_DECLSPEC  int PMPI_Op_commutative(MPI_Op op, int *commute);
+OMPI_DECLSPEC  int PMPI_Op_create(MPI_User_function *function, int commute, MPI_Op *op);
+OMPI_DECLSPEC  int PMPI_Open_port(MPI_Info info, char *port_name);
+OMPI_DECLSPEC  MPI_Op PMPI_Op_f2c(MPI_Fint op);
+OMPI_DECLSPEC  int PMPI_Op_free(MPI_Op *op);
+OMPI_DECLSPEC  int PMPI_Pack_external(const char datarep[], const void *inbuf, int incount,
+                                      MPI_Datatype datatype, void *outbuf,
+                                      MPI_Aint outsize, MPI_Aint *position);
+OMPI_DECLSPEC  int PMPI_Pack_external_size(const char datarep[], int incount,
+                                           MPI_Datatype datatype, MPI_Aint *size);
+OMPI_DECLSPEC  int PMPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
+                             void *outbuf, int outsize, int *position, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm,
+                                  int *size);
+OMPI_DECLSPEC  int PMPI_Pcontrol(const int level, ...);
+OMPI_DECLSPEC  int PMPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Publish_name(const char *service_name, MPI_Info info,
+                                     const char *port_name);
+OMPI_DECLSPEC  int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                            int target_rank, MPI_Aint target_disp, int target_count,
+                            MPI_Datatype target_datatype, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Query_thread(int *provided);
+OMPI_DECLSPEC  int PMPI_Raccumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                    int target_rank, MPI_Aint target_disp, int target_count,
+                                    MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source,
+                                  int tag, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+                             int tag, MPI_Comm comm, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                               MPI_Op op, int root, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                MPI_Op op, int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Reduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                    MPI_Op op, int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Reduce_local(const void *inbuf, void *inoutbuf, int count,
+                                     MPI_Datatype datatype, MPI_Op op);
+OMPI_DECLSPEC  int PMPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype,
+                                       MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype,
+                                        MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Reduce_scatter_init(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype,
+                                            MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype,
+                                             MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype,
+                                              MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Reduce_scatter_block_init(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype,
+                                                  MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Register_datarep(const char *datarep,
+                                         MPI_Datarep_conversion_function *read_conversion_fn,
+                                         MPI_Datarep_conversion_function *write_conversion_fn,
+                                         MPI_Datarep_extent_function *dtype_file_extent_fn,
+                                         void *extra_state);
+OMPI_DECLSPEC  MPI_Fint PMPI_Request_c2f(MPI_Request request);
+OMPI_DECLSPEC  MPI_Request PMPI_Request_f2c(MPI_Fint request);
+OMPI_DECLSPEC  int PMPI_Request_free(MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Request_get_status(MPI_Request request, int *flag,
+                                           MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Rget(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                             int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype,
+                             MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Rget_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                                        void *result_addr, int result_count, MPI_Datatype result_datatype,
+                                        int target_rank, MPI_Aint target_disp, int target_count,
+                                        MPI_Datatype target_datatype, MPI_Op op,
+                                        MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                             int target_rank, MPI_Aint target_disp, int target_cout,
+                             MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Rsend(const void *ibuf, int count, MPI_Datatype datatype, int dest,
+                              int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype,
+                                   int dest, int tag, MPI_Comm comm,
+                                   MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                             MPI_Op op, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                              MPI_Op op, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Scan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                                  MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                int root, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Scatter_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                     int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Scatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                 int root, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype,
+                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                  int root, MPI_Comm comm, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Scatterv_init(const void *sendbuf, const int sendcounts[], const int displs[], MPI_Datatype sendtype,
+                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                                      int root, MPI_Comm comm, MPI_Info info, MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Send_init(const void *buf, int count, MPI_Datatype datatype,
+                                  int dest, int tag, MPI_Comm comm,
+                                  MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest,
+                             int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 int dest, int sendtag, void *recvbuf, int recvcount,
+                                 MPI_Datatype recvtype, int source, int recvtag,
+                                 MPI_Comm comm,  MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Sendrecv_replace(void * buf, int count, MPI_Datatype datatype,
+                                         int dest, int sendtag, int source, int recvtag,
+                                         MPI_Comm comm, MPI_Status *status);
+OMPI_DECLSPEC  MPI_Fint PMPI_Session_c2f (const MPI_Session session);
+OMPI_DECLSPEC  int PMPI_Session_create_errhandler (MPI_Session_errhandler_function *session_errhandler_fn,
+                                                   MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_Session_finalize (MPI_Session *session);
+OMPI_DECLSPEC  int PMPI_Session_get_info (MPI_Session session, MPI_Info *info_used);
+OMPI_DECLSPEC  int PMPI_Session_get_num_psets (MPI_Session session, MPI_Info info, int *npset_names);
+OMPI_DECLSPEC  int PMPI_Session_get_nth_pset (MPI_Session session, MPI_Info info, int n, int *len, char *pset_name);
+OMPI_DECLSPEC  int PMPI_Session_get_pset_info (MPI_Session session, const char *pset_name, MPI_Info *info_used);
+OMPI_DECLSPEC  int PMPI_Session_init (MPI_Info info, MPI_Errhandler errhandler,
+                                     MPI_Session *session);
+OMPI_DECLSPEC  MPI_Session PMPI_Session_f2c (MPI_Fint session);
+OMPI_DECLSPEC  int PMPI_Session_set_info (MPI_Session session, MPI_Info info);
+OMPI_DECLSPEC  int PMPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype,
+                                   int dest, int tag, MPI_Comm comm,
+                                   MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest,
+                              int tag, MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Start(MPI_Request *request);
+OMPI_DECLSPEC  int PMPI_Startall(int count, MPI_Request array_of_requests[]);
+OMPI_DECLSPEC  int PMPI_Status_c2f(const MPI_Status *c_status, MPI_Fint *f_status);
+OMPI_DECLSPEC  int PMPI_Status_c2f08(const MPI_Status *c_status, MPI_F08_status *f08_status);
+OMPI_DECLSPEC  int PMPI_Status_f082f(const MPI_F08_status *f08_status, MPI_Fint *f_status);
+OMPI_DECLSPEC  int PMPI_Status_f082c(const MPI_F08_status *f08_status, MPI_Status *c_status);
+OMPI_DECLSPEC  int PMPI_Status_f2c(const MPI_Fint *f_status, MPI_Status *c_status);
+OMPI_DECLSPEC  int PMPI_Status_f2f08(const MPI_Fint *f_status, MPI_F08_status *f08_status);
+OMPI_DECLSPEC  int PMPI_Status_set_cancelled(MPI_Status *status, int flag);
+OMPI_DECLSPEC  int PMPI_Status_set_elements(MPI_Status *status, MPI_Datatype datatype,
+                                            int count);
+OMPI_DECLSPEC  int PMPI_Status_set_elements_x(MPI_Status *status, MPI_Datatype datatype,
+                                              MPI_Count count);
+OMPI_DECLSPEC  int PMPI_Testall(int count, MPI_Request array_of_requests[], int *flag,
+                                MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int PMPI_Testany(int count, MPI_Request array_of_requests[], int *index, int *flag, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Test_cancelled(const MPI_Status *status, int *flag);
+OMPI_DECLSPEC  int PMPI_Testsome(int incount, MPI_Request array_of_requests[],
+                                 int *outcount, int array_of_indices[],
+                                 MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int PMPI_Topo_test(MPI_Comm comm, int *status);
+OMPI_DECLSPEC  MPI_Fint PMPI_Type_c2f(MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_Type_commit(MPI_Datatype *type);
+OMPI_DECLSPEC  int PMPI_Type_contiguous(int count, MPI_Datatype oldtype,
+                                        MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_darray(int size, int rank, int ndims,
+                                           const int gsize_array[], const int distrib_array[],
+                                           const int darg_array[], const int psize_array[],
+                                           int order, MPI_Datatype oldtype,
+                                           MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_f90_complex(int p, int r, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_f90_integer(int r, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_f90_real(int p, int r, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_hindexed(int count, const int array_of_blocklengths[],
+                                             const MPI_Aint array_of_displacements[],
+                                             MPI_Datatype oldtype,
+                                             MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_hvector(int count, int blocklength, MPI_Aint stride,
+                                            MPI_Datatype oldtype,
+                                            MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_keyval(MPI_Type_copy_attr_function *type_copy_attr_fn,
+                                           MPI_Type_delete_attr_function *type_delete_attr_fn,
+                                           int *type_keyval, void *extra_state);
+OMPI_DECLSPEC  int PMPI_Type_create_hindexed_block(int count, int blocklength,
+                                                   const MPI_Aint array_of_displacements[],
+                                                   MPI_Datatype oldtype,
+                                                   MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_indexed_block(int count, int blocklength,
+                                                  const int array_of_displacements[],
+                                                  MPI_Datatype oldtype,
+                                                  MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_struct(int count, const int array_of_block_lengths[],
+                                           const MPI_Aint array_of_displacements[],
+                                           const MPI_Datatype array_of_types[],
+                                           MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_subarray(int ndims, const int size_array[], const int subsize_array[],
+                                             const int start_array[], int order,
+                                             MPI_Datatype oldtype, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb,
+                                            MPI_Aint extent, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_delete_attr(MPI_Datatype type, int type_keyval);
+OMPI_DECLSPEC  int PMPI_Type_dup(MPI_Datatype type, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_free(MPI_Datatype *type);
+OMPI_DECLSPEC  int PMPI_Type_free_keyval(int *type_keyval);
+OMPI_DECLSPEC  MPI_Datatype PMPI_Type_f2c(MPI_Fint datatype);
+OMPI_DECLSPEC  int PMPI_Type_get_attr(MPI_Datatype type, int type_keyval,
+                                      void *attribute_val, int *flag);
+OMPI_DECLSPEC  int PMPI_Type_get_contents(MPI_Datatype mtype, int max_integers,
+                                          int max_addresses, int max_datatypes,
+                                          int array_of_integers[],
+                                          MPI_Aint array_of_addresses[],
+                                          MPI_Datatype array_of_datatypes[]);
+OMPI_DECLSPEC  int PMPI_Type_get_envelope(MPI_Datatype type, int *num_integers,
+                                          int *num_addresses, int *num_datatypes,
+                                          int *combiner);
+OMPI_DECLSPEC  int PMPI_Type_get_extent(MPI_Datatype type, MPI_Aint *lb,
+                                        MPI_Aint *extent);
+OMPI_DECLSPEC  int PMPI_Type_get_extent_x(MPI_Datatype type, MPI_Count *lb,
+                                          MPI_Count *extent);
+OMPI_DECLSPEC  int PMPI_Type_get_name(MPI_Datatype type, char *type_name,
+                                      int *resultlen);
+OMPI_DECLSPEC  int PMPI_Type_get_true_extent(MPI_Datatype datatype, MPI_Aint *true_lb,
+                                             MPI_Aint *true_extent);
+OMPI_DECLSPEC  int PMPI_Type_get_true_extent_x(MPI_Datatype datatype, MPI_Count *true_lb,
+                                               MPI_Count *true_extent);
+OMPI_DECLSPEC  int PMPI_Type_indexed(int count, const int array_of_blocklengths[],
+                                     const int array_of_displacements[],
+                                     MPI_Datatype oldtype, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Type_match_size(int typeclass, int size, MPI_Datatype *type);
+OMPI_DECLSPEC  int PMPI_Type_set_attr(MPI_Datatype type, int type_keyval,
+                                      void *attr_val);
+OMPI_DECLSPEC  int PMPI_Type_set_name(MPI_Datatype type, const char *type_name);
+OMPI_DECLSPEC  int PMPI_Type_size(MPI_Datatype type, int *size);
+OMPI_DECLSPEC  int PMPI_Type_size_x(MPI_Datatype type, MPI_Count *size);
+OMPI_DECLSPEC  int PMPI_Type_vector(int count, int blocklength, int stride,
+                                    MPI_Datatype oldtype, MPI_Datatype *newtype);
+OMPI_DECLSPEC  int PMPI_Unpack(const void *inbuf, int insize, int *position,
+                               void *outbuf, int outcount, MPI_Datatype datatype,
+                               MPI_Comm comm);
+OMPI_DECLSPEC  int PMPI_Unpublish_name(const char *service_name, MPI_Info info,
+                                       const char *port_name);
+OMPI_DECLSPEC  int PMPI_Unpack_external (const char datarep[], const void *inbuf, MPI_Aint insize,
+                                         MPI_Aint *position, void *outbuf, int outcount,
+                                         MPI_Datatype datatype);
+OMPI_DECLSPEC  int PMPI_Waitall(int count, MPI_Request array_of_requests[],
+                                MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int PMPI_Waitany(int count, MPI_Request array_of_requests[],
+                                int *index, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Wait(MPI_Request *request, MPI_Status *status);
+OMPI_DECLSPEC  int PMPI_Waitsome(int incount, MPI_Request array_of_requests[],
+                                 int *outcount, int array_of_indices[],
+                                 MPI_Status array_of_statuses[]);
+OMPI_DECLSPEC  int PMPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info,
+                                     MPI_Comm comm, void *baseptr, MPI_Win *win);
+OMPI_DECLSPEC  int PMPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info,
+                                            MPI_Comm comm, void *baseptr, MPI_Win *win);
+OMPI_DECLSPEC  int PMPI_Win_attach(MPI_Win win, void *base, MPI_Aint size);
+OMPI_DECLSPEC  MPI_Fint PMPI_Win_c2f(MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_call_errhandler(MPI_Win win, int errorcode);
+OMPI_DECLSPEC  int PMPI_Win_complete(MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_create(void *base, MPI_Aint size, int disp_unit,
+                                   MPI_Info info, MPI_Comm comm, MPI_Win *win);
+OMPI_DECLSPEC  int PMPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win);
+OMPI_DECLSPEC  int PMPI_Win_create_errhandler(MPI_Win_errhandler_function *function,
+                                              MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_Win_create_keyval(MPI_Win_copy_attr_function *win_copy_attr_fn,
+                                          MPI_Win_delete_attr_function *win_delete_attr_fn,
+                                          int *win_keyval, void *extra_state);
+OMPI_DECLSPEC  int PMPI_Win_delete_attr(MPI_Win win, int win_keyval);
+OMPI_DECLSPEC  int PMPI_Win_detach(MPI_Win win, const void *base);
+OMPI_DECLSPEC  MPI_Win PMPI_Win_f2c(MPI_Fint win);
+OMPI_DECLSPEC  int PMPI_Win_fence(int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_flush(int rank, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_flush_all(MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_flush_local(int rank, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_flush_local_all(MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_free(MPI_Win *win);
+OMPI_DECLSPEC  int PMPI_Win_free_keyval(int *win_keyval);
+OMPI_DECLSPEC  int PMPI_Win_get_attr(MPI_Win win, int win_keyval,
+                                     void *attribute_val, int *flag);
+OMPI_DECLSPEC  int PMPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandler);
+OMPI_DECLSPEC  int PMPI_Win_get_group(MPI_Win win, MPI_Group *group);
+OMPI_DECLSPEC  int PMPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
+OMPI_DECLSPEC  int PMPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+OMPI_DECLSPEC  int PMPI_Win_lock(int lock_type, int rank, int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_lock_all(int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_post(MPI_Group group, int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_set_attr(MPI_Win win, int win_keyval, void *attribute_val);
+OMPI_DECLSPEC  int PMPI_Win_set_errhandler(MPI_Win win, MPI_Errhandler errhandler);
+OMPI_DECLSPEC  int PMPI_Win_set_info(MPI_Win win, MPI_Info info);
+OMPI_DECLSPEC  int PMPI_Win_set_name(MPI_Win win, const char *win_name);
+OMPI_DECLSPEC  int PMPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr);
+OMPI_DECLSPEC  int PMPI_Win_start(MPI_Group group, int mpi_assert, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_sync(MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_test(MPI_Win win, int *flag);
+OMPI_DECLSPEC  int PMPI_Win_unlock(int rank, MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_unlock_all(MPI_Win win);
+OMPI_DECLSPEC  int PMPI_Win_wait(MPI_Win win);
+OMPI_DECLSPEC  double PMPI_Wtick(void);
+OMPI_DECLSPEC  double PMPI_Wtime(void);
+OMPI_DECLSPEC  int PMPI_T_init_thread (int required, int *provided);
+OMPI_DECLSPEC  int PMPI_T_finalize (void);
+OMPI_DECLSPEC  int PMPI_T_cvar_get_num (int *num_cvar);
+OMPI_DECLSPEC  int PMPI_T_cvar_get_info (int cvar_index, char *name, int *name_len,
+                                         int *verbosity, MPI_Datatype *datatype,
+                                         MPI_T_enum *enumtype, char *desc,
+                                         int *desc_len, int *bind, int *scope);
+OMPI_DECLSPEC  int PMPI_T_cvar_get_index (const char *name, int *cvar_index);
+OMPI_DECLSPEC  int PMPI_T_cvar_handle_alloc (int cvar_index, void *obj_handle,
+                                            MPI_T_cvar_handle *handle, int *count);
+OMPI_DECLSPEC  int PMPI_T_cvar_handle_free (MPI_T_cvar_handle *handle);
+OMPI_DECLSPEC  int PMPI_T_cvar_read (MPI_T_cvar_handle handle, void *buf);
+OMPI_DECLSPEC  int PMPI_T_cvar_write (MPI_T_cvar_handle handle, const void *buf);
+OMPI_DECLSPEC  int PMPI_T_category_get_num(int *num_cat);
+OMPI_DECLSPEC  int PMPI_T_category_get_info(int cat_index, char *name, int *name_len,
+                                            char *desc, int *desc_len, int *num_cvars,
+                                            int *num_pvars, int *num_categories);
+OMPI_DECLSPEC  int PMPI_T_category_get_index (const char *name, int *category_index);
+OMPI_DECLSPEC  int PMPI_T_category_get_cvars(int cat_index, int len, int indices[]);
+OMPI_DECLSPEC  int PMPI_T_category_get_pvars(int cat_index, int len, int indices[]);
+OMPI_DECLSPEC  int PMPI_T_category_get_categories(int cat_index, int len, int indices[]);
+OMPI_DECLSPEC  int PMPI_T_category_changed(int *stamp);
+
+OMPI_DECLSPEC  int PMPI_T_pvar_get_num(int *num_pvar);
+OMPI_DECLSPEC  int PMPI_T_pvar_get_info(int pvar_index, char *name, int *name_len,
+                                        int *verbosity, int *var_class, MPI_Datatype *datatype,
+                                        MPI_T_enum *enumtype, char *desc, int *desc_len, int *bind,
+                                        int *readonly, int *continuous, int *atomic);
+OMPI_DECLSPEC  int PMPI_T_pvar_get_index (const char *name, int var_class, int *pvar_index);
+OMPI_DECLSPEC  int PMPI_T_pvar_session_create(MPI_T_pvar_session *session);
+OMPI_DECLSPEC  int PMPI_T_pvar_session_free(MPI_T_pvar_session *session);
+OMPI_DECLSPEC  int PMPI_T_pvar_handle_alloc(MPI_T_pvar_session session, int pvar_index,
+                                            void *obj_handle, MPI_T_pvar_handle *handle, int *count);
+OMPI_DECLSPEC  int PMPI_T_pvar_handle_free(MPI_T_pvar_session session, MPI_T_pvar_handle *handle);
+OMPI_DECLSPEC  int PMPI_T_pvar_start(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+OMPI_DECLSPEC  int PMPI_T_pvar_stop(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+OMPI_DECLSPEC  int PMPI_T_pvar_read(MPI_T_pvar_session session, MPI_T_pvar_handle handle,
+                                    void *buf);
+OMPI_DECLSPEC  int PMPI_T_pvar_write(MPI_T_pvar_session session, MPI_T_pvar_handle handle,
+                                     const void *buf);
+OMPI_DECLSPEC  int PMPI_T_pvar_reset(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+OMPI_DECLSPEC  int PMPI_T_pvar_readreset(MPI_T_pvar_session session, MPI_T_pvar_handle handle,
+                                         void *buf);
+OMPI_DECLSPEC  int PMPI_T_enum_get_info(MPI_T_enum enumtype, int *num, char *name, int *name_len);
+OMPI_DECLSPEC  int PMPI_T_enum_get_item(MPI_T_enum enumtype, int index, int *value, char *name,
+                                        int *name_len);
+
+  /*
+   * Tool MPI API
+   */
+OMPI_DECLSPEC  int MPI_T_init_thread (int required, int *provided);
+OMPI_DECLSPEC  int MPI_T_finalize (void);
+OMPI_DECLSPEC  int MPI_T_cvar_get_num (int *num_cvar);
+OMPI_DECLSPEC  int MPI_T_cvar_get_info (int cvar_index, char *name, int *name_len,
+                                        int *verbosity, MPI_Datatype *datatype,
+                                        MPI_T_enum *enumtype, char *desc,
+                                        int *desc_len, int *bind, int *scope);
+OMPI_DECLSPEC  int MPI_T_cvar_get_index (const char *name, int *cvar_index);
+OMPI_DECLSPEC  int MPI_T_cvar_handle_alloc (int cvar_index, void *obj_handle,
+                                            MPI_T_cvar_handle *handle, int *count);
+OMPI_DECLSPEC  int MPI_T_cvar_handle_free (MPI_T_cvar_handle *handle);
+OMPI_DECLSPEC  int MPI_T_cvar_read (MPI_T_cvar_handle handle, void *buf);
+OMPI_DECLSPEC  int MPI_T_cvar_write (MPI_T_cvar_handle handle, const void *buf);
+OMPI_DECLSPEC  int MPI_T_category_get_num(int *num_cat);
+OMPI_DECLSPEC  int MPI_T_category_get_info(int cat_index, char *name, int *name_len,
+                                           char *desc, int *desc_len, int *num_cvars,
+                                           int *num_pvars, int *num_categories);
+OMPI_DECLSPEC  int MPI_T_category_get_index (const char *name, int *category_index);
+OMPI_DECLSPEC  int MPI_T_category_get_cvars(int cat_index, int len, int indices[]);
+OMPI_DECLSPEC  int MPI_T_category_get_pvars(int cat_index, int len, int indices[]);
+OMPI_DECLSPEC  int MPI_T_category_get_categories(int cat_index, int len, int indices[]);
+OMPI_DECLSPEC  int MPI_T_category_changed(int *stamp);
+
+OMPI_DECLSPEC  int MPI_T_pvar_get_num(int *num_pvar);
+OMPI_DECLSPEC  int MPI_T_pvar_get_info(int pvar_index, char *name, int *name_len,
+                                       int *verbosity, int *var_class, MPI_Datatype *datatype,
+                                       MPI_T_enum *enumtype, char *desc, int *desc_len, int *bind,
+                                       int *readonly, int *continuous, int *atomic);
+OMPI_DECLSPEC  int MPI_T_pvar_get_index (const char *name, int var_class, int *pvar_index);
+OMPI_DECLSPEC  int MPI_T_pvar_session_create(MPI_T_pvar_session *session);
+OMPI_DECLSPEC  int MPI_T_pvar_session_free(MPI_T_pvar_session *session);
+OMPI_DECLSPEC  int MPI_T_pvar_handle_alloc(MPI_T_pvar_session session, int pvar_index,
+                                           void *obj_handle, MPI_T_pvar_handle *handle, int *count);
+OMPI_DECLSPEC  int MPI_T_pvar_handle_free(MPI_T_pvar_session session, MPI_T_pvar_handle *handle);
+OMPI_DECLSPEC  int MPI_T_pvar_start(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+OMPI_DECLSPEC  int MPI_T_pvar_stop(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+OMPI_DECLSPEC  int MPI_T_pvar_read(MPI_T_pvar_session session, MPI_T_pvar_handle handle,
+                                   void *buf);
+OMPI_DECLSPEC  int MPI_T_pvar_write(MPI_T_pvar_session session, MPI_T_pvar_handle handle,
+                                    const void *buf);
+OMPI_DECLSPEC  int MPI_T_pvar_reset(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+OMPI_DECLSPEC  int MPI_T_pvar_readreset(MPI_T_pvar_session session, MPI_T_pvar_handle handle,
+                                        void *buf);
+OMPI_DECLSPEC  int MPI_T_enum_get_info(MPI_T_enum enumtype, int *num, char *name, int *name_len);
+OMPI_DECLSPEC  int MPI_T_enum_get_item(MPI_T_enum enumtype, int index, int *value, char *name,
+                                       int *name_len);
+/*
+ * Deprecated prototypes.  Usage is discouraged, as these may be
+ * deleted in future versions of the MPI Standard.
+ */
+OMPI_DECLSPEC  int MPI_Attr_delete(MPI_Comm comm, int keyval)
+            __mpi_interface_deprecated__("MPI_Attr_delete was deprecated in MPI-2.0; use MPI_Comm_delete_attr instead");
+OMPI_DECLSPEC  int PMPI_Attr_delete(MPI_Comm comm, int keyval)
+            __mpi_interface_deprecated__("PMPI_Attr_delete was deprecated in MPI-2.0; use PMPI_Comm_delete_attr instead");
+OMPI_DECLSPEC  int MPI_Attr_get(MPI_Comm comm, int keyval, void *attribute_val, int *flag)
+            __mpi_interface_deprecated__("MPI_Attr_get was deprecated in MPI-2.0; use MPI_Comm_get_attr instead");
+OMPI_DECLSPEC  int PMPI_Attr_get(MPI_Comm comm, int keyval, void *attribute_val, int *flag)
+            __mpi_interface_deprecated__("PMPI_Attr_get was deprecated in MPI-2.0; use PMPI_Comm_get_attr instead");
+OMPI_DECLSPEC  int MPI_Attr_put(MPI_Comm comm, int keyval, void *attribute_val)
+            __mpi_interface_deprecated__("MPI_Attr_put was deprecated in MPI-2.0; use MPI_Comm_set_attr instead");
+OMPI_DECLSPEC  int PMPI_Attr_put(MPI_Comm comm, int keyval, void *attribute_val)
+            __mpi_interface_deprecated__("PMPI_Attr_put was deprecated in MPI-2.0; use PMPI_Comm_set_attr instead");
+
+/*
+ * Even though MPI_Copy_function and MPI_Delete_function are
+ * deprecated, we do not use the attributes marking them as such,
+ * because otherwise the compiler will warn for all the functions that
+ * are declared using them (e.g., MPI_Keyval_create).
+ */
+typedef int (MPI_Copy_function)(MPI_Comm, int, void *,
+                                void *, void *, int *);
+/* MPI_Copy_function was deprecated in MPI-2.0; use MPI_Comm_copy_attr_function instead. */
+typedef int (MPI_Delete_function)(MPI_Comm, int, void *, void *);
+/* MPI_Delete_function was deprecated in MPI-2.0; use MPI_Comm_delete_attr_function instead. */
+OMPI_DECLSPEC  int MPI_Keyval_create(MPI_Copy_function *copy_fn,
+                                     MPI_Delete_function *delete_fn,
+                                     int *keyval, void *extra_state)
+            __mpi_interface_deprecated__("MPI_Keyval_create was deprecated in MPI-2.0; use MPI_Comm_create_keyval instead.");
+OMPI_DECLSPEC  int PMPI_Keyval_create(MPI_Copy_function *copy_fn,
+                                      MPI_Delete_function *delete_fn,
+                                      int *keyval, void *extra_state)
+            __mpi_interface_deprecated__("PMPI_Keyval_create was deprecated in MPI-2.0; use PMPI_Comm_create_keyval instead.");
+OMPI_DECLSPEC  int MPI_Keyval_free(int *keyval)
+            __mpi_interface_deprecated__("MPI_Keyval_free was deprecated in MPI-2.0; MPI_Comm_free_keyval instead.");
+OMPI_DECLSPEC  int PMPI_Keyval_free(int *keyval)
+            __mpi_interface_deprecated__("PMPI_Keyval_free was deprecated in MPI-2.0; PMPI_Comm_free_keyval instead.");
+
+#if !defined(OMPI_COMPILING_FORTRAN_WRAPPERS)
+#define MPI_DUP_FN OMPI_C_MPI_DUP_FN
+#endif
+OMPI_DECLSPEC int OMPI_C_MPI_DUP_FN( MPI_Comm comm, int comm_keyval,
+                                     void* extra_state,
+                                     void* attribute_val_in,
+                                     void* attribute_val_out,
+                                     int* flag )
+            __mpi_interface_deprecated__("MPI_DUP_FN was deprecated in MPI-2.0; use MPI_COMM_DUP_FN instead.");
+
+#if !defined(OMPI_COMPILING_FORTRAN_WRAPPERS)
+#define MPI_NULL_COPY_FN OMPI_C_MPI_NULL_COPY_FN
+#endif
+OMPI_DECLSPEC int OMPI_C_MPI_NULL_COPY_FN( MPI_Comm comm, int comm_keyval,
+                                           void* extra_state,
+                                           void* attribute_val_in,
+                                           void* attribute_val_out,
+                                           int* flag )
+            __mpi_interface_deprecated__("MPI_NULL_COPY_FN was deprecated in MPI-2.0; use MPI_COMM_NULL_COPY_FN instead.");
+
+#if !defined(OMPI_COMPILING_FORTRAN_WRAPPERS)
+#define MPI_NULL_DELETE_FN OMPI_C_MPI_NULL_DELETE_FN
+#endif
+OMPI_DECLSPEC int OMPI_C_MPI_NULL_DELETE_FN( MPI_Comm comm, int comm_keyval,
+                                             void* attribute_val_out,
+                                             void* extra_state )
+            __mpi_interface_deprecated__("MPI_NULL_DELETE_FN was deprecated in MPI-2.0; use MPI_COMM_NULL_DELETE_FN instead.");
+
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+/*
+ * Removed typedefs.  These typedefs are only available if Open MPI
+ * was configured with --enable-mpi1-compatibility.
+ *
+ * These typedefs were formally removed from the MPI specification
+ * and should no longer be used in MPI applications.
+ *
+ * Even though MPI_Handler_function is removed, we do not use the
+ * attributes marking it as such, because otherwise the compiler
+ * will warn for all the functions that are declared using them
+ * (e.g., MPI_Errhandler_create).
+ */
+typedef void (MPI_Handler_function)(MPI_Comm *, int *, ...);
+/* MPI_Handler_function was removed in MPI-3.0; use MPI_Comm_use_errhandler_function instead. */
+
+/*
+ * Removed prototypes.  These prototypes are only available if Open
+ * MPI was configured with --enable-mpi1-compatibility.
+ *
+ * These functions were formally removed from the MPI specification
+ * and should no longer be used in MPI applications.
+ */
+OMPI_DECLSPEC  int MPI_Address(void *location, MPI_Aint *address)
+                   __mpi_interface_removed__(MPI_Address, MPI_Get_address);
+OMPI_DECLSPEC  int PMPI_Address(void *location, MPI_Aint *address)
+                   __mpi_interface_removed__(PMPI_Address, PMPI_Get_address);
+OMPI_DECLSPEC  int MPI_Errhandler_create(MPI_Handler_function *function,
+                                         MPI_Errhandler *errhandler)
+                   __mpi_interface_removed__(MPI_Errhandler_create, MPI_Comm_create_errhandler);
+OMPI_DECLSPEC  int PMPI_Errhandler_create(MPI_Handler_function *function,
+                                          MPI_Errhandler *errhandler)
+                   __mpi_interface_removed__(PMPI_Errhandler_create, PMPI_Comm_create_errhandler);
+OMPI_DECLSPEC  int MPI_Errhandler_get(MPI_Comm comm, MPI_Errhandler *errhandler)
+                   __mpi_interface_removed__(MPI_Errhandler_get, MPI_Comm_get_errhandler);
+OMPI_DECLSPEC  int PMPI_Errhandler_get(MPI_Comm comm, MPI_Errhandler *errhandler)
+                   __mpi_interface_removed__(PMPI_Errhandler_get, PMPI_Comm_get_errhandler);
+OMPI_DECLSPEC  int MPI_Errhandler_set(MPI_Comm comm, MPI_Errhandler errhandler)
+                   __mpi_interface_removed__(MPI_Errhandler_set, MPI_Comm_set_errhandler);
+OMPI_DECLSPEC  int PMPI_Errhandler_set(MPI_Comm comm, MPI_Errhandler errhandler)
+                   __mpi_interface_removed__(PMPI_Errhandler_set, PMPI_Comm_set_errhandler);
+OMPI_DECLSPEC  int MPI_Type_extent(MPI_Datatype type, MPI_Aint *extent)
+                   __mpi_interface_removed__(MPI_Type_extent, MPI_Type_get_extent);
+OMPI_DECLSPEC  int PMPI_Type_extent(MPI_Datatype type, MPI_Aint *extent)
+                   __mpi_interface_removed__(PMPI_Type_extent, PMPI_Type_get_extent);
+OMPI_DECLSPEC  int MPI_Type_hindexed(int count, int array_of_blocklengths[],
+                                     MPI_Aint array_of_displacements[],
+                                     MPI_Datatype oldtype, MPI_Datatype *newtype)
+                   __mpi_interface_removed__(MPI_Type_hindexed, MPI_Type_create_hindexed);
+OMPI_DECLSPEC  int PMPI_Type_hindexed(int count, int array_of_blocklengths[],
+                                      MPI_Aint array_of_displacements[],
+                                      MPI_Datatype oldtype, MPI_Datatype *newtype)
+                   __mpi_interface_removed__(PMPI_Type_hindexed, PMPI_Type_create_hindexed);
+OMPI_DECLSPEC  int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride,
+                                    MPI_Datatype oldtype, MPI_Datatype *newtype)
+                   __mpi_interface_removed__(MPI_Type_hvector, MPI_Type_create_hvector);
+OMPI_DECLSPEC  int PMPI_Type_hvector(int count, int blocklength, MPI_Aint stride,
+                                     MPI_Datatype oldtype, MPI_Datatype *newtype)
+                   __mpi_interface_removed__(PMPI_Type_hvector, PMPI_Type_create_hvector);
+OMPI_DECLSPEC  int MPI_Type_lb(MPI_Datatype type, MPI_Aint *lb)
+                   __mpi_interface_removed__(MPI_Type_lb, MPI_Type_get_extent);
+OMPI_DECLSPEC  int PMPI_Type_lb(MPI_Datatype type, MPI_Aint *lb)
+                   __mpi_interface_removed__(PMPI_Type_lb, PMPI_Type_get_extent);
+OMPI_DECLSPEC  int MPI_Type_struct(int count, int array_of_blocklengths[],
+                                   MPI_Aint array_of_displacements[],
+                                   MPI_Datatype array_of_types[],
+                                   MPI_Datatype *newtype)
+                   __mpi_interface_removed__(MPI_Type_struct, MPI_Type_create_struct);
+OMPI_DECLSPEC  int PMPI_Type_struct(int count, int array_of_blocklengths[],
+                                    MPI_Aint array_of_displacements[],
+                                    MPI_Datatype array_of_types[],
+                                    MPI_Datatype *newtype)
+                   __mpi_interface_removed__(PMPI_Type_struct, PMPI_Type_create_struct);
+OMPI_DECLSPEC  int MPI_Type_ub(MPI_Datatype mtype, MPI_Aint *ub)
+                   __mpi_interface_removed__(MPI_Type_ub, MPI_Type_get_extent);
+OMPI_DECLSPEC  int PMPI_Type_ub(MPI_Datatype mtype, MPI_Aint *ub)
+                   __mpi_interface_removed__(PMPI_Type_ub, PMPI_Type_get_extent);
+#endif /* !OMPI_OMIT_MPI1_COMPAT_DECLS */
+
+#if OMPI_REMOVED_USE_STATIC_ASSERT
+#define MPI_Address(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Address, MPI_Get_address)
+#define MPI_Errhandler_create(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Errhandler_create, MPI_Comm_create_errhandler)
+#define MPI_Errhandler_get(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Errhandler_get, MPI_Comm_get_errhandler)
+#define MPI_Errhandler_set(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Errhandler_set, MPI_Comm_set_errhandler)
+#define MPI_Type_extent(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Type_extent, MPI_Type_get_extent)
+#define MPI_Type_hindexed(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Type_hindexed, MPI_Type_create_hindexed)
+#define MPI_Type_hvector(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Type_hvector, MPI_Type_create_hvector)
+#define MPI_Type_lb(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Type_lb, MPI_Type_get_extent)
+#define MPI_Type_struct(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Type_struct, MPI_Type_create_struct)
+#define MPI_Type_ub(...)  THIS_FUNCTION_WAS_REMOVED_IN_MPI30(MPI_Type_ub, MPI_Type_get_extent)
+#endif
+
+#if defined(c_plusplus) || defined(__cplusplus)
+}
+#endif
+
+#endif /* OMPI_MPI_H */
diff --git a/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_config_minimal.h b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_config_minimal.h
new file mode 100644
index 00000000..88e4ccb9
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_config_minimal.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007-2021 Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2009-2012 Oak Rigde National Laboratory.  All rights reserved.
+ * Copyright (c) 2011-2020 Sandia National Laboratories. All rights reserved.
+ * Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2011-2013 INRIA.  All rights reserved.
+ * Copyright (c) 2015      University of Houston. All rights reserved.
+ * Copyright (c) 2015-2021 Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2017-2019 IBM Corporation.  All rights reserved.
+ * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021-2022 Google, LLC. All rights reserved.
+ * Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates.  All Rights
+ *                         reserved.
+ * Copyright (c) 2021      Bull S.A.S. All rights reserved.
+ * Copyright (c) 2018      Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/*
+ * Error classes and codes
+ * Do not change the values of these without also modifying mpif.h.in.
+ */
+#define MPI_SUCCESS                   0
+#define MPI_ERR_BUFFER                1
+#define MPI_ERR_COUNT                 2
+#define MPI_ERR_TYPE                  3
+#define MPI_ERR_TAG                   4
+#define MPI_ERR_COMM                  5
+#define MPI_ERR_RANK                  6
+#define MPI_ERR_REQUEST               7
+#define MPI_ERR_ROOT                  8
+#define MPI_ERR_GROUP                 9
+#define MPI_ERR_OP                    10
+#define MPI_ERR_TOPOLOGY              11
+#define MPI_ERR_DIMS                  12
+#define MPI_ERR_ARG                   13
+#define MPI_ERR_UNKNOWN               14
+#define MPI_ERR_TRUNCATE              15
+#define MPI_ERR_OTHER                 16
+#define MPI_ERR_INTERN                17
+#define MPI_ERR_IN_STATUS             18
+#define MPI_ERR_PENDING               19
+#define MPI_ERR_ACCESS                20
+#define MPI_ERR_AMODE                 21
+#define MPI_ERR_ASSERT                22
+#define MPI_ERR_BAD_FILE              23
+#define MPI_ERR_BASE                  24
+#define MPI_ERR_CONVERSION            25
+#define MPI_ERR_DISP                  26
+#define MPI_ERR_DUP_DATAREP           27
+#define MPI_ERR_FILE_EXISTS           28
+#define MPI_ERR_FILE_IN_USE           29
+#define MPI_ERR_FILE                  30
+#define MPI_ERR_INFO_KEY              31
+#define MPI_ERR_INFO_NOKEY            32
+#define MPI_ERR_INFO_VALUE            33
+#define MPI_ERR_INFO                  34
+#define MPI_ERR_IO                    35
+#define MPI_ERR_KEYVAL                36
+#define MPI_ERR_LOCKTYPE              37
+#define MPI_ERR_NAME                  38
+#define MPI_ERR_NO_MEM                39
+#define MPI_ERR_NOT_SAME              40
+#define MPI_ERR_NO_SPACE              41
+#define MPI_ERR_NO_SUCH_FILE          42
+#define MPI_ERR_PORT                  43
+#define MPI_ERR_QUOTA                 44
+#define MPI_ERR_READ_ONLY             45
+#define MPI_ERR_RMA_CONFLICT          46
+#define MPI_ERR_RMA_SYNC              47
+#define MPI_ERR_SERVICE               48
+#define MPI_ERR_SIZE                  49
+#define MPI_ERR_SPAWN                 50
+#define MPI_ERR_UNSUPPORTED_DATAREP   51
+#define MPI_ERR_UNSUPPORTED_OPERATION 52
+#define MPI_ERR_WIN                   53
+
+#define MPI_T_ERR_MEMORY              54
+#define MPI_T_ERR_NOT_INITIALIZED     55
+#define MPI_T_ERR_CANNOT_INIT         56
+#define MPI_T_ERR_INVALID_INDEX       57
+#define MPI_T_ERR_INVALID_ITEM        58
+#define MPI_T_ERR_INVALID_HANDLE      59
+#define MPI_T_ERR_OUT_OF_HANDLES      60
+#define MPI_T_ERR_OUT_OF_SESSIONS     61
+#define MPI_T_ERR_INVALID_SESSION     62
+#define MPI_T_ERR_CVAR_SET_NOT_NOW    63
+#define MPI_T_ERR_CVAR_SET_NEVER      64
+#define MPI_T_ERR_PVAR_NO_STARTSTOP   65
+#define MPI_T_ERR_PVAR_NO_WRITE       66
+#define MPI_T_ERR_PVAR_NO_ATOMIC      67
+#define MPI_ERR_RMA_RANGE             68
+#define MPI_ERR_RMA_ATTACH            69
+#define MPI_ERR_RMA_FLAVOR            70
+#define MPI_ERR_RMA_SHARED            71
+#define MPI_T_ERR_INVALID             72
+#define MPI_T_ERR_INVALID_NAME        73
+#define MPI_ERR_PROC_ABORTED          74
+
+/* not #if conditional on OPAL_ENABLE_FT_MPI for ABI */
+#define MPI_ERR_PROC_FAILED           75
+#define MPI_ERR_PROC_FAILED_PENDING   76
+#define MPI_ERR_REVOKED               77
+
+/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
+   MPI_ERR_<foo> code. Set the last code to allow some room for adding
+   error codes without breaking ABI. */
+#define MPI_ERR_LASTCODE              92
+
+/*
+ * Comparison results.  Don't change the order of these, the group
+ * comparison functions rely on it.
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_IDENT,
+  MPI_CONGRUENT,
+  MPI_SIMILAR,
+  MPI_UNEQUAL
+};
+
+/*
+ * MPI_Init_thread constants
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+};
+
+/*
+ * Datatype combiners.
+ * Do not change the order of these without also modifying mpif.h.in.
+ * (see also mpif-common.h.fin).
+ */
+enum {
+  MPI_COMBINER_NAMED,
+  MPI_COMBINER_DUP,
+  MPI_COMBINER_CONTIGUOUS,
+  MPI_COMBINER_VECTOR,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HVECTOR_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HVECTOR_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HVECTOR,
+  MPI_COMBINER_INDEXED,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HINDEXED_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HINDEXED_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HINDEXED,
+  MPI_COMBINER_INDEXED_BLOCK,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_STRUCT_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_STRUCT_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_STRUCT,
+  MPI_COMBINER_SUBARRAY,
+  MPI_COMBINER_DARRAY,
+  MPI_COMBINER_F90_REAL,
+  MPI_COMBINER_F90_COMPLEX,
+  MPI_COMBINER_F90_INTEGER,
+  MPI_COMBINER_RESIZED,
+  MPI_COMBINER_HINDEXED_BLOCK
+};
+
+////MOCK ERRHANDELERS
+#define OMPI_ERRHANDLER_INVOKE(comm,err,fname)				err
+#define OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,fname)		err
+
diff --git a/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_dims_create.c b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_dims_create.c
new file mode 100644
index 00000000..7353a4fb
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/ompi_dims_create/ompi_dims_create.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2014 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2012      Los Alamos National Security, LLC.  All rights
+ *                         reserved.
+ * Copyright (c) 2014      Intel, Inc. All rights reserved
+ * Copyright (c) 2015 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015-2016 Research Organization for Information Science
+ *                         and Technology (RIST). All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * extraction from openMPI, mocking not needed dependencies for MPI_Dims_create function export: 
+ * Copyright (c) 2022 		Andrea Di Iorio
+ */
+
+//#include "ompi_config.h"
+#include "ompi_config_minimal.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <math.h>
+
+//#include "ompi/mpi/c/bindings.h"
+//#include "ompi/runtime/params.h"
+//#include "ompi/communicator/communicator.h"
+//#include "ompi/errhandler/errhandler.h"
+
+
+static const char FUNC_NAME[] = "MPI_Dims_create";
+
+/* static functions */
+static int assignnodes(int ndim, int nfactor, int *pfacts,int **pdims);
+static int getfactors(int num, int *nfators, int **factors);
+
+
+/*
+ * This is a utility function, no need to have anything in the lower
+ * layer for this at all
+ * ****
+ *	original manpage ... https://www.open-mpi.org/doc/v3.1/man3/MPI_Dims_create.3.php
+ *  For Cartesian topologies, the function MPI_Dims_create helps the user select a balanced distribution of processes per coordinate direction, 
+ *  depending on the number of processes in the group to be balanced and optional constraints that can be specified by the user. 
+ *  One use is to partition all the processes (the size of MPI_COMM_WORLD’s group) into an n-dimensional topology.
+ *	The entries in the array dims are set to describe a Cartesian grid with ndims dimensions and a total of nnodes nodes. 
+ *	The dimensions are set to be as close to each other as possible, using an appropriate divisibility algorithm. 
+ *
+ *	The caller may further constrain the operation of this routine by specifying elements of array dims. 
+ *	If dims[i] is set to a positive number, the routine will not modify the number of nodes in dimension i; only those entries where dims[i] = 0 are modified by the call.
+ *	Negative input values of dims[i] are erroneous. An error will occur if nnodes is not a multiple of ((pi) over (i, dims[i] != 0)) dims[i].
+ *	For dims[i] set by the call, dims[i] will be ordered in nonincreasing order. Array dims is suitable for use as input to routine MPI_Cart_create. MPI_Dims_create is local.
+ *	Example:
+ *	
+ *	dims
+ *	before                    dims
+ *	call        function call        on return
+ *	-----------------------------------------------------
+ *	(0,0)    MPI_Dims_create(6, 2, dims)    (3,2)
+ *	(0,0)    MPI_Dims_create(7, 2, dims)     (7,1)
+ *	(0,3,0)    MPI_Dims_create(6, 3, dims)    (2,3,1)
+ *	(0,3,0)    MPI_Dims_create(7, 3, dims)    erroneous call
+ *	------------------------------------------------------
+ */
+int MPI_Dims_create(int nnodes, int ndims, int dims[])
+{
+    int i;
+    int freeprocs;
+    int freedims;
+    int nfactors;
+    int *factors;
+    int *procs;
+    int *p;
+    int err;
+
+    /*if (MPI_PARAM_CHECK) {
+        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
+
+        if (0 > ndims) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD,
+                                           MPI_ERR_DIMS, FUNC_NAME);
+        }
+
+        if ((0 != ndims) && (NULL == dims)) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD,
+                                           MPI_ERR_ARG, FUNC_NAME);
+        }
+
+        if (1 > nnodes) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD,
+                                           MPI_ERR_DIMS, FUNC_NAME);
+        }
+    }*/
+	assert( ndims > 0 && nnodes > 1 && NULL != dims ); 
+    /* Get # of free-to-be-assigned processes and # of free dimensions */
+    freeprocs = nnodes;
+    freedims = 0;
+    for (i = 0, p = dims; i < ndims; ++i,++p) {
+        if (*p == 0) {
+            ++freedims;
+        } else if ((*p < 0) || ((nnodes % *p) != 0)) {
+            return OMPI_ERRHANDLER_INVOKE (MPI_COMM_WORLD, MPI_ERR_DIMS,
+                                           FUNC_NAME);
+        } else {
+            freeprocs /= *p;
+        }
+    }
+
+    if (freedims == 0) {
+       if (freeprocs == 1) {
+          return MPI_SUCCESS;
+       }
+       return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_DIMS,
+                                     FUNC_NAME);
+    }
+
+    if (freeprocs == 1) {
+        for (i = 0; i < ndims; ++i, ++dims) {
+            if (*dims == 0) {
+               *dims = 1;
+            }
+        }
+        return MPI_SUCCESS;
+    }
+
+    /* Factor the number of free processes */
+    if (MPI_SUCCESS != (err = getfactors(freeprocs, &nfactors, &factors))) {
+       return OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,
+                                     FUNC_NAME);
+    }
+
+    /* Assign free processes to free dimensions */
+    if (MPI_SUCCESS != (err = assignnodes(freedims, nfactors, factors, &procs))) {
+       free(factors);
+       return OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,
+                                     FUNC_NAME);
+    }
+
+    /* Return assignment results */
+    p = procs;
+    for (i = 0; i < ndims; ++i, ++dims) {
+        if (*dims == 0) {
+           *dims = *p++;
+        }
+    }
+
+    free((char *) factors);
+    free((char *) procs);
+
+    /* all done */
+    return MPI_SUCCESS;
+}
+
+/*
+ *  assignnodes
+ *
+ *  Function:   - assign processes to dimensions
+ *          - get "best-balanced" grid
+ *          - greedy bin-packing algorithm used
+ *          - sort dimensions in decreasing order
+ *          - dimensions array dynamically allocated
+ *  Accepts:    - # of dimensions
+ *          - # of prime factors
+ *          - array of prime factors
+ *          - ptr to array of dimensions (returned value)
+ *  Returns:    - 0 or ERROR
+ */
+static int
+assignnodes(int ndim, int nfactor, int *pfacts, int **pdims)
+{
+    int *bins;
+    int i, j;
+    int n;
+    int f;
+    int *p;
+    int *pmin;
+
+    if (0 >= ndim) {
+       return MPI_ERR_DIMS;
+    }
+
+    /* Allocate and initialize the bins */
+    bins = (int *) malloc((unsigned) ndim * sizeof(int));
+    if (NULL == bins) {
+       return MPI_ERR_NO_MEM;
+    }
+    *pdims = bins;
+
+    for (i = 0, p = bins; i < ndim; ++i, ++p) {
+        *p = 1;
+     }
+
+    /* Loop assigning factors from the highest to the lowest */
+    for (j = nfactor - 1; j >= 0; --j) {
+        f = pfacts[j];
+        /* Assign a factor to the smallest bin */
+        pmin = bins;
+        for (i = 1, p = pmin + 1; i < ndim; ++i, ++p) {
+            if (*p < *pmin) {
+                pmin = p;
+            }
+        }
+        *pmin *= f;
+     }
+
+     /* Sort dimensions in decreasing order (O(n^2) for now) */
+     for (i = 0, pmin = bins; i < ndim - 1; ++i, ++pmin) {
+         for (j = i + 1, p = pmin + 1; j < ndim; ++j, ++p) {
+             if (*p > *pmin) {
+                n = *p;
+                *p = *pmin;
+                *pmin = n;
+             }
+         }
+     }
+
+     return MPI_SUCCESS;
+}
+
+/*
+ *  getfactors
+ *
+ *  Function:   - factorize a number
+ *  Accepts:    - number
+ *          - # prime factors
+ *          - array of prime factors
+ *  Returns:    - MPI_SUCCESS or ERROR
+ */
+static int
+getfactors(int num, int *nfactors, int **factors) {
+    int size;
+    int d;
+    int i;
+    int sqrtnum;
+
+    if(num  < 2) {
+        (*nfactors) = 0;
+        (*factors) = NULL;
+        return MPI_SUCCESS;
+    }
+    /* Allocate the array of prime factors which cannot exceed log_2(num) entries */
+    sqrtnum = ceil(sqrt(num));
+    size = ceil(log(num) / log(2));
+    *factors = (int *) malloc((unsigned) size * sizeof(int));
+
+    i = 0;
+    /* determine all occurences of factor 2 */
+    while((num % 2) == 0) {
+        num /= 2;
+        (*factors)[i++] = 2;
+    }
+    /* determine all occurences of uneven prime numbers up to sqrt(num) */
+    d = 3;
+    for(d = 3; (num > 1) && (d <= sqrtnum); d += 2) {
+        while((num % d) == 0) {
+            num /= d;
+            (*factors)[i++] = d;
+        }
+    }
+    /* as we looped only up to sqrt(num) one factor > sqrt(num) may be left over */
+    if(num != 1) {
+        (*factors)[i++] = num;
+    }
+    (*nfactors) = i;
+    return MPI_SUCCESS;
+}
+
+
+#ifdef TEST_MAIN
+int main(int argc,char** argv){
+	if (argc != 2)	{fprintf(stderr,"usage: <N> ->  to fit in a 2D grid");return EXIT_FAILURE;}
+	int dims2[2] = {0,0};
+	int n = atoi(argv[1]);
+	if (MPI_Dims_create(n,2,dims2)){return EXIT_FAILURE;} printf("%d 2D division:\t%d-%d\n",n,dims2[0],dims2[1]);
+}
+#endif //TEST_MAIN
diff --git a/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c b/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
new file mode 100644
index 00000000..e3cea72b
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/sparseUtilsGeneric.c
@@ -0,0 +1,499 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <errno.h>
+
+#include "macros.h"
+#include "sparseMatrix.h"
+#include "utils.h"
+
+///////////////   no offset --> SINGLE implementation functions
+#ifndef SPARSE_UTILS_C
+#define SPARSE_UTILS_C
+///DENSE accumulator utils
+/*
+ * alloc threads' aux arrays once and split them in threads' structures
+ * so free them once from the first thread struct, with the original pointers returned from the alloc
+ */
+ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size){ //TODO PERF WITH NEXT
+	ACC_DENSE* out	 = NULL;
+	double* vAll	 = NULL;
+	ulong* vAllNzIdx = NULL;
+	if (!(out = calloc(num,sizeof(*out)))){
+		ERRPRINT("_initAccVectors aux struct alloc failed\n");
+		return NULL;
+	}
+	if (!(vAll = calloc(num*size,sizeof(*vAll)))) {
+		ERRPRINT("_initAccVectors aux dense vectors alloc failed\n");
+		goto err;
+	}
+	if (!(vAllNzIdx = calloc(num*size,sizeof(*vAllNzIdx)))) {
+		ERRPRINT("_initAccVectors aux dense vectors' idx alloc failed\n");
+		goto err;
+	}
+	for (ulong i=0; i<num; i++){
+		out[i].vLen	= size; //TODO USELESS INFO?
+		out[i].v	= vAll	  + i*size;  
+		out[i].nnzIdx	= vAllNzIdx + i*size;;
+		out[i].nnzIdxMap.len  = 0;
+	}
+	return out;
+	
+	err:
+	free(out);
+	free(vAll);
+	free(vAllNzIdx);
+	return NULL;
+}
+int allocAccDense(ACC_DENSE* v,ulong size){
+		v->vLen = size; 
+		if (!(v->v = calloc(size,sizeof(*(v->v))))) {
+			ERRPRINT("_initAccVectors aux dense vector alloc failed\n");
+			return EXIT_FAILURE;
+		}
+		if (!(v->nnzIdx = calloc(size,sizeof(*(v->nnzIdx))))) {
+			ERRPRINT("_initAccVectors aux dense vector' idx alloc failed\n");
+			return EXIT_FAILURE;
+		}
+		if (initSpVectIdxDenseAcc(size, &v->nnzIdxMap))	return EXIT_FAILURE;
+
+		return EXIT_SUCCESS;
+}
+
+ACC_DENSE* _initAccVectors(ulong num,ulong size){
+	ACC_DENSE* out	= NULL;
+	if (!(out = calloc(num,sizeof(*out)))){
+		ERRPRINT("_initAccVectors aux struct alloc failed\n");
+		return NULL;
+	}
+	for (ulong i=0; i<num; i++){
+		if (allocAccDense(out+i,size))   goto _err;
+	}
+	return out;
+	
+	_err:
+	for (ulong i=0; i<num; i++){
+		if (out[i].v)	   free(out[i].v);
+		if (out[i].nnzIdx)  free(out[i].nnzIdx);
+	}
+	free(out);
+	return NULL;
+}
+
+void freeAccsDense(ACC_DENSE* vectors,ulong num){
+	for (ulong i=0; i<num; i++){
+		free(vectors[i].v);
+		free(vectors[i].nnzIdx);
+		free(vectors[i].nnzIdxMap.idxsMap);
+	}
+	free(vectors);
+}
+void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num){ 
+	if (!vectors)   return;
+	for (ulong i=0; i<num; i++){
+		if(vectors[i].v)	free(vectors[i].v);
+		if(vectors[i].nnzIdx)   free(vectors[i].nnzIdx);
+	}
+	free(vectors);
+}
+
+int initSpVectIdxDenseAcc(idx_t idxMax,SPVECT_IDX_DENSE_MAP* vectIdxsMap){
+	vectIdxsMap->len = 0;
+	nnz_idxs_flags_t* idxMaps = &vectIdxsMap->idxsMap;
+	#if SPVECT_IDX_BITWISE == TRUE //nnz presence falgs as bitflags in limbs
+	vectIdxsMap->idxsMapN = INT_DIV_CEIL(idxMax, LIMB_SIZE_BIT);
+	#else	//nnz presence falgs in a single array
+	vectIdxsMap->idxsMapN = idxMax;
+	#endif //SPVECT_IDX_BITWISE == TRUE
+	if (!(*idxMaps = calloc(vectIdxsMap->idxsMapN, sizeof(**idxMaps)))) {
+		ERRPRINT("initSpVectIdxDenseAcc\tidxMaps SPVECT_IDX_BITWISE callc err\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+void checkOverallocPercent(ulong* forecastedSizes,spmat* AB){
+	for (ulong r=0,rSize,forecastedSize; r < AB->M; r++){
+		forecastedSize = forecastedSizes[r];
+		#ifdef ROWLENS
+		rSize = AB->RL[r];
+		#else
+		rSize = AB->IRP[r+1] - AB->IRP[r];
+		#endif
+		DEBUGCHECKS{
+			if ( forecastedSize < rSize ){
+				ERRPRINT("BAD FORECASTING\n");
+				assert(forecastedSize >= rSize );
+			}
+		}
+		DEBUGPRINT
+			printf("extra forecastedSize of row: %lu\t=\t%lf %% \n",
+			  r,100*(forecastedSize-rSize) / (double) forecastedSize);
+	}
+	idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
+	printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
+	  extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
+}
+int spmatDiff(spmat* A, spmat* B){
+	if (A->NZ != B->NZ){
+		ERRPRINT("NZ differ\n");
+		return EXIT_FAILURE;
+	}
+	if (memcmp(A->IRP,B->IRP,A->M)){
+		ERRPRINT("IRP differ\n");
+		return EXIT_FAILURE;
+	}
+	if (doubleVectorsDiff(A->AS,B->AS,A->NZ,NULL)){
+		ERRPRINT("AS DIFFER\n");
+		return EXIT_FAILURE;
+	}
+	if (memcmp(A->JA,B->JA,A->NZ)){
+		ERRPRINT("JA differ\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+double* CSRToDense(spmat* sparseMat){
+	double* denseMat;
+	ulong i,j,idxNZ,denseSize;
+	if (__builtin_umull_overflow(sparseMat->M,sparseMat->N,&denseSize)){
+		ERRPRINT("overflow in dense allocation\n");
+		return NULL;
+	}
+	if (!(denseMat = calloc(denseSize, sizeof(*denseMat)))){
+		ERRPRINT("dense matrix alloc failed\n");
+		return  NULL;
+	}
+	for (i=0;i<sparseMat->M;i++){
+		for (idxNZ=sparseMat->IRP[i]; idxNZ<sparseMat->IRP[i+1]; ++idxNZ){
+			 j = sparseMat->JA[idxNZ];
+			 //converting sparse item into dense entry
+			 denseMat[(ulong) IDX2D(i,j,sparseMat->N)] = sparseMat->AS[idxNZ]; 
+		}
+	}
+	return denseMat;
+}
+void printSparseMatrix(spmat* spMatrix,char justNZMarkers){
+	double* denseMat = CSRToDense(spMatrix);
+	if (!denseMat)  return;
+	printMatrix(denseMat,spMatrix->M,spMatrix->N,justNZMarkers);
+	free(denseMat);
+}
+
+
+
+static inline int _colsPartitioningUnifRanges_init(spmat* A,uint gridCols,
+  spmat** colParts,idx_t** colPartsLens){
+
+	spmat* colPart;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *tmpJA;
+	///alloc/init partitions structures
+	if (!(*colParts = calloc(gridCols, sizeof(**colParts)))){
+		ERRPRINT("colsPartitioningUnifRanges\tcolumns partitions of A calloc fail\n");
+		return EXIT_FAILURE;
+	}
+	for (ulong i=0,colBlock; i<gridCols; i++){
+		colBlock = UNIF_REMINDER_DISTRI(i,_colBlock,_colBlockRem);
+		colPart  = *colParts + i;
+		if (allocSpMatrixInternal(A->M,colBlock,colPart)){
+			ERRPRINT("colsPartitioningUnifRanges\tallocSpMatrixInternal partition err\n");
+			return EXIT_FAILURE;
+		}
+		//TODO TODO overalloc A cols partitions NZ arrays, then realloc
+		if (!(colPart->AS = malloc(A->NZ * sizeof(*A->AS)))){
+			ERRPRINT("colPart of A overalloc of AS errd\n");
+			return EXIT_FAILURE;
+		}
+		if (!(colPart->JA = malloc(A->NZ * sizeof(*A->JA)))){
+			ERRPRINT("colPart of A overalloc of JA errd\n");
+			return EXIT_FAILURE;
+		}
+	}
+	//for each A col partition -> last copied nz index = nnz copied ammount
+	if (! (*colPartsLens = calloc(gridCols, sizeof(**colPartsLens))) ) {
+		ERRPRINT("colsPartitioningUnifRanges: colPartsLens calloc errd\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+static inline int _colsPartitioningUnifRanges_finalRealloc(spmat* A,uint gridCols,
+  spmat* colParts,idx_t* colPartsLens){
+
+	spmat* colPart;
+	double* tmpAS; idx_t* tmpJA;
+	//realloc overallcd A parts NZ arrays (TODO -> downsizing -> nofails?)
+	for (ulong i=0,partLen; i<gridCols; i++){
+		colPart = colParts + i;
+		partLen = colPartsLens[i];
+		if (!(tmpAS = realloc(colPart->AS,partLen*sizeof(*(colPart->AS))))){
+			ERRPRINT("realloc overallocated cols partition AS array\n");
+			return EXIT_FAILURE;
+		}
+		colPart->AS = tmpAS;
+		if (!(tmpJA = realloc(colPart->JA,partLen*sizeof(*(colPart->JA))))){
+			ERRPRINT("realloc overallocated cols partition JA array\n");
+			return EXIT_FAILURE;
+		}
+		colPart->JA		 = tmpJA;
+		colPart->NZ		 = partLen;
+		colPart->IRP[A->M]  = partLen;
+	}
+	return EXIT_SUCCESS;
+}
+#endif
+
+ 
+#ifndef OFF_F
+	#error generic implementation requires OFF_F defined
+#endif
+////////////////////////  CSR SPECIFIC -- TODO RENAME //////////////////
+///SPARSE MATRIX PARTITIONING
+idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
+	ulong subRowsN = A->M * gridCols;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
+	ulong* offsets = malloc( (subRowsN+1) * sizeof(*offsets) );
+	if (!offsets)  {
+		ERRPRINT("colsOffsetsPartitioningUnifRanges:\toffsets malloc errd\n");
+		return NULL;
+	}
+	///OFFSETS COMPUTE FOR COL GROUPS -> O( A.NZ )
+	for (ulong r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
+		offsets[ IDX2D(r,0,gridCols) ] = j;  //row's first gc start is costrained
+		//navigate column groups inside current row
+		for (ulong gc=1,gcStartCol;  gc<gridCols;  gc++){
+			gcStartCol = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
+			//goto GroupCols start entry,keeping A's nnz entries navigation (idx j)
+			//for (ulong c=A->JA[j]-OFF_F; c<gcStartCol && j < A->IRP[r+1]-OFF_F; c=A->JA[++j]-OFF_F);
+			while (j < A->IRP[r+1]-OFF_F &&  A->JA[j]-OFF_F < gcStartCol)  
+				j++;
+			offsets[ IDX2D(r,gc,gridCols) ] = j;  //row's gc group startIdx
+		}
+	}
+	offsets[subRowsN] = A->NZ;  //last row's partition end costrained
+	return offsets;
+}
+
+spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,
+	   idx_t** colPartsOffsets)
+{
+	spmat *colParts = NULL, *colPart;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols;
+	ulong *colPartsLens=NULL, *tmpJA;
+	double* tmpAS;
+
+	///alloc/init partitions structures
+	idx_t* colOffsets  = NULL;
+	if (!(colOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(A,gridCols))) 
+		goto _err;
+	if (_colsPartitioningUnifRanges_init(A,gridCols,&colParts,&colPartsLens))
+		goto _err;
+	//OFFSET BASED COPY OF A.COL_GROUPS -> O( A.NZ )
+	for (idx_t r=0,gcId=0;	 r<A->M;	r++){
+		for (idx_t gc=0,gcStartIdx=0,gLen=0;  gc<gridCols; gc++,gcId++){
+			//gcId		= IDX2D(r,gc,gridCols); //seqent. scan of parts
+			colPart		= colParts + gc;
+			gcStartIdx 	= colOffsets[ gcId   ];
+			gLen		= colOffsets[ gcId+1 ] - gcStartIdx;
+			colPart->IRP[r] = colPartsLens[gc];	//new line for the col partition
+			//actual copy of nnz entries to colPartitions
+			memcpy(colPart->AS+colPart->IRP[r], A->AS+gcStartIdx, gLen*sizeof(*A->AS));
+			memcpy(colPart->JA+colPart->IRP[r], A->JA+gcStartIdx, gLen*sizeof(*A->JA));
+			colPartsLens[gc] += gLen;
+			#ifdef ROWLENS
+			colPart->RL[r] = i;
+			#endif
+		}
+	}
+
+	//realloc overallcd A parts NZ arrays
+	if(_colsPartitioningUnifRanges_finalRealloc(A,gridCols,colParts,colPartsLens))
+		goto  _err;
+
+	free(colPartsLens);
+	if (colPartsOffsets)	*colPartsOffsets = colOffsets;	//save for the caller
+	else			free(colOffsets);
+
+	return colParts;
+	_err:
+	free(*colPartsOffsets);
+	for (ulong i=0; i<gridCols; i++)   	freeSpmatInternal(colParts+i);
+	free(colOffsets);
+	free(colParts);
+	free(colPartsLens);
+	return NULL;
+}
+spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols){
+	spmat *colParts, *colPart;
+	ulong _colBlock = A->N/gridCols, _colBlockRem = A->N%gridCols, *colPartsLens=NULL, *tmpJA;
+	double* tmpAS;
+	///alloc/init partitions structures
+	if (_colsPartitioningUnifRanges_init(A,gridCols,&colParts,&colPartsLens))
+		goto _err;
+	/* TODO
+	 * Parallelize: 2for collapse OMP, gcEndCol -> startIdxize, ...
+	 * oppure wrappare cio in static inline 
+	 */
+	for (ulong r=0, j=0;	 r<A->M;	 j=A->IRP[++r]-OFF_F){
+		//navigate column groups inside current row
+		for (ulong gc=0,gcEndCol=0,i;  gc<gridCols ;  gc++,j+=i){
+			i = 0;  //@i=len current subpartition of row @r to copy
+			colPart = colParts + gc;
+			//NB: not shifting IRP because is handled as internal implementation component
+ 			//But JA idx memcopied -> kept as they were originally, handled with shift in functions
+			colPart->IRP[r] = colPartsLens[gc];	
+			gcEndCol += UNIF_REMINDER_DISTRI(gc,_colBlock,_colBlockRem);
+			//goto next GroupCols,keeping A's nnz entries navigation ( index j+i )
+			//for (ulong c=A->JA[j+i]-OFF_F; c<gcEndCol && j+i  < A->IRP[r+1]-OFF_F; c=A->JA[j+ ++i]-OFF_F);
+			while ( j+i < A->IRP[r+1]-OFF_F && A->JA[j+i]-OFF_F < gcEndCol ) i++;
+			memcpy(colPart->AS+colPart->IRP[r], A->AS+j, i*sizeof(*A->AS));
+			memcpy(colPart->JA+colPart->IRP[r], A->JA+j, i*sizeof(*A->JA));
+			
+			colPartsLens[gc] += i;
+			#ifdef ROWLENS
+			colPart->RL[r] = i;
+			#endif
+		}
+	}
+	//realloc overallcd A parts NZ arrays
+	if(_colsPartitioningUnifRanges_finalRealloc(A,gridCols,colParts,colPartsLens))
+		goto  _err;
+	free(colPartsLens);
+	return colParts;
+	_err:
+	for (ulong i=0; i<gridCols; i++)   freeSpmatInternal(colParts+i);
+	if(colParts)		free(colParts);
+	if(colPartsLens)	free(colPartsLens);
+	return NULL;
+}
+
+
+void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
+  				   idx_t gridCols,idx_t* bColOffsets){
+	idx_t* abColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(AB, gridCols);
+	assert(abColOffsets);	//partitioning error
+	for (idx_t rLen,forecast,partId=0; partId<AB->M*gridCols; partId++){
+		forecast = forecastedSizes[partId];
+		rLen = abColOffsets[partId+1] - abColOffsets[partId];
+		DEBUGCHECKS	assert(forecast >= rLen);
+	}
+	idx_t extraMatrix = forecastedSizes[AB->M] - AB->NZ;
+	printf("extra forecastedSize of the matrix: \t%lu\t = %lf %% \n",
+		extraMatrix, 100*extraMatrix /(double) forecastedSizes[AB->M]);
+
+	free(abColOffsets);
+	
+}
+
+
+#ifdef SPARSEUTILS_MAIN_TEST	///unit test embbeded
+
+///inline export here 
+//SPMV_CHUNKS_DISTR spmvChunksFair; 
+spmat* allocSpMatrix(ulong rows, ulong cols);
+int allocSpMatrixInternal(ulong rows, ulong cols, spmat* mat);
+void freeSpmatInternal(spmat* mat);
+void freeSpmat(spmat* mat);
+
+////INTERNAL TEST FUNCTIONS
+//test that each row's partition from colsOffsetsPartitioningUnifRanges is in the correct index range
+#include <alloca.h>
+int testColsOffsetsPartitioningUnifRanges(spmat* mat,ulong gridCols,ulong* partsOffs){
+	ulong _colBlock = mat->N/gridCols, _colBlockRem = mat->N%gridCols;
+	ulong j=0;	//CSR scanning nnz idx
+	ulong* colPartsPopulations = alloca(gridCols * sizeof(*colPartsPopulations));
+	memset(colPartsPopulations,0,gridCols * sizeof(*colPartsPopulations));
+	for (ulong r=0,pId=0; r<mat->M; r++){
+		for (ulong gc=0,pStartIdx,pEndIdx; gc<gridCols; gc++,pId++){
+			pStartIdx = UNIF_REMINDER_DISTRI_STARTIDX(gc,_colBlock,_colBlockRem);
+			pEndIdx   = UNIF_REMINDER_DISTRI_STARTIDX(gc+1,_colBlock,_colBlockRem)-1; 
+			//pId=IDX2D(r,gc,gridCols);
+			for (ulong idx=partsOffs[pId],c; idx<partsOffs[pId+1]; idx++,j++){
+				c = mat->JA[idx];
+				assert(j == idx); //consecutive index in partitioning
+				assert(pStartIdx <= c && c <= pEndIdx);	//colRange
+				assert(mat->IRP[r] <= idx && idx <= mat->IRP[r+1] ); //rowRange
+			}
+			colPartsPopulations[gc] += partsOffs[pId+1] - partsOffs[pId]; 
+		}
+	}
+	assert(j == mat->NZ);
+	ulong s=0;
+	for (ulong gc=0,partSize; gc < gridCols; gc++,s+=partSize){
+		partSize = colPartsPopulations[gc];
+		double partShare=partSize/(double)mat->NZ,partsAvg=1/(double)gridCols;
+		double partShareAvgDiff = partShare - partsAvg;
+		printf("colPartition %lu has:\t%lu = %lf of NNZ\t\t .. %lf\tAVG diff\n",
+		  gc,partSize,partShare,partShareAvgDiff);
+	}
+	assert(s == mat->NZ); //TODO DUPLICATED
+	return EXIT_SUCCESS;
+}
+
+CONFIG Conf = {
+	.gridRows = 8,
+	.gridCols = 8,
+};
+
+#include "parser.h"
+int main(int argc, char** argv){
+	int out=EXIT_FAILURE;
+	if (init_urndfd())  return out;
+	if (argc < 2 )  {ERRPRINT("COO MATRIX FOR TEST"); return out;}
+	////parse sparse matrix and dense vector
+	spmat* mat;
+	char* trgtMatrix = TMP_EXTRACTED_MARTIX;
+	if (extractInTmpFS(argv[1],TMP_EXTRACTED_MARTIX) < 0)
+		trgtMatrix = argv[1];
+	if (!(mat = MMtoCSR(trgtMatrix))){
+		ERRPRINT("err during conversion MM -> CSR\n");
+		return out;
+	}
+	////partitioning test
+	ulong* colsPartitions = colsOffsetsPartitioningUnifRanges_0(mat,Conf.gridCols);
+	if (!colsPartitions)	goto _free;
+	if (testColsOffsetsPartitioningUnifRanges(mat,Conf.gridCols,colsPartitions))
+		goto _free;
+
+	out=EXIT_SUCCESS;
+	printf("testColsOffsetsPartitioningUnifRanges passed with "
+		   "mat: %lux%lu-%luNNZ\tgrid: %dx%d\n",
+			mat->M,mat->N,mat->NZ,Conf.gridRows,Conf.gridCols);
+	_free:
+	if (colsPartitions) free(colsPartitions);
+
+	return out;
+}
+#endif //SPARSEUTILS_MAIN_TEST
diff --git a/base/serial/impl/sp3mm4amg/commons/sparseUtilsMulti.c b/base/serial/impl/sp3mm4amg/commons/sparseUtilsMulti.c
new file mode 100644
index 00000000..c7ba2c84
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/sparseUtilsMulti.c
@@ -0,0 +1,38 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "sparseUtilsMulti.h"
+
+#define OFF_F 0 
+#include "sparseUtilsGeneric.c"
+#undef  OFF_F
+
+#define OFF_F 1 
+#include "sparseUtilsGeneric.c"
+#undef  OFF_F
diff --git a/base/serial/impl/sp3mm4amg/commons/utils.c b/base/serial/impl/sp3mm4amg/commons/utils.c
new file mode 100644
index 00000000..d969bf82
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/commons/utils.c
@@ -0,0 +1,502 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//various aux functions
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+//#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+#include <limits.h>
+
+#include "macros.h"
+#include "sparseMatrix.h"
+#include "utils.h"
+
+
+int urndFd; //will point to urandom device file
+
+///IO
+//UNBUFFERED IO
+int write_wrap(int fd,void* src,size_t count){
+	ssize_t wr;
+	size_t written=0;
+	while (written < count){
+		wr=write(fd,src+written,count-written);
+		if (wr<0){
+			perror("write");
+			return wr;
+		}
+		written+=wr;
+	}
+	return 0;
+}
+
+int read_wrap(int fd,void* dst,size_t count){
+	ssize_t rd;
+	size_t readed=0;
+	while (readed < count){
+		rd=read(fd,dst+readed,count-readed);
+		if (rd<0){
+			perror("read");
+			return rd;
+		}
+		readed+=rd;
+	}
+	return 0;
+}
+
+int readALL_wrap(int fd,char** dst,size_t* count){
+	ssize_t rd=!0; //to allow count > fsize
+	size_t readed=0;
+	char allocated=0;   //flag if required *dst allocation
+	if (!(*dst)){  //allocate dst buffer of same size of file
+		off_t seekCurr=lseek(fd,0,SEEK_CUR);
+		off_t fsize=lseek(fd,0,SEEK_END);
+		if( seekCurr==-1 || fsize==-1 || lseek(fd,seekCurr,SEEK_SET)==-1){
+			perror("lseek");
+			return EXIT_FAILURE;
+		}
+		*count=fsize;
+		if (!(*dst=malloc(fsize))){
+			fprintf(stderr,"malloc read_wrap file size buf error\n");
+			return EXIT_FAILURE;
+		}
+		allocated=!0;
+	}
+	//read loop
+	while (readed < *count && rd > 0){
+		rd=read(fd,(*dst)+readed,*count-readed);
+		if (rd<0){
+			perror("read");
+			if (allocated) free(*dst);
+			return rd;
+		}
+		readed+=rd;
+	}
+	if (readed < *count) (*dst)[readed]='\0';	//TODO NEEDED?
+	return EXIT_SUCCESS;
+}
+int init_urndfd(){ // wrap init urndFd
+	if((urndFd=open(DRNG_DEVFILE,O_RDONLY))<0){
+			perror("open DRNG_DEVFILE");
+			return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+int createNewFile(char* const outFpath){
+	int mode=S_IRWXU;
+	errno = 0;
+	int outFd=open(outFpath, O_WRONLY | O_CREAT | O_TRUNC, mode);
+	if (errno==EEXIST)      outFd=open(outFpath, O_WRONLY | O_TRUNC, mode);
+	if (outFd<0)            perror("open outFd failed ");
+	return outFd;
+}
+///BUFFERED IO 
+//TODO series of 0 returns.... fix
+int fread_wrap(FILE* fp,void* dst,size_t count){
+	int rd=0,toRead;
+	size_t readed=0;
+	while (readed < count){
+		toRead = count - readed;
+		rd=fread(dst+readed,1,toRead,fp);
+		if (rd != toRead){ //TODO SHORT ITEM COUNT
+			if (feof(fp))	return EOF;
+			else if (ferror(fp)){
+				ERRPRINT("fread_wrap errd\n");
+				return -2;
+			}
+			//else ERRPRINT("W** SHORT ITEM COUNT RETURN MEANS!?"); //TODO OO
+		}
+		readed+=rd;
+	}
+	return rd;
+}
+///STRUCTURED DATA IO
+int writeDoubleVector(char* fpath,double* v,ulong size){
+	int fd,out=EXIT_FAILURE;
+	if ( (fd = createNewFile(fpath) ) < 0 ) goto _end;
+	write_wrap(fd,v,size * sizeof(*v)); //raw write of vector
+	out = EXIT_SUCCESS;
+	DEBUG printf("written double vector into %s RAW of %lu elements\n",fpath,size);
+
+	_end:
+	if (close(fd) == EOF)  perror("close errd\n");
+	return out;
+}
+
+///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
+int writeDoubleVectorAsStr(char* fpath,double* v,ulong size){
+	int out=EXIT_FAILURE;
+	FILE* fp = fopen(fpath,"w");
+	if (!fp){
+		perror("fopen vector file write");
+		return EXIT_FAILURE;
+	}
+	for (ulong i=0; i<size; i++){
+		if (fprintf(fp,DOUBLE_STR_FORMAT,v[i]) < 0){
+			ERRPRINT("fprintf to out vector file errd\n");
+			goto _end;
+		} 
+	}
+	out = EXIT_SUCCESS;
+	DEBUG printf("written vector into %s of %lu elements\n",fpath,size);
+
+	_end:
+	if (fclose(fp) == EOF)  perror("fclose errd\n");
+	return out;
+
+}
+
+double* readDoubleVector(char* fpath,ulong* size){
+	int rd,hleft;
+	double *out,*tmp;
+	ulong i=0,vectorSize=RNDVECTORSIZE;
+	if (*size)   vectorSize = *size;
+	FILE* fp = fopen(fpath,"r");
+	if (!fp){
+		perror("fopen vector file");
+		return NULL;
+	}
+	if (!(out = malloc(vectorSize * sizeof(*out)))){ 
+		ERRPRINT("vector read malloc fail for file\n");
+		goto _err;
+	}
+	while (1){
+		if (i >= vectorSize ){ //realloc the array
+			vectorSize *= VECTOR_STEP_REALLOC;
+			if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
+				ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
+				goto _err;
+			}
+			out = tmp;
+			DEBUG   printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
+		}
+		ulong toRead = MIN(VECTOR_READ_BLOCK,(vectorSize-i));
+		if((rd = fread_wrap(fp,out + i,sizeof(*out)*toRead)) == -2){
+			ERRPRINT("fread_wrap errd\n");
+			goto _err;
+		}
+		if(feof(fp))	//TODO rd == EOF not work always != W*F???
+			break;
+		i += rd/sizeof(out);
+		if( (hleft = rd % sizeof(out)) ){
+			DEBUG hprintf("half left double read... rounding down fptr\n");
+			if(fseek(fp,-hleft,SEEK_CUR)){
+				perror("fseek in readDoubleVector");
+				goto _err;
+			}
+		}
+	}
+	//REALLOC THE ARRAY TO THE FINAL SIZE
+	assert( i > 0 );
+	if (!(tmp = realloc(out,*size*sizeof(*out)))){
+		ERRPRINT("realloc errd\n");
+		goto _err;
+	}
+	out = tmp;
+	DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
+	goto _free;
+
+	_err:
+	free(out);
+	out = NULL;
+	_free:
+	if (fclose(fp) == EOF)  perror("fclose errd\n");
+	return out;
+}
+
+double* readDoubleVectorStr(char* fpath,ulong* size){
+	int fscanfOut;
+	double *out,*tmp;
+	ulong i=0,vectorSize=RNDVECTORSIZE;
+	if (*size)   vectorSize = *size;
+	FILE* fp = fopen(fpath,"r");
+	if (!fp){
+		perror("fopen vector file");
+		return NULL;
+	}
+	if (!(out = malloc(vectorSize * sizeof(*out)))){ 
+		ERRPRINT("vector read malloc fail for file\n");
+		goto _err;
+	}
+	while (1){
+		if (i >= vectorSize ){ //realloc the array
+			vectorSize *= VECTOR_STEP_REALLOC;
+			if (!(tmp=realloc(out,vectorSize*sizeof(*out)))){
+				ERRPRINTS("realloc errd to ~~ %lu MB\n",vectorSize >> 20);
+				goto _err;
+			}
+			out = tmp;
+			DEBUG   printf("reallocd to ~~ %lu MB\n",vectorSize >> 20);
+		}
+		fscanfOut = fscanf(fp,DOUBLE_STR_FORMAT,out + i++ );
+		if ( fscanfOut == EOF && ferror(fp)){
+			perror("invalid fscanf");
+			goto _err;
+		}
+		if ( fscanfOut != 1 || fscanfOut == EOF )   break;  //end of vector
+	}
+	//REALLOC THE ARRAY TO THE FINAL SIZE
+	assert( i > 0 );
+	*size = --i;
+	if (!(tmp = realloc(out,*size*sizeof(*out)))){
+		ERRPRINT("realloc errd\n");
+		goto _err;
+	}
+	out = tmp;
+	DEBUG printf("readed vector from %s of %lu elements\n",fpath,*size);
+	goto _free;
+
+	_err:
+	free(out);
+	out = NULL;
+	_free:
+	if (fclose(fp) == EOF)  perror("fclose errd\n");
+	return out;
+}
+
+///CFG-AUX
+int getConfig(CONFIG* conf){
+	int changes=EXIT_FAILURE;
+	char *varVal,*ptr;
+	ulong val;
+	if ((varVal = getenv(GRID_ROWS))){
+		val=strtoul(varVal,&ptr,10);
+		if (ptr==varVal || val>= UINT_MAX){
+			perror("strtol errd");
+		} else {
+			conf->gridRows = val;
+		}
+		changes = EXIT_SUCCESS;
+	}
+	if ((varVal = getenv(GRID_COLS))){
+		val=strtoul(varVal,&ptr,10);
+		if (ptr==varVal || val>= UINT_MAX){
+			perror("strtol errd");
+		} else {
+			conf->gridCols = val;
+		}
+		changes = EXIT_SUCCESS;
+	}
+	return changes;
+}
+
+/////LIB-SORTING -- WRAPPERS
+//comparing functions
+static inline int cmp_idx_t(const void* a, const void*b){
+	idx_t aa=*((ulong*) a), bb = *((ulong*) b);
+	return aa==bb?0:aa>bb?1:-1;
+}
+static inline int cmpulong(const void* a, const void*b){
+	ulong aa=*((ulong*) a), bb = *((ulong*) b);
+	return aa==bb?0:aa>bb?1:-1;
+}
+static inline int cmpuint(const void* a, const void*b){
+	uint aa=*((uint*) a), bb = *((uint*) b);
+	return aa==bb?0:aa>bb?1:-1;
+}
+static inline int cmpRbNode(const void* a, const void* b){
+	rbNode *aa=(rbNode*) a, *bb = (rbNode*) b;
+	return cmp_idx_t(&aa->key,&bb->key);
+}
+//sorting functions 
+void sort_idx_t(idx_t* arr, idx_t len){
+	qsort(arr,len,sizeof(*arr),cmp_idx_t);
+}
+void sortulong(ulong* arr, ulong len){
+	qsort(arr,len,sizeof(*arr),cmpulong);
+}
+void sortuint(uint* arr, uint len){
+	qsort(arr,len,sizeof(*arr),cmpuint);
+}
+void sortRbNode(rbNode* arr, idx_t len){
+	qsort(arr,len,sizeof(*arr),cmpRbNode);
+}
+
+////Tests AUX
+inline void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen){
+	if (arrLen > 0 )	return;
+	for (idx_t i=1,last=arrSorted[0]; i<arrLen; last = arrSorted[i++]) 
+		assert( arrSorted[i] != last );
+}
+///MATH UTILS
+
+static inline int rndDouble_sinAll(double* d){
+	if(read_wrap(urndFd,(void*) d,sizeof(*d))){
+		ERRPRINT("read_wrap failed to read rnd double\n");
+		return EXIT_FAILURE;
+	}
+	*d = sin(*d) * MAXRND;
+	return EXIT_SUCCESS;
+}
+long _rndHold;  //permanent storage of rnd longs
+static inline int rndDouble_sinDecimal(double* d){
+	if(read_wrap(urndFd,(void*) &_rndHold,sizeof(_rndHold))){
+		ERRPRINT("read_wrap failed to read holder for rnd double\n");
+		return EXIT_FAILURE;
+	}
+	*d = (_rndHold % (ulong) ceil(MAXRND)) + sin(_rndHold);
+	return EXIT_SUCCESS;
+}
+   
+void statsAvgVar(double* values,uint numVals, double* out){
+	double sum=0,sumSquare=0;
+	for (uint i=0;  i<numVals;  i++){
+		sum += values[i];
+		sumSquare += values[i]*values[i];
+	}
+	out[0]  =   sum/numVals;							//AVG
+	out[1]  =   sumSquare/numVals - (out[0] * out[0]);  //VAR
+}
+
+/// MATRIX - VECTOR COMPUTE UTILS
+int fillRndVector(ulong size, double* v){
+	for( ulong x=0; x<size; ++x ){
+		if(rndDouble_sinAll( v+x )) return EXIT_FAILURE;
+		#ifdef RNDVECTMIN
+		v[x] += RNDVECTMIN;
+		#endif
+	}
+	return EXIT_SUCCESS;
+}
+
+//convention true result in @a, toCheck in @b
+int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax){
+	int out = EXIT_SUCCESS;
+	double diff,diffAbs,_diffMax=0;
+	if (diffMax)	*diffMax = 0;
+	else diffMax = &_diffMax;
+	for (ulong i=0; i<n; i++){
+		diff	= a[i] - b[i];
+		diffAbs = ABS( diff );
+		if( diffAbs > DOUBLE_DIFF_THREASH ){
+			out = EXIT_FAILURE;
+			ERRPRINTS("DOUBLE VECTORS DIFF: DOUBLE_DIFF_THREASH=%lf\t<\t"
+				  "|%13lg| = %lf %% of @a[%lu]\n",
+			  	  DOUBLE_DIFF_THREASH,diff,100*diffAbs/ABS(a[i]),i);
+			#ifdef DOUBLE_VECT_DIFF_EARLY_EXIT
+			/*#pragma message("DOUBLE_VECT_DIFF_EARLY_EXIT: only first diff double reported")*/
+			return EXIT_FAILURE;
+			#endif
+		}
+		if ( ABS(*diffMax) < diffAbs )   *diffMax = diff;
+	}
+	DEBUG{
+		printf("\nchecked diff %s"CEND" between 2 double vector of %lu elements"
+			   "\tmax diff: %le %s threash: %le\n", !out?CCC"OK":CCCERR"ERR!",
+				n,*diffMax,*diffMax<DOUBLE_DIFF_THREASH?"<":">=",
+			   DOUBLE_DIFF_THREASH);
+		if (!*diffMax){ //self diff check uselss TODO REMOVE
+			if (!memcpy(a,b,n*sizeof(*a)))
+				printf("EXACT MATCHING AMONG THE 2 DOUBLE VECTORS\n!");
+		}
+	}
+	return out;
+}
+
+void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers){
+	printf("printing matrix: %lu x %lu\n",m,n);
+	ulong i,j;
+	for (i=0;i<m;i++){
+		for (j=0;j<n;j++){
+			if (justNZMarkers)  printf("%s",mat[IDX2D(i,j,n)]?".":" ");
+			else				printf("%1.1lf ",mat[IDX2D(i,j,n)]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+
+void printVector(double* v,ulong size){
+	for( ulong i=0;i<size;i++)   printf("%1.1lf ",v[i]);
+	printf("\n");
+}
+
+////VAR -- MISC
+/* search for pattern in @str from NULL terminated @patterns array
+ * return pointer of pattern in @str if any, otherwise NULL
+ */
+static inline char* searchPatternsInStr(char** patterns,char* str){
+	char* out = NULL;
+	for (char** p=patterns;  !out && *p;	p++)	out = strstr(str,*p);
+	return out;
+}
+/* search for pattern in @str from NULL terminated @patterns array
+ * return pointer of pattern in @str if any, otherwise NULL
+ */
+static inline char* searchPatternInStrs(char* pattern,char** strings){
+	char* out = NULL;
+	for (char** str=strings;  !out && *str;	str++)	out = strstr(*str,pattern);
+	return out;
+}
+
+inline int appendArr(ulong val,APPENDARRAY* list){
+	return 0;   //TODO
+}
+
+///DECOMPRESSION IN TMPFS  ; TODO vector not puttable here... figure out nicer way..
+char* COMPRESS_EXTENSIONS[] = { ".gz", ".xz", ".bz2", ".zip", NULL};
+#define STD_DECOMPR_FLAGS " -d -c "
+char* DECOMPRESS_CMDS[] = {"gzip" STD_DECOMPR_FLAGS, "xz" STD_DECOMPR_FLAGS, "bzip2" STD_DECOMPR_FLAGS,
+			   "unzip -c", NULL }; //zip is too old ;)
+
+int extractInTmpFS(char* path, char* tmpFsDecompressPath){
+	char* ext = searchPatternsInStr(COMPRESS_EXTENSIONS,path);
+	if (!ext)   return -1;	//NOT SUPPORTED COMPRESS EXTENSION -> TRY REGULAR PATH
+	//search first 2 char after dot of ext in DECOMPRESS_CMDS to get the decompress cmd
+	if (strlen(ext) < 3 ){
+		ERRPRINTS("NOT SUPPORTED DECOMPRESSION:\textension %s too short to be matched",ext);
+		return -1;
+	}
+	char extStart[3];
+	extStart[0] = ext[1];
+	extStart[1] = ext[2];
+	extStart[2] = '\0';
+	char* decompressCmdBase = searchPatternInStrs(extStart,DECOMPRESS_CMDS);
+	if (!decompressCmdBase){
+		ERRPRINTS("NOT SUPPORTED DECOMPRESS FOR %s\n",ext);
+		return -1;
+	}
+	uint cmdLen = strlen(decompressCmdBase) + strlen(path) + 4 + strlen(tmpFsDecompressPath);
+	char* decompressCmd = alloca(cmdLen+1);
+	if (snprintf(decompressCmd,cmdLen+1,"%s %s > %s",
+		  decompressCmdBase,path,tmpFsDecompressPath) < 0){
+		ERRPRINT("extractInTmpFS, snprintf errd\n");
+	}
+	VERBOSE printf("decompressing %s --> %s\ncmd:\t%s\n",path,tmpFsDecompressPath,decompressCmd);
+	return system(decompressCmd);
+}
diff --git a/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c b/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
new file mode 100644
index 00000000..ae302346
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/fbind/psb_f_spmm_ub.c
@@ -0,0 +1,107 @@
+#include "../include/Sp3MM_CSR_OMP_Multi.h"
+#include "../include/utils.h"
+
+/**
+ * @brief performs multiplication of two sparse matrices 
+ * A and B stored in the CRS format. The resulting matrix is C
+ * 
+ * @param[in] a_m           number of columns of A
+ * @param[in] a_n           number of rows of A
+ * @param[in] a_nz          number of non zero elements of A
+ * @param[in] a_as          array with the non zero coefficients of A
+ * @param[in] a_ja          array with the column indices of the non 
+ *                      zero coefficients of A
+ * @param[in] a_irp         array with the indices of the beginning of 
+ *                      each row in a_as and a_ja
+ * @param[in] a_rl          array with the lengths of each rows of A
+ * @param[in] a_max_row_nz  maximum number of coefficients in a row in A
+ * @param[in] b_m           number of columns of B
+ * @param[in] b_n           number of rows of B
+ * @param[in] b_nz          number of non zero elements of B
+ * @param[in] b_as          array with the non zero coefficients of B
+ * @param[in] b_ja          array with the column indices of the non
+ *                      zero coefficients of B
+ * @param[in] b_irp         array with the indices of the beginning of
+ *                      each row in b_as and b_ja
+ * @param[in] b_rl          array with the lengths of each rows of B
+ * @param[in] b_max_row_nz  maximum number of coefficients in a row in B
+ * @param[out] c_m           number of colums of C
+ * @param[out] c_n           number of rows of C
+ * @param[out] c_nz          number of non zero elements in C
+ * @param[out] c_as          array with the non zero coefficients of C
+ * @param[out] c_ja          array with the column indices of the non
+ *                      zero coefficients of C
+ * @param[out] c_irp         array with the indices of the beginning of
+ *                      each row in c_as and c_ja
+ * @param[out] c_rl          array with the lengths of each rows of C
+ * @param[out] c_max_row_nz  maximum number of coefficients in a row in C
+ * @param[out] info          return value to check if the operation was successful
+ */
+#ifdef ROWLENS
+void psb_f_spmm_row_by_row_ub_0(idx_t a_m, idx_t a_n, idx_t a_nz, 
+                                double *a_as, idx_t *a_ja, 
+                                idx_t *a_irp, idx_t *a_rl, idx_t a_max_row_nz, 
+                                idx_t b_m, idx_t b_n, idx_t b_nz, 
+                                double *b_as, idx_t *b_ja, 
+                                idx_t *b_irp, idx_t *b_rl, idx_t b_max_row_nz,
+                                idx_t *c_m, idx_t *c_n, idx_t *c_nz, 
+                                double **c_as, idx_t **c_ja, 
+                                idx_t **c_irp, idx_t **c_rl, idx_t *c_max_row_nz,
+                                int info)
+#else
+void psb_f_spmm_row_by_row_ub_0(idx_t a_m, idx_t a_n, idx_t a_nz, 
+                                double *a_as, idx_t *a_ja, 
+                                idx_t *a_irp, idx_t a_max_row_nz, 
+                                idx_t b_m, idx_t b_n, idx_t b_nz, 
+                                double *b_as, idx_t *b_ja, 
+                                idx_t *b_irp, idx_t b_max_row_nz,
+                                idx_t *c_m, idx_t *c_n, idx_t *c_nz, 
+                                double **c_as, idx_t **c_ja, 
+                                idx_t **c_irp,  idx_t *c_max_row_nz,
+                                int info)
+#endif                            
+{
+    int rc;
+    spmat *a, *b, *c;
+    CONFIG *cfg;
+
+    #ifdef ROWLENS
+    a->RL = a_rl;
+    b->RL = b_rl;
+    #endif//ROWLENS
+
+    // setting up cfg
+    // TODO : CHECK THAT THIS IS COMPATIBLE WITH PSB
+    rc = getConfig(cfg);
+
+    // setting up spmat type matrices
+    a->M = a_m;
+    a->N = a_n;
+    a->NZ = a_nz;
+    a->AS = a_as;
+    a->JA = a_ja;
+    a->IRP = a_irp;
+    a->MAX_ROW_NZ = a_max_row_nz;
+
+    b->M = b_m;
+    b->N = b_n;
+    b->NZ = b_nz;
+    b->AS = b_as;
+    b->JA = b_ja;
+    b->IRP = b_irp;
+    b->MAX_ROW_NZ = b_max_row_nz;
+
+    // performing spmm
+    c = spmmRowByRow_0(a, b, cfg);
+
+    // output result
+    *(c_m) = c->M;
+    *(c_n) = c->N;
+    *(c_nz)= c->NZ;
+    *(c_as)= c->AS;
+    *(c_ja)= c->JA;
+    *(c_irp)=c->IRP;
+    #ifdef ROWLENS
+    *(c_rl)= c->RL;
+    #endif
+}
\ No newline at end of file
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
new file mode 100644
index 00000000..57676b59
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Multi.h
@@ -0,0 +1,94 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+//CORE IMPLEMENTATIONS HEADER
+#ifndef SP3MM_CSR_OMP_MULTI_H
+#define SP3MM_CSR_OMP_MULTI_H
+
+///commons single implementation stuff
+#include "macros.h"
+#include "sparseMatrix.h"
+
+///aux structures
+//hold SPMM result over a unpartitionated space among threads-row[s' blocks]
+typedef struct{
+	//space to hold SPMM output
+	ulong*  JA;
+	double* AS;
+	ulong   size;			//num of entries allocated -> only dbg checks
+	ulong   lastAssigned;	//last JA&AS assigned index to an accumulator(atom)
+	SPACC*  accs;			//SPARSIFIED ACC POINTERS
+	uint	accsNum;	
+} SPMM_ACC; //accumulator for SPMM
+///compute function interface and its pointer definitions
+typedef spmat* ( SPMM)		(spmat*,spmat*,CONFIG*);
+typedef spmat* (*SPMM_INTERF)	(spmat*,spmat*,CONFIG*);
+typedef spmat* ( SP3MM)		(spmat*,spmat*,spmat*,CONFIG*,SPMM_INTERF);
+typedef spmat* (*SP3MM_INTERF)  (spmat*,spmat*,spmat*,CONFIG*,SPMM_INTERF);
+
+typedef enum {
+	_1D_DIRECT,
+	_1D_BLOCKS,
+	_2D_OFFSET,
+	_2D_ALLOCD 
+} SPMM_IMPL_TYPE;
+///-- commons single implementation stuff
+
+///includes
+#include "linuxK_rbtree_minimalized.h"
+#include "Sp3MM_CSR_OMP_SymbStep_Multi.h"
+
+//extern char TRGT_IMPL_START_IDX; //multi implementation switch
+#include "sparseUtilsMulti.h"
+#ifdef OFF_F	//save "includer" OFF_F value before overwriting it
+	#pragma push_macro("OFF_F")
+	#define _OFF_F_OLD
+	#undef  OFF_F
+#endif
+
+
+#define OFF_F 0
+#include "Sp3MM_CSR_OMP_UB_Generic.h"
+#include "Sp3MM_CSR_OMP_Num_Generic.h"
+#undef OFF_F
+
+#define OFF_F 1
+#include "Sp3MM_CSR_OMP_UB_Generic.h"
+#include "Sp3MM_CSR_OMP_Num_Generic.h"
+#undef OFF_F
+
+
+
+#ifdef _OFF_F_OLD
+	#pragma pop_macro("OFF_F")
+	#undef  _OFF_F_OLD
+#endif
+
+
+#endif	//SP3MM_CSR_OMP_MULTI_H 
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Num_Generic.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Num_Generic.h
new file mode 100644
index 00000000..91c20449
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_Num_Generic.h
@@ -0,0 +1,95 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OFF_F
+    /*#pragma message("generic implementation requires OFF_F defined")*/
+    #error generic implementation requires OFF_F defined
+#endif
+
+/////SYMBOLIC - NUMERIC IMPLEMENTATIONS
+///SP3MM FUNCTIONS
+/*
+ *  triple matrix multiplication among @R * @AC * @P using gustavson parallel implementation
+ *  implmented as a pair of subsequent spmm operations
+ *  if @conf->spmm != NULL, it will be used as spmm function, otherwise euristics will be 
+ *  used to decide wich implementation to use
+ */
+SP3MM CAT(sp3mmRowByRowPair_SymbNum_,OFF_F);
+
+/*
+ * row-by-row-by-row implementation: forwarding @R*@AC rth row to P for row-by-row
+ * accumulation in preallocated space, TODO exactly determined
+ * basic parallelization: 1thread per @R's rows that will also forward the result to P
+ */
+SP3MM CAT(sp3mmRowByRowMerged_SymbNum_,OFF_F);
+
+///SUB FUNCTIONS
+///SPMM FUNCTIONS
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson row-by-row
+ * formulation using an aux dense vector @_auxDense
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow_SymbNum_,OFF_F);
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson 
+ * with partitioning of @A in @conf->gridRows blocks of rows  
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, accessed by aux offsets
+ */
+SPMM CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, ALLOCATED as CSR submatrixes
+ */
+SPMM CAT(spmmRowByRow2DBlocksAllocated_SymbNum_,OFF_F);
+
+///implementation wrappers as static array of function pointers 
+//sp3mm as pair of spmm
+static SPMM_INTERF  CAT(Spmm_SymbNum_Funcs_,OFF_F)[] = {
+	& CAT(spmmRowByRow_SymbNum_,OFF_F),
+	& CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F),
+	//& CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F),
+	//& CAT(spmmRowByRow2DBlocksAllocated_SymbNum_,OFF_F)
+};
+//sp3mm as pair of spmm
+static SP3MM_INTERF CAT(Sp3mm_SymbNum_Funcs_,OFF_F)[] = {
+	//& CAT(sp3mmRowByRowMerged_SymbNum_,OFF_F)
+};
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Generic.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Generic.h
new file mode 100644
index 00000000..017257eb
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Generic.h
@@ -0,0 +1,195 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+/*
+ * CSR Sp[3]MM Symbolic step implementations
+ * target: 	compute the output matrix size and the row lens for preallocation
+ * 			direct write out partial results
+ * See interfaces in respective header
+ *
+ * MultiImplementations functions with all parameters
+ * several functions (lower level) has config based on 
+ * 	OUT_IDXS for returing or using also symbMul Output idxs somehow
+ * 	COL_PARTS for returing or using also distribution of symb out idxs in col partition
+ */
+
+///MultiImplementations
+///setup aux macros for different signatures implementation via #if arith expr
+#ifdef OUT_IDXS 
+	#define _OUT_IDXS  	TRUE
+#else   
+	#define _OUT_IDXS 	FALSE
+	#define OUT_IDXS	_UNDEF
+#endif
+#ifdef COL_PARTS
+	#define _COL_PARTS	TRUE
+#else   
+	#define _COL_PARTS	FALSE
+	#define COL_PARTS	_UNDEF
+#endif
+///
+#if !defined OFF_F 
+	#error generic implementations requires OFF_F 
+#endif
+
+
+
+
+/*
+ * Compute symbolic product of (nnz indexes of) row @aRowJA and matrix @b
+ * insert nnz indexes of the mul. result row as nodes in a rbtree rooted at @root
+ * with nodes in @nodes which have to be enough for the mul result row (use an UB)
+ * Retuns: multiplication result row NNZ number,se CONFIG_MACROS below for more 
+ *
+ * CONFIG_MACROS:
+ * if _OUT_IDXS == TRUE	return mul.result row nnz idxs in @outIdxs
+ *	  ifdef: OUT_IDXS_RBTREE_NODES: nnz indexes returned inplace sorting rbtree
+ * 								as nnz indexes(JA) of the mul result row
+ * else:							stop at returning the mul. result row lenght
+ * if _COL_PARTS == TRUE	return the number of nonzero elements in
+ *		in each of the @gridCols column partitions inside @rowColPartsLens
+ * OFF_F:	offset back indexes from fortran
+ * 		TODO also output indexes are shifted (see c_b )
+
+idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F) (idx_t* aRowJA,idx_t aRowLen,
+  spmat* b,rbRoot* root,rbNode* nodes
+	#if _OUT_IDXS  == TRUE
+	#ifndef OUT_IDXS_RBTREE_NODES 
+	,idx_t* outIdxs
+	#endif
+	#endif
+	#if _COL_PARTS == TRUE
+	,ushort gridCols,idx_t* rowColPartsOffsets
+	#endif
+  );
+ */
+
+/*
+ * SPVECT_IDX_DENSE_MAP based, as SpMM_Row_Symb_Rbtree but with idxMap aux idx keeping
+ * CONFIG_MACROS (new)
+ * IDX_RMUL_SYMB_RBTREE && ( _OUT_IDXS == T || _COL_PARTS == T ):
+ * 	 (tmp) symb mult out indexes will be kept via a rbtree
+ * 	 otherwise directly in the out array appending them and then sorting them
+ * 	 		(potentially same n log n)
+
+idx_t CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
+  (
+   idx_t* aRowJA, idx_t aRowLen, spmat* b, SPVECT_IDX_DENSE_MAP* idxsMapAcc
+   #if _OUT_IDXS  == TRUE 
+   ,idx_t* outIdxs
+	#if IDX_RMUL_SYMB_RBTREE == TRUE
+	,rbRoot* root, rbNode* nodes
+	#endif
+   #endif	// _OUT_IDXS == TRUE
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols,idx_t* rowColPartsLens
+   #endif
+  );
+ */
+
+/*
+ * Compute symbolic product of sparse matrixes @a * @b
+ * Alloc aux structures based on a upper bounded allocation
+ * Returns array of exact row lens of the result mul matrix c=@a*@b
+ * 		plus an extra entry for the cumulative num of matrix nnz 
+ * 		(interoperability with upper bound implementations)
+ *
+ * CONFIG_MACROS:
+ * 	if _OUT_IDXS == TRUE:	
+ * 		return in *@outIdxs pointers to overallocd JAs nnz indexes
+ * 		*outIdxs[0] --> start of first row (its size in 0th of returned array)
+ * 						also is the malloc returned address for later free
+ * 							[NB: rows are not contiguos in the allocation]
+ * 	if _COL_PARTS == TRUE: 	return in *@rowColPartsLens
+ * 		a matrix @a->M * @gridCols of offsets 
+ * 		for each column partition in each row
+ */
+idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
+  (
+   ROW_MMSYM_IMPL_MODE symbRowImplID, spmat* a, spmat* b
+   #if _OUT_IDXS  == TRUE
+   ,idx_t*** outIdxs
+   #endif
+   #if _COL_PARTS == TRUE
+   ,ushort gridCols, idx_t** rowColPartsLens
+   #endif
+  );
+
+//////Sp3MM - rowByrowByrow
+#define SP3MM_UB_HEURISTIC	2
+#if !defined COL_PARTS && defined OUT_IDXS
+///MultiImplementations functions without COL_PARTS
+//NB required OUT_IDXS for initial row-by-row step...
+/*
+ * as earlier but meant for work with 
+ * triple product as rob-by-row-by-row forwarding:
+ * @abRowJATmp is a tmp storage for first row-by-row sym product
+ * 	has to be big enough to store all the possible nonzero elements
+ * @nodes has to be big enough to store all possible nnz of ab and abc row
+ * 	(not only ab row as earlier version)
+ *
+ */
+idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID, idx_t* aRowJA,idx_t aRowLen,
+	spmat* b,spmat* c, rbRoot* root,rbNode* nodes, idx_t* abRowJATmp
+	#if _OUT_IDXS  == TRUE
+	#ifndef OUT_IDXS_RBTREE_NODES 
+	,idx_t* outIdxs
+	#endif
+	#endif
+  );
+
+/*
+ * triple product @a*@b*@c as rob-by-row-by-row forwarding:
+ * Returns: resulting matrix rows' sizes exactly in an array
+ * 			plus an extra element for the res.matrix's total size
+ * CONFIG_MACROS:
+ * 	if _OUT_IDXS == TRUE:	
+ * 		return in *@outIdxs pointers to overallocd JAs nnz indexes
+ * 		*outIdxs[0] --> start of first row (its size in 0th of returned array)
+ * 						also is the malloc returned address for later free
+ * 							[NB: rows are not contiguos in the allocation]
+ */ 
+//TODO 	HEURISTICS IN _OUT_IDXS to avoid serializing after first sym.product
+//		see HEURISTICS_UB
+idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F) 
+  (
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID, spmat* a, spmat* b, spmat* c
+	#if _OUT_IDXS  == TRUE
+	,idx_t*** outIdxs
+	#endif
+  );
+
+#endif
+	
+#undef OUT_IDXS
+#undef _OUT_IDXS
+#undef COL_PARTS
+#undef _COL_PARTS
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Multi.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Multi.h
new file mode 100644
index 00000000..f46a2779
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_SymbStep_Multi.h
@@ -0,0 +1,80 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef SP3MM_CSR_OMP_SYMB_MULTI
+#define SP3MM_CSR_OMP_SYMB_MULTI
+
+/* Multi implementation of symbolic product of sparse matrixes, config macros
+ * OFF_F:		C-Fortran spmat indexing
+ * OUT_IDXS:	indexes output
+ * COL_PARTS:	partitioning columns output...
+ */
+
+#define OUT_IDXS_ON		OutIdxs_
+#define COL_PARTS_ON	ColParts_
+#undef OUT_IDXS
+#undef COL_PARTS
+
+#define OFF_F 0
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+		//generate outIdxs AND colParts ve sions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
+#define OFF_F 1
+	///generate basic versions
+	#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	///generate outIdxs versions
+	#define OUT_IDXS 	OUT_IDXS_ON	
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+	#undef  OUT_IDXS
+	///generate colParts versions
+	#define COL_PARTS	COL_PARTS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+		//generate outIdxs AND colParts ve sions
+		#define OUT_IDXS 	OUT_IDXS_ON
+		#include "Sp3MM_CSR_OMP_SymbStep_Generic.h"
+
+	#undef OUT_IDXS
+	#undef COL_PARTS
+#undef OFF_F
+
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
new file mode 100644
index 00000000..dc59a6e7
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/Sp3MM_CSR_OMP_UB_Generic.h
@@ -0,0 +1,95 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OFF_F
+    /*#pragma message("generic implementation requires OFF_F defined")*/
+    #error generic implementation requires OFF_F defined
+#endif
+
+///SP3MM FUNCTIONS
+/*
+ *  triple matrix multiplication among @R * @AC * @P using gustavson parallel implementation
+ *  implmented as a pair of subsequent spmm operations
+ *  if @conf->spmm != NULL, it will be used as spmm function, otherwise euristics will be 
+ *  used to decide wich implementation to use
+ */
+SP3MM CAT(sp3mmRowByRowPair_,OFF_F);
+
+/*
+ * row-by-row-by-row implementation: forwarding @R*@AC rth row to P for row-by-row
+ * accumulation in preallocated space, TODO exactly determined
+ * basic parallelization: 1thread per @R's rows that will also forward the result to P
+ */
+SP3MM CAT(sp3mmRowByRowMerged_,OFF_F);
+
+///SUB FUNCTIONS
+///SPMM FUNCTIONS
+SPMM CAT(spmmSerial_,OFF_F); //mono thread version for debug oracle-less
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson row-by-row
+ * formulation using an aux dense vector @_auxDense
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow_,OFF_F);
+/*
+ * sparse parallel implementation of @A * @B parallelizing Gustavson 
+ * with partitioning of @A in @conf->gridRows blocks of rows  
+ * return resulting product matrix
+ */
+SPMM CAT(spmmRowByRow1DBlocks_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, accessed by aux offsets
+ */
+SPMM CAT(spmmRowByRow2DBlocks_,OFF_F);
+
+/* 
+ * sparse parallel implementation of @A * @B as Gustavson parallelizzed in 2D
+ * with partitioning of
+ * @A into rows groups, uniform rows division
+ * @B into cols groups, uniform cols division, ALLOCATED as CSR submatrixes
+ */
+SPMM CAT(spmmRowByRow2DBlocksAllocated_,OFF_F);
+
+///implementation wrappers as static array of function pointers
+//sp3mm as pair of spmm
+static SPMM_INTERF  CAT(Spmm_UB_Funcs_,OFF_F)[] = {
+	& CAT(spmmRowByRow_,OFF_F),
+	& CAT(spmmRowByRow1DBlocks_,OFF_F),
+	& CAT(spmmRowByRow2DBlocks_,OFF_F),
+	& CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)
+};
+//sp3mm as direct product
+static SP3MM_INTERF CAT(Sp3mm_UB_Funcs_,OFF_F)[] = {
+	& CAT(sp3mmRowByRowMerged_,OFF_F)
+};
diff --git a/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h b/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
new file mode 100644
index 00000000..1eb40410
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/SpMMUtilsGeneric.h
@@ -0,0 +1,406 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+///scalar-vector multiply
+//TODO che guadagno si ha ad utilizzare solo la versione generica delle successive 2 funzioni
+/*
+ * Sparse vector part <->scalar multiplication in a dense output
+ * sparse vector part will hold nnz values in @vectVals 		 (AS subvector)
+ * with corresponding indexes in @vectIdxs in range [0,@vectLen] (JA subvector)
+ * resulting vector accumulated in a dense array in @aux->v, along with nnzIdx
+ * both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
+ */
+inline void CAT(scSparseVectMul_,OFF_F)
+  (double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux){
+	for (ulong i=0,j; i<vectLen; i++){
+		j = vectIdxs[i]-OFF_F;
+		DEBUGCHECKS{
+			if (j>=aux->vLen){
+				fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
+				assert(j < aux->vLen);
+			}
+		}
+		aux->v[j] += vectVals[i] * scalar;  //accumulate
+		//append new nonzero index to auxVNNZeroIdxs for quick sparsify
+		//if (!(aux->v[j]))	aux->nnzIdx[ aux->nnzIdxMap.len++ ] = j; //TODO numerical zero may then cause readd
+		if(!spVect_idx_in(j,&aux->nnzIdxMap))
+			aux->nnzIdx[ aux->nnzIdxMap.len-1 ] = j;
+	}
+}
+
+/* Same as above scSparseVectMul_ , but consider an initial offset to remove from each idx
+ * Sparse vector part <->scalar multiplication in a dense output
+ * @vectVals:	sparse vector part values	( from spmat.AS )
+ * with [at least] @vectLen corresponding 
+ * target idxs in @vectIdxs (from spmat.JA ) starting from @startIdx 
+ *
+ * Resulting vector accumulated in a dense array in @aux->v, along with nnzIdx
+ * all nnz values indexes will be shifted back of @startIdx in @aux
+ * both of accumulator's dense array and nnzIdx in @aux and has to be big @vectLen
+ */
+inline void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,
+  ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux){
+	for (ulong i=0,j; i<vectLen; i++){
+		j = vectIdxs[i]-OFF_F - startIdx;
+		DEBUGCHECKS{
+			if (j>=aux->vLen){
+				fprintf(stderr,"index %lu outside vLen %lu\n",j,aux->vLen);
+				assert(j < aux->vLen);
+			}
+		}
+		aux->v[j] += vectVals[i] * scalar;  //accumulate
+		//append new nonzero index to auxVNNZeroIdxs for quick sparsify
+		//if (!(aux->v[j]))	aux->nnzIdx[ aux->nnzIdxMap.len++ ] = j; 
+		////TODO numerical zero may then cause readd
+		if(!spVect_idx_in(j,&aux->nnzIdxMap))
+			aux->nnzIdx[ aux->nnzIdxMap.len-1 ] = j;
+	}
+}
+
+
+///MultiImplementations functions
+#ifndef OFF_F
+	#error generic implementations requires OFF_F defined
+#endif
+
+///TODO IMPLEMENT SCALAR<->ROW MUL AS GENERIC SPARSE VECTOR<->SCALAR MUL
+//inline void scSparseRowMul(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
+inline void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux){
+	ulong  rowStartIdx = mat->IRP[trgtR]-OFF_F,rowLen;
+	#ifdef ROWLENS
+	rowLen = mat->RL[trgtR];
+	#else
+	rowLen = mat->IRP[trgtR+1] - mat->IRP[trgtR];
+	#endif
+	CAT(scSparseVectMul_,OFF_F)(scalar,mat->AS+rowStartIdx,mat->JA+rowStartIdx,rowLen,aux);
+	//TODO check impact of generic version use
+}
+
+
+///OUTPUT SIZE PREDICTION	
+
+/*			O(A.NZ [+B.M] )
+ * return array of upper bounded row sizes of the product @A * @B
+ * also appended at the end for the cumulative total size of the matrix AB = A*B
+ */
+inline idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B){
+	AUDIT_INTERNAL_TIMES	Start = omp_get_wtime();
+	idx_t* rowSizes = calloc((A->M+1),  sizeof(*rowSizes));
+	if (!rowSizes){
+		ERRPRINT("spMMSizeUpperbound: rowSizes calloc errd\n");
+		return NULL;
+	}
+	idx_t fullMatBound = 0;
+	#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
+	for (idx_t r=0;  r<A->M; r++){
+		for (idx_t jj=A->IRP[r] - OFF_F,j,rlen;  jj<A->IRP[r+1] - OFF_F; jj++){
+			j = A->JA[jj] - OFF_F;
+			#ifdef ROWLENS
+			rlen = B->RL[j];
+			#else
+			rlen = B->IRP[j+1] - B->IRP[j]; //OFF_F: delta is offset independent
+			#endif 
+			rowSizes[r]	 += rlen;
+			fullMatBound	+= rlen;
+			//rowSizes[A->M]  += rlen;	//just below omp reduction sum
+		}
+	}
+	rowSizes[A->M]  = fullMatBound;
+	AUDIT_INTERNAL_TIMES	End= omp_get_wtime();
+	VERBOSE 
+		printf("spMMSizeUpperbound:%lu\t%le s\n",rowSizes[A->M],End-Start);
+	return rowSizes;
+}
+
+/*	O(A.NZ + B.NZ)
+ * return matrix @A.M x gridCols of upper bound 
+ * for each of the @gridCols col partitions of the output matrix AB = @A * @B 
+ * also appended at the end for the cumulative total size of the matrix AB 
+ */
+inline idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)
+  (spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets){
+	AUDIT_INTERNAL_TIMES	Start = omp_get_wtime();
+	
+	idx_t* rowPartsSizes = calloc((A->M*gridCols +1),  sizeof(*rowPartsSizes));
+	if (!rowPartsSizes){
+		ERRPRINT("spMMSizeUpperbound: rowPartsSizes calloc errd\n");
+		return NULL;
+	}
+
+	idx_t fullMatBound = 0;
+	#pragma omp parallel for schedule(static) reduction(+:fullMatBound)
+	for (idx_t r=0;  r<A->M; r++){
+		//for each A.row -> sum B.colParts lens		
+		for (idx_t jj=A->IRP[r]-OFF_F,j,rlen;  jj<A->IRP[r+1]-OFF_F; jj++){
+			j = A->JA[jj] - OFF_F;
+			for (idx_t gc=0,bPartID=IDX2D(j,gc,gridCols);  gc < gridCols; gc++,bPartID++){
+				rlen = bColPartOffsets[bPartID+1] - bColPartOffsets[bPartID];
+				rowPartsSizes[ IDX2D(r,gc,gridCols) ]	+= rlen;
+				fullMatBound	+= rlen;
+			}
+		}
+	}
+	rowPartsSizes[ A->M*gridCols ]  = fullMatBound;
+	AUDIT_INTERNAL_TIMES	End= omp_get_wtime();
+	VERBOSE 
+	 printf("spMMSizeUpperboundColParts_:%lu\t%le s\n",rowPartsSizes[A->M],End-Start);
+	return rowPartsSizes;
+}
+
+
+////Single implementations funcs
+#ifndef SPMMUTILS_H_SINGLE_IMPLEMENTATION
+#define SPMMUTILS_H_SINGLE_IMPLEMENTATION
+
+#include "utils.h"
+///Allocs - Free
+//SpMM holder of accumulats 
+inline SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum){
+	SPMM_ACC* out = calloc(1,sizeof(*out));
+	if (!out){
+		ERRPRINT("initSpMMAcc:	out calloc errd\n");
+		return NULL;
+	}
+	out->size = entriesNum;
+	if (!(out->JA = malloc(entriesNum * sizeof(*(out->JA))))){
+		ERRPRINT("initSpMMAcc:	JA malloc errd\n");
+		goto _err;
+	}
+	if (!(out->AS = malloc(entriesNum * sizeof(*(out->AS))))){
+		ERRPRINT("initSpMMAcc:	AS malloc errd\n");
+		goto _err;
+	}
+	if (!(out->accs = malloc(accumulatorsNum * sizeof(*(out->accs))))){
+		ERRPRINT("initSpMMAcc:	accs malloc errd\n");
+		goto _err;
+	}
+	return out;
+
+	_err:
+	if (out->JA)	free(out->JA);
+	if (out->AS)	free(out->AS);
+	if (out->accs)  free(out->accs);
+	if (out)		free(out);
+	return NULL;
+}
+inline void freeSpMMAcc(SPMM_ACC* acc){
+	free(acc->JA);
+	free(acc->AS);
+	free(acc->accs);
+	free(acc);
+}
+///// dense acc sparsify functions
+/*
+ * sparsify dense accumulated vector @accV (with shifted of @startColAcc) 
+ * into sparse accumulator @accSparse that'll use space for nnz entries from @acc
+*/
+///DIRECT OUT MATRIX SPARSIFY
+/*
+ * sparsify @accV directly inside row @row of matrix @m
+ * considering, if given not NULL, 2D partitioning with 
+ * @gridCols cols groups and colGroups offsets per row matrix @colPartsOffsets
+ * :Returns	inplace modify of @m
+ */
+static inline void sparsifyDirect(ACC_DENSE* accV,spmat* m,idx_t row){
+	idx_t nnz = accV->nnzIdxMap.len, sparsifyStart = m->IRP[row], sparsifyEnd = m->IRP[row+1];
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+	DEBUGCHECKS	assertArrNoRepetitions(accV->nnzIdx,nnz);
+	DEBUGCHECKS	assert(nnz <= (sparsifyEnd - sparsifyStart));
+
+	for (idx_t i=0,j;	i < nnz;   i++){
+		j =  accV->nnzIdx[i];
+		m -> JA[sparsifyStart + i] = j; //+ startColAcc;
+		m -> AS[sparsifyStart + i] = accV->v[j];
+	}
+}
+//TODO 2D partitioning - colParts version of above...best to use multi impl trick of symbStep
+static inline void sparsifyDirectColParts(ACC_DENSE* accV,spmat* m,idx_t row,
+  ushort colGroupId,ushort gridCols, idx_t* colPartsOffsets,idx_t startCol){
+	idx_t nnz = accV->nnzIdxMap.len;
+	ushort partID = IDX2D(row,colGroupId,gridCols);
+	idx_t sparsifyStart = colPartsOffsets[partID], sparsifyEnd = colPartsOffsets[partID+1];
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+	DEBUGCHECKS		assertArrNoRepetitions(accV->nnzIdx,nnz);
+	DEBUGCHECKS		assert( nnz == sparsifyEnd - sparsifyStart );
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+
+	for (idx_t i=0,j;	i < nnz;   i++){ 
+		j =  accV->nnzIdx[i];
+		m -> JA[sparsifyStart + i] = j + startCol;
+		m -> AS[sparsifyStart + i] = accV->v[j];
+	}
+	
+}	
+
+///UB SPACE SPARSIFY
+//internal sparsivy dense acc inside (prepared) sparse acc struct
+static inline void _sparsifyUB(ACC_DENSE* accV,SPACC* accSparse,idx_t startColAcc){
+	idx_t nnz = accV->nnzIdxMap.len;
+	sort_idx_t(accV->nnzIdx,nnz); //sort nnz idx for ordered write
+	DEBUGCHECKS		assertArrNoRepetitions(accV->nnzIdx,nnz);
+	for (idx_t i=0,j;	i < nnz;   i++){ 
+		j =  accV->nnzIdx[i];
+		accSparse -> JA[i] = j + startColAcc;
+		accSparse -> AS[i] = accV->v[j];
+	}
+	accSparse -> len = nnz;
+}
+
+//row[Part] sparsified in a thread safe (exactly long) reserved area using atomics
+static inline void sparsifyUBNoPartsBounds
+  (SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc){
+	//sort nnz indexes of dense accumulator
+	idx_t nnz = accV->nnzIdxMap.len;
+	idx_t sparsifyStartV;		 //start index(inside @accSparse) of @accV to sparsify
+	//sparsifyStartV = __atomic_fetch_add(&(acc->lastAssigned),nnz,__ATOMIC_ACQ_REL); 
+	#pragma omp atomic capture
+	{   //fetch and add like .... 
+		sparsifyStartV = acc->lastAssigned;
+		acc->lastAssigned += nnz;
+	}
+	DEBUGCHECKS{
+		if (acc->lastAssigned >= acc->size){
+			ERRPRINT("OMP ATOMIC OR SG ERRD IN SPACE ASSIGNMENTS...\n");
+			assert(acc->lastAssigned < acc->size);
+		}
+	}
+	//
+	accSparse -> AS = acc->AS + sparsifyStartV; 
+	accSparse -> JA = acc->JA + sparsifyStartV; 
+	_sparsifyUB(accV,accSparse,startColAcc);
+}
+////output-gather functons
+/*
+ * merge @conf->gridCols*@mat->M sparse rows partitions into @mat
+ * EXPECTED rowsParts @rowsParts to be sorted in accord to the 
+ * 2D rowMajor computing grid given in @conf
+ * allocd arrays to hold non zero values and indexes into @mat
+ */
+inline int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,
+  CONFIG* conf){
+	ulong nzNum=0,j,rLen,idx,partsNum = mat->M * conf->gridCols;
+	//TODO PARALLEL MERGE - SAVE STARTING offset OF EACH PARTITION IN THE OUT MATRIX
+	ulong* rowsPartsOffsets=alloca(partsNum*sizeof(*rowsPartsOffsets));
+	///count nnz entries and alloc arrays for them
+	for (ulong r=0; r<mat->M; r++){
+		//for each partition ->get len -> outMat.IRP and aux offsets  
+		for (j=0,rLen=0; j<conf->gridCols; j++){
+			idx = IDX2D(r,j,conf->gridCols);
+			rowsPartsOffsets[idx]=nzNum+rLen;//part start=prev accumulated end
+			rLen += rowsParts[idx].len;
+		}
+		nzNum += rLen;
+		mat->IRP[r+1] = nzNum;
+		#ifdef ROWLENS
+		mat->RL[r] = rLen;
+		#endif
+	}
+	mat->NZ = nzNum;
+	if (!(mat->AS = malloc(nzNum * sizeof(*(mat->AS))))){
+		ERRPRINT("merged sparse matrix AS alloc errd\n");
+		return EXIT_FAILURE;
+	}  
+	if (!(mat->JA = malloc(nzNum * sizeof(*(mat->JA))))){
+		ERRPRINT("merged sparse matrix JA alloc errd\n");
+		return EXIT_FAILURE;
+	}
+	///popolate with rows nnz values and indexes
+	ulong pLen; //omp for aux vars
+	#pragma omp parallel for schedule(static) private(pLen)
+	for (ulong i=0;  i<partsNum; i++){
+		pLen = rowsParts[i].len;
+		memcpy(mat->AS + rowsPartsOffsets[i],rowsParts[i].AS,pLen*sizeof(*(mat->AS)));
+		memcpy(mat->JA + rowsPartsOffsets[i],rowsParts[i].JA,pLen*sizeof(*(mat->JA)));
+	}
+	CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
+		for (ulong i=0,w=0; i<mat->M; i++){
+			if (mat->IRP[i] != w) 
+				{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
+			for (j=0; j<conf->gridCols; j++){
+				SPACC r = rowsParts[IDX2D(i,j,conf->gridCols)];
+				for (ulong jj=0; jj<r.len; jj++,w++){
+					if (mat->AS[w]!= r.AS[jj]){
+						ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
+					if (mat->JA[w]!= r.JA[jj]){
+						ERRPRINT("MERGE ROW ERR JA\n"); return -1;}
+				}
+			}
+		}
+	}
+	return EXIT_SUCCESS;
+}
+
+/*
+ * merge @mat->M sparse rows @rows in sparse matrix @mat
+ * EXPECTED @rows to be sorted in accord to trgt matrix row index @r
+ * allocd arrays to hold non zero values and indexes into @mat
+ */
+inline int mergeRows(SPACC* rows,spmat* mat){
+	ulong nzNum=0;
+	//count nnz entries and alloc arrays for them
+	for (ulong r=0;   r<mat->M;   ++r){
+		nzNum += rows[r].len;
+		mat->IRP[r+1] = nzNum;
+		#ifdef ROWLENS
+		mat->RL[r]  = rows[r].len
+		#endif
+	}
+	mat->NZ = nzNum;
+	if (!(mat->AS = malloc(nzNum * sizeof(*(mat->AS))))){
+		ERRPRINT("merged sparse matrix AS alloc errd\n");
+		return EXIT_FAILURE;
+	}  
+	if (!(mat->JA = malloc(nzNum * sizeof(*(mat->JA))))){
+		ERRPRINT("merged sparse matrix JA alloc errd\n");
+		return EXIT_FAILURE;
+	}
+	///POPOLATE WITH ROWS NNZ VALUES AND INDEXES
+	//TODO PARALLEL COPY
+	#pragma omp parallel for schedule(static)
+	for (ulong r=0; r<mat->M; r++){
+		memcpy(mat->AS+mat->IRP[r], rows[r].AS, rows[r].len*sizeof(*(mat->AS)));
+		memcpy(mat->JA+mat->IRP[r], rows[r].JA, rows[r].len*sizeof(*(mat->JA)));
+	}
+	CONSISTENCY_CHECKS{ //TODO REMOVE written nnz check manually
+		for (ulong r=0,i=0; r<mat->M; r++){
+			if (i != mat->IRP[r])
+				{ERRPRINT("MERGE ROW ERR IRP\n");return -1;}
+			for (ulong j=0; j<rows[r].len; j++,i++){
+				if (mat->AS[i]!= rows[r].AS[j]){
+					ERRPRINT("MERGE ROW ERR AS\n"); return -1;}
+				if (mat->JA[i]   != rows[r].JA[j]){
+					ERRPRINT("MERGE ROW ERR JA\n"); return -1;}
+			}
+		}
+	}
+	return EXIT_SUCCESS;
+} 
+
+#endif	//SPMMUTILS_H_SINGLE_IMPLEMENTATION
+
diff --git a/base/serial/impl/sp3mm4amg/include/SpMMUtilsMulti.h b/base/serial/impl/sp3mm4amg/include/SpMMUtilsMulti.h
new file mode 100644
index 00000000..cd40c494
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/SpMMUtilsMulti.h
@@ -0,0 +1,56 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef SPMMUTILSMULTI_H
+#define SPMMUTILSMULTI_H
+
+extern char TRGT_IMPL_START_IDX; //multi implementation switch
+
+#ifdef OFF_F	//save "includer" OFF_F value before overwriting it
+	#pragma push_macro("OFF_F")
+	#define _OFF_F_OLD
+	#undef  OFF_F
+#endif
+
+#define OFF_F 0
+#include "SpMMUtilsGeneric.h"
+#undef OFF_F
+
+#define OFF_F 1
+#include "SpMMUtilsGeneric.h"
+#undef OFF_F
+
+#ifdef _OFF_F_OLD
+	#pragma pop_macro("OFF_F")
+	#undef  _OFF_F_OLD
+#endif
+
+#endif
+
diff --git a/base/serial/impl/sp3mm4amg/include/config.h b/base/serial/impl/sp3mm4amg/include/config.h
new file mode 100644
index 00000000..a11f6d21
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/config.h
@@ -0,0 +1,155 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+
+#ifndef CONFIG_H
+#define CONFIG_H
+//CONFIG STRUCT DEPENDECIES
+//switch among diferent symb row -> rows implementation var
+typedef enum{
+	RBTREE,
+	IDXMAP
+} ROW_MMSYM_IMPL_MODE;
+
+//
+typedef struct{
+	ushort gridRows;
+	ushort gridCols;
+	//TODO FULL CONFIG DOCCED HERE
+	ROW_MMSYM_IMPL_MODE symbMMRowImplID; //how to compute symb mul (if required)
+	uint threadNum;  //thread num to use in an OMP parallel region ...
+	void* chunkDistrbFunc;  //CHUNKS_DISTR_INTERF func pntr
+} CONFIG;  
+///Smart controls
+typedef size_t 		idx_t;	//spmat indexes
+typedef unsigned __int128	uint128;
+
+#include "macros.h"
+#ifndef SPARSIFY_PRE_PARTITIONING
+	#define SPARSIFY_PRE_PARTITIONING TRUE	//u.b. implementation will sparsify dense acc in a pre splitted mem area
+#endif
+///AUDIT&CHECKS
+//debug checks and tmp stuff
+#ifndef DEBUG 
+    #define DEBUG                       if( TRUE )
+#endif
+//long prints
+#ifndef DEBUGPRINT
+    #define DEBUGPRINT                  if( FALSE )
+#endif
+//heavy impact debug checks
+#ifndef DEBUGCHECKS
+    #define DEBUGCHECKS                 if( FALSE )
+#endif
+//extra print in the normal output
+#ifndef AUDIT_INTERNAL_TIMES
+    #define AUDIT_INTERNAL_TIMES        if( TRUE )
+#endif
+#ifndef VERBOSE
+    #define VERBOSE                     if( FALSE )
+#endif
+//extra checks over the imput and correct partials
+#ifndef CONSISTENCY_CHECKS
+    #define CONSISTENCY_CHECKS          if( TRUE )
+#endif
+#ifndef SPVECT_IDX_BITWISE 	//SPVECT_IDX_DENSE_ACC.nnzIdxsFlags will be an array of bitflags
+	#define SPVECT_IDX_BITWISE TRUE
+#endif
+#if SPVECT_IDX_BITWISE == TRUE
+	#ifndef LIMB_T
+		#define LIMB_T uint128
+	#endif
+	typedef LIMB_T limb_t;
+	typedef limb_t* nnz_idxs_flags_t;
+	#define LIMB_SIZE_BIT ( sizeof(limb_t) * 8 )
+#else //nnz idxs ar flags in a byte arry
+	typedef uchar* nnz_idxs_flags_t;
+	#define LIMB_SIZE_BIT ( sizeof(uchar) * 8 )
+#endif
+
+///AUDIT extra configuration
+//#define ROWLENS
+#ifdef ROWLENS
+/*#pragma message("ROW_LENS ARRAY ENABLED")*/
+#endif
+
+/*
+ * idxsMapAcc based symb row*rows, outIdxs and colParts carried with an aux rbtree
+ * otherwise carried throught an append array
+ * (hyp same computational cost... n + nlog n)
+ */
+#ifndef IDX_RMUL_SYMB_RBTREE
+	#define IDX_RMUL_SYMB_RBTREE  FALSE
+#endif
+#ifndef RB_CACHED_INSERT
+	#define RB_CACHED_INSERT TRUE	//use cached insert
+#endif
+//#define USE_RB_ROOT_CACHE_LMOST //use leftmost leaf cached in rbtree in sym mul
+///CONSTS
+#define ELL_MAX_ENTRIES ( 6l << 27 ) //2*6GB of ell (padded) entries maxSupport in a matrix 
+#define LIMIT_ELL_SIZE				 //enable above threshold
+#define ELL_AS_FILLER       (0 )        //handled with calloc
+//TODO NOW FILLED WITH LAST NNPADDED COL #define ELL_JA_FILLER       (-1)
+//#define DOUBLE_VECT_DIFF_EARLY_EXIT 1
+//#define RNDVECTMIN          222222
+#define VECTOR_STEP_REALLOC 25
+#define VECTOR_READ_BLOCK	50		//file (raw) vector read block
+#define RNDVECTORSIZE       100000
+#define RNDVECTORDUMP       TMPDIR  "rndVectorDump"
+#define RNDVECTORDUMPRAW    TMPDIR  "rndVectorDumpRaw"
+#define OUTVECTORDUMP       TMPDIR  "outVectorDump"
+#define OUTVECTORDUMPRAW    TMPDIR  "outVectorDumpRaw"
+//#define FLOAT_DIFF_ABS
+#ifndef AVG_TIMES_ITERATION
+    #define AVG_TIMES_ITERATION         5
+#endif
+//ompChunksDivide.h -> chunksFairFolded()
+#ifndef FAIR_CHUNKS_FOLDING
+    #define FAIR_CHUNKS_FOLDING 4
+#endif
+//SPMV specific
+//rows partitions for dotProduct SIMD reduction enable
+#ifndef SIMD_ROWS_REDUCTION
+    #define SIMD_ROWS_REDUCTION         TRUE
+#endif
+/*#if SIMD_ROWS_REDUCTION == TRUE
+    #pragma message("SIMD_ROWS_REDUCTION enabled")
+    //TODO SOME TRICK TO HAVE 1! PRINT
+#endif*/
+extern double Start,End,Elapsed,ElapsedInternal;
+#define DOUBLE_DIFF_THREASH         7e-4
+#define DRNG_DEVFILE                "/dev/urandom"
+#define MAXRND                      3e-5
+#ifndef TMPDIR
+    #define TMPDIR                      "/tmp/"
+#endif
+#define TMP_EXTRACTED_MARTIX    TMPDIR "extractedMatrix"
+
+#endif  //CONFIG_H
diff --git a/base/serial/impl/sp3mm4amg/include/inlineExports.h b/base/serial/impl/sp3mm4amg/include/inlineExports.h
new file mode 100644
index 00000000..cc3e0a83
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/inlineExports.h
@@ -0,0 +1,49 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//inline export - single implmentation functions
+void cleanRbNodes(rbRoot* root,rbNode* nodes,idx_t nodesNum);
+int rbInsertNewKey(rbRoot *root,rbNode *node, idx_t key);
+//void C_FortranShiftIdxs(spmat* outMat);
+//void Fortran_C_ShiftIdxs(spmat* m);
+ACC_DENSE* _initAccVectors(ulong num,ulong size);
+ACC_DENSE* _initAccVectors_monoalloc(ulong num,ulong size); //TODO PERF WITH NEXT
+SPMM_ACC* initSpMMAcc(ulong entriesNum, ulong accumulatorsNum);
+idx_t reductionMaxSeq(idx_t* arr,idx_t arrLen);
+int _allocAccDense(ACC_DENSE* v,ulong size);
+int mergeRows(SPACC* rows,spmat* mat);
+int mergeRowsPartitions(SPACC* rowsParts,spmat* mat,CONFIG* conf);
+void _freeAccsDenseChecks(ACC_DENSE* vectors,ulong num); 
+void _resetAccVect(ACC_DENSE* acc);
+void _resetIdxMap(SPVECT_IDX_DENSE_MAP* acc);
+void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
+void freeAccsDense(ACC_DENSE* vectors,ulong num);
+void freeSpMMAcc(SPMM_ACC* acc);
+void sparsifyDenseVect(SPMM_ACC* acc,ACC_DENSE* accV,SPACC* accSparse, ulong startColAcc);
diff --git a/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h b/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
new file mode 100644
index 00000000..eba852f2
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/inlineExports_Generic.h
@@ -0,0 +1,43 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "macros.h"
+
+#ifndef OFF_F
+	#error generic implementation requires OFF_F defined
+#endif
+
+////inline exports
+//multi implmentation functions
+void CAT(scSparseVectMul_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen, ACC_DENSE* aux);
+void CAT(scSparseVectMulPart_,OFF_F)(double scalar,double* vectVals,ulong* vectIdxs,ulong vectLen,ulong startIdx,ACC_DENSE* aux);
+void CAT(_scRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
+void CAT(scSparseRowMul_,OFF_F)(double scalar,spmat* mat,ulong trgtR, ACC_DENSE* aux);
+idx_t* CAT(spMMSizeUpperbound_,OFF_F)(spmat* A,spmat* B);
+idx_t* CAT(spMMSizeUpperboundColParts_,OFF_F)(spmat* A,spmat* B,ushort gridCols,idx_t* bColPartOffsets);
diff --git a/base/serial/impl/sp3mm4amg/include/linuxK_rbtree_minimalized.h b/base/serial/impl/sp3mm4amg/include/linuxK_rbtree_minimalized.h
new file mode 100644
index 00000000..187bfe28
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/linuxK_rbtree_minimalized.h
@@ -0,0 +1,335 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+
+  Userspace GNUC porting,del augmented deps and minimalizing for few OPs only:	Andrea Di Iorio
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/core-api/rbtree.rst for documentation and samples.
+*/
+/*
+ *              RedBlackTree_linux_userspace
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the RedBlackTree_linux_userspace or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE RedBlackTree_linux_userspace GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+ /*
+ * https://bitbucket.org/andysnake96/redblacktree_linux_userspace
+ */ 
+
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+//#include <linux/kernel.h>     //TODO LESS_DEPENDENCIES
+//#include <linux/stddef.h>     //TODO LESS_DEPENDENCIES
+//#include <linux/rcupdate.h>   //TODO LESS_DEPENDENCIES
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <macros.h>             ///my usefull macros :)
+#include <macrosLinuxMock.h>             ///my usefull macros :)
+
+struct rb_node {
+	unsigned long  __rb_parent_color;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+	struct rb_node *rb_node;
+};
+/////MINIMALIZE fulfilling rbtree_augmented deps TODO
+///rb_node colors labels
+#define	RB_RED		0
+#define	RB_BLACK	1
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+///rb_set_parent	//TODO GENERIC ADDONLY
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			WRITE_ONCE(parent->rb_left, new);
+		else
+			WRITE_ONCE(parent->rb_right, new);
+	} else
+		WRITE_ONCE(root->rb_node, new);
+}
+
+/////////////////////////////
+
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
+
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+			    struct rb_root *root);
+extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
+				struct rb_root *root);
+
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+/*static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}**/ //TODO LESS_DEPENDENCIES
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+				   struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+///////////////////////////////////////////////////////////////////////////////
+/***END OF ORIGINAL LINUX KERNEL HEADER MINIMALIZED
+ ***FEW AUX FUNCS TO SUPPORT SpMM-Symbolic
+ ***/
+#include <string.h>
+#include "config.h"
+typedef struct{
+	idx_t key;
+	struct rb_node rb;
+
+	/* following fields used for testing augmented rbtree functionality
+	u32 val;
+	u32 augmented;	///only for AUGMENTED_TEST
+	*/
+} rbNode;
+//static struct rb_root_cached root = RB_ROOT_CACHED;
+
+typedef struct rb_root_cached rbRoot;
+
+
+/*
+ * return 1 if @node with the given key @key 
+ * has been inserted in rbtree rooted at @root; 0 otherwise
+ */
+static inline int rbInsertStdNewKey(rbRoot *root,rbNode *node, idx_t key)
+{
+	struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+	idx_t parentK;
+
+	while (*new) {
+		parent = *new;
+		parentK = rb_entry(parent, rbNode, rb)->key;
+		if	(key < parentK)		new = &parent->rb_left;
+		else if	(key > parentK)		new = &parent->rb_right;
+		else				return 0; //already in 
+		DEBUGCHECKS			assert( *new != parent );
+	}
+	//insert the node in the correct position, assigning the key
+	/*DEBUGCHECKS{	//check for double insertion
+		rbNode testNode;
+		memset(&testNode,0,sizeof(testNode));
+		assert( !memcmp(node,&testNode,sizeof(*node)) );
+  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n))
+			assert( n != *new );
+	}*/
+			
+	node->key = key;
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color(&node->rb, &root->rb_root);
+	return 1;
+}
+
+static inline int rbInsertCachedNewKey(rbRoot *root,rbNode *node, idx_t key)
+{
+	struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+	idx_t parentK;
+	bool leftmost = true;
+
+	while (*new) {
+		parent = *new;
+		parentK = rb_entry(parent, rbNode, rb)->key;
+		if (key < parentK)			new = &parent->rb_left;
+		else if (key > parentK) {
+			new = &parent->rb_right;
+			leftmost = false;
+		}
+		else					return 0;
+	}
+
+	//insert the node in the correct position, assigning the key
+	/*DEBUGCHECKS{	//check for double insertion
+		rbNode testNode;
+		memset(&testNode,0,sizeof(testNode));
+		assert( !memcmp(node,&testNode,sizeof(*node)) );
+  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n))
+			assert( n != *new );
+	}*/
+			
+	node->key = key;
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color_cached(&node->rb, root, leftmost);
+	return 1;
+}
+
+static inline int rbInsertNewKey(rbRoot *root,rbNode *node, idx_t key){
+	#if	RB_CACHED_INSERT == TRUE
+	return rbInsertCachedNewKey(root,node,key);
+	#else
+	return rbInsertStdNewKey(root,node,key);
+	#endif
+}
+
+#define rbNodeOrderedVisit(n,root) \
+  for (n = rb_first(&root->rb_root); n; n = rb_next(n));
+
+inline void cleanRbNodes(rbRoot* root,rbNode* nodes,idx_t nodesNum){
+	memset(nodes,0,nodesNum * sizeof(*nodes));
+	memset(root,0,sizeof(*root));
+}
+#endif	/* _LINUX_RBTREE_H */
diff --git a/base/serial/impl/sp3mm4amg/include/macros.h b/base/serial/impl/sp3mm4amg/include/macros.h
new file mode 100644
index 00000000..c75017ee
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/macros.h
@@ -0,0 +1,118 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#ifndef MACROS
+#define MACROS
+
+#include <stdio.h>
+#include <stdlib.h>
+
+///aux macro-functions
+#define	ABS(a)				        ((a) > 0   ? (a) : -(a))
+#define	MIN(a,b)			        ((a) < (b) ? (a) : (b))
+#define MAX(a,b)			        ((a) > (b) ? (a) : (b))
+#define AVG(a,b)                    ((a)/2 + (b)/2 + ((a)%2+(b)%2)/2)
+#define SWAP(a,b)                   (a)=(a)^(b);(b)=(b)^(a);(a)=(a)^(b)
+#define IN_RANGE(i,start,end)		( (start) <= (i) && (i) <= (end) )
+
+//ceil(x/y) with integers
+#define INT_DIV_CEIL(x,y)		    ( ( (x) - 1) / (y) + 1 )
+//2D ROW MAJOR indexing wrap compute
+#define IDX2D(i,j,nCols)            ((j) + (i)*(nCols))
+
+///distribuite reminder @rem in group givin an extra +1 to the first @rem
+#define UNIF_REMINDER_DISTRI(i,div,rem) \
+    ( (div) + ( (i) < (rem) ? 1 : 0 ) )
+#define UNIF_REMINDER_DISTRI_STARTIDX(i,div,rem) \
+    ( (i) * (div) + MIN( (i),(rem) ) )
+#define UNIF_REMINDER_DISTRI_ENDIDX(i,div,rem) \
+    ( (i+1) * (div) + MIN( (i),(rem) ) )
+//shorter name alias
+#define unifRemShare(i,div,rem)			UNIF_REMINDER_DISTRI(i,div,rem)
+#define unifRemShareStart(i,div,rem)	UNIF_REMINDER_DISTRI_STARTIDX(i,div,rem)
+#define unifRemShareEnd(i,div,rem)		UNIF_REMINDER_DISTRI_ENDIDX( (i) , (div) , (rem) )
+#define unifRemShareBlock(i,div,rem)	unifRemShareStart(i,div,rem), unifRemShare(i,div,rem)
+
+#define STATIC_ARR_ELEMENTS_N(arr)  (sizeof( (arr) ) / (sizeof(*(arr))))  
+////STRING UTILS
+#define _STR(s) #s
+#define STR(s) _STR(s)
+///CONCATENATE
+//Concatenate preprocessor tokens A and B WITHOUT   expanding macro definitions
+#define _CAT(a,b)		a ## b
+#define _CAT3(a,b,c)	a ## b ## c
+#define _CAT4(a,b,c,d)	a ## b ## c ## d
+//Concatenate preprocessor tokens A and B           EXPANDING macro definitions
+#define CAT(a,b)		_CAT(a,b)
+#define CAT3(a,b,c)		_CAT3(a,b,c)
+#define CAT4(a,b,c,d)	_CAT4(a,b,c,d)
+
+#define _UNDEF			_
+////PRINTS
+#define CHIGHLIGHT                  "\33[1m\33[92m"
+#define CCC                         CHIGHLIGHT
+#define CHIGHLIGHTERR               "\33[31m\33[1m\33[44m"
+#define CCCERR                      CHIGHLIGHTERR
+#define CEND                        "\33[0m"
+#define hprintsf(str,...)           printf( CHIGHLIGHT str CEND,__VA_ARGS__ ) 
+#define hprintf(str)                printf( CHIGHLIGHT str CEND) 
+#define ERRPRINTS(str,...)          fprintf( stderr, CHIGHLIGHTERR str CEND,__VA_ARGS__ )
+#define ERRPRINT(str)               fprintf( stderr, CHIGHLIGHTERR str CEND )
+
+#include <assert.h> 
+
+///aux types
+typedef unsigned char  uchar;
+typedef unsigned short ushort;
+typedef unsigned int   uint;
+typedef unsigned long  ulong;
+typedef char bool;
+#define TRUE    (!0)
+#define FALSE   0
+#define true    TRUE
+#define false   FALSE
+#define T   TRUE
+#define F   FALSE
+//smart decimal type custom precision def build macro _DECIMAL_TRGT_PREC 
+#ifndef _DECIMAL_TRGT_PREC
+//dflt floating point precision & formatting chars
+	#define _DECIMAL_TRGT_PREC	double
+	#define _DECIMAL_TRGT_PREC_PR 	"%lf"
+#else 
+    //TODO SELECT WITH RESPECT TO THE EXPORTED TARGET DECIMAL TYPE
+	#define _DECIMAL_TRGT_PREC_PR 	"%f"
+#endif
+typedef _DECIMAL_TRGT_PREC	decimal;
+
+
+///EXTRA INCLUDE    --- cuda 
+///assertionn are disabled at compile time by defining the NDEBUG preprocessor macro before including assert.h	s
+//#ifdef ASSERT 	#include <assert.h> #endif
+
+#endif 	//MACROS
diff --git a/base/serial/impl/sp3mm4amg/include/macrosLinuxMock.h b/base/serial/impl/sp3mm4amg/include/macrosLinuxMock.h
new file mode 100644
index 00000000..6b323330
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/macrosLinuxMock.h
@@ -0,0 +1,140 @@
+/*
+ *              RedBlackTree_linux_userspace
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the RedBlackTree_linux_userspace or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE RedBlackTree_linux_userspace GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef MACROSLINUXMOCK
+#define MACROSLINUXMOCK
+
+////////////////////////// LINUX KERNEL Userspaceing RBTree//////////////////////////////
+typedef unsigned int u32;
+typedef unsigned int cycles_t;
+
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
+
+/** 
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *	TODO LESS_DEPENDENCIES
+	removed:
+		BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) &&	\
+				 !__same_type(*(ptr), void),			\
+				 "pointer type mismatch in container_of()");	\
+	
+ */
+#define container_of(ptr, type, member) ({				\
+	void *__mptr = (void *)(ptr);					\
+	((type *)(__mptr - offsetof(type, member))); })
+
+#define NOOP(x)     x
+//#define NOOP      do {} while(0)
+#undef unlikely
+#define unlikely(x) NOOP(x)
+/*
+ * Yes, this permits 64-bit accesses on 32-bit architectures. These will
+ * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
+ * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
+ * (e.g. a virtual address) and a strong prevailing wind.
+ */
+#define compiletime_assert_rwonce_type(t)					\
+	compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),	\
+		"Unsupported access size for {READ,WRITE}_ONCE().")
+///TODO OVERWRITTEN FOR PORTING
+#undef compiletime_assert_rwonce_type 
+#define compiletime_assert_rwonce_type(t)   NOOP(t)
+/*
+ * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
+ * atomicity. Note that this may result in tears!
+ */
+#ifndef __READ_ONCE
+#define __READ_ONCE(x)	(*(const volatile __unqual_scalar_typeof(x) *)&(x))
+#endif
+
+#define READ_ONCE(x)							\
+({									\
+	compiletime_assert_rwonce_type(x);				\
+	__READ_ONCE(x);							\
+})
+
+#define __WRITE_ONCE(x, val)						\
+do {									\
+	*(volatile typeof(x) *)&(x) = (val);				\
+} while (0)
+
+/*//TODO ORIGNAL VERSION
+#define WRITE_ONCE(x, val)						\
+do {									\
+	compiletime_assert_rwonce_type(x);				\
+	__WRITE_ONCE(x, val);						\
+} while (0)
+*/
+#define WRITE_ONCE(x,val)	x = val
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
+ * word from memory atomically but without telling KASAN/KCSAN. This is
+ * usually used by unwinding code when walking the stack of a running process.
+ */
+#define READ_ONCE_NOCHECK(x)						\
+({									\
+	compiletime_assert(sizeof(x) == sizeof(unsigned long),		\
+		"Unsupported access size for READ_ONCE_NOCHECK().");	\
+	(typeof(x))__read_once_word_nocheck(&(x));			\
+})
+
+#include <x86intrin.h>	//rdtsc
+static inline cycles_t get_cycles(void)
+{
+	
+	/**#ifndef CONFIG_X86_TSC
+	if (!boot_cpu_has(X86_FEATURE_TSC))
+		return 0;
+	#endif */ ///TODO LESS_DEPENDENCIES
+
+	return __rdtsc();
+}
+
+/**#define WARN_ON_ONCE(condition)	({				\
+	static bool __section(".data.once") __warned;		\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once && !__warned)) {		\
+		__warned = true;				\
+		WARN_ON(1);					\
+	}							\
+	unlikely(__ret_warn_once);				\
+}) */ //TODO LESS_DEPENDENCIES
+#include <assert.h>
+#define WARN_ON_ONCE(condition)	assert( !(condition) )
+
+#define div_u64(a,b)	( a / b )
+#define kfree(x)		free(x)
+
+#endif //MACROSLINUXMOCK
diff --git a/base/serial/impl/sp3mm4amg/include/mmio.h b/base/serial/impl/sp3mm4amg/include/mmio.h
new file mode 100644
index 00000000..7cfd0a1b
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/mmio.h
@@ -0,0 +1,133 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h b/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
new file mode 100644
index 00000000..5436d1af
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/ompChunksDivide.h
@@ -0,0 +1,105 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OMPCHUNKSDIVIDE
+#define OMPCHUNKSDIVIDE
+
+/*
+ * chunks distribution for runtime schedule setted to dynamic, TODO guided
+ * generic interface with elements, srcMatrix, configuration
+ * TODO the srcMatrix is leaved for advanced chunk distribution...
+ * configuration is expected to have a valid number of threadNum setted
+ */
+#include "config.h"
+//distribution of @rows|blocks of @matrix, exploiting @config
+typedef void (CHUNKS_DISTR )           (ulong,spmat*,CONFIG*);
+typedef void (*CHUNKS_DISTR_INTERF )   (ulong,spmat*,CONFIG*);
+
+//NOOP chunks division for manual configuration via export OMP_SCHEDULE  
+inline void chunksNOOP(ulong r,spmat* mat,CONFIG* cfg){ return; }
+#include "ompGetICV.h"
+//fair division of @r elements from matrix @mat with threads num in @cfg
+inline void chunksFair(ulong r,spmat* mat,CONFIG* cfg){
+	assert(cfg->threadNum > 0); //configured target thread num
+	omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
+	omp_get_schedule(&kind,&chunk_size);
+	k = kind;
+	#if _OPENMP >= 201811   //OMP_SCHED_SCHEDULE modifier from 5.0
+	monotonic = omp_sched_monotonic & kind;
+	if(monotonic)   k = kind-omp_sched_monotonic;
+	#endif
+	(void) monotonic;   //else no unused warning
+	switch(k){
+		case omp_sched_static :
+			DEBUG   printf("static it's already fair\n");
+			return;
+		case omp_sched_dynamic: 
+			chunk_size_new = MAX(1,r/cfg->threadNum);
+			break; 
+		//case omp_sched_guided :
+		//case omp_sched_auto   :
+		//default:
+	}
+	if(chunk_size == chunk_size_new)	return;
+	omp_set_schedule(kind,chunk_size_new);
+	VERBOSE printf("chunksFair:\tchunk adapted to %d\n",chunk_size_new);
+	DEBUG   ompGetRuntimeSchedule(NULL);
+}
+//fair division of @r elements from matrix @mat of threads in @cfg
+//subdividing the fair share with a factor of @FAIR_CHUNKS_FOLDING
+inline void chunksFairFolded(ulong r,spmat* mat,CONFIG* cfg){
+	assert(cfg->threadNum > 0); //configured target thread num
+	omp_sched_t k,kind; int chunk_size,chunk_size_new=0,monotonic;
+	omp_get_schedule(&kind,&chunk_size);
+	k = kind;
+	#if _OPENMP >= 201811   //OMP_SCHED_SCHEDULE modifier from 5.0
+	monotonic = omp_sched_monotonic & kind;
+	if(monotonic)   k = kind-omp_sched_monotonic;
+	#endif
+	(void) monotonic;   //else no unused warning
+	switch(k){
+		case omp_sched_static :
+			DEBUG   printf("static it's already fair\n");
+			return;
+		case omp_sched_dynamic: 
+			chunk_size_new = MAX(1,r/(cfg->threadNum*FAIR_CHUNKS_FOLDING));
+			break; 
+		//case omp_sched_guided :
+		//case omp_sched_auto   :
+		//default:
+	}
+	if(chunk_size == chunk_size_new)	return;
+	omp_set_schedule(kind,chunk_size_new);
+	DEBUG{  //check with ICV get
+		printf("chunksFairFolded:\tchunk adapted to %d\n",chunk_size_new);
+		ompGetRuntimeSchedule(NULL);
+	}
+}
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/ompGetICV.h b/base/serial/impl/sp3mm4amg/include/ompGetICV.h
new file mode 100644
index 00000000..6e545a1c
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/ompGetICV.h
@@ -0,0 +1,41 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef OMPGETICV_H
+#define OMPGETICV_H
+//only header definitions
+/*
+ * log sched configuration on stdout
+ * return kind,monotonic,chunkSize if arg0 not NULL
+ */
+void ompGetRuntimeSchedule(int* );
+void ompGetAllICV();    //only if not OMP_GET_ICV_MAIN 
+float ompVersionMacroMap(); //version number as float using API dates mappings
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/ompi_config_minimal.h b/base/serial/impl/sp3mm4amg/include/ompi_config_minimal.h
new file mode 100644
index 00000000..88e4ccb9
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/ompi_config_minimal.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2021 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2007-2021 Cisco Systems, Inc.  All rights reserved
+ * Copyright (c) 2008-2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2009-2012 Oak Rigde National Laboratory.  All rights reserved.
+ * Copyright (c) 2011-2020 Sandia National Laboratories. All rights reserved.
+ * Copyright (c) 2012-2018 Los Alamos National Security, LLC. All rights
+ *                         reserved.
+ * Copyright (c) 2011-2013 INRIA.  All rights reserved.
+ * Copyright (c) 2015      University of Houston. All rights reserved.
+ * Copyright (c) 2015-2021 Research Organization for Information Science
+ *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2017-2019 IBM Corporation.  All rights reserved.
+ * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021-2022 Google, LLC. All rights reserved.
+ * Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates.  All Rights
+ *                         reserved.
+ * Copyright (c) 2021      Bull S.A.S. All rights reserved.
+ * Copyright (c) 2018      Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+/*
+ * Error classes and codes
+ * Do not change the values of these without also modifying mpif.h.in.
+ */
+#define MPI_SUCCESS                   0
+#define MPI_ERR_BUFFER                1
+#define MPI_ERR_COUNT                 2
+#define MPI_ERR_TYPE                  3
+#define MPI_ERR_TAG                   4
+#define MPI_ERR_COMM                  5
+#define MPI_ERR_RANK                  6
+#define MPI_ERR_REQUEST               7
+#define MPI_ERR_ROOT                  8
+#define MPI_ERR_GROUP                 9
+#define MPI_ERR_OP                    10
+#define MPI_ERR_TOPOLOGY              11
+#define MPI_ERR_DIMS                  12
+#define MPI_ERR_ARG                   13
+#define MPI_ERR_UNKNOWN               14
+#define MPI_ERR_TRUNCATE              15
+#define MPI_ERR_OTHER                 16
+#define MPI_ERR_INTERN                17
+#define MPI_ERR_IN_STATUS             18
+#define MPI_ERR_PENDING               19
+#define MPI_ERR_ACCESS                20
+#define MPI_ERR_AMODE                 21
+#define MPI_ERR_ASSERT                22
+#define MPI_ERR_BAD_FILE              23
+#define MPI_ERR_BASE                  24
+#define MPI_ERR_CONVERSION            25
+#define MPI_ERR_DISP                  26
+#define MPI_ERR_DUP_DATAREP           27
+#define MPI_ERR_FILE_EXISTS           28
+#define MPI_ERR_FILE_IN_USE           29
+#define MPI_ERR_FILE                  30
+#define MPI_ERR_INFO_KEY              31
+#define MPI_ERR_INFO_NOKEY            32
+#define MPI_ERR_INFO_VALUE            33
+#define MPI_ERR_INFO                  34
+#define MPI_ERR_IO                    35
+#define MPI_ERR_KEYVAL                36
+#define MPI_ERR_LOCKTYPE              37
+#define MPI_ERR_NAME                  38
+#define MPI_ERR_NO_MEM                39
+#define MPI_ERR_NOT_SAME              40
+#define MPI_ERR_NO_SPACE              41
+#define MPI_ERR_NO_SUCH_FILE          42
+#define MPI_ERR_PORT                  43
+#define MPI_ERR_QUOTA                 44
+#define MPI_ERR_READ_ONLY             45
+#define MPI_ERR_RMA_CONFLICT          46
+#define MPI_ERR_RMA_SYNC              47
+#define MPI_ERR_SERVICE               48
+#define MPI_ERR_SIZE                  49
+#define MPI_ERR_SPAWN                 50
+#define MPI_ERR_UNSUPPORTED_DATAREP   51
+#define MPI_ERR_UNSUPPORTED_OPERATION 52
+#define MPI_ERR_WIN                   53
+
+#define MPI_T_ERR_MEMORY              54
+#define MPI_T_ERR_NOT_INITIALIZED     55
+#define MPI_T_ERR_CANNOT_INIT         56
+#define MPI_T_ERR_INVALID_INDEX       57
+#define MPI_T_ERR_INVALID_ITEM        58
+#define MPI_T_ERR_INVALID_HANDLE      59
+#define MPI_T_ERR_OUT_OF_HANDLES      60
+#define MPI_T_ERR_OUT_OF_SESSIONS     61
+#define MPI_T_ERR_INVALID_SESSION     62
+#define MPI_T_ERR_CVAR_SET_NOT_NOW    63
+#define MPI_T_ERR_CVAR_SET_NEVER      64
+#define MPI_T_ERR_PVAR_NO_STARTSTOP   65
+#define MPI_T_ERR_PVAR_NO_WRITE       66
+#define MPI_T_ERR_PVAR_NO_ATOMIC      67
+#define MPI_ERR_RMA_RANGE             68
+#define MPI_ERR_RMA_ATTACH            69
+#define MPI_ERR_RMA_FLAVOR            70
+#define MPI_ERR_RMA_SHARED            71
+#define MPI_T_ERR_INVALID             72
+#define MPI_T_ERR_INVALID_NAME        73
+#define MPI_ERR_PROC_ABORTED          74
+
+/* not #if conditional on OPAL_ENABLE_FT_MPI for ABI */
+#define MPI_ERR_PROC_FAILED           75
+#define MPI_ERR_PROC_FAILED_PENDING   76
+#define MPI_ERR_REVOKED               77
+
+/* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined
+   MPI_ERR_<foo> code. Set the last code to allow some room for adding
+   error codes without breaking ABI. */
+#define MPI_ERR_LASTCODE              92
+
+/*
+ * Comparison results.  Don't change the order of these, the group
+ * comparison functions rely on it.
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_IDENT,
+  MPI_CONGRUENT,
+  MPI_SIMILAR,
+  MPI_UNEQUAL
+};
+
+/*
+ * MPI_Init_thread constants
+ * Do not change the order of these without also modifying mpif.h.in.
+ */
+enum {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+};
+
+/*
+ * Datatype combiners.
+ * Do not change the order of these without also modifying mpif.h.in.
+ * (see also mpif-common.h.fin).
+ */
+enum {
+  MPI_COMBINER_NAMED,
+  MPI_COMBINER_DUP,
+  MPI_COMBINER_CONTIGUOUS,
+  MPI_COMBINER_VECTOR,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HVECTOR_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HVECTOR_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HVECTOR,
+  MPI_COMBINER_INDEXED,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_HINDEXED_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_HINDEXED_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_HINDEXED,
+  MPI_COMBINER_INDEXED_BLOCK,
+#if (!OMPI_OMIT_MPI1_COMPAT_DECLS)
+  MPI_COMBINER_STRUCT_INTEGER,
+#else
+  OMPI_WAS_MPI_COMBINER_STRUCT_INTEGER, /* preserve ABI compatibility */
+#endif
+  MPI_COMBINER_STRUCT,
+  MPI_COMBINER_SUBARRAY,
+  MPI_COMBINER_DARRAY,
+  MPI_COMBINER_F90_REAL,
+  MPI_COMBINER_F90_COMPLEX,
+  MPI_COMBINER_F90_INTEGER,
+  MPI_COMBINER_RESIZED,
+  MPI_COMBINER_HINDEXED_BLOCK
+};
+
+////MOCK ERRHANDELERS
+#define OMPI_ERRHANDLER_INVOKE(comm,err,fname)				err
+#define OMPI_ERRHANDLER_NOHANDLE_INVOKE(err,fname)		err
+
diff --git a/base/serial/impl/sp3mm4amg/include/parser.h b/base/serial/impl/sp3mm4amg/include/parser.h
new file mode 100644
index 00000000..c482f7e1
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/parser.h
@@ -0,0 +1,90 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#ifndef PARSER
+#define PARSER
+
+#include "mmio.h" 
+#include "sparseMatrix.h" 
+
+typedef struct{
+	ulong row;
+	ulong col;
+	double val;
+} entry;	 //MatrixMarket COO entry
+
+typedef struct{
+	MM_typecode mcode;
+	entry* entries;
+	ulong* rowLens;
+	ulong M,N,NZ;   //spmat sizes
+} MatrixMarket;
+
+////COO PARSE
+//parse and check MatrixMarket mat in @matPath file
+MatrixMarket* MMRead(char* matPath);
+void freeMatrixMarket(MatrixMarket* mm);
+//basic check for sparse matrix compliance to the app, return posix bool
+int MMCheck(MM_typecode typecode);
+/* 
+ * parse MatrixMarket matrix entries in @fp, of type @mcode
+ * into COOrdinate list of entries
+ *  -> expand simmetric matrixes into a normal matrix with both parts
+ *	  so NZ will be inplace doubled
+ * return allocated and filled COO entries with the NNZ number into
+ * NO SORT CHECKING HERE
+ */
+entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens);
+
+////COO -> ANYTHING ELSE CONVERSION
+/*
+ * write COO entries in @entries inside sparse matrix @mat in ELL format
+ * EXPECTED: CSR arrays allocated, @entries col sorted in (not madatory consecut) rows
+ * [simmetrical parts explicitly rappresented --> not important here]
+ */
+int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens);
+/*
+ * write COO entries in @entries inside sparse matrix @mat in ELL format
+ * EXPECTED: @entries col sorted in (not madatory consecut) rows
+ * ELL internal array allocated in this function, not freed in case of error
+ */
+int COOtoELL(entry* entries, spmat* mat, ulong* rowLens);
+////wrapper MM -> specialized target
+/*
+ * Parse MatrixMarket matrix stored in file at @matPath
+ * IMPLEMENTED WRAPPING: MMtoCOO -> COOtoCSR
+ * Returns: allocated spmat sparse matrix with all field allocated
+ * symetric matrix are expanded in a full matrix
+ */
+spmat* MMtoCSR(char* matPath);
+spmat* MMtoELL(char* matPath);
+
+
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/sparseMatrix.h b/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
new file mode 100644
index 00000000..8fc769e8
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/sparseMatrix.h
@@ -0,0 +1,216 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+//sparse matrix def & aux
+#ifndef SPARSEMATRIX
+#define SPARSEMATRIX 
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "macros.h"
+#include "config.h"
+
+typedef struct {
+	idx_t NZ,M,N;
+	double *AS; 
+	idx_t* JA;
+	//CSR SPECIFIC
+	idx_t* IRP;
+	#ifdef ROWLENS
+	idx_t* RL;   //row lengths
+	#endif
+	//CUDA SPECIFIC
+	idx_t MAX_ROW_NZ;
+
+} spmat; //describe a sparse matrix
+
+//smart index keeping in a dense map
+typedef struct{
+	idx_t	len;				//num of nnz idx accumulated
+	/* nnz index presence packing, implict space enough for all possible indexes*/
+	nnz_idxs_flags_t idxsMap;
+	uint idxsMapN;	//either num of limbs or len of char flag array
+} SPVECT_IDX_DENSE_MAP;
+int initSpVectIdxDenseAcc(idx_t idxMax, SPVECT_IDX_DENSE_MAP* );  
+inline void _resetIdxMap(SPVECT_IDX_DENSE_MAP* acc){
+	acc->len = 0;
+	memset(acc->idxsMap,0,sizeof(*acc->idxsMap)*acc->idxsMapN);
+}
+inline void _freeIdxMap(SPVECT_IDX_DENSE_MAP* acc){
+	free(acc->idxsMap);
+	free(acc);
+}
+//aux struct for sparse vector-scalar product accumualtion
+typedef struct{ 
+	double*  v;          			//aux accumulating dense vector (sparse)
+	idx_t*   nnzIdx;     			//v nnz value's indexes 		(contiguos)
+	idx_t    vLen;       			//size of the aux dense vector //TODO USELESS?
+	SPVECT_IDX_DENSE_MAP nnzIdxMap;	//
+} ACC_DENSE;
+int allocAccDense(ACC_DENSE* v, ulong size);
+
+/*  
+ **** NNZ IDXS PRESENCE FLAGS ACCESS INTERFACE:  ***
+ *	sp_vect_idx_set(idx,SPVECT_IDX_DENSE_MAP)
+ *		-> return 0 if idx isn't already setted and in case set it
+ */
+inline int spVect_idx_in(idx_t idx, SPVECT_IDX_DENSE_MAP* idxsMapAcc){
+	#if SPVECT_IDX_BITWISE == TRUE
+	uint limbID 	= idx / LIMB_SIZE_BIT; //idx's limb id
+	uint limbIdxID	= idx % LIMB_SIZE_BIT; //idx's pos in limb
+	DEBUGCHECKS		assert( limbID < idxsMapAcc->idxsMapN );
+	limb_t idxPos   = ((limb_t) 1) << limbIdxID;
+	if (!(idxsMapAcc->idxsMap[limbID] & idxPos) ){
+		idxsMapAcc->idxsMap[limbID] |= idxPos;
+		idxsMapAcc->len++;
+		return 0;
+	}
+	#else	
+	//assert( idx < idxsMapAcc->idxsMapN ); //TODO
+	if (!( idxsMapAcc->idxsMap[idx] )){	//TODO usabile primitiva atomica di cmpswap, accorpamento ops a livello HW?
+		idxsMapAcc->idxsMap[idx] = 1;
+		idxsMapAcc->len++;
+		return 0;
+	}
+	#endif //SPVECT_IDX_BITWISE == TRUE
+	return 1;
+}	
+inline void _resetAccVect(ACC_DENSE* acc){
+	memset(acc->v,		 0,	acc->vLen * sizeof(*(acc->v)));
+	memset(acc->nnzIdx,	 0,	acc->vLen * sizeof(*(acc->nnzIdx)));
+	_resetIdxMap(&acc->nnzIdxMap);
+}
+////Sparse vector accumulator -- corresponding to a matrix portion
+typedef struct{
+	//idx_t    r;     //row index in the corresponding matrix
+	//idx_t    c;     //col index in the corresponding matrix
+	idx_t   len;   //rowLen
+	double* AS;    //row nnz    values
+	idx_t*  JA;    //row nnz    colIndexes
+} SPACC; 
+
+
+/*
+ * ARRAY BISECTION - RECURSIVE VERSION
+ * TODO ASSERT LEN>0 ommitted
+ */
+inline int BISECT_ARRAY(idx_t target, idx_t* arr, idx_t len){
+	//if (len == 0)              return FALSE;
+	if (len <= 1)              return *arr == target; 
+	idx_t middleIdx = (len-1) / 2;  //len=5-->2, len=4-->1
+	idx_t middle    = arr[ middleIdx ];
+	if	(target == middle)  return TRUE;
+	else if (target <  middle)  return BISECT_ARRAY(target,arr,middleIdx); 
+	else	return BISECT_ARRAY(target,arr+middleIdx+1,middleIdx + (len-1)%2);
+}
+
+/*
+ * return !0 if col @j idx is in row @i of sparse mat @smat
+ * bisection used --> O(log_2(ROWLENGHT))
+ */
+inline int IS_NNZ(spmat* smat,idx_t i,idx_t j){
+	idx_t rStart = smat->IRP[i];
+	idx_t rLen   = smat->IRP[i+1] - rStart;
+	if (!rLen)  return FALSE;
+	return BISECT_ARRAY(j,smat->JA + rStart,rLen);
+}
+inline int IS_NNZ_linear(spmat* smat,idx_t i,idx_t j){	//linear -> O(ROWLENGHT)
+	int out = 0;
+	for (idx_t x=smat->IRP[i]; x<smat->IRP[i+1] && !out; x++){
+		out = (j == smat->JA[x]); 
+	} 
+	return out;
+}
+////aux functions
+//free sparse matrix
+inline void freeSpmatInternal(spmat* mat){
+	if(!mat)	return;
+	free(mat->AS);  
+	free(mat->JA);  
+	free(mat->IRP);  
+#ifdef ROWLENS
+	free(mat->RL);
+#endif 
+}
+
+inline void freeSpmat(spmat* mat){
+	if (!mat)	return;
+	freeSpmatInternal(mat);
+	free(mat);
+}
+
+static inline void zeroSpmat(spmat* m) {
+	memset(m->AS, 0, sizeof(*(m->AS)) * m->NZ);
+	memset(m->JA, 0, sizeof(*(m->JA)) * m->NZ);
+	memset(m->IRP, 0, sizeof(*(m->IRP)) * (m->M + 1));
+}
+
+//free max aux structs not NULL pointed
+inline void freeSpAcc(SPACC* r){ 
+	free(r->AS);
+	free(r->JA);
+}
+////alloc&init functions
+//alloc&init internal structures only dependent of dimensions @rows,@cols
+inline int allocSpMatrixInternal(idx_t rows, idx_t cols, spmat* mat){
+	mat -> M = rows;
+	mat -> N = cols;
+	if (!(mat->IRP=calloc(mat->M+1,sizeof(*(mat->IRP))))){ //calloc only for 0th
+		ERRPRINT("IRP calloc err\n");
+		freeSpmatInternal(mat);
+		return EXIT_FAILURE;
+	}
+#ifdef ROWLENS
+	if (!(mat->RL = malloc(mat->M*sizeof(*(mat->RL))))){
+		ERRPRINT("RL calloc err\n");
+		freeSpmatInternal(mat);
+		return EXIT_FAILURE;
+	}
+#endif
+	return EXIT_SUCCESS;
+}
+
+//alloc a sparse matrix of @rows rows and @cols cols 
+inline spmat* allocSpMatrix(idx_t rows, idx_t cols){
+
+	spmat* mat;
+	if (!(mat = calloc(1,sizeof(*mat)))) { 
+		ERRPRINT("mat  calloc failed\n");
+		return NULL;
+	}
+	if (allocSpMatrixInternal(rows,cols,mat)){
+		free(mat);
+		return NULL;
+	}
+	return mat;
+}
+
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h b/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
new file mode 100644
index 00000000..fbcab700
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/sparseUtilsGeneric.h
@@ -0,0 +1,138 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#include "sparseMatrix.h"
+#include "config.h"
+
+#ifndef OFF_F
+	#pragma error OFF_F required
+#endif
+//////////////////////////////// CSR SPECIFIC /////////////////////////////////
+///SPARSE MATRIX PARTITIONING
+/*
+ * partition CSR sparse matrix @A in @gridCols columns partitions 
+ * returning an offsets matrix out[i][j] = start of jth colPartition of row i
+ * subdivide @A columns in uniform cols ranges in the output 
+ */
+idx_t* CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
+
+/*
+ * partition CSR sparse matrix @A in @gridCols columns partitions as 
+ * indipended and allocated sparse matrixes and return them
+ * subdivide @A columns in uniform cols ranges in the output 
+ */
+spmat* CAT(colsPartitioningUnifRanges_,OFF_F)(spmat* A,uint gridCols);
+//same as above but with (aux) use of offsets partitoning (also returned if colOffsets!=NULL
+spmat* CAT(colsPartitioningUnifRangesOffsetsAux_,OFF_F)(spmat* A,uint gridCols,idx_t** colPartsOffsets);
+
+//same as checkOverallocPercent but with 2D partitioning - CSR col partitioning
+void CAT(checkOverallocRowPartsPercent_,OFF_F)(ulong* forecastedSizes,spmat* AB,
+				  	      idx_t gridCols,idx_t* bColOffsets);
+///////////////////////////////////////////////////////////////////////////////
+
+///Single implementations headers
+#ifndef SPARSEUTILS_H_COMMON_IDX_IMPLS
+#define SPARSEUTILS_H_COMMON_IDX_IMPLS
+
+//shift every index about the sparse data for use the matrix in a fortran app
+inline void C_FortranShiftIdxs(spmat* m){
+	for(ulong r=0; r<m->M+1; m -> IRP[r]++,r++);
+	for(ulong i=0; i<m->NZ;  m -> JA[i]++, i++);
+}
+//shift every index about the sparse data for use the matric in a C app
+inline void Fortran_C_ShiftIdxs(spmat* m){	//TODO DBG ONLY and compleatness
+	for(ulong r=0; r<m->M+1; m -> IRP[r]--,r++);
+	for(ulong i=0; i<m->NZ;  m -> JA[i]--, i++);
+}
+
+/*
+ * check SpMM resulting matrix @AB = A * B nnz distribution in rows
+ * with the preallocated, forecasted size in @forecastedSizes 
+ * in @forecastedSizes there's for each row -> forecasted size 
+ * and in the last entry the cumulative of the whole matrix
+ */
+void checkOverallocPercent(ulong* forecastedSizes,spmat* AB);
+/*  
+	check if sparse matrixes A<->B differ up to 
+	DOUBLE_DIFF_THREASH per element
+*/
+int spmatDiff(spmat* A, spmat* B);
+////dyn alloc of spMM output matrix
+/*
+///size prediction of AB = @A * @B
+inline ulong SpMMPreAlloc(spmat* A,spmat* B){
+	//TODO BETTER PREALLOC HEURISTICS HERE 
+	return MAX(A->NZ,B->NZ);
+}
+//init a sparse matrix AB=@A * @B with a initial allocated space by an euristic
+inline spmat* initSpMatrixSpMM(spmat* A, spmat* B){
+	spmat* out;
+	if (!(out = allocSpMatrix(A->M,B->N)))  return NULL;
+	out -> NZ = SpMMPreAlloc(A,B);
+	if (!(out->AS = malloc(out->NZ*sizeof(*(out->AS))))){
+		ERRPRINT("initSpMatrix: out->AS malloc errd\n");
+		free(out);
+		return NULL;
+	}
+	if (!(out->JA = malloc(out->NZ*sizeof(*(out->JA))))){
+		ERRPRINT("initSpMatrix: out->JA malloc errd\n");
+		freeSpmat(out);
+		return NULL;
+	}
+	return out;
+}
+
+#define REALLOC_FACTOR  1.5
+//realloc sparse matrix NZ arrays
+inline int reallocSpMatrix(spmat* mat,ulong newSize){
+	mat->NZ *= newSize;
+	void* tmp;
+	if (!(tmp = realloc(mat->AS,mat->NZ * sizeof(*(mat->AS))))){
+		ERRPRINT("reallocSpMatrix:  realloc AS errd\n");
+		return EXIT_FAILURE;
+	}
+	mat->AS = tmp;
+	if (!(tmp = realloc(mat->JA,mat->NZ * sizeof(*(mat->JA))))){
+		ERRPRINT("reallocSpMatrix:  realloc JA errd\n");
+		return EXIT_FAILURE;
+	}
+	mat->JA = tmp;
+	return EXIT_SUCCESS;
+}
+*/
+////MISC
+//print useful information about 3SPMM about to compute
+void print3SPMMCore(spmat* R,spmat* AC,spmat* P,CONFIG* conf);
+void printSparseMatrix(spmat* sparseMat,char justNZMarkers);
+/*convert @sparseMat sparse matrix in dense matrix returned*/
+double* CSRToDense(spmat* sparseMat);
+
+void freeAccsDense(ACC_DENSE* vectors,ulong num);
+
+#endif //SPARSEUTILS_H_COMMON_IDX_IMPLS 
diff --git a/base/serial/impl/sp3mm4amg/include/sparseUtilsMulti.h b/base/serial/impl/sp3mm4amg/include/sparseUtilsMulti.h
new file mode 100644
index 00000000..13478f35
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/sparseUtilsMulti.h
@@ -0,0 +1,52 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#ifndef SPARSEUTILSMULTI_H
+#define SPARSEUTILSMULTI_H
+
+#ifdef OFF_F	//save "includer" OFF_F value before overwriting it
+	#pragma push_macro("OFF_F")
+	#define _OFF_F_OLD
+	#undef  OFF_F
+#endif
+
+#define OFF_F 0
+#include "sparseUtilsGeneric.h"
+#undef  OFF_F
+
+#define OFF_F 1 
+#include "sparseUtilsGeneric.h"
+#undef  OFF_F
+
+#ifdef _OFF_F_OLD
+	#pragma pop_macro("OFF_F")
+	#undef  _OFF_F_OLD
+#endif
+
+#endif
diff --git a/base/serial/impl/sp3mm4amg/include/utils.h b/base/serial/impl/sp3mm4amg/include/utils.h
new file mode 100644
index 00000000..8ef47492
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/include/utils.h
@@ -0,0 +1,181 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+#ifndef UTILS
+#define UTILS
+
+//#include <stddef.h> 
+#include "macros.h"
+
+#include "linuxK_rbtree_minimalized.h"
+extern int urndFd;	//file pointer to DRNG_DEVFILE O_RDONLY opened
+int init_urndfd(); // wrap init urndFd
+
+///IO
+//UNBUFFERED IO
+/*
+ * urndFd usage template to populate random timeout
+	if(_read_wrap(urndFd,(char*)&timeout,sizeof(timeout))<0){
+		fprintf(stderr,"rnd read for thread's timeout failed");
+		ret=EXIT_FAILURE;
+		goto end;
+	}
+ */
+//wrap read cycle over @fd
+int read_wrap(int fd,void* dst,size_t count);
+int fread_wrap(FILE* fp,void* dst,size_t count);
+//dual of read_wrap
+int write_wrap(int fd,void* src,size_t count);
+//create or open file at @outFpath for write
+int createNewFile(char* const outFpath);
+///STRUCTURED DATA IO
+#define DOUBLE_STR_FORMAT	"%25le\n"
+//write double vector @v as row sequence of double at @fpath
+//e.g. read with od -tf8 -w8 fpath : OCTALOFFSET:   DOUBLE FULL DIGITS
+int writeDoubleVector(char* fpath,double* v,ulong size);
+/*
+ * read vector of double [Str] of arbitrary size from @fpath, true lenght in *size
+ * if size point to a nnz value, the initial allocation will be of *size
+ * eventual successive reallocation done multipling *size with VECTOR_STEP_REALLOC
+ */
+double* readDoubleVector(char* fpath,ulong* size);
+double* readDoubleVectorStr(char* fpath,ulong* size);
+
+///STRUCTURED DATA IO -- BUFFERED: FSCANF - FPRINTF
+//dual of readDoubleVectorVector
+int writeDoubleVectorAsStr(char* fpath,double* v,ulong size);
+int MPI_Dims_create(int nnodes, int ndims, int dims[]);	//commons/ompi_dims_create/ompi_dims_create.c
+
+#include "config.h"
+///config from ENV
+#define GRID_ROWS   "GRID_ROWS"
+#define GRID_COLS   "GRID_COLS"
+//parse configuration from env
+int getConfig(CONFIG* conf);
+
+//append only list implemented with a reallocated array
+typedef struct{
+	ulong* a;
+	ulong  size;
+	ulong  lastIdx;
+} APPENDARRAY;
+//append @val to @list, reallocating if reached end
+//TODO inline int appendArr(ulong val,APPENDARRAY* list);
+
+void sortuint(uint* arr, uint len);     //sort uint array @arr of @len elements
+void sort_idx_t(idx_t* arr, idx_t len); 
+void sortulong(ulong* arr, ulong len);
+void sortRbNode(rbNode* arr,idx_t len);
+
+///ranges functions 
+/*
+ * considering @rangesN uniform ranges from [0,rangesEnd)
+ * return the rangeId that element @i is in
+ */
+inline ushort matchingUnifRangeIdxLinear(idx_t i,idx_t rangesEnd,ushort rangesN){
+	double rangeW = rangesEnd / (double) rangesN;
+	for(idx_t rEnd = rangeW,j=0; j < rangesN; rEnd = rangeW * (j++)){
+		if( i < rEnd )	return j;
+	}
+	assert( FALSE );	//i should be in a range!
+	return EXIT_FAILURE;
+}
+/*
+ * find which range @idx match among 
+ * a uniform range divion of @size element in @rangesN ranges with carried reminder
+ * return 0based idx of matched range
+ */
+inline ushort matchingUnifRangeIdx(idx_t idx, idx_t size, ushort rangesN){
+	idx_t rangeW = size/rangesN, rangeRem = size%rangesN;
+	idx_t searchStart,searchEnd;
+	//!IN_RANGE(idx,unifRemShareBlock(idx,rangeW,rangeRem)) && IN_RANGE(r,0,rangesN);){
+	for(ushort r=rangesN/2-1, hMoveWidth=rangesN/2; hMoveWidth>0;hMoveWidth/=2){
+		searchStart = unifRemShareStart(idx,rangeW,rangeRem);
+		searchEnd	= unifRemShareEnd(idx,rangeW,rangeRem);
+		if 		(IN_RANGE(idx,searchStart,searchEnd))	return r;
+		else if	(idx < searchStart)						r = AVG(0,r);
+		else											r = AVG(r,rangesN-1);
+	}
+	assert(FALSE);
+}
+
+///reductionUtils
+inline idx_t reductionSumSeq(idx_t* arr,idx_t arrLen){
+	idx_t i,out;
+	for(i=0,out=0; i<arrLen; out += arr[i++] );
+	return out;
+}
+inline idx_t reductionMaxSeq(idx_t* arr,idx_t arrLen){
+	idx_t i,out;
+	for( i=0,out=0; i<arrLen; i++,out=out<arr[i]?arr[i]:out );
+	return out;
+}
+inline idx_t reductionSumOmp(idx_t* arr,idx_t arrLen){
+	idx_t out;
+	#pragma omp parallel for reduction(+:out)
+	for(idx_t i=0; i<arrLen; i++){
+		out += arr[i];
+	}
+	return out;
+}
+inline idx_t reductionMaxOmp(idx_t* arr,idx_t arrLen){
+	idx_t out;
+	#pragma omp parallel for reduction(max:out)
+	for(idx_t i=0; i<arrLen; i++){	out=out<arr[i]?arr[i]:out;	}
+	return out;	
+}
+
+/*
+ * return 0 if vectors a and b has elements that differ at most of DOUBLE_DIFF_THREASH 
+ * if diffMax!=NULL save there the max difference value  
+ *  of the 2 entries of the input vectors, signed as a[i] - b[i] (as well as dump prints)
+ * CONVENTION:	@a = true result, @b vector to check with the true result 
+ */
+int doubleVectorsDiff(double* a, double* b, ulong n,double* diffMax);
+//fill a random vector in @v long @size doubles
+int fillRndVector(ulong size, double* v);
+//read vector as a sequence of space separated double from file at @fpath 
+#define VECTOR_STEP_MALLOC 100
+
+/* 
+ * decompress file at @path into @tmpFsDecompressPath, 
+ * decompression command obtanined first looking at the extension
+ * then matching it with a list of avaible decompression cmd
+ * that can be make as shell cmd adding @path > @tmpFsDecompressPath
+ * e.g. decompress xz -d -c @path > @tmpFsDecompressPath
+ * Returns: -1 if decompression wasn't possible otherwise decompress command exti status
+ */
+int extractInTmpFS(char* path, char* tmpFsDecompressPath);
+//compute E[@values] in @out[0] and VAR[@values] in @out[1] of @numVals values
+void statsAvgVar(double* values,uint numVals, double* out);
+void printMatrix(double* mat,ulong m,ulong n,char justNZMarkers);
+void printVector(double* v,ulong size);
+
+void assertArrNoRepetitions(idx_t* arrSorted, idx_t arrLen);
+#endif
diff --git a/base/serial/impl/sp3mm4amg/lib/linuxK_rbtree_minimalized.c b/base/serial/impl/sp3mm4amg/lib/linuxK_rbtree_minimalized.c
new file mode 100644
index 00000000..7fe3bd58
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/lib/linuxK_rbtree_minimalized.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+
+  linux/lib/rbtree.c
+*/
+/*
+ *              RedBlackTree_linux_userspace
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the RedBlackTree_linux_userspace or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE RedBlackTree_linux_userspace GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+///#include <linux/rbtree_augmented.h>	//TODO LESS_DEPENDENCIES
+#include "linuxK_rbtree_minimalized.h"	///fulfill embedded deps needed //TODO LESS_DEPENDENCIES
+//#include <linux/export.h>
+
+/*	MINIMALIZED VERSION WITHOUT THE IMPORT OF rbtree_augmented 
+ * red-black trees properties:  https://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
+
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
+///rb_set_black	//TODO ADDONLY_GENERIC
+
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
+{
+	return (struct rb_node *)red->__rb_parent_color;
+}
+
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
+}
+
+///TODO REMOVE AGUMENT
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red.
+		 */
+		if (unlikely(!parent)) {
+			/*
+			 * The inserted node is root. Either this is the
+			 * first node, or we recursed at Case 1 below and
+			 * are no longer violating 4).
+			 */
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		}
+
+		/*
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as,
+		 * per 4), we don't want a red root or two
+		 * consecutive red nodes.
+		 */
+		if(rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - node's uncle is red (color flips).
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - node's uncle is black and node is
+				 * the parent's right child (left rotate at parent).
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				///augment_rotate(parent, node); //TODO DEL AUGMENTED DEP
+				parent = node;
+				tmp = node->rb_right;
+			}
+
+			/*
+			 * Case 3 - node's uncle is black and node is
+			 * the parent's left child (right rotate at gparent).
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			///augment_rotate(gparent, parent); //TODO DEL_AUGMENTED_DEP
+			break;
+		} else {
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
+			}
+
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				///augment_rotate(parent, node); //TODO DEL_AUGMENTED_DEP
+				parent = node;
+				tmp = node->rb_left;
+			}
+
+			/* Case 3 - left rotate at gparent */
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			///augment_rotate(gparent, parent); //TODO DEL_AUGMENTED_DEP
+			break;
+		}
+	}
+}
+
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+///____rb_erase_color	//TODO ADDONLY_GENERIC
+
+/* Non-inline version for rb_erase_augmented() use */
+///__rb_erase_color	//TODO ADDONLY_GENERIC
+
+/*	TODO LESS_DEPENDENCIES MINIMALIZED
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+
+static const struct rb_augment_callbacks dummy_callbacks = {
+	.propagate = dummy_propagate,
+	.copy = dummy_copy,
+	.rotate = dummy_rotate
+};*/
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	__rb_insert(node, root);
+}
+
+///rb_erase	//TODO ADDONLY_GENERIC
+
+/** TODO LESS_DEPENDENCIES - AUGMENTED REMOVE
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
+ */
+///__rb_insert_augmented todo LESS_DEPENDENCIES - AUGMENTED REMOVE
+
+////// Traversing
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right;
+		while (node->rb_left)
+			node = node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
+	if (node->rb_left) {
+		node = node->rb_left;
+		while (node->rb_right)
+			node = node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+
+///rb_replace_node	//TODO ADDONLY_GENERIC
+///rb_replace_node_rcu	//TODO LESS_DEPENDENCIES
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
diff --git a/base/serial/impl/sp3mm4amg/lib/mmio.c b/base/serial/impl/sp3mm4amg/lib/mmio.c
new file mode 100644
index 00000000..c250ff2a
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/lib/mmio.c
@@ -0,0 +1,511 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}
diff --git a/base/serial/impl/sp3mm4amg/lib/parser.c b/base/serial/impl/sp3mm4amg/lib/parser.c
new file mode 100644
index 00000000..f527500f
--- /dev/null
+++ b/base/serial/impl/sp3mm4amg/lib/parser.c
@@ -0,0 +1,391 @@
+/*
+ *              Sp3MM_for_AlgebraicMultiGrid
+ *    (C) Copyright 2021-2022
+ *        Andrea Di Iorio      
+ * 
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *    2. Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions, and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
+ *       not be used to endorse or promote products derived from this
+ *       software without specific written permission.
+ * 
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
+ *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */ 
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "sparseMatrix.h"
+#include "mmio.h"
+#include "parser.h"
+#include "macros.h"
+#include "utils.h"
+
+////COO PARSE
+int MMCheck(MM_typecode mcode) {
+	if (!mm_is_matrix(mcode)){  //consistency checks among flags in @mcode
+		ERRPRINT("invalid matrix: not a matrix\n");
+		return EXIT_FAILURE;
+	}
+	if (mm_is_dense(mcode) ){   //|| mm_is_array(mcode) ){
+		ERRPRINT("invalid matrix: not a supported sparse matrix\tDENSE MAT\n");
+		return EXIT_FAILURE;
+	}
+	return EXIT_SUCCESS;
+}
+
+entry* MMtoCOO(ulong* NZ, FILE *fp, MM_typecode mcode,ulong* rowLens){
+	int scanndRet=0;
+	ulong nzTrgt=*NZ,nzIdx=0;	//expanded num of nz (in case of sym matrix)
+	ulong diagEntries=0, row = 0, col = 0;//current entry's row,col from MM -> 1 based
+	double val = 0;
+	entry* entries = NULL;		//COO parsed entries
+	///init
+	if (mm_is_symmetric(mcode)){
+		nzTrgt = 2* (*NZ); //upscale max num of nz in the matrix
+		VERBOSE	 printf("MMtoCOO:\tparsing a simmetric matrix\n");
+	}
+	if (!(entries	 = malloc(nzTrgt * sizeof(*entries)))){
+		ERRPRINT("MMtoCOO:  entries malloc errd\n");
+		return NULL;
+	}
+	///parse MM fp lines into COOordinate entries
+	while (1) { // Reading the fp until EOF
+		if (mm_is_pattern(mcode)){
+			scanndRet = fscanf(fp, "%lu %lu\n", &row, &col);
+			val = 1.0;
+		} else if (mm_is_real(mcode) || (mm_is_integer(mcode))){
+			scanndRet = fscanf(fp, "%lu %lu %lf\n", &row, &col, &val);
+		}
+		
+
+		if (scanndRet == EOF){ //TODO more strict check with type&ret?
+			if (ferror(fp)){
+				perror("fscanf EOF");
+				goto _err; 
+			} else  break;
+		}
+		CONSISTENCY_CHECKS{ 
+			//TODO USELESS ? ? ?
+			if ((mm_is_pattern(mcode) && scanndRet != 2) || 
+				(!mm_is_pattern(mcode) && scanndRet != 3)){
+				ERRPRINT("invalid matrix: not consistent entry scannable\n");
+				goto _err; 
+			}
+		}
+		////ADD THE CURRENT MATRIX ENTRY
+		rowLens[row-1]++;
+		entries[nzIdx++]=(entry) { .row=row-1, .col=col-1, .val=val };
+		//also mirrored entry if sym.matrix with reflected idx inside matrix limits
+		if (mm_is_symmetric(mcode) && row != col ){
+			//TODO COSTRAINED FORMAT ?&& row <= mat->N && col <= mat->M ){
+			SWAP(row,col);
+			rowLens[row-1]++;
+			entries[nzIdx++]=(entry) { .row=row-1, .col=col-1, .val=val };
+		}
+		else	diagEntries++;	//for CONSISTENCY_CHECKS only
+	}
+ 
+	//CONSISTENCY_CHECKS 
+	nzTrgt = *NZ;
+	if(mm_is_symmetric(mcode))	nzTrgt = 2*(*NZ) - diagEntries;
+	assert( nzIdx == nzTrgt );
+	
+	//update NZ 
+	*NZ = nzIdx;
+	return entries;
+	
+	_err:
+	free(entries);  return NULL;
+} 
+
+void freeMatrixMarket(MatrixMarket* mm){
+	if (!mm)	return;
+	free(mm->entries);
+	free(mm->rowLens);
+	free(mm);
+}
+MatrixMarket* MMRead(char* matPath){
+	FILE* fp = fopen(matPath, "r");
+	if (!fp){
+		perror("fopen");
+		return NULL;
+	}
+	MatrixMarket* out = calloc(1,sizeof(*out));
+	if (!out){
+		ERRPRINT("MMRead out malloc errd\n");
+		goto err;
+	}
+	//banner -> parse  matrix specs
+	if (mm_read_banner(fp, &out->mcode) != 0) {
+		fprintf(stderr,"mm_read_banner err at:%s\n",matPath);
+		goto err;
+	}
+	//assert matrix is compatible with this app scope
+	if (MMCheck(out->mcode))	 goto err;
+
+	//parse sizes
+	//TODO OVERCOME uint limitation?
+	if(mm_read_mtx_crd_size(fp,(uint*) &out->M, (uint*) &out->N, (uint*) &out->NZ)){
+		fprintf(stderr,"mm_read_mtx_crd_size err at %s:\n",matPath);
+		goto err;
+	}
+	if (!(out->rowLens = calloc(out->M,sizeof(*(out->rowLens))))){
+		ERRPRINT("MMRead:\trowLens calloc errd\n");
+		goto err;
+	}
+	if (!(out->entries = MMtoCOO(&out->NZ, fp, out->mcode,out->rowLens))){
+		ERRPRINTS("MAT PARSE TO CSR ERR at:%s\n",matPath);
+		goto err;
+	}
+	goto _end;
+
+	err:
+	freeMatrixMarket(out);
+	out = NULL;
+	_end:
+	fclose(fp);
+	return out;
+}
+
+
+////COO -> ANYTHING ELSE CONVERSION
+int COOtoCSR(entry* entries, spmat* mat,ulong* rowLens){ 
+	int out = EXIT_FAILURE;
+	ulong idx;
+	long* _rowsLastCol = NULL;	//for each row -> last added entry's columnIdx 
+	ulong* rowsNextIdx = NULL;	//for each row -> next entry progressive idx
+	if (!(rowsNextIdx = calloc(mat->M,sizeof(*rowsNextIdx)))){
+		ERRPRINT("MMtoCOO:  rowsNextIdx calloc errd\n");
+		goto _end;
+	}
+	CONSISTENCY_CHECKS{ //alloc and init aux arr for entries sort check
+		if (!(_rowsLastCol = malloc(mat->M*sizeof(*_rowsLastCol)))){
+			ERRPRINT("MMtoCOO:  _rowsLastCol malloc errd\n");
+			goto _end;
+		}
+		memset(_rowsLastCol,-1,mat->M*sizeof(*_rowsLastCol));
+	}
+	/*TODO OLD
+	 * //get rowLens->IRP (partial), TODO moved MMtoCOO to avoid FULL rescan entries
+	 * for (ulong i=0; i<mat->NZ; i++)	 mat->IRP[entries[i].row+1]++;
+	 * memcpy(mat->RL,mat->IRP + 1,sizeof(*mat->IRP) * mat->M); //TODO in next ifdef
+	 * for (ulong i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
+	 * OLD2: rowLens memcpy ... no just moved the pointer
+	 * #ifdef ROWLENS
+	 * memcpy(mat->RL,rowLens,sizeof(*rowLens) * mat->M); //TODO in next ifdef
+	 * #endif
+	 */
+	//IRP: trasform rows lens as increments to build row index "pointer"
+	//0th -> 0 mandatory; 1th = 0th row len, ...., M+1th = end of Mth row
+	memcpy(mat->IRP+1,rowLens,sizeof(*rowLens) * mat->M);//init IRP with rows lens
+	for (ulong i=2; i<mat->M+1; i++)	mat->IRP[i] += mat->IRP[i-1];
+	CONSISTENCY_CHECKS  assert(mat->IRP[mat->M] == mat->NZ);
+	///FILL 
+	//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
+	//sorted for each row (even nn sequential in @entries)
+	//entries write in CSR format
+	entry* e;
+	for (ulong i=0; i<mat->NZ; i++) {
+		e = entries+i;
+		CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE SORTED
+			/*#pragma message("COO sorting check enabled")*/
+			if (_rowsLastCol[e->row] >= (long) e->col){
+				ERRPRINTS("not sorted entry:%ld,%ld,%lf",e->row,e->col,e->val);
+				goto _end;
+			}
+			_rowsLastCol[e->row] = e->col;
+		}
+		idx = mat -> IRP[e->row] + rowsNextIdx[e->row]++;
+		mat -> AS[idx] = e->val;
+		mat -> JA[idx] = e->col;
+	}
+	
+	out = EXIT_SUCCESS;
+
+	_end:
+	if(rowsNextIdx)	 free(rowsNextIdx);
+	if(_rowsLastCol)	free(_rowsLastCol);
+
+	return out;
+}
+
+int COOtoELL(entry* entries, spmat* mat, ulong* rowLens){
+	int out=EXIT_FAILURE;
+	ulong maxRow = 0, col, _ellEntriesTot, *rowsNextCol;
+	long* _rowsLastCol=NULL;
+	entry* e;
+	for (ulong i=0; i<mat->M; i++)  maxRow = MAX(maxRow,rowLens[i]); 
+	_ellEntriesTot = 2*mat->M*maxRow;
+	#ifdef LIMIT_ELL_SIZE
+	if ( _ellEntriesTot > ELL_MAX_ENTRIES ){
+		ERRPRINTS("Required entries %lu -> %lu uMB for the matrix exceed the "
+		  "designated threashold of: %lu  -> %lu MB for ellpack\n",
+		  _ellEntriesTot,(sizeof(double)*_ellEntriesTot) >> 20,
+		  ELL_MAX_ENTRIES,(sizeof(double)*ELL_MAX_ENTRIES) >> 20);
+		return EXIT_FAILURE;
+	}
+	#endif
+	//malloc aux vects
+	if (!(rowsNextCol = calloc(mat->M,sizeof(*rowsNextCol)))){
+		ERRPRINT("MMtoELL:\trowsNextCol calloc errd\n");
+		goto _end;
+	}
+	CONSISTENCY_CHECKS{ //alloc and init aux arr for entries SORT CHECK
+		if (!(_rowsLastCol = malloc(mat->M*sizeof(*_rowsLastCol)))){
+			ERRPRINT("MMtoELL:\trowsLastCol malloc errd\n");
+			goto _end;
+		}
+		memset(_rowsLastCol,-1,mat->M*sizeof(*_rowsLastCol));
+	}
+	///malloc dependant to MAX ROW LEN, err free in the caller
+	if (!(mat->AS = calloc(mat->M * maxRow,  sizeof(*(mat->AS))))){
+		ERRPRINT("MMtoELL:\tELL->AS calloc errd\n");
+		goto _end;
+	} //zero init for auto rows residual fill with 0
+	if (!(mat->JA = calloc(mat->M * maxRow,  sizeof(*(mat->JA))))){
+		ERRPRINT("MMtoELL:\tELL->JA calloc errd\n");
+		goto _end;
+	}
+
+	mat->MAX_ROW_NZ = maxRow;
+	/*#ifdef ROWLENS
+	 *memcpy(mat->RL,rowLens,sizeof(*rowLens) * mat->M); //TODO in next ifdef
+	 *#endif 
+	 */
+	///FILL NZ
+	//TODO EXPECTED entries with .col entries -> CONSISTENCY_CHECKS
+	//sorted for each row (even nn sequential in @entries)
+	for (ulong i=0; i<mat->NZ; i++){
+		e = entries + i;
+		CONSISTENCY_CHECKS{ //TODO CHECK IF COO ENTRIES ARE COLS SORTED for righe successive
+			/*#pragma message("COO sorting check enabled")*/
+			if (_rowsLastCol[e->row] >= (long) e->col){
+				ERRPRINTS("not sorted entry:%ld,%ld,%lf",
+					  e->row,e->col,e->val);
+				goto _end;
+			}
+			_rowsLastCol[e->row] = e->col;
+		}
+		col = rowsNextCol[e->row]++;	//place entry in its row's sequent spot
+		mat->AS[ IDX2D(e->row,col,maxRow) ] = e->val;
+		mat->JA[ IDX2D(e->row,col,maxRow) ] = e->col;
+		 
+	}
+	///FILL PAD
+	ulong padded = 0,paddedEntries = mat->M*mat->MAX_ROW_NZ;
+	for (ulong r=0; r<mat->M; r++){
+		for (ulong c=rowLens[r],j=IDX2D(r,c,maxRow); c<maxRow; c++,j++,padded++){
+			//mat->AS[j] = ELL_AS_FILLER; //TODO ALREADY DONE IN CALLOC
+			//mat->JA[j] = mat->JA[rowLens[r]-1]; //ELL_JA_FILLER; //TODO calloc CUDA benefit?
+		}
+	}
+	VERBOSE{ 
+	  printf("padded %lu entries = %lf%% of NZ\n",padded,100*padded/(double) mat->NZ);
+	  printf("ELL matrix of: %lu paddedEntries -> %lu MB of JA+AS\n",
+		paddedEntries,(paddedEntries*sizeof(*(mat->JA))+paddedEntries*sizeof(*(mat->AS))) >> 20);
+	}
+	out = EXIT_SUCCESS;
+	_end:
+	if(rowsNextCol)  free(rowsNextCol);
+	if(_rowsLastCol) free(_rowsLastCol);
+	return out;
+}
+////wrapper MM -> specialized target
+spmat* MMtoCSR(char* matPath){
+	spmat* mat	= NULL;
+	MatrixMarket* mm = MMRead(matPath);
+	if (!mm){
+		ERRPRINT("MMtoCSR parse err\n");
+		return NULL;
+	}
+	if (!(mat = calloc(1,sizeof(*mat)))){
+		ERRPRINT("MMtoCSR: mat struct alloc errd");
+		goto err;
+	}
+	mat -> M = mm->M;
+	mat -> N = mm->N;
+	mat -> NZ= mm->NZ;
+	//alloc sparse matrix components
+	if (!(mat->IRP = calloc(mat->M+1,sizeof(*(mat->IRP))))){
+		ERRPRINT("MMtoCSR: IRP calloc err\n");
+		goto err;
+	}
+	////alloc core struct of CSR
+	if(!(mat->JA = malloc(mat->NZ*sizeof(*(mat->JA))))){
+		ERRPRINT("MMtoCSR: JA malloc err\n");
+		goto err; 
+	}
+	if(!(mat->AS = malloc(mat->NZ*sizeof(*(mat->AS))))){
+		ERRPRINT("MMtoCSR: AS malloc err\n");
+		goto err;  
+	}
+	if (COOtoCSR(mm->entries,mat,mm->rowLens))  goto err;
+	#ifdef ROWLENS
+	mat->RL = mm->rowLens;
+	mm->rowLens = NULL; //avoid free in @freeMatrixMarket
+	#endif
+
+	VERBOSE
+	printf("MMtoCSR: %lu NZ entries-> %lu MB of AS+JA+IRP\n",mat->NZ,
+	(mat->NZ*sizeof(*(mat->AS))+mat->NZ*sizeof(*(mat->JA))+(1+mat->M*sizeof(*(mat->IRP))))>>20);
+	goto _free;
+
+
+	err:
+	if (mat)	freeSpmat(mat);
+	mat = NULL;
+	_free:
+	freeMatrixMarket(mm);
+	return mat;
+}
+
+
+spmat* MMtoELL(char* matPath){
+	spmat* mat	= NULL;
+	MatrixMarket* mm = MMRead(matPath);
+	if (!mm){
+		ERRPRINT("MMtoELL: parse err\n");
+		return NULL;
+	}
+	if (!(mat = calloc(1,sizeof(*mat)))){
+		ERRPRINT("MMtoELL:  mat struct alloc errd");
+		goto err;
+	}
+	////alloc core struct of CSR
+	mat -> M = mm->M;
+	mat -> N = mm->N;
+	mat -> NZ= mm->NZ;
+	if (COOtoELL(mm->entries,mat,mm->rowLens))  goto err;
+	#ifdef ROWLENS
+	mat->RL = mm->rowLens;
+	mm->rowLens = NULL; //avoid free in @freeMatrixMarket
+	#endif
+
+	goto _free;
+
+	err:
+	if(mat) freeSpmat(mat);
+	mat = NULL;
+	_free:
+	freeMatrixMarket(mm);
+	return mat;
+}
diff --git a/cbind/base/psb_objhandle_mod.F90 b/cbind/base/psb_objhandle_mod.F90
index d6200294..69ce1d0b 100644
--- a/cbind/base/psb_objhandle_mod.F90
+++ b/cbind/base/psb_objhandle_mod.F90
@@ -40,35 +40,4 @@ module psb_objhandle_mod
 
   type, bind(c) :: psb_c_zspmat
     type(c_ptr) :: item = c_null_ptr
-  end type psb_c_zspmat
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! sp3mm c code structs
-! TODO : rename to conventions
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  type, bind(C, name=spmat) :: spmat_t
-    ! number of non zeros and dimensions
-    integer(c_size_t)               :: nz, m, n
-    ! value array
-    real(c_double), allocatable      :: as(:)
-    ! columns array
-    integer(c_size_t), allocatable  :: ja(:)
-    ! row index pointers array
-    integer(c_size_t), allocatable  :: irp(:)
-    ! lengths of the rows
-    integer(c_size_t), allocatable  :: rl(:)
-    ! max value of rl
-    integer(c_size_t)               :: max_row_nz
-  end type spmat_t
-
-  type, bind(C, name=CONFIG) :: config_t
-    ! dimensions of the grid
-    integer(c_short)  :: grid_rows, grid_cols
-    ! how to compute symb mul (if required)
-    integer(c_int)    :: symb_mm_row_impl_id
-    ! thread num to use in OMP parallel region
-    integer(c_int)    :: thread_num
-    ! CHUNKS_DISTR_INTERF func pntr
-    type(c_ptr)       :: chunk_distrb_func
-  end type config_t
-end module psb_objhandle_mod
+  end type psb_c_zspmat
\ No newline at end of file