/*
 *              Sp3MM_for_AlgebraicMultiGrid
 *    (C) Copyright 2021-2022
 *        Andrea Di Iorio      
 * 
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *    1. Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions, and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
 *       not be used to endorse or promote products derived from this
 *       software without specific written permission.
 * 
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
 *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 */ 

/*
 * CSR Sp[3]MM Symbolic step implementations
 * target: 	compute the output matrix size and the row lens for preallocation
 * 			direct write out partial results
 * See interfaces in respective header
 */


/*#pragma message( "compiling Sp3MM_CSR_OMP_Symb_Generic.c with config as:" \
	STR(OFF_F) " - " STR(OUT_IDXS) " - " STR(COL_PARTS) )*/
#ifndef OFF_F
    #error generic implementation requires OFF_F defined
#endif

///setup aux macros for different signatures implementation via #if arith expr
#pragma push_macro("OUT_IDXS")
#pragma push_macro("_OUT_IDXS")
#pragma push_macro("COL_PARTS")
#pragma push_macro("_COL_PARTS")

#ifdef OUT_IDXS 
	#define _OUT_IDXS  	TRUE
#else   
	#define _OUT_IDXS 	FALSE
	#define OUT_IDXS	_UNDEF
#endif
#ifdef COL_PARTS
	#define _COL_PARTS	TRUE
#else   
	#define _COL_PARTS	FALSE
	#define COL_PARTS	_UNDEF
#endif
///

//////SpMM - rowByrow
///1row->matrix->outRow
//RBTREE based
/*
 * Compute symbolic product of (nnz indexes of) row @aRowJA and matrix @b
 * insert nnz indexes of the mul. result row as nodes in a rbtree rooted at @root
 * with nodes in @nodes which have to be enough for the mul result row (use an UB)
 * Retuns: multiplication result row NNZ number,se CONFIG_MACROS below for more 
 *
 * CONFIG_MACROS:
 * if _OUT_IDXS == TRUE	return mul.result row nnz idxs in @outIdxs
 *	  ifdef: OUT_IDXS_RBTREE_NODES: nnz indexes returned inplace sorting rbtree
 * 								as nnz indexes(JA) of the mul result row
 * else:							stop at returning the mul. result row lenght
 * if _COL_PARTS == TRUE	return the number of nonzero elements in
 *		in each of the @gridCols column partitions inside @rowColPartsLens
 * OFF_F:	offset back indexes from fortran
 * 		TODO also output indexes are shifted (see c_b )
 */
static inline idx_t CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)  
  (
   idx_t* aRowJA, idx_t aRowLen, spmat* b,rbRoot* root, rbNode* nodes
   #if _OUT_IDXS  == TRUE && !defined OUT_IDXS_RBTREE_NODES 
   ,idx_t* outIdxs
   #endif
   #if _COL_PARTS == TRUE
   ,ushort gridCols,idx_t* rowColPartsLens
   #endif
  )
{
	//Compute resulting ab's row non zero indexes and total lenght
	idx_t abRowLen = 0;		//mul.result row len,	return value
	for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){	//for each entry in a's row
		c_a = aRowJA[i]-OFF_F;
		//gather diffrent nnz indexes in corresponding b's `c_a`th row 
		for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){ 
			c_b = b->JA[j]-OFF_F;
			//check if c_b is nonzero index for mul.result row
			inserted 	 =  rbInsertNewKey (root, nodes+abRowLen, c_b);
			abRowLen	+=	inserted; //inserted needed just after this
			/*	LESS EFFICIENT THEN BELOW (here no memory of last colPart)
			#if _COL_PARTS == TRUE //keep track of which col partition is c_b in
			if (inserted)
			 rowColPartsLens[ matchingUnifRangeIdx(c_b, b->N, gridCols) ]++;
			#endif */
		}
	}
	#if 	_OUT_IDXS == T  && defined OUT_IDXS_RBTREE_NODES
	/* return the non zero indexes of the mul.result row
 	 * sorting inplace the nodes inserted in the rbtree */
	sortRbNode(nodes,abRowLen);
	#elif	_OUT_IDXS == T || _COL_PARTS == T
	uint i=0;
	idx_t k;
	#if _COL_PARTS == T
	//colParts aux vars
	idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
	ushort gc=0;
	idx_t gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
	idx_t gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
	#endif	//_COL_PARTS == T
  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
		k = rb_entry(n,rbNode,rb)->key;
		#if _OUT_IDXS == T
		//return the mul.result nnz index inside the rbNodes
		outIdxs[ i++ ] = k;
		#endif
		#if _COL_PARTS == T
		while (k >= gcEndCol ){	//see if the idx is in another col partition
								//	TODO also = since gcEndCol as k is 0based
			gcEndCol = 	unifRemShareEnd(gc ,_colBlock, _colBlockRem);
			gc++;
			DEBUGCHECKS{ assert( gc < gridCols ); }
		}
		rowColPartsLens[gc]++;
		#endif //_COL_PARTS == T
	}
	#endif	//_OUT_IDXS == T ... _COL_PARTS == T
	/*DEBUGCHECKS{	//TODO PRINT NNZ INDEXES FOR MANUAL (PAINFUL CHECK)
		idx_t k;
  		for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
			k = rb_entry(n,rbNode,rb)->key;
			printf("%lu, ",k);
		}
		printf("\n");
	}*/

	return abRowLen;
}

//SPVECT_IDX_DENSE_MAP based	TODO double implementation for trick syntax folding here...
/*
 * SPVECT_IDX_DENSE_MAP based, as SpMM_Row_Symb_Rbtree but with idxMap aux idx keeping
 * CONFIG_MACROS (new)
 * IDX_RMUL_SYMB_RBTREE && ( _OUT_IDXS == T || _COL_PARTS == T ):
 * 	 (tmp) symb mult out indexes will be kept via a rbtree
 * 	 otherwise directly in the out array appending them and then sorting them
 * 	 		(potentially same n log n)
 */
static inline idx_t CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
  (
   idx_t* aRowJA, idx_t aRowLen, spmat* b, SPVECT_IDX_DENSE_MAP* idxsMapAcc
   #if _OUT_IDXS  == TRUE 
   ,idx_t* outIdxs
   #endif
   #if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
   ,rbRoot* root, rbNode* nodes
   #endif	// _OUT_IDXS == TRUE
   #if _COL_PARTS == TRUE
   ,ushort gridCols,idx_t* rowColPartsLens
   #endif
  )
{
	//Compute resulting ab's row non zero indexes and total lenght
	idx_t abRowLen = 0;		//mul.result row len,	return value
	for ( idx_t i=0,c_a,inserted; i < aRowLen; i++ ){	//for each entry in a's row
		c_a = aRowJA[i]-OFF_F;
		//gather diffrent nnz indexes in corresponding b's `c_a`th row 
		for ( idx_t j = b->IRP[c_a]-OFF_F,c_b; j < b->IRP[c_a+1]-OFF_F; j++ ){ 
			c_b = b->JA[j]-OFF_F;
			//check if c_b is nonzero index for mul.result row
			inserted  = spVect_idx_in(c_b,idxsMapAcc);
			#if	_OUT_IDXS == T || _COL_PARTS == T //idxs HAS TO be accumulated
			if (inserted)
				#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F 	//add it in a RBTREE struct
				rbInsertNewKey (root, nodes+idxsMapAcc->len, c_b);
				#else						//append it, then sort
				outIdxs[idxsMapAcc->len] = c_b;
				#endif	//IDX_RMUL_SYMB_RBTREE == T //how accumulated key c_b
			#endif	//#if	_OUT_IDXS == T || _COL_PARTS == T
		}
	}
	abRowLen = idxsMapAcc->len;
	//gather idxs or their sparsity struct in output row
	#if	_OUT_IDXS == T || _COL_PARTS == T 
	idx_t j = 0,k;
	#if _COL_PARTS == T 
	//colParts aux vars
	idx_t _colBlock = abRowLen / gridCols, _colBlockRem = abRowLen % gridCols;
	ushort gc = 0;
	idx_t  gcStartCol = unifRemShareStart(gc,_colBlock,_colBlockRem);
	idx_t  gcEndCol = unifRemShareEnd(gc,_colBlock,_colBlockRem);
	#endif
	#if IDX_RMUL_SYMB_RBTREE == T || _OUT_IDXS == F	///idxs recorded in a aux rbtree
  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
		k = rb_entry(n,rbNode,rb)->key;
		#if _OUT_IDXS == T
		outIdxs[ j++ ] = k;	//record ordered key sotred from aux rbtree
		#endif
	#else				///idxs recorded in aux append array
	sort_idx_t(outIdxs,abRowLen);
  	for (; j < abRowLen; j++){
		k = outIdxs[j];		//(OSS) already ordered in outIndexes arr
	#endif	//IDX_RMUL_SYMB_RBTREE == T
		#if _COL_PARTS == T
		while (k >= gcEndCol ){	//see if the idx is in another col partition
				//	TODO also = since gcEndCol as k is 0based
			gcEndCol = unifRemShareEnd(gc ,_colBlock, _colBlockRem);
			gc++;
			DEBUGCHECKS{ assert( gc < gridCols ); }
		}
		rowColPartsLens[gc]++;
		#endif //_COL_PARTS == T
	}
	#endif	//_OUT_IDXS == T ... _COL_PARTS == T
	return abRowLen;
}

//switch among 2 row_symb_XXX implemenetation aux
/*
 * SpMM single row symbolic computation
 * select one implementation via @implID 
 * among SpMM_Row_Symb_Rbtree or SpMM_Row_Symb_IdxMap
 * args will be forwared accordingly
 */
static inline idx_t CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)  
  (
   ROW_MMSYM_IMPL_MODE implID, idx_t* aRowJA, idx_t aRowLen, spmat* b,
   rbRoot* root, rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc
   #if _OUT_IDXS  == TRUE 
   ,idx_t* outIdxs
   #endif
   #if _COL_PARTS == TRUE
   ,ushort gridCols,idx_t* rowColPartsLens
   #endif
  )
{
	if (implID == RBTREE)	{
		return CAT4(SpMM_Row_Symb_Rbtree,OUT_IDXS,COL_PARTS,OFF_F)  
			(
				aRowJA,aRowLen,b,root,nodes
   				#if _OUT_IDXS  == TRUE && !defined OUT_IDXS_RBTREE_NODES 
				,outIdxs
				#endif
				#if _COL_PARTS == TRUE
				,gridCols,rowColPartsLens
				#endif
			);
	}
	else { //IDXMAP
		return CAT4(SpMM_Row_Symb_IdxMap,OUT_IDXS,COL_PARTS,OFF_F)  
			(
				aRowJA,aRowLen,b,idxsMapAcc
   				#if		 _OUT_IDXS  == TRUE 
				,outIdxs
				#endif
   				#if ( _OUT_IDXS == TRUE && IDX_RMUL_SYMB_RBTREE == T ) || _COL_PARTS == T
   				,root, nodes
				#endif
				#if _COL_PARTS == TRUE
				,gridCols, rowColPartsLens
				#endif
			);
	}
}
///SpMM row-by-row
idx_t* CAT4(SpMM_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
  (
   ROW_MMSYM_IMPL_MODE symbRowImplID, spmat* a, spmat* b
   #if _OUT_IDXS  == TRUE
   ,idx_t*** outIdxs
   #endif
   #if _COL_PARTS == TRUE
   ,ushort gridCols, idx_t** rowColPartsLens
   #endif
  )
{

	///initial allocations
	rbRoot* rbRoots = NULL;	
	rbNode* rbNodes	= NULL; 
	SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
	idx_t  *rowLens=NULL,*upperBoundedRowsLens=NULL,*upperBoundedSymMat=NULL;
	idx_t  maxRowLen=0;
	int rbTreeUsed = (symbRowImplID == RBTREE || 
			  (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS)) );
	
	if ( !(rowLens = malloc(sizeof(*rowLens) * (a->M+1))) ){ 
		ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
		goto _err;
	}
	if (_OUT_IDXS == TRUE || rbTreeUsed ){
		if (!(upperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))
			goto _err;
	}
	#if _OUT_IDXS  == TRUE
	if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
		ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
		goto _err;
	}
	if (!(upperBoundedSymMat   = malloc(
	  sizeof(*upperBoundedSymMat)*upperBoundedRowsLens[a->M]))){
		ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
		goto _err;
	}
	//write rows' start pointer from full matrix JA allocated
	for (idx_t i=0,cumul=0; i<a->M; cumul += upperBoundedRowsLens[i++])
		*outIdxs[i] = upperBoundedSymMat + cumul; 
	#endif	//#if _OUT_IDXS  == TRUE
	#if _COL_PARTS == TRUE
	if (!(*rowColPartsLens = malloc(a->M * gridCols * sizeof(**rowColPartsLens)))){
		ERRPRINT("SpMM_Symb_ rowColPartsLens malloc errd\n");
		goto _err;
	}
	#endif //_COL_PARTS
	uint maxThreads	= omp_get_max_threads();	//TODO FROM CFG
	//index keeping aux struct
	//rbtree implementation or idxMap with aux of symbTree for tmp outIdx keeping
	if ( rbTreeUsed ){
		maxRowLen = reductionMaxSeq(upperBoundedRowsLens, a->M);
		//rbTrees for index keeping
		rbRoots = malloc(maxThreads * sizeof(*rbRoots));
		rbNodes	= calloc(maxThreads * maxRowLen, sizeof(*rbNodes));
		if( !rbRoots || !rbNodes ){
			ERRPRINT("SpMM_Symb_ threads' aux rbTree mallocs errd\n");
			goto _err;
		}
		//init roots
		for (uint i=0; i<maxThreads; i++)
			rbRoots[i] = RB_ROOT_CACHED;
	} 
	if (symbRowImplID == IDXMAP){  //idxMap implementation && not outIdxs via rbtree
		idxsMapAccs 	= calloc(maxThreads, sizeof(*idxsMapAccs));
		if (!idxsMapAccs){
			ERRPRINT("SpMM_Symb_ idxsMapAccs calloc errd\n");
			goto _err;
		}
		//init idxs maps
		for (uint i=0; i<maxThreads; i++){
			if(initSpVectIdxDenseAcc(b->N,idxsMapAccs+i))	goto _err;
		}
	}
	///rows parallel compute
	idx_t* aRow;
	idx_t  aRowLen,rLen,abCumulLen=0;
	int tid;
	rbRoot* tRoot;	rbNode* tNodes; 
	SPVECT_IDX_DENSE_MAP* tIdxsMapAcc = NULL;
	#pragma omp parallel for schedule(static) \
	  private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:abCumulLen)
	for(idx_t r=0; r<a->M; r++){
		aRow		= a->JA + a->IRP[r]-OFF_F;
		aRowLen		= a->IRP[r+1] - a->IRP[r];
		tid 		= omp_get_thread_num();
		//TODO low overhead pointer airth can be avoided with if (symbRowImplID .. && )
		tIdxsMapAcc	= idxsMapAccs + tid;
		tRoot 		= rbRoots + tid;
		tNodes  	= rbNodes + tid * maxRowLen;

		rLen = CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)  
		(
			symbRowImplID, aRow, aRowLen, b, tRoot,tNodes, tIdxsMapAcc
   			#if _OUT_IDXS  == TRUE
			,*outIdxs[r]
			#endif
			#if _COL_PARTS == TRUE
			,gridCols, (*rowColPartsLens) + IDX2D(r,0,gridCols)
			#endif
		);
		rowLens[r]  = rLen;
		abCumulLen += rLen;
		///reset symb idxs keeping aux structs
		if (symbRowImplID==RBTREE || (IDX_RMUL_SYMB_RBTREE && (_COL_PARTS || _OUT_IDXS))){
			*tRoot = RB_ROOT_CACHED;
			memset(tNodes,0,rLen * sizeof(*tNodes));
		}
		if (symbRowImplID == IDXMAP)	_resetIdxMap(tIdxsMapAcc);
		
	}
	rowLens[a->M]	= abCumulLen;
	goto _free;

	_err:
	free(rowLens);
	#if _OUT_IDXS  == T
	if  (outIdxs)			free(*outIdxs);
	#endif
	#if _COL_PARTS == T
	if  (rowColPartsLens)	free(*rowColPartsLens);
	#endif
	rowLens = NULL;
	_free:
	free(upperBoundedRowsLens);
	free(upperBoundedSymMat);
	free(rbRoots);
	free(rbNodes);
	if (idxsMapAccs){
		for (uint i=0; i<maxThreads; i++)	free(idxsMapAccs[i].idxsMap);
		free(idxsMapAccs);
	}

	return rowLens;
}


///Sp3MM - rowByrowByrow	////////////////////////////////////////////////////////
//TODO COL_PARTS VERSION?
#if _OUT_IDXS == TRUE && _COL_PARTS == FALSE
idx_t CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F) 
  (
	ROW_MMSYM_IMPL_MODE symbMMRowImplID,idx_t* aRowJA,idx_t aRowLen,spmat* b,spmat* c, 
	rbRoot* root,rbNode* nodes, SPVECT_IDX_DENSE_MAP* idxsMapAcc,idx_t* abRowJATmp
	#if _OUT_IDXS  == TRUE && !defined  OUT_IDXS_RBTREE_NODES 
	,idx_t* outIdxs
	#endif
  )
{
	struct rb_node* n;
	idx_t abRowLen	= CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F) 
	  (symbMMRowImplID,aRowJA,aRowLen,b,root,nodes,idxsMapAcc,abRowJATmp);
	cleanRbNodes(root, nodes, abRowLen);
	idx_t abcRowLen	= CAT4(SpMM_Row_Symb_,OUT_IDXS,COL_PARTS,OFF_F)
	  (symbMMRowImplID,abRowJATmp,abRowLen,c,root,nodes,idxsMapAcc,abRowJATmp);

	#if 	_OUT_IDXS == TRUE
	#ifndef OUT_IDXS_RBTREE_NODES	
	//return the mul.result nnz index inside the rbNodes
	uint i=0;
	//rbNodeOrderedVisit(n,root)	outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
  	for (struct rb_node* n = rb_first(&root->rb_root); n; n = rb_next(n)){
		outIdxs[ i++ ] = rb_entry(n,rbNode,rb)->key;
	}
	#else
	/* return the non zero indexes of the mul.result row
 	 * sorting inplace the nodes inserted in the rbtree */
	sortRbNode(nodes,abcRowLen);
	#endif
	#endif
	return abcRowLen;
}

idx_t* CAT3(Sp3MM_Symb_,OUT_IDXS,OFF_F) 
  (
	ROW_MMSYM_IMPL_MODE symbMMRowImplID, spmat* a, spmat* b, spmat* c
	#if _OUT_IDXS  == TRUE
	,idx_t*** outIdxs
	#endif
  )
{
	//idxs keeping aux buffs
	idx_t*  abRowsJATmp	= NULL;
	rbRoot* rbRoots 	= NULL; rbNode* rbNodes	= NULL; 
	SPVECT_IDX_DENSE_MAP* idxsMapAccs = NULL;
	
	idx_t *abUpperBoundedRowsLens = NULL, *upperBoundedSymMat = NULL;
	///initial allocations
	idx_t* rowLens = malloc(sizeof(*rowLens) * (a->M +1) ); //to return
	if (!rowLens){
		ERRPRINT("SpMM_Symb_ rowLens malloc errd\n");
		goto _err;
	}
	
	#if _OUT_IDXS  == TRUE
	if (!(*outIdxs = malloc(sizeof(**outIdxs) * a->M))){
		ERRPRINT("SpMM_Symb_ outIdxs malloc errd\n");
		goto _err;
	}
	if (!(abUpperBoundedRowsLens = CAT(spMMSizeUpperbound_,OFF_F)(a,b)))	
		goto _err;
  	/*TODO TODO instead of doing one sym product first to have a correct UB
	 *	use an heuristics here to get output matrix size
	 */
	idx_t abcUBSize = abUpperBoundedRowsLens[a->M] * SP3MM_UB_HEURISTIC;
	if (!(upperBoundedSymMat=malloc(sizeof(*upperBoundedSymMat)*abcUBSize))){
		ERRPRINT("SpMM_Symb_ upperBoundedSymMat malloc errd\n");
		goto _err;
	}
	//TODO heuristic TO UB rows bounds ... require compacting copy
	for (idx_t i=0,cumul=0; i<a->M; 
	  cumul += SP3MM_UB_HEURISTIC * abUpperBoundedRowsLens[i++])
		*outIdxs[i] = upperBoundedSymMat + cumul; 
  	#endif	//#if _OUT_IDXS  == TRUE
	//rbTrees for index keeping
	uint maxThreads 	= omp_get_max_threads(); //TODO FROM CFG
	idx_t abMaxRowLen 	= reductionMaxSeq(abUpperBoundedRowsLens, a->M);
	#ifdef 	  HEURISTICS_UB
	idx_t maxRowLenUB 	= abMaxRowLen * SP3MM_UB_HEURISTIC; //TODO UB HEURISTC
	#else
	idx_t maxRowLenUB 	= c->N;
	#endif	//HEURISTICS_UB

	if (!(abRowsJATmp	= malloc(maxThreads*maxRowLenUB*sizeof(*abRowsJATmp)) ) ){
		ERRPRINT("Sp3MM_Symb_ abRowsJATmp malloc errd\n");
		goto _err;
	}
	if (symbMMRowImplID == RBTREE || IDX_RMUL_SYMB_RBTREE ){
		rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes));
		rbRoots = malloc(maxThreads * sizeof(*rbRoots));
		if (!rbRoots || !rbNodes ){
			ERRPRINT("Sp3MM_Symb_ rbRoots || rbNodes malloc errd\n");
			goto _err;
		}
		//init roots
		for (uint i=0; i<maxThreads; i++)	rbRoots[i] = RB_ROOT_CACHED;
		if (!( rbNodes	= malloc(maxThreads * maxRowLenUB * sizeof(*rbNodes)) )){
			ERRPRINT("Sp3MM_Symb_ rbNodes malloc errdn\n");
			goto _err;
		}
	}
	if (symbMMRowImplID == IDXMAP){
		if (!( idxsMapAccs = calloc(maxThreads,sizeof(*idxsMapAccs)) )){
			ERRPRINT("Sp3MM_Symb_ idxsMapAccs calloc errd\n");
			goto _err;
		}	
		//init idxs maps
		for (uint i=0; i<maxThreads; i++){
			if(initSpVectIdxDenseAcc(c->N,idxsMapAccs+i))	goto _err;
		}
	}
	///rows parallel compute
	idx_t* aRow;
	idx_t  aRowLen,rLen,outCumulLen=0;
	//threads local pointers
	int tid;
	rbRoot* tRoot;
	rbNode* tNodes;
	SPVECT_IDX_DENSE_MAP* tIdxsMapAcc;
	idx_t* tABRowJATmp;

	#pragma omp parallel for schedule(static) \
	private(aRow,aRowLen,rLen, tRoot,tNodes,tid) reduction(+:outCumulLen)
	for(idx_t r=0; r<a->M; r++){
		aRow		= a->JA + a->IRP[r]-OFF_F;
		aRowLen		= a->IRP[r+1] - a->IRP[r];
		tid 		= omp_get_thread_num();
		tRoot 		= rbRoots + tid;
		tNodes  	= rbNodes + tid * maxRowLenUB;
		tIdxsMapAcc = NULL;	//TODO
		tABRowJATmp	= abRowsJATmp + tid * maxRowLenUB;
		rLen = CAT3(Sp3MM_Row_Symb_,OUT_IDXS,OFF_F)
		  (symbMMRowImplID, aRow,aRowLen,b,c,tRoot,tNodes,tIdxsMapAcc,tABRowJATmp,
		  	#if _OUT_IDXS == TRUE
			*outIdxs[r]
			#endif
		  );
		outCumulLen += rLen;
		rowLens[r] = rLen;
		
	}
	goto _free;
	_err:
	free(rowLens);
	#if _OUT_IDXS == T
	if (*outIdxs)	free(*outIdxs);
	#endif
	rowLens = NULL;
	_free:
	free(abUpperBoundedRowsLens);
	free(upperBoundedSymMat);
	free(rbRoots);
	free(rbNodes);
	free(abRowsJATmp);

	return rowLens;
}

#endif	//#if !defined COL_PARTS && defined OUT_IDXS


///restore aux macros entry state
//#undef _OUT_ID
//#undef _COL_PARTS
#pragma pop_macro("OUT_IDXS")
#pragma pop_macro("_OUT_IDXS")
#pragma pop_macro("COL_PARTS")
#pragma pop_macro("_COL_PARTS")