psblas3/base/serial/impl/sp3mm4amg/Sp3MM_CSR_OMP_Num_Generic.c

/*
 *              Sp3MM_for_AlgebraicMultiGrid
 *    (C) Copyright 2021-2022
 *        Andrea Di Iorio      
 * 
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *    1. Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions, and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *    3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may
 *       not be used to endorse or promote products derived from this
 *       software without specific written permission.
 * 
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS
 *  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 */ 

/*#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )*/
#ifndef OFF_F
    #error generic implementation requires OFF_F defined
#endif

#ifndef SP3MM_OMP_SYMB
#define SP3MM_OMP_SYMB

//allocCSRSpMatSymbStep aux function for IRP set by symb step output
static inline idx_t _setCSR_IRP_1DPartitioing(spmat* m, idx_t* rowSizes){
	idx_t r,cumulSize;
	for (r=0,cumulSize=0; r<m->M; cumulSize += rowSizes[r++])
		m->IRP[r] = cumulSize;
	DEBUGCHECKS	assert(cumulSize == rowSizes[m->M]);
	m->IRP[m->M] = cumulSize;
	return cumulSize;
}
static inline idx_t _setCSR_IRP_2DPartitioing(spmat* m,idx_t* rowSizes,ushort gridCols){
	idx_t cumulSize=0;
	for (idx_t r=0,cumulSizeOld; r<m->M; r++){
		m->IRP[r] = cumulSize;
		for (ushort gc=0; gc<gridCols; gc++){
			cumulSizeOld = cumulSize;
			cumulSize += rowSizes[ IDX2D(r,gc,gridCols) ];
			//inplace update 2D rowParts len matrix as IRP like	start offsets
			rowSizes[ IDX2D(r,gc,gridCols) ] = cumulSizeOld; 
		}
	}
	DEBUGCHECKS		assert(cumulSize == rowSizes[m->M*gridCols]);
	m->IRP[m->M] = cumulSize;
	return cumulSize;
}
/*
 * allocate CSR mat @m, setting up correctly IRP and other buffers allocations
 * if @gridCols == 1: (1D partitioning of @m) -> @rowsSizes will be an array of row lenghts
 * if @gridCols >  1: (2D partitioning of @m) -> @rowsSizes will be a matrix with
 * 		elem i,j = len of j-th colPartition (out of @gridCols) of i-th row
 */
static inline int allocCSRSpMatSymbStep(spmat* m,idx_t* rowSizes,ushort gridCols){
	//setup IRP and get cumul, whole size of @m
	if (!gridCols)	   return EXIT_FAILURE;
	idx_t cumulSize;
	if (gridCols == 1) cumulSize = _setCSR_IRP_1DPartitioing(m,rowSizes);
	/*if (gridCols > 1} -- static analizer miss unsignedness...*/
	else		   cumulSize = _setCSR_IRP_2DPartitioing(m,rowSizes,gridCols);
	m->IRP[m->M] 	=  cumulSize;
	m->NZ		=  cumulSize;

	if (!(m->AS = malloc(rowSizes[m->M] * sizeof(*m->AS))) ){
		ERRPRINT("allocCSRSpMatSymbStep m->AS malloc errd\n");
		return EXIT_FAILURE;
	}
	if (!(m->JA = malloc(rowSizes[m->M] * sizeof(*m->JA))) ){
		ERRPRINT("allocCSRSpMatSymbStep m->JA malloc errd\n");
		return EXIT_FAILURE;
	}
	return EXIT_SUCCESS;
}
#endif
//////////////////// COMPUTE CORE Sp[3]MM SYMB-NUMB PHASE //////////////////////////
////////Sp3MM as 2 x SpMM
///1D
spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
	ACC_DENSE *accVects = NULL,*acc;
	SPMM_ACC* outAccumul=NULL;
	idx_t* rowsSizes = NULL;
	///init AB matrix with SPMM heuristic preallocation
	spmat* AB = allocSpMatrix(A->M,B->N);
	if (!AB)	goto _err;
	///aux structures alloc 
	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
		ERRPRINT("accVects init failed\n");
		goto _err;
	}
	///SYMBOLIC STEP
	if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
		goto _err;
	if (allocCSRSpMatSymbStep(AB,rowsSizes,1))	goto _err;
	
	///NUMERIC STEP
	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (AB->M,AB,cfg);
	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
	#pragma omp parallel for schedule(runtime) private(acc)
	for (idx_t r=0;  r<A->M; r++){	//row-by-row formulation
		acc = accVects + omp_get_thread_num();
		_resetAccVect(acc);   //rezero for the next A row
		//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
		/*	direct use of sparse scalar vector multiplication
		for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen;  ja<A->IRP[r+1]-OFF_F;  ja++){
			ca 		= A->JA[ja]		- OFF_F;
			jb 		= B->IRP[ca]	- OFF_F;
			bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
			CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
		} */
		for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
			CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
		//direct sparsify: trasform accumulated dense vector to a CSR row
		sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
	}
	#if OFF_F != 0
	C_FortranShiftIdxs(AB);
	#endif
	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
	DEBUG                   checkOverallocPercent(rowsSizes,AB);
	goto _free;

	_err:
	if(AB)  freeSpmat(AB);
	AB=NULL;	//nothing'll be returned
	_free:
	free(rowsSizes);
	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
	if(outAccumul)  freeSpMMAcc(outAccumul);

	return AB;

}
spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
	ACC_DENSE *accVects = NULL,*acc;
	SPMM_ACC* outAccumul=NULL;
	idx_t* rowsSizes = NULL;
	///init AB matrix with SPMM heuristic preallocation
	spmat* AB = allocSpMatrix(A->M,B->N);
	if (!AB)	goto _err;
	///aux structures alloc 
	if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){
		ERRPRINT("accVects init failed\n");
		goto _err;
	}
	///SYMBOLIC STEP
	if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))
		goto _err;
	if (allocCSRSpMatSymbStep(AB,rowsSizes,1))
		goto _err;
	
	///NUMERIC STEP
	//perform Gustavson over rows blocks -> M / @cfg->gridRows
	ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;
	((CHUNKS_DISTR_INTERF)	cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);
	AUDIT_INTERNAL_TIMES	Start=omp_get_wtime();
	ulong b,startRow,block; //omp for aux vars
	#pragma omp parallel for schedule(runtime) private(acc,startRow,block)
	for (b=0;   b < cfg->gridRows; b++){
		block	  = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);
		startRow   = UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);
		for (idx_t r=startRow;  r<startRow+block; r++){
			acc = accVects + omp_get_thread_num();
			_resetAccVect(acc);   //rezero for the next A row
			//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]
			/*	direct use of sparse scalar vector multiplication
			for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen;  ja<A->IRP[r+1]-OFF_F;  ja++){
				ca 		= A->JA[ja]		- OFF_F;
				jb 		= B->IRP[ca]	- OFF_F;
				bRowLen = B->IRP[ca+1] 	- B->IRP[ca];
				CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);
			} */
			for (ulong ja=A->IRP[r]-OFF_F;	ja<A->IRP[r+1]-OFF_F;	ja++) //row-by-row formul
				CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);
			//direct sparsify: trasform accumulated dense vector to a CSR row
			sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API
		}
	}
	#if OFF_F != 0
	C_FortranShiftIdxs(AB);
	#endif
	AUDIT_INTERNAL_TIMES	End=omp_get_wtime();
	DEBUG			checkOverallocPercent(rowsSizes,AB);
	goto _free;

	_err:
	if(AB)  freeSpmat(AB);
	AB=NULL;	//nothing'll be returned
	_free:
	if(rowsSizes)   free(rowsSizes);
	if(accVects)	freeAccsDense(accVects,cfg->threadNum);
	if(outAccumul)  freeSpMMAcc(outAccumul);

	return AB;

}
spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){ 
	DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);
	idx_t* bColOffsets = NULL;   //B group columns starting offset for each row
	ACC_DENSE *accVectors=NULL,*accV;
	SPACC* accRowPart;
	spmat* AB = allocSpMatrix(A->M,B->N);
	SPMM_ACC* outAccumul=NULL;
	idx_t	*rowsPartsSizes=NULL, *rowSizes=NULL;	//for rows' cols partition, correct len
	if (!AB)	goto _err;
	//2D indexing aux vars
	ulong gridSize=cfg->gridRows*cfg->gridCols, aSubRowsN=A->M*cfg->gridCols;
	ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;
	ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;
	ulong startRow,startCol,rowBlock,colBlock; //data division aux variables
	////get bColOffsets for B column groups 
	if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))
		goto _err;
	///TODO TODO
	_err:
	if (AB) freeSpmat(AB);
	AB = NULL; 
	_free:
	free(rowsPartsSizes);
	free(bColOffsets);
	if (accVectors)  freeAccsDense(accVectors,gridSize);
	if (outAccumul)  freeSpMMAcc(outAccumul);

	return AB;
}
/*	TODO
///2D
//PARTITIONS NOT ALLOCATED
spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){
///SP3MM
spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
////////Sp3MM direct
///1D
spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){
*/
added sp3mm4amg source code to the repository and created C wrapper routine to be used in the fortran interface 2 years ago			`/*`
			`* Sp3MM_for_AlgebraicMultiGrid`
			`* (C) Copyright 2021-2022`
			`* Andrea Di Iorio`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions, and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* 3. The name of the Sp3MM_for_AlgebraicMultiGrid or the names of its contributors may`
			`* not be used to endorse or promote products derived from this`
			`* software without specific written permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
			`* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Sp3MM_for_AlgebraicMultiGrid GROUP OR ITS CONTRIBUTORS`
			`* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`* POSSIBILITY OF SUCH DAMAGE.`
			`*/`

			`/#pragma message( "compiling SpMM_CSR_OMP_Generic.c with OFF_F as:" STR(OFF_F) )/`
			`#ifndef OFF_F`
			`#error generic implementation requires OFF_F defined`
			`#endif`

			`#ifndef SP3MM_OMP_SYMB`
			`#define SP3MM_OMP_SYMB`

			`//allocCSRSpMatSymbStep aux function for IRP set by symb step output`
			`static inline idx_t _setCSR_IRP_1DPartitioing(spmat* m, idx_t* rowSizes){`
			`idx_t r,cumulSize;`
			`for (r=0,cumulSize=0; r<m->M; cumulSize += rowSizes[r++])`
			`m->IRP[r] = cumulSize;`
			`DEBUGCHECKS assert(cumulSize == rowSizes[m->M]);`
			`m->IRP[m->M] = cumulSize;`
			`return cumulSize;`
			`}`
			`static inline idx_t _setCSR_IRP_2DPartitioing(spmat* m,idx_t* rowSizes,ushort gridCols){`
			`idx_t cumulSize=0;`
			`for (idx_t r=0,cumulSizeOld; r<m->M; r++){`
			`m->IRP[r] = cumulSize;`
			`for (ushort gc=0; gc<gridCols; gc++){`
			`cumulSizeOld = cumulSize;`
			`cumulSize += rowSizes[ IDX2D(r,gc,gridCols) ];`
			`//inplace update 2D rowParts len matrix as IRP like start offsets`
			`rowSizes[ IDX2D(r,gc,gridCols) ] = cumulSizeOld;`
			`}`
			`}`
			`DEBUGCHECKS assert(cumulSize == rowSizes[m->M*gridCols]);`
			`m->IRP[m->M] = cumulSize;`
			`return cumulSize;`
			`}`
			`/*`
			`* allocate CSR mat @m, setting up correctly IRP and other buffers allocations`
			`* if @gridCols == 1: (1D partitioning of @m) -> @rowsSizes will be an array of row lenghts`
			`* if @gridCols > 1: (2D partitioning of @m) -> @rowsSizes will be a matrix with`
			`* elem i,j = len of j-th colPartition (out of @gridCols) of i-th row`
			`*/`
			`static inline int allocCSRSpMatSymbStep(spmat* m,idx_t* rowSizes,ushort gridCols){`
			`//setup IRP and get cumul, whole size of @m`
			`if (!gridCols) return EXIT_FAILURE;`
			`idx_t cumulSize;`
			`if (gridCols == 1) cumulSize = _setCSR_IRP_1DPartitioing(m,rowSizes);`
			`/if (gridCols > 1} -- static analizer miss unsignedness.../`
			`else cumulSize = _setCSR_IRP_2DPartitioing(m,rowSizes,gridCols);`
			`m->IRP[m->M] = cumulSize;`
			`m->NZ = cumulSize;`

			`if (!(m->AS = malloc(rowSizes[m->M] * sizeof(*m->AS))) ){`
			`ERRPRINT("allocCSRSpMatSymbStep m->AS malloc errd\n");`
			`return EXIT_FAILURE;`
			`}`
			`if (!(m->JA = malloc(rowSizes[m->M] * sizeof(*m->JA))) ){`
			`ERRPRINT("allocCSRSpMatSymbStep m->JA malloc errd\n");`
			`return EXIT_FAILURE;`
			`}`
			`return EXIT_SUCCESS;`
			`}`
			`#endif`
			`//////////////////// COMPUTE CORE Sp[3]MM SYMB-NUMB PHASE //////////////////////////`
			`////////Sp3MM as 2 x SpMM`
			`///1D`
			`spmat* CAT(spmmRowByRow_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){`
			`ACC_DENSE accVects = NULL,acc;`
			`SPMM_ACC* outAccumul=NULL;`
			`idx_t* rowsSizes = NULL;`
			`///init AB matrix with SPMM heuristic preallocation`
			`spmat* AB = allocSpMatrix(A->M,B->N);`
			`if (!AB) goto _err;`
			`///aux structures alloc`
			`if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){`
			`ERRPRINT("accVects init failed\n");`
			`goto _err;`
			`}`
			`///SYMBOLIC STEP`
			`if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))`
			`goto _err;`
			`if (allocCSRSpMatSymbStep(AB,rowsSizes,1)) goto _err;`

			`///NUMERIC STEP`
			`((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (AB->M,AB,cfg);`
			`AUDIT_INTERNAL_TIMES Start=omp_get_wtime();`
			`#pragma omp parallel for schedule(runtime) private(acc)`
			`for (idx_t r=0; r<A->M; r++){ //row-by-row formulation`
			`acc = accVects + omp_get_thread_num();`
			`_resetAccVect(acc); //rezero for the next A row`
			`//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]`
			`/* direct use of sparse scalar vector multiplication`
			`for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; ja<A->IRP[r+1]-OFF_F; ja++){`
			`ca = A->JA[ja] - OFF_F;`
			`jb = B->IRP[ca] - OFF_F;`
			`bRowLen = B->IRP[ca+1] - B->IRP[ca];`
			`CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);`
			`} */`
			`for (ulong ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul`
			`CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);`
			`//direct sparsify: trasform accumulated dense vector to a CSR row`
			`sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API`
			`}`
			`#if OFF_F != 0`
			`C_FortranShiftIdxs(AB);`
			`#endif`
			`AUDIT_INTERNAL_TIMES End=omp_get_wtime();`
			`DEBUG checkOverallocPercent(rowsSizes,AB);`
			`goto _free;`

			`_err:`
			`if(AB) freeSpmat(AB);`
			`AB=NULL; //nothing'll be returned`
			`_free:`
			`free(rowsSizes);`
			`if(accVects) freeAccsDense(accVects,cfg->threadNum);`
			`if(outAccumul) freeSpMMAcc(outAccumul);`

			`return AB;`

			`}`
			`spmat* CAT(spmmRowByRow1DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){`
			`ACC_DENSE accVects = NULL,acc;`
			`SPMM_ACC* outAccumul=NULL;`
			`idx_t* rowsSizes = NULL;`
			`///init AB matrix with SPMM heuristic preallocation`
			`spmat* AB = allocSpMatrix(A->M,B->N);`
			`if (!AB) goto _err;`
			`///aux structures alloc`
			`if (!(accVects = _initAccVectors(cfg->threadNum,AB->N))){`
			`ERRPRINT("accVects init failed\n");`
			`goto _err;`
			`}`
			`///SYMBOLIC STEP`
			`if (!(rowsSizes = CAT(SpMM_Symb___,OFF_F) (cfg->symbMMRowImplID, A,B)))`
			`goto _err;`
			`if (allocCSRSpMatSymbStep(AB,rowsSizes,1))`
			`goto _err;`

			`///NUMERIC STEP`
			`//perform Gustavson over rows blocks -> M / @cfg->gridRows`
			`ulong rowBlock = AB->M/cfg->gridRows, rowBlockRem = AB->M%cfg->gridRows;`
			`((CHUNKS_DISTR_INTERF) cfg->chunkDistrbFunc) (cfg->gridRows,AB,cfg);`
			`AUDIT_INTERNAL_TIMES Start=omp_get_wtime();`
			`ulong b,startRow,block; //omp for aux vars`
			`#pragma omp parallel for schedule(runtime) private(acc,startRow,block)`
			`for (b=0; b < cfg->gridRows; b++){`
			`block = UNIF_REMINDER_DISTRI(b,rowBlock,rowBlockRem);`
			`startRow = UNIF_REMINDER_DISTRI_STARTIDX(b,rowBlock,rowBlockRem);`
			`for (idx_t r=startRow; r<startRow+block; r++){`
			`acc = accVects + omp_get_thread_num();`
			`_resetAccVect(acc); //rezero for the next A row`
			`//for each A's row nnz, accumulate scalar vector product nnz.val * B[nnz.col]`
			`/* direct use of sparse scalar vector multiplication`
			`for (idx_t ja=A->IRP[r]-OFF_F,ca,jb,bRowLen; ja<A->IRP[r+1]-OFF_F; ja++){`
			`ca = A->JA[ja] - OFF_F;`
			`jb = B->IRP[ca] - OFF_F;`
			`bRowLen = B->IRP[ca+1] - B->IRP[ca];`
			`CAT(scSparseVectMul_,OFF_F)(A->AS[ja], B->AS+jb,B->JA+jb,bRowLen,acc);`
			`} */`
			`for (ulong ja=A->IRP[r]-OFF_F; ja<A->IRP[r+1]-OFF_F; ja++) //row-by-row formul`
			`CAT(scSparseRowMul_,OFF_F)(A->AS[ja], B, A->JA[ja]-OFF_F, acc);`
			`//direct sparsify: trasform accumulated dense vector to a CSR row`
			`sparsifyDirect(acc,AB,r); //0,NULL);TODO COL PARTITIONING COMMON API`
			`}`
			`}`
			`#if OFF_F != 0`
			`C_FortranShiftIdxs(AB);`
			`#endif`
			`AUDIT_INTERNAL_TIMES End=omp_get_wtime();`
			`DEBUG checkOverallocPercent(rowsSizes,AB);`
			`goto _free;`

			`_err:`
			`if(AB) freeSpmat(AB);`
			`AB=NULL; //nothing'll be returned`
			`_free:`
			`if(rowsSizes) free(rowsSizes);`
			`if(accVects) freeAccsDense(accVects,cfg->threadNum);`
			`if(outAccumul) freeSpMMAcc(outAccumul);`

			`return AB;`

			`}`
			`spmat* CAT(spmmRowByRow2DBlocks_SymbNum_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){`
			`DEBUG printf("spmm\trowBlocks of A ,\tcolBlocks of B\tM=%luxN=%lu\n",A->M,B->N);`
			`idx_t* bColOffsets = NULL; //B group columns starting offset for each row`
			`ACC_DENSE accVectors=NULL,accV;`
			`SPACC* accRowPart;`
			`spmat* AB = allocSpMatrix(A->M,B->N);`
			`SPMM_ACC* outAccumul=NULL;`
			`idx_t rowsPartsSizes=NULL, rowSizes=NULL; //for rows' cols partition, correct len`
			`if (!AB) goto _err;`
			`//2D indexing aux vars`
			`ulong gridSize=cfg->gridRowscfg->gridCols, aSubRowsN=A->Mcfg->gridCols;`
			`ulong _rowBlock = AB->M/cfg->gridRows, _rowBlockRem = AB->M%cfg->gridRows;`
			`ulong _colBlock = AB->N/cfg->gridCols, _colBlockRem = AB->N%cfg->gridCols;`
			`ulong startRow,startCol,rowBlock,colBlock; //data division aux variables`
			`////get bColOffsets for B column groups`
			`if (!(bColOffsets = CAT(colsOffsetsPartitioningUnifRanges_,OFF_F)(B,cfg->gridCols)))`
			`goto _err;`
			`///TODO TODO`
			`_err:`
			`if (AB) freeSpmat(AB);`
			`AB = NULL;`
			`_free:`
			`free(rowsPartsSizes);`
			`free(bColOffsets);`
			`if (accVectors) freeAccsDense(accVectors,gridSize);`
			`if (outAccumul) freeSpMMAcc(outAccumul);`

			`return AB;`
			`}`
			`/* TODO`
			`///2D`
			`//PARTITIONS NOT ALLOCATED`
			`spmat* CAT(spmmRowByRow2DBlocksAllocated_,OFF_F)(spmat* A,spmat* B, CONFIG* cfg){`
			`///SP3MM`
			`spmat* CAT(sp3mmRowByRowPair_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){`
			`////////Sp3MM direct`
			`///1D`
			`spmat* CAT(sp3mmRowByRowMerged_,OFF_F)(spmat* R,spmat* AC,spmat* P,CONFIG* cfg,SPMM_INTERF spmm){`
			`*/`