diff --git a/cuda/hlldev.c b/cuda/hlldev.c
index f9359bf7..eb78c16c 100644
--- a/cuda/hlldev.c
+++ b/cuda/hlldev.c
@@ -182,20 +182,27 @@ int spmvHllDeviceFloat(void *deviceMat, float alpha, void* deviceX,
 }
 
 void
-dspmdmmhll_gpu (double *z, int s, int vPitch, double *y, double alpha, double* cM, int* rP,
-	     int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows, double *x, double beta, int firstIndex)
+dspmdmmhll_gpu (double *z, int count, int zPitch, double alpha, double* cM, int* rP,
+		int* rS, int hackSize, int* hackOffs, int avgNnzPerRow, int rows,
+		double *x, int xPitch, double beta, int firstIndex)
 {
   int i=0;
   spgpuHandle_t handle=psb_cudaGetHandle();
-  for (i=0; i<s; i++)
-    {
-      spgpuDhellspmv (handle, (double*) z, (double*)y, alpha, (double*) cM, rP,
-		       hackSize, hackOffs, rS, NULL,
-               avgNnzPerRow, rows, (double*)x, beta, firstIndex);
-      z += vPitch;
-      y += vPitch;
-      x += vPitch;		
-    }
+#if defined(NEW_MM)
+  spgpuDhellspmm(handle, count, (double*) z, zPitch, (double*)z, zPitch,
+		  alpha, (double*) cM, rP,hackSize, hackOffs, rS, NULL,
+		  rows, (double*)x, xPitch, beta, firstIndex);
+#else
+  
+  for (i=0; i<count; i++) {
+    spgpuDhellspmv (handle, (double*) z, (double*)z, alpha,
+		    (double*) cM, rP,
+		    hackSize, hackOffs, rS, NULL,
+		    avgNnzPerRow, rows, (double*)x, beta, firstIndex);
+    z += zPitch;
+    x += xPitch;		
+  }
+#endif
 }
 
 //new
@@ -211,11 +218,11 @@ int spmvHllDeviceDouble(void *deviceMat, double alpha, void* deviceX,
   /*__assert(x->size_ >= devMat->columns, "ERROR: x vector's size is not >= to matrix size (columns)");*/
   /*__assert(y->size_ >= devMat->rows, "ERROR: y vector's size is not >= to matrix size (rows)");*/
 #endif
-  dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_, (double *)y->v_,
-           alpha, (double *)devMat->cM, 
-	       devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
-           devMat->avgNzr, devMat->rows,
-           (double *)x->v_, beta, devMat->baseIndex);
+  dspmdmmhll_gpu ((double *)y->v_, y->count_, y->pitch_,
+		  alpha, (double *)devMat->cM, 
+		  devMat->rP, devMat->rS, devMat->hackSize, devMat->hackOffs,
+		  devMat->avgNzr, devMat->rows,
+		  (double *)x->v_, x->pitch_, beta, devMat->baseIndex);
 
   return SPGPU_SUCCESS;
 }
diff --git a/cuda/spgpu/hell.h b/cuda/spgpu/hell.h
index d177686c..e62f9ae0 100644
--- a/cuda/spgpu/hell.h
+++ b/cuda/spgpu/hell.h
@@ -104,7 +104,26 @@ void spgpuDhellspmv (spgpuHandle_t handle,
 	const __device double *x, 
 	double beta,
 	int baseIndex);
-
+#if defined(NEW_MM)
+void spgpuDhellspmm(spgpuHandle_t handle,
+		     int count,
+		     __device double *z,
+		     int zpitch,
+		     const __device double *y,
+		     int ypitch,
+		     double alpha, 
+		     const __device double* cM, 
+		     const __device int* rP,
+		     int hackSize,
+		     const __device int* hackOffsets, 
+		     const __device int* rS,
+		     const __device int* rIdx, 
+		     int rows, 
+		     const __device double *x,
+		     int xpitch,
+		     double beta,
+		     int baseIndex);
+#endif
 
 /** 
 * \fn void spgpuChellspmv (spgpuHandle_t handle,__device cuFloatComplex *z,const __device cuFloatComplex *y, cuFloatComplex alpha, const __device cuFloatComplex* cM, const __device int* rP,int hackSize,const __device int* hackOffsets, const __device int* rS, const __device int* rIdx, int avgNnzPerRow, int rows, const __device cuFloatComplex *x, cuFloatComplex beta, int baseIndex)
diff --git a/cuda/spgpu/kernels/hell_dspmv.cu b/cuda/spgpu/kernels/hell_dspmv.cu
index b06f8cfb..dc99e842 100644
--- a/cuda/spgpu/kernels/hell_dspmv.cu
+++ b/cuda/spgpu/kernels/hell_dspmv.cu
@@ -16,7 +16,7 @@
 
 #include "cudadebug.h"
 #include "cudalang.h"
-
+#include <stdio.h>
 extern "C"
 {
 #include "core.h"
@@ -30,3 +30,202 @@ extern "C"
 #define TEX_FETCH_TYPE int2
 #include "hell_spmv_base.cuh"
 
+
+
+#if defined(NEW_MM)
+
+#define MMBSZ   12
+
+#undef GEN_SPGPU_HELL_NAME
+#define GEN_SPGPU_HELL_NAME(x) CONCAT(CONCAT(spgpu,x),hellspmm)
+#undef GEN_SPGPU_HELL_NAME_VANILLA
+#define GEN_SPGPU_HELL_NAME_VANILLA(x) CONCAT(CONCAT(spgpu,x),hellspmm_vanilla)
+
+
+__global__ void
+CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn)
+  (int count, VALUE_TYPE *z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP,
+   int hackSize, const int* hackOffsets, const int* rS, int rows,
+   const VALUE_TYPE *x, int xPitch, 
+   VALUE_TYPE beta, int baseIndex)
+{
+  VALUE_TYPE *pz,*px,*py;
+  VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
+  VALUE_TYPE yVal; 
+  __shared__ VALUE_TYPE temp[THREAD_BLOCK][MMBSZ];
+        
+  int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
+    
+  if (i < rows) {
+    int j; 
+    int hackId = i / hackSize;
+    int hackLaneId = i % hackSize;
+    
+    int hackOffset;
+    unsigned int laneId = threadIdx.x % 32;
+    if (laneId == 0)
+      hackOffset = hackOffsets[hackId];
+    //__syncthreads();
+    hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0) + hackLaneId;		
+    
+    rP += hackOffset; 
+    cM += hackOffset; 
+    
+    int rowSize = rS[i];
+    for (int k=0; k<count; k++) {
+      temp[threadIdx.x][k] = CONCAT(zero_,VALUE_TYPE)();
+    }      
+    
+    for (int j = 0; j < rowSize; j++) {
+      int pointer;
+      VALUE_TYPE value;
+      VALUE_TYPE fetch;
+      
+      pointer = rP[0] - baseIndex;
+      rP += hackSize;
+      
+      value = cM[0];
+      cM += hackSize;
+
+      px = (VALUE_TYPE *) x;
+      for (int k=0; k<count; k++) {
+	fetch = px[pointer]; 
+	temp[threadIdx.x][k] =
+	  CONCAT(VALUE_TYPE, _fma)(value, fetch, temp[threadIdx.x][k]);
+	px = px + xPitch;
+      }
+    }
+    // Since z and y are accessed with the same offset by the same thread,
+    // and the write to z follows the y read, y and z can share the same base address (in-place computing).
+    py = (VALUE_TYPE *) y;
+    pz = z;
+    if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
+      for (int k=0; k<count; k++) {
+	yVal = py[i];
+	pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul) (alpha,  temp[threadIdx.x][k]));
+	py += yPitch;
+	pz += zPitch;
+      }
+    else
+      for (int k=0; k<count; k++) {
+	pz[i] = CONCAT(VALUE_TYPE, _mul) (alpha,  temp[threadIdx.x][k]);
+	pz += zPitch;
+      }
+  }
+}
+
+
+void
+CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL))
+  (spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y, int yPitch,
+   VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* rP, int hackSize, const int* hackOffsets,
+   const int* rS,  const __device int* rIdx, int rows,
+   const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta, int baseIndex)
+{
+  dim3 block (THREAD_BLOCK, 1);
+  dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
+  // Should we generalize the code to 1/2/4/8 threads per row?
+  // And maybe adjust THREAD_BLOCK size? 
+  int shrMemSize;
+  shrMemSize=MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
+  CONCAT(GEN_SPGPU_HELL_NAME(TYPE_SYMBOL), _krn) 
+    <<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch,y, yPitch,
+							    alpha, cM, rP, hackSize, hackOffsets, rS, rows,
+							    x, xPitch, beta, baseIndex);
+}
+
+void
+GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
+     (spgpuHandle_t handle,
+        int count, 
+        VALUE_TYPE* z,
+        int zPitch,
+	const VALUE_TYPE *y,
+        int  yPitch,
+	VALUE_TYPE alpha, 
+	const VALUE_TYPE* cM, 
+	const int* rP, 
+	int hackSize,
+	const __device int* hackOffsets, 
+	const __device int* rS,
+	const __device int* rIdx, 
+	int rows, 
+	const VALUE_TYPE *x,
+        int xPitch,
+	VALUE_TYPE beta, 
+      int baseIndex)
+{
+  VALUE_TYPE *px,*py, *pz;
+  int cnt;
+  int maxNForACall = max(handle->maxGridSizeX, THREAD_BLOCK*handle->maxGridSizeX);
+  
+  // maxNForACall should be a multiple of hackSize
+  maxNForACall = (maxNForACall/hackSize)*hackSize;
+  //fprintf(stderr,"Entering kernel  %d maxNForACall\n",maxNForACall);
+  while (rows > maxNForACall) {//managing large vectors
+    cnt = count;
+    px = (VALUE_TYPE *) x;
+    py = (VALUE_TYPE *) y;
+    pz = (VALUE_TYPE *) z;	  
+    while (cnt > MMBSZ) {
+      //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  hackSize, hackOffsets,
+							  rS, rIdx,
+							  maxNForACall,
+							  px, xPitch, beta, baseIndex);
+      px += xPitch*MMBSZ;
+      py += yPitch*MMBSZ;
+      pz += zPitch*MMBSZ;
+      cnt -= MMBSZ;
+    }
+    if (cnt >0) {
+      CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							  py, yPitch,
+							  alpha, cM, rP,
+							  hackSize, hackOffsets,
+							  rS, rIdx,
+							  maxNForACall,
+							  px, xPitch, beta, baseIndex);
+    }
+
+    y = y + maxNForACall;
+    z = z + maxNForACall;
+    hackOffsets = hackOffsets + maxNForACall/hackSize;
+    rS = rS + maxNForACall;
+    
+    rows -= maxNForACall;
+  }
+  cnt = count;
+  px = (VALUE_TYPE *) x;
+  py = (VALUE_TYPE *) y;
+  pz = (VALUE_TYPE *) z;	  
+  while (cnt > MMBSZ) {
+    //fprintf(stderr,"counts %d %d %d :  pointers: %p %p %p\n",rows,cnt,MMBSZ,px,py,pz);    
+    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, MMBSZ, pz, zPitch, py, yPitch,
+							alpha, cM, rP, hackSize, hackOffsets,
+							rS, rIdx, rows,
+							px, xPitch, beta, baseIndex);
+    px += xPitch*MMBSZ;
+    py += yPitch*MMBSZ;
+    pz += zPitch*MMBSZ;
+    cnt -= MMBSZ;
+  }
+  if (cnt >0) {
+    CONCAT(_,GEN_SPGPU_HELL_NAME_VANILLA(TYPE_SYMBOL)) (handle, cnt, pz, zPitch,
+							py, yPitch,
+							alpha, cM, rP,
+							hackSize, hackOffsets,
+							rS, rIdx,
+							rows,
+							px, xPitch, beta, baseIndex);
+  }
+  
+  
+  cudaCheckError("CUDA error on hell_spmm");
+}
+
+#endif
diff --git a/cuda/spgpu/kernels/hell_spmv_base.cuh b/cuda/spgpu/kernels/hell_spmv_base.cuh
index ca074d33..b2f9b053 100644
--- a/cuda/spgpu/kernels/hell_spmv_base.cuh
+++ b/cuda/spgpu/kernels/hell_spmv_base.cuh
@@ -14,7 +14,6 @@
  * GNU General Public License for more details.
  */
 
-
 #define PRE_CONCAT(A, B) A ## B
 #define CONCAT(A, B) PRE_CONCAT(A, B)
 
@@ -156,4 +155,3 @@ GEN_SPGPU_HELL_NAME(TYPE_SYMBOL)
 	
 	cudaCheckError("CUDA error on hell_spmv");
 }
-
diff --git a/cuda/vectordev.c b/cuda/vectordev.c
index 22039b5c..877be2d2 100644
--- a/cuda/vectordev.c
+++ b/cuda/vectordev.c
@@ -38,8 +38,10 @@
 #include "core.h"
 
 //new
-MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned int size, 
-						   unsigned int elementType)
+MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
+						   unsigned int size, 
+						   unsigned int elementType,
+						   unsigned int storage2d)
 {
   struct MultiVectorDeviceParams params;
 
@@ -75,6 +77,7 @@ MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count, unsigned
 	
   params.count = count;
   params.size = size;
+  params.storage2d = storage2d ;
 
   return params;
 
@@ -87,8 +90,9 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
   
   struct MultiVectDevice *tmp = (struct MultiVectDevice *)malloc(sizeof(struct MultiVectDevice));
   *remoteMultiVec = (void *)tmp;
-  tmp->size_ = params->size;
-  tmp->count_ = params->count;
+  tmp->size_      = params->size;
+  tmp->count_     = params->count;
+  tmp->storage2d_ = params->storage2d;
 
   if (params->elementType == SPGPU_TYPE_INT)
     {
@@ -163,9 +167,9 @@ int FallocMultiVecDevice(void** deviceMultiVec, unsigned int count,
 { int i;
   struct MultiVectorDeviceParams p;
 
-  p = getMultiVectorDeviceParams(count, size, elementType);
+  p = getMultiVectorDeviceParams(count, size, elementType, PSB_VECT_STOR_BY_COLS);
   i = allocMultiVecDevice(deviceMultiVec, &p);
-  //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.size);
+  //fprintf(stderr,"From ALLOC: %d %d \n", p.pitch, p.count);
   //cudaSync();
   if (i != 0) {
     fprintf(stderr,"From routine : %s : %d, %d %d \n","FallocMultiVecDevice",i, count, size);
@@ -194,3 +198,9 @@ int getMultiVecDevicePitch(void* deviceVec)
   return(i);
 }
 
+int getMultiVecDeviceStorage2d(void* deviceVec)
+{ int i;
+  struct MultiVectDevice *dev = (struct MultiVectDevice *) deviceVec;
+  i = dev->storage2d_;
+  return(i);
+}
diff --git a/cuda/vectordev.h b/cuda/vectordev.h
index 6d5a16b0..80fc1678 100644
--- a/cuda/vectordev.h
+++ b/cuda/vectordev.h
@@ -37,8 +37,14 @@
 #include "cintrf.h"
 #include <complex.h>
 
+#define PSB_VECT_STOR_BY_COLS  0
+#define PSB_VECT_STOR_BY_ROWS  1
+
 struct MultiVectDevice
 {
+  // storage of 2d data
+  int storage2d_;
+  
   // number of vectors
   int count_;
 
@@ -54,17 +60,19 @@ struct MultiVectDevice
 
 typedef struct MultiVectorDeviceParams
 {			
-	// number on vectors
-	unsigned int count; //1 for a simple vector
-
-	// The resulting allocation will be pitch*s*(size of the elementType)
-	unsigned int elementType;
-	
-	// Pitch (in number of elements)
-	unsigned int pitch;
-
-	// Size of a single vector (in number of elements).
-	unsigned int size; 
+  // storage of 2d data
+  unsigned int storage2d;
+  // number on vectors
+  unsigned int count; //1 for a simple vector
+  
+  // The resulting allocation will be pitch*s*(size of the elementType)
+  unsigned int elementType;
+  
+  // Pitch (in number of elements)
+  unsigned int pitch;
+  
+  // Size of a single vector (in number of elements).
+  unsigned int size; 
 } MultiVectorDeviceParams;
 
 
@@ -76,7 +84,8 @@ int unregisterMapped(void *);
 
 MultiVectorDeviceParams getMultiVectorDeviceParams(unsigned int count,
 						   unsigned int size,  
-						   unsigned int elementType);
+						   unsigned int elementType,
+						   unsigned int storage2d);
 
 int FallocMultiVecDevice(void** deviceMultiVec, unsigned count, 
 			 unsigned int size, unsigned int elementType);
@@ -85,3 +94,4 @@ int allocMultiVecDevice(void ** remoteMultiVec, struct MultiVectorDeviceParams *
 int getMultiVecDeviceSize(void* deviceVec);
 int getMultiVecDeviceCount(void* deviceVec);
 int getMultiVecDevicePitch(void* deviceVec);
+int getMultiVecDeviceStorage2d(void* deviceVec);
diff --git a/test/block_krylov/kernel/dpdegenmm.F90 b/test/block_krylov/kernel/dpdegenmm.F90
index cece5417..2b0db219 100644
--- a/test/block_krylov/kernel/dpdegenmm.F90
+++ b/test/block_krylov/kernel/dpdegenmm.F90
@@ -596,7 +596,7 @@ program pdegenmm
   ! solver parameters
   integer(psb_epk_)  :: amatsize, precsize, descsize, annz, nbytes
   real(psb_dpk_)     :: err, eps
-  integer, parameter :: ntests=50, ngpu=50, ncnv=20
+  integer, parameter :: ntests=50, ngpu=10, ncnv=20
   type(psb_d_coo_sparse_mat), target  :: acoo
   type(psb_d_csr_sparse_mat), target  :: acsr
   type(psb_d_ell_sparse_mat), target  :: aell
@@ -659,12 +659,12 @@ program pdegenmm
   !
   !  get parameters
   !
-  !call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
-  nrhs=2
-  acfmt='CSR'
-  agfmt='HLG'
-  idim=2
-  tnd=.false.
+  call get_parms(ctxt,nrhs,acfmt,agfmt,idim,tnd)
+  !nrhs=8
+  !acfmt='CSR'
+  !agfmt='CSRG'
+  !idim=100
+  !tnd=.false.
   call psb_init_timers()
   !
   !  allocate and fill in the coefficient matrix and initial vectors
@@ -911,7 +911,7 @@ program pdegenmm
     write(psb_out_unit,'("Size of matrix:     ",i20)') nr
     write(psb_out_unit,'("Number of nonzeros: ",i20)') annz
     write(psb_out_unit,'("Memory occupation:  ",i20)') amatsize
-    flops  = ntests*(2.d0*annz)
+    flops  = ntests*(2.d0*annz)*nrhs
     tflops = flops
     gflops = flops * ngpu
     write(psb_out_unit,'("Storage type for A:    ",a)') a%get_fmt()
@@ -970,7 +970,7 @@ program pdegenmm
     write(psb_out_unit,*)
     write(psb_out_unit,'("MBYTES/S sust. effective bandwidth  (CPU)  : ",F20.3)') bdwdth
 #ifdef HAVE_CUDA
-    bdwdth = ngpu*ntests*nbytes/(gt2*1.d6)
+    bdwdth = nrhs*ngpu*ntests*nbytes/(gt2*1.d6)
     write(psb_out_unit,'("MBYTES/S sust. effective bandwidth  (GPU)  : ",F20.3)') bdwdth
     bdwdth = psb_cuda_MemoryPeakBandwidth()
     write(psb_out_unit,'("MBYTES/S peak bandwidth             (GPU)  : ",F20.3)') bdwdth
@@ -1053,4 +1053,4 @@ contains
 
   end subroutine get_parms
 
-end program pdegenmm
\ No newline at end of file
+end program pdegenmm