You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
142 lines
4.3 KiB
Plaintext
142 lines
4.3 KiB
Plaintext
/*
|
|
* spGPU - Sparse matrices on GPU library.
|
|
*
|
|
* Copyright (C) 2010 - 2015
|
|
* Davide Barbieri - University of Rome Tor Vergata
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* version 3 as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*/
|
|
|
|
#define THREAD_BLOCK 128
|
|
#define MMBSZ 8
|
|
|
|
__global__ void
|
|
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
|
|
(int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
|
|
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets,
|
|
int hackSize, const int* hackOffsets, int rows, int cols,
|
|
const VALUE_TYPE *x, int xPitch, VALUE_TYPE beta)
|
|
{
|
|
VALUE_TYPE *pz,*px,*py;
|
|
VALUE_TYPE zProd = CONCAT(zero_,VALUE_TYPE)();
|
|
VALUE_TYPE yVal;
|
|
__shared__ VALUE_TYPE temp[MMBSZ][THREAD_BLOCK];
|
|
|
|
int hackCount = (rows + hackSize - 1)/hackSize;
|
|
|
|
int i = threadIdx.x + blockIdx.x * (THREAD_BLOCK);
|
|
|
|
int hackId = i / hackSize;
|
|
int hackLaneId = i % hackSize;
|
|
|
|
// shared between offsetsChunks and warpHackOffsetTemp
|
|
extern __shared__ int dynShrMem[];
|
|
|
|
int hackOffset = 0;
|
|
int nextOffset = 0;
|
|
|
|
unsigned int laneId = threadIdx.x % warpSize;
|
|
unsigned int warpId = threadIdx.x / warpSize;
|
|
|
|
if (laneId == 0 && i < rows) {
|
|
hackOffset = hackOffsets[hackId];
|
|
nextOffset = hackOffsets[hackId+1];
|
|
}
|
|
|
|
hackOffset = __shfl_sync(0xFFFFFFFF,hackOffset, 0);
|
|
nextOffset = __shfl_sync(0xFFFFFFFF,nextOffset, 0);
|
|
|
|
if (hackId >= hackCount)
|
|
return;
|
|
|
|
cM += hackOffset*hackSize + hackLaneId;
|
|
hdiaOffsets += hackOffset;
|
|
|
|
for (int k=0; k<count; k++) {
|
|
temp[k][threadIdx.x] = CONCAT(zero_,VALUE_TYPE)();
|
|
}
|
|
|
|
// diags for this hack is next hackOffset minus current hackOffset
|
|
int diags = nextOffset - hackOffset;
|
|
|
|
// Warp oriented
|
|
int rounds = (diags + warpSize - 1)/warpSize;
|
|
|
|
volatile int *offsetsChunk = dynShrMem + warpId*warpSize;
|
|
|
|
for (int r = 0; r < rounds; r++) {
|
|
// in the last round diags will be <= warpSize
|
|
if (laneId < diags)
|
|
offsetsChunk[laneId] = hdiaOffsets[laneId];
|
|
|
|
if (i < rows) {
|
|
int dCount = min(diags, warpSize);
|
|
|
|
for (int j = 0; j < dCount; ++j) {
|
|
int column = offsetsChunk[j] + i;
|
|
|
|
if(column >= 0 && column < cols) {
|
|
px = (VALUE_TYPE *) x;
|
|
for (int k = 0; k < count; k++) {
|
|
VALUE_TYPE xValue = px[column];
|
|
VALUE_TYPE mValue = cM[0];
|
|
temp[k][threadIdx.x] = CONCAT(VALUE_TYPE, _fma)(mValue, xValue, temp[k][threadIdx.x]);
|
|
px = px + xPitch;
|
|
}
|
|
}
|
|
cM += hackSize;
|
|
}
|
|
}
|
|
diags -= warpSize;
|
|
hdiaOffsets += warpSize;
|
|
}
|
|
|
|
// Since z and y are accessed with the same offset by the same thread,
|
|
// and the write to z follows the y read, y and z can share the same base address (in-place computing).
|
|
if (i >= rows)
|
|
return;
|
|
|
|
py = (VALUE_TYPE *) y;
|
|
pz = z;
|
|
|
|
if (CONCAT(VALUE_TYPE, _isNotZero(beta)))
|
|
for (int k=0; k<count; k++) {
|
|
yVal = py[i];
|
|
pz[i] = CONCAT(VALUE_TYPE, _fma)(beta, yVal, CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]));
|
|
py += yPitch;
|
|
pz += zPitch;
|
|
}
|
|
else
|
|
for (int k=0; k<count; k++) {
|
|
pz[i] = CONCAT(VALUE_TYPE, _mul)(alpha, temp[k][threadIdx.x]);
|
|
pz += zPitch;
|
|
}
|
|
}
|
|
|
|
void
|
|
CONCAT(_,GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL))
|
|
(spgpuHandle_t handle, int count, VALUE_TYPE* z, int zPitch, const VALUE_TYPE *y,
|
|
int yPitch, VALUE_TYPE alpha, const VALUE_TYPE* cM, const int* hdiaOffsets, int hackSize,
|
|
const __device int* hackOffsets, int rows, int cols, const VALUE_TYPE *x,
|
|
int xPitch, VALUE_TYPE beta)
|
|
{
|
|
dim3 block (THREAD_BLOCK, 1);
|
|
dim3 grid ((rows + THREAD_BLOCK - 1) / THREAD_BLOCK);
|
|
// Should we generalize the code to 1/2/4/8 threads per row?
|
|
// And maybe adjust THREAD_BLOCK size?
|
|
int shrMemSize,maxShmemSz;
|
|
maxShmemSz = getGPUSharedMemPerBlock();
|
|
shrMemSize = MMBSZ*THREAD_BLOCK*sizeof(VALUE_TYPE);
|
|
CONCAT(GEN_SPGPU_HDIA_NAME(TYPE_SYMBOL), _krn)
|
|
<<< grid, block, shrMemSize, handle->currentStream >>> (count, z, zPitch, y, yPitch,
|
|
alpha, cM, hdiaOffsets, hackSize, hackOffsets, rows, cols,
|
|
x, xPitch, beta);
|
|
}
|