00 |
__shared__ inElem sharedLocalBin[/*max size*/]; |
01 |
outputIdx index = computeOutputIndex(blockIdx, threadIdx); |
02 |
outElem myOutElem = initOutElem(index); |
03 |
int zLo = z0 cutoff; |
04 |
int zHi = z0 + blockDim.z + cutoff; |
05 |
// compute yLo, yHi, xLo, xHi similarly |
06 |
for z = [zLo:zHi]{ |
07 |
for y = [yLo:yHi]{ |
08 |
for x = [xLo:xHi]{ |
09 |
int start = binOffsetArray[z][y][x]; |
10 |
int end = binOffsetArray[z][y][x+1]; |
11 |
if(threadIdx < end-start){ |
12 |
sharedLocalBin[threadIdx] = globalBinArray[start+threadIdx]; |
13 |
} |
14 |
__syncthreads(); |
15 |
for i=[0:end-start]{ |
16 |
if(|sharedLocalBin[i].coords myOutElem.coords| < kernel-width){ |
17 |
/*compute the contribution of this input onto the output*/ |
18 |
} } } } } |
19 |
globalOutputGrid[index] = myOutElem; |