. Author manuscript; available in PMC: 2014 May 1.

Published in final edited form as: J Parallel Distrib Comput. 2013 Jan 16;73(5):10.1016/j.jpdc.2013.01.001. doi: 10.1016/j.jpdc.2013.01.001

Algorithm 2.

Pseudo code for parallel gridding operation on GPU with the compact binning.

00	__shared__ inElem sharedLocalBin[/max size/];
01	outputIdx index = computeOutputIndex(blockIdx, threadIdx);
02	outElem myOutElem = initOutElem(index);
03	int zLo = z0 cutoff;
04	int zHi = z0 + blockDim.z + cutoff;
05	// compute yLo, yHi, xLo, xHi similarly
06	for z = [zLo:zHi]{
07	for y = [yLo:yHi]{
08	for x = [xLo:xHi]{
09	int start = binOffsetArray[z][y][x];
10	int end = binOffsetArray[z][y][x+1];
11	if(threadIdx < end-start){
12	sharedLocalBin[threadIdx] = globalBinArray[start+threadIdx];
13	}
14	__syncthreads();
15	for i=[0:end-start]{
16	if(\|sharedLocalBin[i].coords myOutElem.coords\| < kernel-width){
17	/compute the contribution of this input onto the output/
18	} } } } }
19	globalOutputGrid[index] = myOutElem;