Optimizing Parallel Reduction in CUDA
Mark Harris NVIDIA Developer Technology
http://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
Tuesday, September 11, 12
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer - - PowerPoint PPT Presentation
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology http://developer.download.nvidia.com/assets/cuda/files/reduction.pdf Tuesday, September 11, 12 Parallel Reduction Tree-based approach used within each thread block
Tuesday, September 11, 12
3
4 7 5 9 11 14 25 3 1 7 4 1 6 3
Tuesday, September 11, 12
5
4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3 4 7 5 9 11 14 25 3 1 7 0 4 1 6 3
Level 0: 8 blocks Level 1: 1 block
Tuesday, September 11, 12
6
Tuesday, September 11, 12
10 1 8
3 5
2 7 11 2
Values (shared memory)
Tuesday, September 11, 12
8
10 1 8
3 5
2 7 11 2
Values (shared memory)
2 4 6 8 10 12 14
11 1 7
8 5
9 7 11 11 2 2
Values
4 8 12
18 1 7
6
8 5 4
9 7 13 11 2 2
Values
8
24 1 7
6
8 5 17
9 7 13 11 2 2
Values
41 1 7
6
8 5 17
9 7 13 11 2 2
Values Thread IDs Step 1 Stride 1 Step 2 Stride 2 Step 3 Stride 4 Step 4 Stride 8 Thread IDs Thread IDs Thread IDs
Tuesday, September 11, 12
7
__global__ void reduce0(int *g_idata, int *g_odata) { extern __shared__ int sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for(unsigned int s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
Tuesday, September 11, 12
10
interleaved addressing with divergent branching
Note: Block Size = 128 threads for all tests Bandwidth Time (222 ints)
Jinx: Kernel ~ Bandwidth
Tuesday, September 11, 12
9
__global__ void reduce1(int *g_idata, int *g_odata) { extern __shared__ int sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for (unsigned int s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
Tuesday, September 11, 12
8
10 1 8
3 5
2 7 11 2
Values (shared memory)
2 4 6 8 10 12 14
11 1 7
8 5
9 7 11 11 2 2
Values
4 8 12
18 1 7
6
8 5 4
9 7 13 11 2 2
Values
8
24 1 7
6
8 5 17
9 7 13 11 2 2
Values
41 1 7
6
8 5 17
9 7 13 11 2 2
Values Thread IDs Step 1 Stride 1 Step 2 Stride 2 Step 3 Stride 4 Step 4 Stride 8 Thread IDs Thread IDs Thread IDs
Tuesday, September 11, 12
12
10 1 8
3 5
2 7 11 2
Values (shared memory)
1 2 3 4 5 6 7
11 1 7
8 5
9 7 11 11 2 2
Values
1 2 3
18 1 7
6
8 5 4
9 7 13 11 2 2
Values
1
24 1 7
6
8 5 17
9 7 13 11 2 2
Values
41 1 7
6
8 5 17
9 7 13 11 2 2
Values Thread IDs Step 1 Stride 1 Step 2 Stride 2 Step 3 Stride 4 Step 4 Stride 8 Thread IDs Thread IDs Thread IDs
Tuesday, September 11, 12
11
for (unsigned int s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } for (unsigned int s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); }
Tuesday, September 11, 12
13
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
Jinx: Kernel ~ Bandwidth → Speedup
Tuesday, September 11, 12
12
10 1 8
3 5
2 7 11 2
Values (shared memory)
1 2 3 4 5 6 7
11 1 7
8 5
9 7 11 11 2 2
Values
1 2 3
18 1 7
6
8 5 4
9 7 13 11 2 2
Values
1
24 1 7
6
8 5 17
9 7 13 11 2 2
Values
41 1 7
6
8 5 17
9 7 13 11 2 2
Values Thread IDs Step 1 Stride 1 Step 2 Stride 2 Step 3 Stride 4 Step 4 Stride 8 Thread IDs Thread IDs Thread IDs
Tuesday, September 11, 12
1 2 3 4 5 6 7 8 9
Tuesday, September 11, 12
14
10 1 8
3 5
2 7 11 2
Values (shared memory)
1 2 3 4 5 6 7
8
10 6 9 3 7
2 7 11 2
Values
1 2 3
8 7 13 13 9 3 7
2 7 11 2
Values
1
21 20 13 13 9 3 7
2 7 11 2
Values
41 20 13 13 9 3 7
2 7 11 2
Values Thread IDs Step 1 Stride 8 Step 2 Stride 4 Step 3 Stride 2 Step 4 Stride 1 Thread IDs Thread IDs Thread IDs
Tuesday, September 11, 12
15
for (unsigned int s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); } for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }
Tuesday, September 11, 12
16
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
sequential addressing
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
Jinx: Kernel ~ Bandwidth → Speedup (Cumulative speedup)
Tuesday, September 11, 12
17
for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }
Tuesday, September 11, 12
18
// each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // perform first level of reduction, // reading from global memory, writing to shared memory unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; __syncthreads();
Tuesday, September 11, 12
19
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
sequential addressing
first add during global load
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
Jinx: Kernel ~ Bandwidth → Speedup (Cumulative speedup)
Tuesday, September 11, 12
20
Tuesday, September 11, 12
Tuesday, September 11, 12
21
Tuesday, September 11, 12
__device__ void warpReduce(volatile int* sdata, int tid) { sdata[tid] += sdata[tid + 32]; sdata[tid] += sdata[tid + 16]; sdata[tid] += sdata[tid + 8]; sdata[tid] += sdata[tid + 4]; sdata[tid] += sdata[tid + 2]; sdata[tid] += sdata[tid + 1]; } // ¡later… for (unsigned int s=blockDim.x/2; s>32; s>>=1) { if (tid < s) sdata[tid] += sdata[tid + s]; __syncthreads(); } if (tid < 32) warpReduce(sdata, tid);
22
Without unrolling, all warps execute every iteration of the for loop and if statement IMPORTANT: For this to be correct, we must use the “volatile” ¡keyword!
Tuesday, September 11, 12
23
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
sequential addressing
first add during global load
unroll last warp
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
Jinx: Kernel ~ Bandwidth → Speedup (Cumulative speedup)
…
Tuesday, September 11, 12
24
Tuesday, September 11, 12
28
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
sequential addressing
first add during global load
unroll last warp
completely unrolled
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
…
Tuesday, September 11, 12
Tuesday, September 11, 12
Tuesday, September 11, 12
Tuesday, September 11, 12
Tuesday, September 11, 12
Tuesday, September 11, 12
32
unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; __syncthreads();
unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x; unsigned int gridSize = blockSize*2*gridDim.x; sdata[tid] = 0; while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } __syncthreads();
Tuesday, September 11, 12
33
unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; __syncthreads();
unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x; unsigned int gridSize = blockSize*2*gridDim.x; sdata[tid] = 0; while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } __syncthreads();
Tuesday, September 11, 12
34
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
sequential addressing
first add during global load
unroll last warp
completely unrolled
multiple elements per thread
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
Tuesday, September 11, 12
34
interleaved addressing with divergent branching
interleaved addressing with bank conflicts
sequential addressing
first add during global load
unroll last warp
completely unrolled
multiple elements per thread
Step Speedup Bandwidth Time (222 ints) Cumulative Speedup
…
Tuesday, September 11, 12
35
template <unsigned int blockSize> __device__ void warpReduce(volatile int *sdata, unsigned int tid) { if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; } template <unsigned int blockSize> __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) { extern __shared__ int sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*(blockSize*2) + tid; unsigned int gridSize = blockSize*2*gridDim.x; sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads(); if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } if (tid < 32) warpReduce(sdata, tid); if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
Tuesday, September 11, 12
37
Tuesday, September 11, 12