Memory Access Patterns: The Missing Piece of the Multi-GPU Puzzle
Tal Ben-Nun, Ely Levy, Amnon Barak and Eri Rubin
The Hebrew University of Jerusalem, Israel Supercomputing ‘15, November 2015
Memory Access Patterns: The Missing Piece of the Multi-GPU Puzzle - - PowerPoint PPT Presentation
Memory Access Patterns: The Missing Piece of the Multi-GPU Puzzle Tal Ben-Nun , Ely Levy, Amnon Barak and Eri Rubin The Hebrew University of Jerusalem, Israel Supercomputing 15, November 2015 Introduction Developing efficient parallel
Tal Ben-Nun, Ely Levy, Amnon Barak and Eri Rubin
The Hebrew University of Jerusalem, Israel Supercomputing ‘15, November 2015
PCI-Express
GPU 2 GPU 1 GPU 4 GPU 3
Input Output
Input Output
Input Output
Input Output
Input Output
Input Output
Input Output
Input Output
Input Output
// Load data to shared memory smem[tidy * BW + tidx] = M[(bidy * BH + tidy) * STRIDE + (bidx * BW + tidx)]; // Wrap coords // ... __syncthreads(); neighbors = 0; current_gen = smem[tidy * BW + tidx]; for (int ly = -1; ly <= 1; ++ly) { for (int lx = -1; lx <= 1; ++lx) { if (lx == 0 && ly == 0) continue; neighbors += smem[(ly + tidy) * BW + (lx + tidx)]; } }
(bidx * BW + tidx)] = ...; Window Width Block Height Wrapped Boundaries
Single GPU
Boundary Exchanges Multi-GPU
M1(0,0) = M[STRIDE] M1(x,-1) = M4(x, HEIGHT/4) = = M[(HEIGHT-1)*STRIDE+x] M3(0,0) = M2(0,HEIGHT/4+1) = = M[((HEIGHT/4)*2+1)*STRIDE] Multi-GPU
Multi-GPU
Each thread requires an entire dimension of a buffer Matrix multiplication, Exact N-body simulation, Matrix transposition
Each thread-block requires a spatially-local N-dimensional window ND convolution, Jacobi method, Stencil operators
Sporadic access of a dense data structure with a fixed pattern SpMV, Cloth simulation
Thread operates on neighbors of a vertex Barnes-Hut N-body algorithm
Thread-block operates on a permutation of the
Fast Fourier transform
Patterns that cannot be determined in advance Finite state machines
n O(n) n m < n n Unpredictable Structured Injective Unstructured Injective Reductive Static Reductive Dynamic Irregular
Scheduler
GPU 1 GPU 2 GPU n
Segmenters Allocator Segment Location Monitor Memory Analyzer
Invoker Thread Invoker Thread Invoker Thread
Task
Kernel Output Container
Datum Access Pattern Aggregator
Input Container
Datum Access Pattern
Input Container
Datum Access Pattern
Input Container
Datum Access Pattern
Output Container
Datum Access Pattern Aggregator
Output Container
Datum Access Pattern Aggregator
Dimensions Constants
Thread Warp Block Device Multi-Device
Multiple Device Abstraction
Input Controller
Global Memory 1
Output Controller Iterators Iterators Iterators Shared Container Iterators Iterators Iterators Device-level Aggregator Input Controller
Global Memory N
Output Controller Iterators Iterators Iterators Shared Container Iterators Iterators Iterators Device-level Aggregator
Host-Level Infrastructure Device-Level Infrastructure
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B); template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
GPU 1 GPU 2 GPU 3 GPU 4
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
A B
GPU 1 GPU 2 GPU 3 GPU 4
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
hbuffer_A hbuffer_B
GPU 1 GPU 2 GPU 3 GPU 4
A B
hbuffer_A hbuffer_B
GPU 1 GPU 2 GPU 3 GPU 4
A B
GPU 1 GPU 2 GPU 3 GPU 4 GPU 1 GPU 2 GPU 3 GPU 4
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
hbuffer_A hbuffer_B
GPU 1 GPU 2 GPU 3 GPU 4
A B
GPU 1 GPU 2 GPU 3 GPU 4
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
GPU 1 GPU 2 GPU 3 GPU 4
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
GPU 1 GPU 2 GPU 3 GPU 4
hbuffer_A hbuffer_B A B A B A B A B A B
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
GPU 1 GPU 2 GPU 3 GPU 4
hbuffer_A hbuffer_B A B A B A B A B A B
Scheduler sched; typedef Window2D<T,1,WRAP,ILPX,ILPY> Win2D; typedef StructuredInjective<T,2,ILPX,ILPY> SMat; // Define data structures to be used Matrix<T> A (width, height), B (width, height); // Use existing host buffers as matrices A.Bind(hbuffer_A); B.Bind(hbuffer_B); // Analyze memory access patterns for allocation sched.AnalyzeCall(Win2D(A), SMat(B)); sched.AnalyzeCall(Win2D(B), SMat(A)); // Invoke the kernels for (int i = 0; i < iterations; ++i) sched.Invoke(GameOfLifeTick, Win2D((i % 2) ? B : A), SMat((i % 2) ? A : B)); // Gather processed data back to host if ((iterations % 2) == 0) sched.Gather(A); else sched.Gather(B);
GPU 1 GPU 2 GPU 3 GPU 4
hbuffer_A hbuffer_B A B A B A B A B A B
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
1,1 1,2 1,3 1,4 2,1 2,2 2,3 2,4 3,1 3,2 3,4 GPU 1 GPU 2 Block (3,3) Threads
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
Global Memory
Window Width Block Height
Coalesced Global-to-Shared/Registers
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
Global Memory Thread (1,1)
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
Global Memory
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
Global Memory t1,1
template <typename T, int ILPX, int ILPY> __global__ void GameOfLifeTick MAPS_MULTIDEF( Window2D<T,1,WRAP,ILPX,ILPY> current_gen, StructuredInjective<T,2,ILPX,ILPY> next_gen) { MAPS_MULTI_INIT(current_gen, next_gen); #pragma unroll MAPS_FOREACH(nextgen_iter, next_gen) { int live_neighbors = 0, is_live = 0; #pragma unroll MAPS_FOREACH_ALIGNED(iter, current_gen, nextgen_iter) { // Set variables according to the rules if (iter.index() == 4) is_live = *iter; else live_neighbors += *iter; } int result = GameOfLifeConditions(...); *nextgen_iter = result; } next_gen.commit(); }
Global Memory t0,1 t1,1 t2,1
0.5 1 1.5 2 2.5 3 3.5 4 780 Titan K40 980 780 Titan K40 980 780 Titan K40 980 Game of Life Histogram SGEMM
Speedup
1 GPU 2 GPUs 3 GPUs 4 GPUs
0.5 1 1.5 2 2.5 3 3.5 4
CUBLAS- XT MAPS- Multi CUBLAS- XT MAPS- Multi CUBLAS- XT MAPS- Multi GTX 780 Titan Black GTX 980
Speedup
1 GPU 2 GPUs 3 GPUs 4 GPUs
1000 2000 3000 4000 5000 6000
Naive CUB MAPS Naive CUB MAPS Naive CUB MAPS GTX 780 Titan Black GTX 980
Throughput [images / s]
1 GPU 2 GPUs 3 GPUs 4 GPUs
(All multi-GPU results over MAPS-Multi)
GPU 3 GPU 1
=1 =7 =5 =3 =5 =3 =9 =2 =4 =5 =0 =0 Forward Propagation – Compute 𝒈 Backpropagation – Compute 𝜶𝒈
… … … … … …
GPU 3 GPU 1
20,000 40,000 60,000 80,000 100,000 120,000
Caffe Torch MAPS- Multi Caffe Torch MAPS- Multi Caffe Torch MAPS- Multi GTX 780 Titan Black GTX 980
Trained Images / Second
1 GPU 2 GPUs 4 GPUs
𝑩𝒅𝒅 = ∑𝑿𝒔∗
𝑾 = 𝑿𝑰
𝑩𝒗𝒚 = 𝑾/𝑾 𝑮 = 𝑿𝑼 ⋅ 𝑩𝒗𝒚
𝑰𝒔𝒇𝒕 = 𝑰 ∗ 𝑮/𝑩𝒅𝒅
Update H Update W
𝑩𝒅𝒅 = ∑𝑰∗𝒒 𝑾 𝑼 = 𝑰𝑼𝑿𝑼
𝑩𝒗𝒚𝑼 = 𝑾𝑼/𝑾 𝑼
𝑮𝑼 = 𝑰 ⋅ 𝑩𝒗𝒚𝑼
𝑿𝒔𝒇𝒕 = 𝑿𝑼 ∗ 𝑮𝑼/𝑩𝒅𝒅𝑼
𝑿𝑼
𝑿 = 𝑿𝒔𝒇𝒕
𝑼
Window2D Block2D-T, Block1D Block2D-T, Block2D Block2D-T
* E. Mejía-Roa, D. Tabas-Madrid, J. Setoain, C. Garca, F. Tirado, and A. Pascual-Montano. NMF-mGPU: non-negative matrix factorization on multi-GPU systems. BMC Bioinformatics, 2015.
20 25 30 35 40 45 50 55 60 65 1 2 3 4
Throughput [iterations / s]
GPUs
20 30 40 50 60 70 1 2 3 4
Throughput [iterations / s]
GPUs
25 35 45 55 65 75 85 95 1 2 3 4
Throughput [iterations / s]
GPUs
NMF-mGPU MAPS-Multi
This research was supported by the Ministry of Science and Technology, Israel and by the German Research Foundation (DFG) Priority Program “Software for Exascale Computing” (SPP-EXA), research project FFMK.