San Jose 2018
MultiGPU Made Easy by OmpSs + CUDA/OpenACC
Antonio J. Peña
- Sr. Researcher & Activity Lead
MultiGPU Made Easy by OmpSs + CUDA/OpenACC Antonio J. Pea Sr. - - PowerPoint PPT Presentation
www.bsc.es MultiGPU Made Easy by OmpSs + CUDA/OpenACC Antonio J. Pea Sr. Researcher & Activity Lead Manager, BSC/UPC NVIDIA GCoE San Jose 2018 Introduction: Programming Models for GPU Computing CUDA (Compute Unified Device Architecture)
San Jose 2018
– Runtime & Driver APIs (high-level / low-level) – Specific for NVIDIA GPUs: best performance & control
– Open Standard – Higher-level, pragma-based – Aiming at portability – heterogeneous hardware – For NVIDIA GPUs, implemented on top of CUDA
– Open Standard – Low-level – similar to CUDA Driver API – Multi-target, portable*
2
3
– Subtopic: “High productivity programming environments for exascale”
– Develop & deploy a production-ready parallel programming environment – Advance and integrate existing state-of-the-art European technology – High coding productivity, high performance, energy awareness
5
Automatic Code Annotation Satisfactory Performance? Profile Code Low-Level Accelerator Kernels Start Deploy Yes No Satisfactory Performance? Yes No Tune/Insert Directives Manually Directive Optimisation Possible? No Yes Update code patterns Satisfactory Code Patterns? No Yes
– Defines a single address space – Executes sequential code that
– in, out, inout – Used
– Basis for memory management functionalities (replication, locality, movement, …)
7
8
void Cholesky( float *A[NT][NT] ) { int i, j, k; for (k=0; k<NT; k++) { spotrf (A[k*NT+k]) ; for (i=k+1; i<NT; i++) { strsm (A[k][k], A[k][i]); } for (i=k+1; i<NT; i++) { for (j=k+1; j<i; j++) { sgemm( A[k][i], A[k][j], A[j][i]); } ssyrk (A[k][i], A[i][i]); } }
TS TS NT NT TS TS
9
void Cholesky( float *A[NT][NT] ) { int i, j, k; for (k=0; k<NT; k++) { #pragma omp task inout (A[k][k]) spotrf (A[k][k]) ; for (i=k+1; i<NT; i++) { #pragma omp task in (A[k][k]) inout (A[k][i]) strsm (A[k][k], A[k][i]); } for (i=k+1; i<NT; i++) { for (j=k+1; j<i; j++) { #pragma omp task in (A[k][i], A[k][j]) inout (A[j][i]) sgemm( A[k][i], A[k][j], A[j][i]); } #pragma omp task in (A[k][i]) inout (A[i][i]) ssyrk (A[k][i], A[i][i]); } }
TS TS NT NT TS TS
10
void Cholesky( float *A[NT][NT] ) { int i, j, k; for (k=0; k<NT; k++) { #pragma omp task inout (A[k][k]) spotrf (A[k][k]) ; for (i=k+1; i<NT; i++) { #pragma omp task in (A[k][k]) inout (A[k][i]) strsm (A[k][k], A[k][i]); } for (i=k+1; i<NT; i++) { for (j=k+1; j<i; j++) { #pragma omp task in (A[k][i], A[k][j]) inout (A[j][i]) sgemm( A[k][i], A[k][j], A[j][i]); } #pragma omp task in (A[k][i]) inout (A[i][i]) ssyrk (A[k][i], A[i][i]); } }
TS TS NT NT TS TS
11
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait // can access any of A,B,C ...
Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B T1 No need to copy
Task Dependency Graph
T1 needs a valid copy
B Also it allocates array B in the device (no copy needed), and invalidates other B’s C T1 T2 T3
12
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait // can access any of A,B,C ...
Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B T2
Task Dependency Graph
T2 can reuse arrays A and B, due they have been used by previous task (T1) B C Additionally it also invalidates others A’s T1 T2 T3
13
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait // can access any of A,B,C ...
Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B T3
Task Dependency Graph
T3 needs to copy back to the host array B B C Does not invalidate the existing copy in the device T1 T2 T3 B
14
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait // can access any of A,B,C ...
Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B
Task Dependency Graph
B C Taskwait requires full memory consistency in the host A T1 T2 T3
T1 T2 T3
B
TW
15
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait noflush // does not flush data dev -> host scale_task_cuda (B, C, 3.00, 1024); //T4 #pragma omp taskwait // can access any of A,B,C ...
T1 T2 T3 T4
Task Dependency Graph
Taskwait is waiting for task finalization, but does not copy memory back to the host (neither invalidate it)
noflush Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B C
nf T1 T2 T3
B B
16
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait noflush // does not flush data dev -> host scale_task_cuda (B, C, 3.00, 1024); //T4 #pragma omp taskwait // can access any of A,B,C ...
T1 T2 T3 T4
Task Dependency Graph
Before executing T4 it will need a consistent copy of C and it will also invalidate all previous versions of B
noflush Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B C
nf T1 T2 T3
B B C B
T4
17
#pragma omp target device (cuda) #pragma omp task out([N] b) in([N] c) void scale_task_cuda(double *b, double *c, double a, int N) { int j = blockIdx.x * blockDim.x + threadIdx.x; if (j < N) b[j] = a * c[j]; } #pragma omp target device (smp) #pragma omp task out([N] b) in([N] c) void scale_task_host(double *b, double *c, double a, int N) { for (int j=0; j < N; j++) b[j] = a*c[j]; } void main(int argc, char *argv[]) { ... scale_task_cuda (B, A, 10.0, 1024); //T1 scale_task_cuda (A, B, 0.01, 1024); //T2 scale_task_host (C, B, 2.00, 1024); //T3 #pragma omp taskwait noflush // does not flush data dev -> host scale_task_cuda (B, C, 3.00, 1024); //T4 #pragma omp taskwait // can access any of A,B,C ...
T1 T2 T3 T4
Task Dependency Graph
Taskwait waits for tasks finalization, it will invalidate all data versions and force memory consistency
noflush Memory Transfers
HOST MEMORY DEVICE MEMORY
A A B C
nf T1 T2 T3
B B C B
T4
A B
TW
18 #include <kernel.h> int main(int argc, char *argv[]) { float a=5, x[N], y[N]; // Initialize values for (int i=0; i<N; ++i) x[i] = y[i] = i; // Compute saxpy algorithm (1 task) saxpy(N, a, x, y); #pragma omp taskwait //Check results for (int i=0; i<N; ++i){ if (y[i]!=a*i+i) perror("Error\n") } message("Results are correct\n"); } void saxpy(int n, float a, float *X, float *Y) { for (int i=0; i<n; ++i) Y[i] = X[i] * a + Y[i]; } #pragma omp target device(smp) copy_deps #pragma omp task in([n]x) inout([n]y) void saxpy(int n, float a, float* x, float* y);
kernel.c kernel.h main.c
#pragma omp target device(cuda) copy_deps ndrange(1,n,128) #pragma omp task in([n]x) inout([n]y) __global__ void saxpy(int n, float a, float* x, float* y);
kernel.cuh
2
__global__ void saxpy(int n, float a, float* x, float* y) { int i = blockIdx.x * blockDim.x + threadIdx.x; if(i < n) y[i] = a * x[i] + y[i]; }
kernel.cu
1
3
– Instead of porting & optimizing many CUDA tasks… – …port every GPU-accelerated task using using OpenACC… – …and only use CUDA where the OpenACC compiler doesn’t provide the required efficiency
19
#include <kernel.h> int main(int argc, char *argv[]) { float a=5, x[N], y[N]; // Initialize values for (int i=0; i<N; ++i) x[i] = y[i] = i; // Compute saxpy algorithm (1 task) saxpy(N, a, x, y); #pragma omp taskwait //Check results for (int i=0; i<N; ++i){ if (y[i]!=a*i+i) perror("Error\n") } message("Results are correct\n"); }
void saxpy(int n, float a, float *x, float *y) { for (int i=0; i<n; ++i) y[i] = x[i] * a + y[i]; }
#pragma omp target device(smp) copy_deps #pragma omp task in([n]x) inout([n]y) void saxpy (int n, float a, float* x, float* y);
kernel.c kernel.h main.c
3
20 #pragma omp target device(openacc) copy_deps #pragma omp task in([n]x) inout([n]y) void saxpy (int n, float a, float* x, float* y);
kernel.h
void saxpy(int n, float a, float *x, float *y) { #pragma acc kernels for (int i=0; i<n; ++i) y[i] = x[i] * a + y[i]; }
kernel.c
2
– Six different stress components – Finite differences (FD) method with a Fully Staggered Grid (FSG)
21
22
– Featuring Wen-mei Hwu && David Kirk – Advanced CUDA + Brand-new “+AI” format!
– Core: Pau Farré, Marc Jordà, Kyunghun Kim, Mohammad Owais – Collaborators: Pedro Valero, Aimar Rodríguez, Jan Ciesko
– Xavier Martorell, Vicenç Beltran, Xavier Teruel, Sergi Mateo, JM Perez, …
– Maurizio Hanzich, Samuel Rodríguez, …
– Wen-mei Hwu & TAs: Simón García de Gonzalo, Abdul Dakkak, Carl Pearson, Mert Hiyadetoglu – David Kirk, Juan Gómez-Luna
24 Pau Farré