[PPT] - Everywhere Blocks for SIMD Programming Authors: Rubens E. A. PowerPoint Presentation

SLIDE 1

Everywhere Blocks for SIMD Programming

Authors: ¡Rubens ¡E. ¡A. ¡Moreira, ¡Sylvain ¡Collange, ¡Fernando ¡M. ¡Q. ¡Pereira ¡ Speaker: ¡Breno ¡Campos ¡Ferreira ¡Guimarães ¡

SLIDE 2

Trends ¡in ¡Massively ¡Parallel ¡Processing ¡ Simple ¡ andalso ¡ efficient ¡

Source: ¡hCp://on-‑demand.gputechconf.com/gtc/2016/presentaMon/s6224-‑mark-‑harris.pdf ¡

SLIDE 3

Explicit, ¡yet ¡safe ¡ programming! ¡

Source: ¡hCp://on-‑demand.gputechconf.com/gtc/2016/presentaMon/s6224-‑mark-‑harris.pdf ¡

Trends ¡in ¡Massively ¡Parallel ¡Processing ¡

SLIDE 4

Source: ¡hCp://on-‑demand.gputechconf.com/gtc/2016/presentaMon/s6224-‑mark-‑harris.pdf ¡

Trends ¡in ¡Massively ¡Parallel ¡Processing ¡

SLIDE 5

DEPARTMENT ¡OF ¡COMPUTER ¡SCIENCE ¡ UNIVERSIDADE ¡FEDERAL ¡DE ¡MINAS ¡GERAIS ¡ FEDERAL ¡UNIVERSITY ¡OF ¡MINAS ¡GERAIS, ¡BRAZIL ¡

DIVERGENCES ¡

SLIDE 6

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

Divergences ¡

SLIDE 7

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡

SLIDE 8

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ SIMD: ¡LOCKSTEP ¡EXECUTION! ¡

SLIDE 9

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡ SIMD: ¡LOCKSTEP ¡EXECUTION! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ DIVERGENCE! ¡

SLIDE 10

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡ SIMD: ¡LOCKSTEP ¡EXECUTION! ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 11

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡ SIMD: ¡LOCKSTEP ¡EXECUTION! ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 12

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡ SIMD: ¡LOCKSTEP ¡EXECUTION! ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 13

Divergences ¡

then memcpy(A, B, N); ¡ else memcpy(B, A, N); ¡

Control ¡flow ¡graph ¡for ¡kernel. ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

if (threadId.x > N) ¡ SIMD: ¡LOCKSTEP ¡EXECUTION! ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

And ¡waiMng ¡to ¡process ¡ can ¡be ¡quite ¡costly! ¡

SLIDE 14

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ int idx = threadId.x; int dimx = threadDim.x;

F ¡assigns ¡the ¡result ¡of ¡ (size ¡-‑ ¡i ¡+ ¡1) ¡to ¡data[i]

SLIDE 15

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } int idx = threadId.x; int dimx = threadDim.x;

M ¡assigns ¡the ¡constant ¡ value ¡size ¡to ¡data[i]

SLIDE 16

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } int idx = threadId.x; int dimx = threadDim.x;

Q ¡does ¡also ¡assign ¡size ¡to ¡ data[i], ¡but ¡only ¡for ¡ threads ¡with ¡odd ¡index ¡i

SLIDE 17

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } int idx = threadId.x; int dimx = threadDim.x;

P ¡calls ¡funcMon ¡random ¡ and ¡assigns ¡its ¡value, ¡ modulo ¡size, ¡to ¡data[i]

SLIDE 18

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } int idx = threadId.x; int dimx = threadDim.x;

SLIDE 19

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } 16153µs: constant assignment int idx = threadId.x; int dimx = threadDim.x;

SLIDE 20

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } 16153µs: constant assignment 16250µs: few operations and assignment int idx = threadId.x; int dimx = threadDim.x;

SLIDE 21

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } 16153µs: constant assignment 16250µs: few operations and assignment 30210µs: function call and assignment int idx = threadId.x; int dimx = threadDim.x;

SLIDE 22

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } 16153µs: constant assignment 16250µs: few operations and assignment 32193µs: constant assignment BUT within divergent region! 30210µs: function call and assignment int idx = threadId.x; int dimx = threadDim.x;

SLIDE 23

Interlude: ¡The ¡Kernels ¡of ¡Samuel ¡

void F(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size - i + 1; } } Source: ¡hCp://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf ¡ void M(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = size; } } void Q(int *data, int size) { for (int i = idx; i < size; i += dimx) { if (i % 2) data[i] = size; } } void P(int *data, int size) { for (int i = idx; i < size; i += dimx) { data[i] = random() % size; } } 16153µs: constant assignment 16250µs: few operations and assignment 32193µs: constant assignment BUT within divergent region! 30210µs: function call and assignment int idx = threadId.x; int dimx = threadDim.x;

Divergence ¡is ¡ harmful ¡to ¡ performance! ¡

SLIDE 24

Divergences: ¡Coda ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ Divergent ¡region: ¡

nly ¡acMve ¡threads ¡

run ¡memcpy ¡

SLIDE 25

Divergences: ¡Coda ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ Divergent ¡region: ¡

nly ¡acMve ¡threads ¡

run ¡memcpy ¡ Control ¡flow ¡graph ¡for ¡memcpy. ¡

FUNCTION memcpy ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 26

Divergences: ¡Coda ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ Divergent ¡region: ¡

nly ¡acMve ¡threads ¡

run ¡memcpy ¡ Control ¡flow ¡graph ¡for ¡memcpy. ¡

FUNCTION memcpy ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 27

Divergences: ¡Coda ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ Divergent ¡region: ¡

nly ¡acMve ¡threads ¡

run ¡memcpy ¡ Control ¡flow ¡graph ¡for ¡memcpy. ¡

FUNCTION memcpy ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 28

Divergences: ¡Coda ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ Divergent ¡region: ¡

nly ¡acMve ¡threads ¡

run ¡memcpy ¡ Control ¡flow ¡graph ¡for ¡memcpy. ¡

FUNCTION memcpy ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 t t t t

Threads

Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Observed ¡behavior: ¡

SubopMmal ¡behavior: ¡ thread ¡T3 ¡is ¡inacYve. ¡ Right? ¡

SLIDE 29

Divergences: ¡Coda ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ Divergent ¡region: ¡

nly ¡acMve ¡threads ¡

run ¡memcpy ¡ Control ¡flow ¡graph ¡for ¡memcpy. ¡

FUNCTION memcpy ¡ DIVERGENCE! ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 t t t t

Threads

Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Observed ¡behavior: ¡

Not ¡really! ¡We ¡are ¡using ¡ Dynamic ¡Parallelism ¡

SLIDE 30

DEPARTMENT ¡OF ¡COMPUTER ¡SCIENCE ¡ UNIVERSIDADE ¡FEDERAL ¡DE ¡MINAS ¡GERAIS ¡ FEDERAL ¡UNIVERSITY ¡OF ¡MINAS ¡GERAIS, ¡BRAZIL ¡

DYNAMIC ¡PARALLELISM ¡

SLIDE 31

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡ CUDA’s ¡special ¡syntax ¡for ¡dynamic ¡parallelism: ¡

kernel<<<#warps, ¡#threads>>>(args…) ¡

SLIDE 32

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 a2 b2 b6 t2 a3 b3 b7 t3 a4 a0 b0 b4 t0 a1 b1 b5 t1 c2 c6 c3 c0 c4 c1 c5

j = 0 j = 1 j = 3

Threads

Tempo Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Actual ¡behavior ¡with ¡CUDA’s ¡dynamic ¡parallelism: ¡

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

Time ¡

From ¡T2 ¡ From ¡T1 ¡ From ¡T0 ¡

memcpy ¡runs ¡once ¡ per ¡acMve ¡thread ¡at ¡

memcpy<<<1,4>>>

call ¡site! ¡

SLIDE 33

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 a2 b2 b6 t2 a3 b3 b7 t3 a4 a0 b0 b4 t0 a1 b1 b5 t1 c2 c6 c3 c0 c4 c1 c5

j = 0 j = 1 j = 3

Threads

Tempo Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Actual ¡behavior ¡with ¡CUDA’s ¡dynamic ¡parallelism: ¡

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

Time ¡

From ¡T2 ¡ From ¡T1 ¡ From ¡T0 ¡

SIMD ¡ kernels! ¡

SLIDE 34

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

Dynamic ¡Parallelism ¡

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 a2 b2 b6 t2 a3 b3 b7 t3 a4 a0 b0 b4 t0 a1 b1 b5 t1 c2 c6 c3 c0 c4 c1 c5

j = 0 j = 1 j = 3

Threads

Tempo Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Actual ¡behavior ¡with ¡CUDA’s ¡dynamic ¡parallelism: ¡ Time ¡

From ¡T2 ¡ From ¡T1 ¡ From ¡T0 ¡

SLIDE 35

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

Dynamic ¡Parallelism ¡

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 a2 b2 b6 t2 a3 b3 b7 t3 a4 a0 b0 b4 t0 a1 b1 b5 t1 c2 c6 c3 c0 c4 c1 c5

j = 0 j = 1 j = 3

Threads

Tempo Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Actual ¡behavior ¡with ¡CUDA’s ¡dynamic ¡parallelism: ¡ Time ¡

From ¡T2 ¡ From ¡T1 ¡ From ¡T0 ¡

All ¡threads ¡ work ¡on ¡a ¡ single ¡vector! ¡

SLIDE 36

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

Dynamic ¡Parallelism ¡

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 a2 b2 b6 t2 a3 b3 b7 t3 a4 a0 b0 b4 t0 a1 b1 b5 t1 c2 c6 c3 c0 c4 c1 c5

j = 0 j = 1 j = 3

Threads

Tempo Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Actual ¡behavior ¡with ¡CUDA’s ¡dynamic ¡parallelism: ¡ Time ¡

From ¡T2 ¡ From ¡T1 ¡ From ¡T0 ¡

All ¡threads ¡ work ¡on ¡a ¡ single ¡vector! ¡ Dynamic ¡parallelism ¡changes ¡the ¡ dimension ¡of ¡the ¡parallelism ¡

SLIDE 37

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

Dynamic ¡Parallelism ¡

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

t2 c1 c0 c2 c3 c4 t3 c5 c6 a1 a0 a2 a3 a4 t0 b1 b0 b2 b3 b4 t1 b5 b6 b7 a2 b2 b6 t2 a3 b3 b7 t3 a4 a0 b0 b4 t0 a1 b1 b5 t1 c2 c6 c3 c0 c4 c1 c5

j = 0 j = 1 j = 3

Threads

Tempo Tempo c1 c0 c2 c3 c4 t c5 c6 Time ¡ Actual ¡behavior ¡with ¡CUDA’s ¡dynamic ¡parallelism: ¡ Time ¡

From ¡T2 ¡ From ¡T1 ¡ From ¡T0 ¡

All ¡threads ¡ work ¡on ¡a ¡ single ¡vector! ¡ Dynamic ¡parallelism ¡changes ¡the ¡ dimension ¡of ¡the ¡parallelism ¡ All ¡threads ¡must ¡ be ¡acMve! ¡

SLIDE 38

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

CUDA’s ¡Dynamic ¡ Parallelism: ¡ Nested ¡kernel ¡calls ¡

SLIDE 39

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

CUDA’s ¡Dynamic ¡ Parallelism: ¡ Nested ¡kernel ¡calls ¡ Has ¡the ¡overhead ¡of ¡ allocaYng ¡and ¡scheduling ¡ a ¡new ¡kernel ¡

SLIDE 40

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

CUDA’s ¡Dynamic ¡ Parallelism: ¡ Nested ¡kernel ¡calls ¡ Has ¡the ¡overhead ¡of ¡ allocaYng ¡and ¡scheduling ¡ a ¡new ¡kernel ¡ kernel<<<#warps, #threads>>>(args...); ¡

SLIDE 41

Dynamic ¡Parallelism ¡

void kernel(int **A, int **B, int *N) { int tid(threadId.x); if (tid > N) { memcpy<<<1, 4>>>(A[tid], B[tid], N[tid]); } else { memcpy<<<1, 4>>>(B[tid], A[tid], N[tid]); } }

Kernel ¡for ¡parallel ¡execuMon ¡(CUDA). ¡

CUDA’s ¡Dynamic ¡ Parallelism: ¡ Nested ¡kernel ¡calls ¡ Has ¡the ¡overhead ¡of ¡ allocaYng ¡and ¡scheduling ¡ a ¡new ¡kernel ¡ kernel<<<#warps, #threads>>>(args...); ¡ Parallel ¡Time ¡~ ¡Kernel ¡Launching ¡Overhead ¡+ ¡SequenMal ¡Time ¡ #warps ¡x ¡#threads ¡

SLIDE 42

DEPARTMENT ¡OF ¡COMPUTER ¡SCIENCE ¡ UNIVERSIDADE ¡FEDERAL ¡DE ¡MINAS ¡GERAIS ¡ FEDERAL ¡UNIVERSITY ¡OF ¡MINAS ¡GERAIS, ¡BRAZIL ¡

WARP-‑SYNCHRONOUS ¡PROGRAMMING ¡

SLIDE 43

Warp-‑Synchronous ¡Programming ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

SLIDE 44

Warp-‑Synchronous ¡Programming ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Mappings: ¡ int ¡value ¡= ¡[10 ¡20 ¡30 ¡10] ¡ int ¡value ¡= ¡[11 ¡21 ¡31 ¡11] ¡ increment ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡

SLIDE 45

Warp-‑Synchronous ¡Programming ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Mappings: ¡ int ¡value ¡= ¡[10 ¡20 ¡30 ¡10] ¡ int ¡value ¡= ¡[11 ¡21 ¡31 ¡11] ¡ increment ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ ReducYons: ¡ int ¡value ¡= ¡[10 ¡20 ¡30 ¡10] ¡ int ¡scalar ¡= ¡(70 ¡70 ¡70 ¡70) ¡ sum ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡

SLIDE 46

Warp-‑Synchronous ¡Programming ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Mappings: ¡ int ¡value ¡= ¡[10 ¡20 ¡30 ¡10] ¡ int ¡value ¡= ¡[11 ¡21 ¡31 ¡11] ¡ increment ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ ReducYons: ¡ int ¡value ¡= ¡[10 ¡20 ¡30 ¡10] ¡ int ¡scalar ¡= ¡(70 ¡70 ¡70 ¡70) ¡ sum ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ T0 ¡ ¡ ¡T1 ¡ ¡ ¡T2 ¡ ¡ ¡T3 ¡ Warp-‑level ¡parallelism! ¡

SLIDE 47

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Everywhere ¡blocks: ¡

SLIDE 48

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Everywhere ¡blocks: ¡ DIVERGENCE: ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡

SLIDE 49

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Everywhere ¡blocks: ¡ EVERYWHERE { code... } DIVERGENCE: ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ EVERYWHERE: ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ All ¡threads ¡are ¡temporarily ¡ re-‑enabled ¡to ¡process ¡code ¡ within ¡EVERYWHERE ¡block! ¡

SLIDE 50

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

Everywhere ¡blocks: ¡ EVERYWHERE { code... } DIVERGENCE: ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ EVERYWHERE: ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ DIVERGENCE: ¡ T0 ¡ T1 ¡ T2 ¡ T3 ¡ All ¡threads ¡are ¡temporarily ¡ re-‑enabled ¡to ¡process ¡code ¡ within ¡EVERYWHERE ¡block! ¡ Divergences ¡restored! ¡

SLIDE 51

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡

void memcpy_wrapper(int **dest, int **src, int *N, int mask) { EVERYWHERE { for (int i=0; i < threadDim.x; ++i) { if (not (mask & (1 << i))) continue; // skip thread “i” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } }

Warp-‑synchronous ¡wrapper ¡for ¡SIMD ¡memory ¡copy. ¡

SLIDE 52

void memcpy_wrapper(int **dest, int **src, int *N, int mask) { EVERYWHERE { for (int i=0; i < threadDim.x; ++i) { if (not (mask & (1 << i))) continue; // skip thread “i” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } }

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡ Warp-‑synchronous ¡wrapper ¡for ¡SIMD ¡memory ¡copy. ¡

1. ¡everywhere ¡re-‑enables ¡all ¡threads! ¡

SLIDE 53

void memcpy_wrapper(int **dest, int **src, int *N, int mask) { EVERYWHERE { for (int i=0; i < threadDim.x; ++i) { if (not (mask & (1 << i))) continue; // skip thread “i” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } }

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡ Warp-‑synchronous ¡wrapper ¡for ¡SIMD ¡memory ¡copy. ¡

1. ¡everywhere ¡re-‑enables ¡all ¡threads! ¡
2. ¡Skip ¡formerly ¡divergent ¡threads! ¡

SLIDE 54

void memcpy_wrapper(int **dest, int **src, int *N, int mask) { EVERYWHERE { for (int i=0; i < threadDim.x; ++i) { if (not (mask & (1 << i))) continue; // skip thread “i” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } }

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡ Warp-‑synchronous ¡wrapper ¡for ¡SIMD ¡memory ¡copy. ¡

1. ¡everywhere ¡re-‑enables ¡all ¡threads! ¡
2. ¡Skip ¡formerly ¡divergent ¡threads! ¡
3. ¡Extracts ¡values ¡for ¡current ¡thread ¡“i”. ¡

SLIDE 55

void memcpy_wrapper(int **dest, int **src, int *N, int mask) { EVERYWHERE { for (int i=0; i < threadDim.x; ++i) { if (not (mask & (1 << i))) continue; // skip thread “i” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } }

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡ Warp-‑synchronous ¡wrapper ¡for ¡SIMD ¡memory ¡copy. ¡

1. ¡everywhere ¡re-‑enables ¡all ¡threads! ¡
2. ¡Skip ¡formerly ¡divergent ¡threads! ¡
3. ¡Extracts ¡values ¡for ¡current ¡thread ¡“i”. ¡
4. ¡We ¡then ¡call ¡our ¡SIMD ¡kernel ¡memcpy. ¡

SLIDE 56

void memcpy_wrapper(int **dest, int **src, int *N, int mask) { EVERYWHERE { for (int i=0; i < threadDim.x; ++i) { if (not (mask & (1 << i))) continue; // skip thread “i” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } }

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

void memcpy(int *dest, int *src, int N) { for (int i=threadId.x; i < N; i+=threadDim.x) { dest[i] = src[i]; } }

SIMD ¡implementaMon ¡of ¡memory ¡copy. ¡ Warp-‑synchronous ¡wrapper ¡for ¡SIMD ¡memory ¡copy. ¡

1. ¡everywhere ¡re-‑enables ¡all ¡threads! ¡
2. ¡Skip ¡formerly ¡divergent ¡threads! ¡
3. ¡Extracts ¡values ¡for ¡current ¡thread ¡“i”. ¡
4. ¡We ¡then ¡call ¡our ¡SIMD ¡kernel ¡memcpy. ¡

NVIDIA ¡hardware ¡does ¡not ¡support ¡ re-‑enabling ¡of ¡threads ¡within ¡warp! ¡

SLIDE 57

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

SPMD ¡

¡ handle ¡divergences ¡

SIMD ¡

¡ all ¡threads ¡must ¡be ¡acMve ¡ at ¡the ¡call ¡site ¡

everywhere

¡ temporarily ¡re-‑enables ¡all ¡threads ¡within ¡the ¡warp ¡

SLIDE 58

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

We ¡have ¡defined ¡the ¡semanMcs ¡of ¡EVERYWHERE ¡in ¡the ¡SIMD ¡world: ¡

SemanYcs ¡of ¡everywhere ¡in ¡SIMD: ¡ encode ¡the ¡building ¡blocks ¡to ¡implement ¡this ¡construct ¡

SLIDE 59

Warp-‑Synchronous ¡Programming: ¡Everywhere ¡blocks ¡

We ¡have ¡defined ¡the ¡semanMcs ¡of ¡EVERYWHERE ¡in ¡the ¡SIMD ¡world: ¡

Implemented ¡an ¡abstract ¡SIMD ¡ machine ¡in ¡Prolog, ¡with ¡ support ¡to ¡everywhere ¡blocks. ¡ Extended ¡Intel's ¡SPMD ¡compiler ¡ with ¡a ¡new ¡idiom, ¡funcYon ¡call ¡ re-‑vectorizaYon, ¡that ¡enhances ¡ naMve ¡dynamic ¡parallelism. ¡

SLIDE 60

DEPARTMENT ¡OF ¡COMPUTER ¡SCIENCE ¡ UNIVERSIDADE ¡FEDERAL ¡DE ¡MINAS ¡GERAIS ¡ FEDERAL ¡UNIVERSITY ¡OF ¡MINAS ¡GERAIS, ¡BRAZIL ¡

FUNCTION ¡CALL ¡RE-‑VECTORIZATION ¡

SLIDE 61

Programmability ¡ Efficiency ¡ FuncMon ¡Call ¡Re-‑VectorizaMon ¡

SLIDE 62

Programmability ¡ Efficiency ¡

Dynamic ¡ Parallelism ¡

FuncMon ¡Call ¡Re-‑VectorizaMon ¡

SLIDE 63

Programmability ¡ Efficiency ¡

Dynamic ¡ Parallelism ¡ Shuffle ¡ Nightmare ¡

FuncMon ¡Call ¡Re-‑VectorizaMon ¡

SLIDE 64

Programmability ¡ Efficiency ¡

Dynamic ¡ Parallelism ¡ Shuffle ¡ Nightmare ¡ FuncYon ¡Call ¡ Re-‑VectorizaYon ¡

FuncMon ¡Call ¡Re-‑VectorizaMon ¡

SLIDE 65

Programmability ¡ Efficiency ¡

Dynamic ¡ Parallelism ¡ Shuffle ¡ Nightmare ¡

FuncMon ¡Call ¡Re-‑VectorizaMon ¡

Re-‑enable ¡all ¡threads ¡within ¡warp, ¡ avoiding ¡kernel ¡allocaMon ¡and ¡scheduling ¡ FuncYon ¡Call ¡ Re-‑VectorizaYon ¡

SLIDE 66

Programmability ¡ Efficiency ¡

Dynamic ¡ Parallelism ¡ Shuffle ¡ Nightmare ¡

FuncMon ¡Call ¡Re-‑VectorizaMon ¡

Re-‑enable ¡all ¡threads ¡within ¡warp, ¡ avoiding ¡kernel ¡allocaMon ¡and ¡scheduling ¡ Allowing ¡SIMD ¡ funcMons ¡to ¡be ¡executed, ¡ without ¡diving ¡into ¡ warp-‑synchronous ¡coding! ¡ FuncYon ¡Call ¡ Re-‑VectorizaYon ¡

SLIDE 67

FuncMon ¡Call ¡Re-‑VectorizaMon: ¡CREV ¡

string T = text, P = pattern; void memcmp(int offset) { bool m = true; for (int i=threadId.x; i < |P|; ++i) if (P[i] != T[i + offset]) m = false; if (all(m == true)) Found(k); } void StringMatch() { for (int i=threadId.x; i < (|T| - |P|); i+=threadDim.x) if (P[0] == T[i]) crev memcmp(i); } void NaiveStringMatch() { for (int i=threadId.x; i < (|T| - |P|); i+=threadDim.x) { int j = 0, k = i; while (j < |P| and P[j] == T[k]) j = j + 1; k = k + 1; if (j == |P|) Found(k); } }

SIMD ¡funcMon ¡ SPMD ¡funcMon ¡ with ¡crev ¡call ¡ within ¡divergent ¡ region ¡

if (P[0] == T[i]) crev memcmp(i); ¡

Naïve ¡parallel ¡approach ¡ CREV ¡

SLIDE 68

FuncMon ¡Call ¡Re-‑VectorizaMon: ¡CREV ¡

2000 4000 6000 8000 4 8 12 16 20 24 28 32

KMP PAR CREV

+42.5 +55.2 +17.0 +25.2 +10.1 +28.5 +5.2 +42.2

17.1

+59.7 +0.0 +8.1

1.8

+33.4

10.2

+50.9

Text ¡length ¡ PaCern ¡length ¡

SLIDE 69

FuncMon ¡Call ¡Re-‑VectorizaMon: ¡CREV ¡

ProperYes ¡of ¡CREV: ¡ ¡

‑

Composability ¡ ¡We ¡are ¡able ¡to ¡nest ¡everywhere ¡blocks: ¡crev ¡can ¡be ¡called ¡recursively! ¡ ¡

‑

MulYplicaYve ¡composiYon ¡ ¡The ¡target ¡crev ¡funcMon ¡runs ¡once ¡per ¡acMve ¡thread. ¡ ¡In ¡a ¡warp ¡of ¡W ¡threads, ¡the ¡funcMon ¡may ¡run ¡up ¡to ¡W ¡Mmes. ¡ ¡If ¡the ¡call ¡is ¡recursive, ¡up ¡to ¡WN ¡Mmes. ¡ ¡

‑

CommutaYvity ¡ ¡There ¡is ¡no ¡predefined ¡order ¡between ¡execuMon ¡of ¡crev’s ¡target ¡funcMon. ¡ ¡

‑

SynchronizaYon ¡parity ¡ ¡SynchronizaMon ¡primiMves ¡remain ¡correct, ¡regardless ¡of ¡the ¡crev ¡nested ¡level. ¡ ¡crev ¡uses ¡a ¡context ¡stack ¡to ¡keep ¡track ¡of ¡divergences. ¡

SLIDE 70

FuncMon ¡Call ¡Re-‑VectorizaMon: ¡CREV ¡

Sequential Parallel Launch CREV Dataset BookFilter

not implemented

8530.990 7857.980 7405.175 bin-‑L20K-‑P16 ¡ String Matching 6649.279

KMP Algorithm

3576.143 393166.268 2737.939 txt-‑256MB-‑P16 ¡ Bellman-Ford 141088.730 493619.688

not implemented

529856.065 erdos-‑renyi ¡ Depth-First Search 3754.101 3786.263

not implemented

3790.444 ¡

ctree-‑D5 ¡

Connected- Component Leader 4054.658 3983.088 5272.919 3984.795

ctree-‑D5 ¡

Quicksort-bitonic 2.871

not implemented

204.278 2.878 int-‑16K ¡ Mergesort-bitonic 7.302

not implemented

104.985 4.114 int-‑16K ¡

ExecuYon ¡Ymes ¡(in ¡millions ¡of ¡cycles): ¡ Datasets: ¡

¡

‑

bin-‑L20K-‑P16: ¡10K ¡strings ¡of ¡0s ¡and ¡1s, ¡each ¡of ¡length ¡20K, ¡and ¡target ¡paCern ¡of ¡length ¡16. ¡

‑

txt-‑256MB-‑P16: ¡256MB ¡in ¡5bi ¡lines ¡from ¡books ¡from ¡Project ¡Gutenberg; ¡target ¡paCern ¡has ¡length ¡16. ¡

‑

erdos-‑renyi: ¡random ¡Erdos-‑Renyi ¡graph ¡with ¡2048 ¡nodes ¡and ¡80% ¡probability ¡of ¡edges. ¡

‑
ctree-‑D5: ¡8-‑ary ¡complete ¡tree ¡of ¡depth ¡5 ¡(root ¡+ ¡five ¡full ¡levels ¡of ¡nodes). ¡
‑

int-‑16K: ¡16K ¡random ¡integers ¡in ¡the ¡range ¡[0, ¡100000). ¡

✪ Fastest; ¡✪ 1st ¡runner ¡up; ¡✪ 2nd ¡runner ¡up. ¡

SLIDE 71

DEPARTMENT ¡OF ¡COMPUTER ¡SCIENCE ¡ UNIVERSIDADE ¡FEDERAL ¡DE ¡MINAS ¡GERAIS ¡ FEDERAL ¡UNIVERSITY ¡OF ¡MINAS ¡GERAIS, ¡BRAZIL ¡

QUESTIONS? ¡

Sergey ¡Prokofiev ¡– ¡Piano ¡sonata ¡no. ¡7 ¡op. ¡83 ¡

Email ¡us: ¡ Rubens ¡Emilio ¡Alves ¡Moreira ¡[rubens@dcc.ufmg.br] ¡ Sylvain ¡Collange ¡[sylvain.collange@inria.fr] ¡ Fernando ¡Magno ¡Quintão ¡Pereira ¡[fernando@dcc.ufmg.br] ¡ ¡ Check ¡our ¡website: ¡ hCp://cuda.dcc.ufmg.br/~swan ¡

Everywhere Blocks for SIMD Programming

DIVERGENCES ¡

And ¡waiMng ¡to ¡process ¡ can ¡be ¡quite ¡costly! ¡

Divergence ¡is ¡ harmful ¡to ¡ performance! ¡

DYNAMIC ¡PARALLELISM ¡

WARP-­‑SYNCHRONOUS ¡PROGRAMMING ¡

SPMD ¡

SIMD ¡

everywhere

FUNCTION ¡CALL ¡RE-­‑VECTORIZATION ¡

QUESTIONS? ¡

WARP-‑SYNCHRONOUS ¡PROGRAMMING ¡

FUNCTION ¡CALL ¡RE-‑VECTORIZATION ¡