Improving Performance of OpenCL on CPUs Ralf Karrenberg - - PowerPoint PPT Presentation

improving performance of opencl on cpus
SMART_READER_LITE
LIVE PREVIEW

Improving Performance of OpenCL on CPUs Ralf Karrenberg - - PowerPoint PPT Presentation

Improving Performance of OpenCL on CPUs Ralf Karrenberg karrenberg@cs.uni-saarland.de Sebastian Hack hack@cs.uni-saarland.de European LLVM Conference, London April 12-13, 2012 1 Data-Parallel Languages: OpenCL __kernel void


slide-1
SLIDE 1

Improving Performance of OpenCL on CPUs

Ralf Karrenberg

karrenberg@cs.uni-saarland.de

Sebastian Hack

hack@cs.uni-saarland.de

European LLVM Conference, London April 12-13, 2012

1

slide-2
SLIDE 2

Data-Parallel Languages: OpenCL

✞ ☎

__kernel void DCT(__global float * output , __global float * input , __global float * dct8x8 , __local float * inter , const uint width , const uint blockWidth , const uint inverse) { uint tidX = get_global_id (0); ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

}

✝ ✆

2

slide-3
SLIDE 3

OpenCL: Execution Model

3

slide-4
SLIDE 4

CPU Driver Implementation (2D, Na¨ ıve)

✞ ☎

cl_int clEnqueueNDRangeKernel (Kernel scalarKernel , TA argStruct , int* globalSizes , int* localSizes ) { int groupSizeX = globalSizes [0] / localSizes [0]; int groupSizeY = globalSizes [1] / localSizes [1]; // Loop

  • ver

groups. for (int groupX =0; groupX <groupSizeX ; ++ groupX) { for (int groupY =0; groupY <groupSizeY ; ++ groupY) { // Loop

  • ver

threads in group. for (int lidY =0; lidY < localSizes [1]; ++ lidY) { for (int lidX =0; lidX < localSizes [0]; ++ lidX) { scalarKernel (argStruct , lidX , lidY , groupX , groupY , globalSizes , localSizes ); } } } } }

✝ ✆

4

slide-5
SLIDE 5

CPU Driver Implementation (2D, Group Kernel)

✞ ☎

cl_int clEnqueueNDRangeKernel (Kernel groupKernel , TA argStruct , int* globalSizes , int* localSizes ) { int groupSizeX = globalSizes [0] / localSizes [0]; int groupSizeY = globalSizes [1] / localSizes [1]; // Loop

  • ver

groups. for (int groupX =0; groupX <groupSizeX ; ++ groupX) { for (int groupY =0; groupY <groupSizeY ; ++ groupY) { // Loop

  • ver

threads in group. groupKernel (argStruct , groupX , groupY , globalSizes , localSizes ); } } }

✝ ✆

5

slide-6
SLIDE 6

CPU Driver Implementation (2D, Group Kernel, OpenMP)

✞ ☎

cl_int clEnqueueNDRangeKernel (Kernel groupKernel , TA argStruct , int* globalSizes , int* localSizes ) { int groupSizeX = globalSizes [0] / localSizes [0]; int groupSizeY = globalSizes [1] / localSizes [1]; #pragma

  • mp

parallel for for (int groupX =0; groupX <groupSizeX ; ++ groupX) { for (int groupY =0; groupY <groupSizeY ; ++ groupY) { // Loop

  • ver

threads in group. groupKernel (argStruct , groupX , groupY , globalSizes , localSizes ); } } }

✝ ✆

6

slide-7
SLIDE 7

Group Kernel (2D, Scalar)

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { scalarKernel (argStruct , lidX , lidY , groupIDs , globalSizes , localSizes ); // to be inlined } } }

✝ ✆

7

slide-8
SLIDE 8

Group Kernel (2D, Scalar, Inlined)

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = get_global_id (0); ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

} } }

✝ ✆

8

slide-9
SLIDE 9

Group Kernel (2D, Scalar, Inlined, Optimized (1))

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

} } }

✝ ✆

9

slide-10
SLIDE 10

Group Kernel (2D, Scalar, Inlined, Optimized (1))

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

} } }

✝ ✆

10

slide-11
SLIDE 11

Group Kernel (2D, Scalar, Inlined, Optimized (2))

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { uint LIC = lidY* blockWidth ; for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[LIC + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? LIC + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

} } }

✝ ✆

11

slide-12
SLIDE 12

Barrier Synchronization

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { uint LIC = lidY* blockWidth ; for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[LIC + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? LIC + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

} } }

✝ ✆

12

slide-13
SLIDE 13

Barrier Synchronization: Example

a b c d e

13

slide-14
SLIDE 14

Barrier Synchronization: Example

a b c d e a1 a2 b c1 c2 d1 d2 e

13

slide-15
SLIDE 15

Barrier Synchronization: Example

a b c d e a1 a2 b c1 c2 d1 d2 e a1

F1

next: F2

13

slide-16
SLIDE 16

Barrier Synchronization: Example

a b c d e a1 a2 b c1 c2 d1 d2 e a1 a2 b c1

F1

next: F2

F2

next: F3

13

slide-17
SLIDE 17

Barrier Synchronization: Example

a b c d e a1 a2 b c1 c2 d1 d2 e a1 a2 b c1 c2 d1

F1

next: F2

F2

next: F3

F3

next: F4

13

slide-18
SLIDE 18

Barrier Synchronization: Example

a b c d e a1 a2 b c1 c2 d1 d2 e a1 a2 b c1 c2 d2 d1 b e c1

F1

next: F2

F2

next: F3

F3

next: F4

F4

next: F3 return

13

slide-19
SLIDE 19

Group Kernel (1D, Scalar, Barrier Synchronization)

✞ ☎

void groupKernel (TA argStruct , int groupID , int globalSizes , int localSize , ...) { void* data[localSize] = alloc(localSize* liveValSize ); int next = BARRIER_BEGIN ; while (true) { switch (next) { case BARRIER_BEGIN : for (int i=0; i<localSize; ++i) next = F1(argStruct , tid , ..., &data[i]); // B2 break; ... case B4: for (int i=0; i<localSize; ++i) next = F4(tid , ..., &data[i]); // B3 or END break; case BARRIER_END : return; } } }

✝ ✆

14

slide-20
SLIDE 20

OpenCL: Exploiting Parallelism on CPUs

CPU (1 core): All threads run sequentially 1 . . . 14 15 CPU (4 cores): Each core executes 1 thread 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

15

slide-21
SLIDE 21

OpenCL: Exploiting Parallelism on CPUs

CPU (1 core): All threads run sequentially 1 . . . 14 15 CPU (4 cores): Each core executes 1 thread 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 CPU (4 cores, SIMD width 4): Each core executes 4 threads 1 2 3 4 5 6 7 8 . . . 11 12 . . . 15

15

slide-22
SLIDE 22

OpenCL: Exploiting Parallelism on CPUs

CPU (1 core): All threads run sequentially 1 . . . 14 15 CPU (4 cores): Each core executes 1 thread 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 CPU (4 cores, SIMD width 4): Each core executes 4 threads 1 2 3 4 5 6 7 8 . . . 11 12 . . . 15

15

slide-23
SLIDE 23

Group Kernel (2D, SIMD)

✞ ☎

void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; lidX +=4) { __m128i lidXV = <lidX ,lidX+1,lidX+2,lidX +3>; simdKernel (argStruct , lidXV , lidY , groupIDs , globalSizes , localSizes ); // to be inlined } } }

✝ ✆

Whole-Function Vectorization (WFV) of kernel code New kernel computes 4 “threads” at once using SIMD instruction set Challenge: diverging control flow

16

slide-24
SLIDE 24

Diverging Control Flow

a b c d e f Thread Trace 1 a b c e f 2 a b d e f 3 a b c e b c e f 4 a b c e b d e f Different threads execute different code paths

17

slide-25
SLIDE 25

Diverging Control Flow

a b c d e f a b c d e f Thread Trace 1 a b c d e b c d e f 2 a b c d e b c d e f 3 a b c d e b c d e f 4 a b c d e b c d e f Different threads execute different code paths Execute everything, mask out results of inactive threads (using predication, blending) Control flow to data flow conversion on ASTs [Allen et al. POPL’83] Whole-Function Vectorization on SSA CFGs [K & H CGO’11]

17

slide-26
SLIDE 26

Diverging Control Flow

a b c d e f a b c d e f Thread Trace 1 a b c d e b c d e f 2 a b c d e b c d e f 3 a b c d e b c d e f 4 a b c d e b c d e f Overhead for maintaining & updating of predicates Overhead for operations with side-effects (e.g. load/store/call) Expensive but rarely executed paths are now always executed Linearization increases register pressure ☞ more spilling Works well for kernels with mostly straight-line code

17

slide-27
SLIDE 27

DCT Kernel: Non-Divergent Control Flow

✞ ☎

__kernel void DCT(__global float * output , __global float * input , __global float * dct8x8 , __local float * inter , const uint width , const uint blockWidth , const uint inverse) { uint tidX = get_global_id (0); ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; }

  • utput[tidY*width + tidX] = acc;

} // Compiled to LLVM bitcode.

✝ ✆

18

slide-28
SLIDE 28

Non-Divergent Control Flow

Idea: optimize cases where threads do not diverge a b c d e f a b c d e f Thread Trace 1 a b c e f 2 a b c e f 3 a b c e b d e f 4 a b c e b d e f

19

slide-29
SLIDE 29

Non-Divergent Control Flow

Idea: optimize cases where threads do not diverge a b c d e f a b c d e f Thread Trace 1 a b c e b d e f 2 a b c e b d e f 3 a b c e b d e f 4 a b c e b d e f

19

slide-30
SLIDE 30

Non-Divergent Control Flow

Idea: optimize cases where threads do not diverge a b c d e f a b c d e f Thread Trace 1 a b c e b d e f 2 a b c e b d e f 3 a b c e b d e f 4 a b c e b d e f Option 1: Insert dynamic predicate-tests & branches to skip paths

◮ “Branch on superword condition code” (BOSCC) [Shin et al. PACT’07] ◮ Additional overhead for dynamic test ◮ Does not help against increased register pressure 19

slide-31
SLIDE 31

Non-Divergent Control Flow

Idea: optimize cases where threads do not diverge a b c d e f a b c d e f

u v

Thread Trace 1 a b c e b d e f 2 a b c e b d e f 3 a b c e b d e f 4 a b c e b d e f Option 2: Statically prove non-divergence of certain blocks

◮ Non-divergent blocks can be excluded from linearization ◮ Less executed code, less register pressure ◮ More conservative than dynamic test ☞ exploit both! 19

slide-32
SLIDE 32

Uniform/Varying Branches

b u Either all threads entering b go left or right

if ( blockWidth % 2 == 0) { ... } for(uint k=0; k < blockWidth ; k++) { ... }

b v From the p+q threads entering b, p go left, q go right

if (tid % 2 == 0) { ... } for(uint k=0; k < tid; k++) { ... }

20

slide-33
SLIDE 33

When does a block diverge?

Informally

a b c d e f g

v v u

A block b is divergent if: b might execute less (not provably 0) threads than its predecessor. That is: it is a successor of a varying branch Two disjoint paths from the same varying branch rejoin at b (Additional criterion for loops)

21

slide-34
SLIDE 34

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

(a) Original CFG

22

slide-35
SLIDE 35

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

(a) Original CFG (b) Topological order (by data dependencies)

22

slide-36
SLIDE 36

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

a c e b d f g

(c)

(a) Original CFG (b) Topological order (by data dependencies) (c) Na¨ ıve: Rewire all edges to next block ☞ all blocks are always executed

22

slide-37
SLIDE 37

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

a c e b d f g

(c)

a c b e d f g

(d)

(a) Original CFG (b) Topological order (by data dependencies) (c) Na¨ ıve: Rewire all edges to next block ☞ all blocks are always executed (d) Invalid: Edges to/from non-divergent block remain ☞ b and d can be skipped

22

slide-38
SLIDE 38

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

a c e b d f g

(c)

a c b e d f g

(d)

a c b e d f g

(e)

(a) Original CFG (b) Topological order (by data dependencies) (c) Na¨ ıve: Rewire all edges to next block ☞ all blocks are always executed (d) Invalid: Edges to/from non-divergent block remain ☞ b and d can be skipped (e) Valid: Rewire edges to divergent blocks to next in list ☞ only e can be skipped

22

slide-39
SLIDE 39

Evaluation I: WFV vs. Sequential Execution Comparison

Application Na¨ ıve UniVal BOSCC UniCF BitonicSort 3.0 3.2 3.3 3.2 BlackScholes 3.9 4.1 4.1 4.1 DCT 0.67 0.85 0.85 1.78 FastWalshTransform 0.74 0.73 0.73 0.73 FloydWarshall 0.11 0.12 0.13 0.12 Histogram 0.92 1.08 1.07 1.24 Mandelbrot 0.51 2.4 2.4 2.4 MatrixTranspose 0.97 1.44 1.44 1.44 NBody 1.8 2.67 2.67 3.64 AVG 1.4 1.84 1.85 2.07

SIMD width 4, median of 100 iterations, no warm-up, confidence level 95%

23

slide-40
SLIDE 40

Evaluation I: WFV vs. Sequential Execution Comparison

Application Na¨ ıve UniVal BOSCC UniCF BitonicSort 3.0 3.2 3.3 3.2 BlackScholes 3.9 4.1 4.1 4.1 DCT 0.67 0.85 0.85 1.78 FastWalshTransform 0.74 0.73 0.73 0.73 FloydWarshall 0.11 0.12 0.13 0.12 Histogram 0.92 1.08 1.07 1.24 Mandelbrot 0.51 2.4 2.4 2.4 MatrixTranspose 0.97 1.44 1.44 1.44 NBody 1.8 2.67 2.67 3.64 AVG 1.4 1.84 1.85 2.07

SIMD width 4, median of 100 iterations, no warm-up, confidence level 95% Na¨ ıve WFV is often inferior to sequential execution

23

slide-41
SLIDE 41

Evaluation I: WFV vs. Sequential Execution Comparison

Application Na¨ ıve UniVal BOSCC UniCF BitonicSort 3.0 3.2 3.3 3.2 BlackScholes 3.9 4.1 4.1 4.1 DCT 0.67 0.85 0.85 1.78 FastWalshTransform 0.74 0.73 0.73 0.73 FloydWarshall 0.11 0.12 0.13 0.12 Histogram 0.92 1.08 1.07 1.24 Mandelbrot 0.51 2.4 2.4 2.4 MatrixTranspose 0.97 1.44 1.44 1.44 NBody 1.8 2.67 2.67 3.64 AVG 1.4 1.84 1.85 2.07

SIMD width 4, median of 100 iterations, no warm-up, confidence level 95% Na¨ ıve WFV is often inferior to sequential execution Dynamic analysis (BOSCC) has almost no effect for these benchmarks

23

slide-42
SLIDE 42

Evaluation I: WFV vs. Sequential Execution Comparison

Application Na¨ ıve UniVal BOSCC UniCF BitonicSort 3.0 3.2 3.3 3.2 BlackScholes 3.9 4.1 4.1 4.1 DCT 0.67 0.85 0.85 1.78 FastWalshTransform 0.74 0.73 0.73 0.73 FloydWarshall 0.11 0.12 0.13 0.12 Histogram 0.92 1.08 1.07 1.24 Mandelbrot 0.51 2.4 2.4 2.4 MatrixTranspose 0.97 1.44 1.44 1.44 NBody 1.8 2.67 2.67 3.64 AVG 1.4 1.84 1.85 2.07

SIMD width 4, median of 100 iterations, no warm-up, confidence level 95% Na¨ ıve WFV is often inferior to sequential execution Dynamic analysis (BOSCC) has almost no effect for these benchmarks Static analysis (UniCF) is beneficial for suitable kernels

23

slide-43
SLIDE 43

Evaluation I: WFV vs. Sequential Execution Comparison

Application Na¨ ıve UniVal BOSCC UniCF BitonicSort 3.0 3.2 3.3 3.2 BlackScholes 3.9 4.1 4.1 4.1 DCT 0.67 0.85 0.85 1.78 FastWalshTransform 0.74 0.73 0.73 0.73 FloydWarshall 0.11 0.12 0.13 0.12 Histogram 0.92 1.08 1.07 1.24 Mandelbrot 0.51 2.4 2.4 2.4 MatrixTranspose 0.97 1.44 1.44 1.44 NBody 1.8 2.67 2.67 3.64 AVG 1.4 1.84 1.85 2.07

SIMD width 4, median of 100 iterations, no warm-up, confidence level 95% Na¨ ıve WFV is often inferior to sequential execution Dynamic analysis (BOSCC) has almost no effect for these benchmarks Static analysis (UniCF) is beneficial for suitable kernels Kernels dominated by random memory access are not suited for WFV

23

slide-44
SLIDE 44

Evaluation II: WFVOpenCL vs. Intel/AMD (milliseconds)

Application WFVOpenCL Intel AMD Speedup vs Intel BitonicSort 164 1,170 47,271 7.13× BlackScholes 241 329 717 1.37× DCT 201 350 693 1.74× FastWalshTransform 4,944 6,661 8,601 1.35× FloydWarshall 934(148*) 525* 471 0.56×(3.55×*) Histogram 387 1,178 527 3.07× Mandelbrot 632 1,930 29,045 3.05× MatrixTranspose 1,072 2,933 10,748 2.74× NBody 343 676 1,253 1.97×

4 cores, SIMD width 4, median of 100 iterations, no warm-up, confidence level 95% Intel OpenCL SDK v1.1 / AMD APP SDK v2.5 Average speedup: 2.5× (Intel), 40× (AMD) *WFV disabled – Intel driver does not vectorize FloydWarshall

24

slide-45
SLIDE 45

LLVM: Benefits and Drawbacks

We heavily rely on JIT code generator ☞ no disappointment! LLVM IR allows convenient expression of vector computations

◮ Vector-select and type legalization

MOVMASK still requires an intrinsic Would be great: a way to express predication in IR

25

slide-46
SLIDE 46

Outlook

More optimizations for WFV Integration of WFV into LLVM mainline?

◮ Should integrate nicely with Hal’s BasicBlock vectorization ◮ Combine with loop dependency analysis / Polly for “classic” loop

vectorization

Support for architectures w/ predicated execution (e.g. LRBni)

26

slide-47
SLIDE 47

Conclusion

OpenCL benefits from “group kernel”-based implementation:

◮ Optimize uniform expressions & access to tid etc. ◮ Enable continuation-based barrier synchronization

OpenCL benefits from both multi-threading and WFV on CPUs Divergence analysis improves WFV:

◮ Reduce amount of executed code ◮ Reduce register pressure ◮ Reduce overhead for maintaining & updating of predicates

Evaluation shows importance of advanced vectorization techniques Sources available: https://github.com/karrenberg

27

slide-48
SLIDE 48

Conclusion

OpenCL benefits from “group kernel”-based implementation:

◮ Optimize uniform expressions & access to tid etc. ◮ Enable continuation-based barrier synchronization

OpenCL benefits from both multi-threading and WFV on CPUs Divergence analysis improves WFV:

◮ Reduce amount of executed code ◮ Reduce register pressure ◮ Reduce overhead for maintaining & updating of predicates

Evaluation shows importance of advanced vectorization techniques Sources available: https://github.com/karrenberg

Thank You!

Questions?

27

slide-49
SLIDE 49

28

slide-50
SLIDE 50

CFG Linearization w/ Non-Divergent Blocks

Combine divergent blocks to divergent regions with DFS:

◮ Non-uniform branch found: create new region, set as active ◮ Post-dominator of region found: finish region, set last unfinished one as

active

◮ Add divergent blocks to active region ◮ Merge overlapping regions

Linearize regions recursively (inner before outer regions):

◮ Order blocks topologically by data dependencies (inner regions treated

as single blocks)

◮ Schedule blocks in this order by visiting all outgoing edges: ⋆ Rewire all edges that target a divergent block ⋆ New target: next divergent, unscheduled block of region 29

slide-51
SLIDE 51

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

(a) Original CFG

30

slide-52
SLIDE 52

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

(a) Original CFG (b) Topological order (by data dependencies)

30

slide-53
SLIDE 53

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

a c e b d f g

(c)

(a) Original CFG (b) Topological order (by data dependencies) (c) Na¨ ıve: Rewire all edges to next block ☞ all blocks are always executed

30

slide-54
SLIDE 54

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

a c e b d f g

(c)

a c b e d f g

(d)

(a) Original CFG (b) Topological order (by data dependencies) (c) Na¨ ıve: Rewire all edges to next block ☞ all blocks are always executed (d) Invalid: Edges to/from non-divergent block remain ☞ b and d can be skipped

30

slide-55
SLIDE 55

CFG Linearization w/ Non-Divergent Blocks: Example

a b c d e f g

v v u (a)

a c e b d f g

(b)

a c e b d f g

(c)

a c b e d f g

(d)

a c b e d f g

(e)

(a) Original CFG (b) Topological order (by data dependencies) (c) Na¨ ıve: Rewire all edges to next block ☞ all blocks are always executed (d) Invalid: Edges to/from non-divergent block remain ☞ b and d can be skipped (e) Valid: Rewire edges to divergent blocks to next in list ☞ only e can be skipped

30

slide-56
SLIDE 56

Examples

a b c d e f g h i

u v u u

a b c e f j

v u u u u

31

slide-57
SLIDE 57

Retaining Control Flow: Complex Example

a b c d e f g h i

u v u u

32

slide-58
SLIDE 58

Retaining Control Flow: Complex Example

a b c d e f g h i

u v u u

a b c d e f g h i

u v u u

32

slide-59
SLIDE 59

Retaining Control Flow: Complex Example

a b c d e f g h i

u v u u

a b c d e f g h i

u v u u

a b c d e f g h i

32

slide-60
SLIDE 60

Retaining Control Flow: Loop Example

a b c d e f h i

u u v

33

slide-61
SLIDE 61

Retaining Control Flow: Loop Example

a b c d e f h i

u u v

a b c d e f h i

u u v

33

slide-62
SLIDE 62

Retaining Control Flow: Loop Example

a b c d e f h i

u u v

a b c d e f h i

u u v

a b c d e f h i

u u v

33