Advances in Loop Analysis Frameworks and Optimizations
Adam Nemet & Michael Zolotukhin Apple
Advances in Loop Analysis Frameworks and Optimizations Adam Nemet - - PowerPoint PPT Presentation
Advances in Loop Analysis Frameworks and Optimizations Adam Nemet & Michael Zolotukhin Apple Loop Unrolling for (x = 0; x < 6; x++) { foo(x); } Loop Unrolling for (x = 0; x < 6; x += 2) { for (x = 0; x < 6; x++) { foo(x);
Adam Nemet & Michael Zolotukhin Apple
foo(x); } for (x = 0; x < 6; x++) {
foo(x); foo(x + 1); } for (x = 0; x < 6; x += 2) { for (x = 0; x < 6; x++) {
foo(x); foo(x + 1); } for (x = 0; x < 6; x += 2) { for (x = 0; x < 6; x++) { foo(x + 2); foo(x + 3); foo(x + 5); foo(x + 4); {
+ Removes loop overhead + Enables other optimizations – Increases code size – Increases compile time – Might regress performance
r += a[i] * b[i]; r += a[i] * b[i]; r += a[i] * b[i];
const int b[50] = {1, 0, 0, …, 0, 0}; int foo(int *a) { int r = 0; for (int i = 0; i < 50; i++) { r += a[i] * b[i]; } return r; }m
1;z0;z 0;z0;z const int b[50] = {1, 0, 0, …, 0, 0}; int foo(int *a) { int r = 0; r += a[0] * b[0]; r += a[1] * b[1]; …z r += a[48] * b[48]; r += a[49] * b[49]; return r; }m
const int b[50] = {1, 0, 0, …, 0, 0}; int foo(int *a) { int r = 0; r += a[0] * 1;z r += a[1] * 0;z …z r += a[48] * 0;z r += a[49] * 0;z return r; }m
const int b[50] = {1, 0, 0, …, 0, 0}; int foo(int *a) { return a[0];z }m
instruction, iteration by iteration
instruction
its unrolled version
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1 = %x
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1 = %x = %t
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1 = %x = %t = 1
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1 = %x = %t = 1 = true
Original loop cost Unrolled loop cost
Iteration 0
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 1 = %x = %t = 1 = true
Original loop cost Unrolled loop cost
Iteration 1
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r
Original loop cost Unrolled loop cost
Iteration 1
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r = 0 = 0 = 2 = true = %r
Original loop cost Unrolled loop cost
%r = 0 loop: %y = b[i] %x = a[i] %t = %x * %y %r = %r + %t %i = %i + 1 %cmp = %i < 50 br %cmp, loop, exit exit: ret %r
Original loop cost Unrolled loop cost
Iteration 49
Original loop cost Unrolled loop cost
Execution speed-up
Unroll Do not unroll Unroll Do not unroll
investigating compile time regressions
General Optimizations
Loop Transformations Case Study
General Optimizations
Loop Transformations 456.hmmer from SPECint 2006
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; } }
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; } }
mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY;
mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY;
dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY;
dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY;
if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY;
= dc[k-1] + tpdd[k-1]; dc[k] =
t; = dc[k-1] + tpdd[k-1]; dc[k] = t
t; = dc[k-1] + tpdd[k-1]; dc[k] = t
t; = dc[k-1] + tpdd[k-1]; dc[k] = t
Iteration K+1: Iteration K:
t2 = dc[k] + tpdd[k]; dc[k+1] = t2; dc[k] = t; t = dc[k-1] + tpdd[k-1];
Iteration K+1: Iteration K:
t2 = dc[k] + tpdd[k]; dc[k+1] = t2; dc[k] = t; t = dc[k-1] + tpdd[k-1];
if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; dc[k] = = dc[k-1] + tpdd[k-1];
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
} for (k = 1; k <= M; k++) {
+ Partial loop vectorization + Improve memory access pattern:
+ Reduce spilling
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; } for (k = 1; k <= M; k++) { dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; } }
Loop Dependence Analysis Run-time Alias Checks
st 2 ld 8 st 10 mul 1 mul 9 ld 3 st 4 st 7 ld 5 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1 dup of ld 3
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
st 2
ld 3 st 4 ld 8 st 10 mul 1 st 7 ld 5 mul 9 add 6 dup of mul 1
for (k = 1; k <= M; k++) { mc[k] = mpp[k-1] + tpmm[k-1]; if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc; if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc; if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc; mc[k] += ms[k]; if (mc[k] < -INFTY) mc[k] = -INFTY; dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; } if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
} for (k = 1; k <= M; k++) {
} for (k = 1; k <= M; k++) { dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk}
dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY;
Load Load Add Cmp Csel Cmp Csel Load Load Add Store DC[k] DC[k-1] —>
Load Load Add Cmp Csel Cmp Csel Load Load Add HW st -> ld forwarding Store
Load Load Add Cmp Csel Cmp Csel Load Load Add HW st -> ld forwarding SW st -> ld forwarding Store
dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY;
distance of one
= sc;
} for (k = 1; k <= M; k++) { if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk} dc[k] = = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = if (dc[k] < -INFTY) dc[k] = = -INFTY;
} for (k = 1; k <= M; k++) { if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk} dc[k] = = dc[k-1] + tpdd[k-1]; = sc; if (dc[k] < -INFTY) dc[k] = = -INFTY; T if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = T T
} for (k = 1; k <= M; k++) { if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk} dc[k] = = T + tpdd[k-1]; = sc; if (dc[k] < -INFTY) dc[k] = = -INFTY; T if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = T T
} for (k = 1; k <= M; k++) { if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk} dc[k] = = T + tpdd[k-1]; = sc; if (dc[k] < -INFTY) dc[k] = = -INFTY; T if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = T T T = dc[0];
} if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; sk} for (k = 1; k <= M; k++) { dc[k] = = T + tpdd[k-1]; = sc; if (dc[k] < -INFTY) dc[k] = = -INFTY; T if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = T T T = dc[0];
about loop cases
Graph
InnerLoopVectorizer::canVectorizeMemory()
Memory Dependence Checker
Access Analysis Runtime Pointer Check
areDepsSafe(CheckDeps, DepCands) Memory Accesses processMemAccesses() canCheckPtrAtRT() P
n t e r s
LoopAccessInfo
Memory Dependence Checker
Access Analysis Runtime Pointer Check
areDepsSafe(CheckDeps, DepCands) Memory Accesses processMemAccesses() canCheckPtrAtRT() P
n t e r s getChecks() getDependences() canVectorizeMemory()
LoopAccessAnalysis
LoopAccessInfo
Mem
Acc Run
areDepsSafe( Memo process canCh P
n t e r get getDe canVect
LoopAccessInfo
Mem
Acc Run
areDepsSafe( Memo process canCh P
n t e r get getDe canVect
LoopAccessInfo
Mem
Acc Run
areDepsSafe( Memo process canCh P
n t e r get getDe canVect
LoopAccessInfo
Mem
Acc Run
areDepsSafe( Memo process canCh P
n t e r get getDe canVect getInfo(Loop)
Loop Alias Checks Original Loop
LoopAccessInfo
Me
Acc Run
areDepsSafe( Memo process canCh P
n t e r
getChecks()
Loop
Loop 2 Loop 1
Loop 2 Distribution Checks Undistributed Loop Loop 1
Vectorized Loop 2 Distribution Checks Undistributed Loop Vectorization Check Scalar Loop 2 Loop 1
Vectorized Loop 2 Distribution Checks Undistributed Loop Vectorization Check Scalar Loop 2 Loop 1
+ Metadata
Vectorized Loop 2 Distribution Checks Undistributed Loop Vectorization Check Scalar Loop 2 Loop 1
+ Metadata
Vectorized Loop 2 Distribution Checks Undistributed Loop Vectorization Check Loop 1
+ Metadata
Vectorized Loop 2 Distribution Checks Undistributed Loop Vectorization Check Loop 1
+ Metadata
Vectorized Loop 2 Distribution + Vectorization Checks Undistributed Loop Loop 1
+ Metadata