. .
E0358
Uday Kumar Reddy B
uday@csa.iisc.ernet.in Dept of CSA, Indian Institute of Science, Bangalore, India
A course on advanced compilation at Dept of CSA IISc
1/104
E0358 . Uday Kumar Reddy B uday@csa.iisc.ernet.in Dept of CSA, - - PowerPoint PPT Presentation
. E0358 . Uday Kumar Reddy B uday@csa.iisc.ernet.in Dept of CSA, Indian Institute of Science, Bangalore, India A course on advanced compilation at Dept of CSA IISc 1/104 R ESEARCH IN P ROGRAMMING AND C OMPILER T ECHNOLOGIES Current: C,
1/104
2/104
2/104
1
2
3/104
4/104
5/104
6/104
7/104
8/104
8/104
1
9/104
1
2
9/104
1
2
3
9/104
1
2
3
9/104
(C) Bernie Saunders, CC BY-NC-ND 3.0
for (i = 0; i <= 2; i++) for (j = 2; j <= (R + 1); j++) for (k = 0; (k <= (C + 3)); k++)
blurx[i][j-2][k] = img[i][j-2][k]*0.0625f + img[i][j-1][k]*0.25f + img[i][j][k]*0.375f + img[i][j+1][k]*0.25f + img[i][j+2][k]*0.0625f;
for (i = 0; (i <= 2); i++) for (j = 2; (j <= (R + 1)); j++) for (k = 2; (k <= (C + 1)); k++)
blury[i][j][k-2] = blurx[i][j-2][k-2]*0.0625f + blurx[i][j-2][k-1]*0.25f + blurx[i][j-2][k]*0.375f + blurx[i][j-2][k+1]*0.25f + blurx[i][j-2][k+2]*0.0625f;
for (i = 0; (i <= 2); i++) for (j = 2; (j <= (R + 1)); j++) for (k = 2; (k <= (C + 1)); k++)
sharpen[i][j][k-2] = img[i][j][k]*(1 + weight) + blury[i][j-2][k-2]*(-weight);
for (i = 0; i <= 2; i++) for (j = 2; j <= R + 1; j++) for (k = 2; k <= C + 1; k++) {
_ct0 = img[i][j][k]; _ct1 = sharpen[i][j-2][k-2]; _ct2 = (std::abs((img[i][j][k] - blury[i][j-2][k-2])) < threshold)? _ct0: _ct1; mask[i][j-2][k-2] = _ct2; }
11/104
for (i = 0; i <= 2; i++) #pragma omp parallel for for (j = 2; j <= (R + 1); j++) #pragma ivdep for (k = 0; k <= C + 3; k++) blurx[i][j-2][k] = img[i][j-2][k]*0.0625f + img[i][j-1][k]*0.25f + img[i][j][k]*0.375f + img[i][j+1][k]*0.25f + img[i][j+2][k]*0.0625f; for (i = 0; i <= 2; i++) #pragma omp parallel for for (j = 2; j <= R + 1; j++) #pragma ivdep for (k = 2; k <= C + 1; k++) blury[i][j][k-2] = blurx[i][j-2][k-2]*0.0625f + blurx[i][j-2][k-1]*0.25f + blurx[i][j-2][k]*0.375f + blurx[i][j-2][k+1]*0.25f + blurx[i][j-2][k+2]*0.0625f; for (i = 0; i <= 2; i++) #pragma omp parallel for for (j = 2; j <= R + 1; j++) #pragma ivdep for (k = 2; k <= C + 1; k++) sharpen[i][j][k-2] = img[i][j][k]*(1 + weight) + blury[i][j-2][k-2]*(-weight); for (i = 0; i <= 2; i++) #pragma omp parallel for private(_ct0,_ct1,_ct2) for (j = 2; j <= R + 1; j++) #pragma ivdep for (k = 2; k <= C + 1; k++) { _ct0 = img[i][j][k]; _ct1 = sharpen[i][j-2][k-2]; _ct2 = (std::abs((img[i][j][k] - blury[i][j-2][k-2])) < threshold)? _ct0: _ct1; mask[i][j-2][k-2] = _ct2; }
12/104
#pragma omp parallel for for (j = 2; j <= (R + 1); j++) for (i = 0; i <= 2; i++) #pragma ivdep for (k = 0; (k <= (C + 3)); k++) blurx[i][j-2][k] = img[i][j-2][k]*0.0625f + img[i][j-1][k]*0.25f + img[i][j][k]*0.375f + img[i][j+1][k]*0.25f + img[i][j+2][k]*0.0625f; #pragma omp parallel for for (j = 2; (j <= (R + 1)); j++) for (i = 0; i <= 2; i++) #pragma ivdep for (k = 2; (k <= (C + 1)); k++) blury[i][j][k-2] = blurx[i][j-2][k-2]*0.0625f + blurx[i][j-2][k-1]*0.25f + blurx[i][j-2][k]*0.375f + blurx[i][j-2][k+1]*0.25f + blurx[i][j-2][k+2]*0.0625f; #pragma omp parallel for for (j = 2; (j <= (R + 1)); j++) for (i = 0; i <= 2; i++) #pragma ivdep for (k = 2; (k <= (C + 1)); k++) sharpen[i][j][k-2] = img[i][j][k]*(1 + weight) + blury[i][j-2][k-2]*(-weight); #pragma omp parallel for private(_ct0,_ct1,_ct2) for (j = 2; j <= R + 1; j++) for (i = 0; i <= 2; i++) #pragma ivdep for (k = 2; k <= C + 1; k++) { _ct0 = img[i][j][k]; _ct1 = sharpen[i][j-2][k-2]; _ct2 = (std::abs((img[i][j][k] - blury[i][j-2][k-2])) < threshold)? _ct0: _ct1; mask[i][j-2][k-2] = _ct2; }
13/104
1
@jit("float32[::](uint8[::], int64)", cache = True, nogil = True)
def unsharp_cv(frame, lib_func):
frame_f = np.float32(frame) / 255.0 res = frame_f kernelx = np.array([1, 4, 6, 4, 1], np.float32) / 16 kernely = np.array([[1], [4], [6], [4], [1]], np.float32) / 16 blury = sepFilter2D(frame_f, -1, kernelx, kernely) sharpen = addWeighted(frame_f, (1 + weight), blury, (-weight), 0) th, choose = threshold(absdiff(frame_f, blury), thresh, 1, THRESH_BINARY) choose = choose.astype(bool) np.copyto(res, sharpen, ’same_kind’, choose)
return res
14/104
1
@jit("float32[::](uint8[::], int64)", cache = True, nogil = True)
def unsharp_cv(frame, lib_func):
frame_f = np.float32(frame) / 255.0 res = frame_f kernelx = np.array([1, 4, 6, 4, 1], np.float32) / 16 kernely = np.array([[1], [4], [6], [4], [1]], np.float32) / 16 blury = sepFilter2D(frame_f, -1, kernelx, kernely) sharpen = addWeighted(frame_f, (1 + weight), blury, (-weight), 0) th, choose = threshold(absdiff(frame_f, blury), thresh, 1, THRESH_BINARY) choose = choose.astype(bool) np.copyto(res, sharpen, ’same_kind’, choose)
return res
2
14/104
1
@jit("float32[::](uint8[::], int64)", cache = True, nogil = True)
def unsharp_cv(frame, lib_func):
frame_f = np.float32(frame) / 255.0 res = frame_f kernelx = np.array([1, 4, 6, 4, 1], np.float32) / 16 kernely = np.array([[1], [4], [6], [4], [1]], np.float32) / 16 blury = sepFilter2D(frame_f, -1, kernelx, kernely) sharpen = addWeighted(frame_f, (1 + weight), blury, (-weight), 0) th, choose = threshold(absdiff(frame_f, blury), thresh, 1, THRESH_BINARY) choose = choose.astype(bool) np.copyto(res, sharpen, ’same_kind’, choose)
return res
2
3
14/104
1
@jit("float32[::](uint8[::], int64)", cache = True, nogil = True)
def unsharp_cv(frame, lib_func):
frame_f = np.float32(frame) / 255.0 res = frame_f kernelx = np.array([1, 4, 6, 4, 1], np.float32) / 16 kernely = np.array([[1], [4], [6], [4], [1]], np.float32) / 16 blury = sepFilter2D(frame_f, -1, kernelx, kernely) sharpen = addWeighted(frame_f, (1 + weight), blury, (-weight), 0) th, choose = threshold(absdiff(frame_f, blury), thresh, 1, THRESH_BINARY) choose = choose.astype(bool) np.copyto(res, sharpen, ’same_kind’, choose)
return res
2
3
4
14/104
#pragma omp parallel for schedule(static) for (int _T_i1 = 0; (_T_i1 <= ((R + 1) / 32)); _T_i1 = (_T_i1 + 1)) { int _ct0 = (((R + 1) < ((32 * _T_i1) + 31))? (R + 1): ((32 * _T_i1) + 31)); int _ct1 = ((2 > (32 * _T_i1))? 2: (32 * _T_i1)); int _ct4 = (((R + 1) < ((32 * _T_i1) + 31))? (R + 1): ((32 * _T_i1) + 31)); int _ct5 = ((2 > (32 * _T_i1))? 2: (32 * _T_i1)); int _ct8 = (((R + 1) < ((32 * _T_i1) + 31))? (R + 1): ((32 * _T_i1) + 31)); int _ct9 = ((2 > (32 * _T_i1))? 2: (32 * _T_i1)); int _ct12 = (((R + 1) < ((32 * _T_i1) + 31))? (R + 1): ((32 * _T_i1) + 31)); int _ct13 = ((2 > (32 * _T_i1))? 2: (32 * _T_i1)); for (int _T_i2 = -1; (_T_i2 <= ((C + 3) / 256)); _T_i2 = (_T_i2 + 1)) { int _ct2 = (((C + 3) < ((256 * _T_i2) + 261))? (C + 3): ((256 * _T_i2) + 261)); int _ct3 = ((0 > (256 * _T_i2))? 0: (256 * _T_i2)); int _ct6 = (((C + 1) < ((256 * _T_i2) + 260))? (C + 1): ((256 * _T_i2) + 260)); int _ct7 = ((2 > ((256 * _T_i2) + 1))? 2: ((256 * _T_i2) + 1)); int _ct10 = (((C + 1) < ((256 * _T_i2) + 259))? (C + 1): ((256 * _T_i2) + 259)); int _ct11 = ((2 > ((256 * _T_i2) + 2))? 2: ((256 * _T_i2) + 2)); int _ct14 = (((C + 1) < ((256 * _T_i2) + 258))? (C + 1): ((256 * _T_i2) + 258)); int _ct15 = ((2 > ((256 * _T_i2) + 3))? 2: ((256 * _T_i2) + 3)); for (int _i0 = 0; (_i0 <= 2); _i0 = (_i0 + 1)) { for (int _i1 = _ct1; (_i1 <= _ct0); _i1 = (_i1 + 1)) { #pragma ivdep for (int _i2 = _ct3; (_i2 <= _ct2); _i2 = (_i2 + 1)) { blurx[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)] = (((((img[(((_i0 * ((R + 4) * (C + 4))) + ((-2 + _i1) * (C + 4))) + _i2)] * 0.0625f) + (img[(((_i0 * ((R + 4) * (C + 4))) + ((-1 + _i1) * (C + 4))) + _i2)] * 0.25f)) + (img[(((_i0 * ((R + 4) * (C + 4))) + (_i1 * (C + 4))) + _i2)] * 0.375f)) + (img[(((_i0 * ((R + 4) * (C + 4))) + ((1 + _i1) * (C + 4))) + _i2)] * 0.25f)) + (img[(((_i0 * ((R + 4) * (C + 4))) + ((2 + _i1) * (C + 4))) + _i2)] * 0.0625f)); } } } for (int _i0 = 0; (_i0 <= 2); _i0 = (_i0 + 1)) { for (int _i1 = _ct5; (_i1 <= _ct4); _i1 = (_i1 + 1)) { #pragma ivdep for (int _i2 = _ct7; (_i2 <= _ct6); _i2 = (_i2 + 1)) { blury[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)] = (((((blurx[_i0][((-32 * _T_i1) + _i1)][(-2 + ((-256 * _T_i2) + _i2))] * 0.0625f) + (blurx[_i0][((-32 * _T_i1) + _i1)][(-1 + ((-256 * _T_i2) + _i2))] * 0.25f)) + (blurx[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)] * 0.375f)) + (blurx[_i0][((-32 * _T_i1) + _i1)][(1 + ((-256 * _T_i2) + _i2))] * 0.25f)) + (blurx[_i0][((-32 * _T_i1) + _i1)][(2 + ((-256 * _T_i2) + _i2))] * 0.0625f)); } } } for (int _i0 = 0; (_i0 <= 2); _i0 = (_i0 + 1)) { for (int _i1 = _ct9; (_i1 <= _ct8); _i1 = (_i1 + 1)) { #pragma ivdep for (int _i2 = _ct11; (_i2 <= _ct10); _i2 = (_i2 + 1)) { sharpen[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)] = ((img[(((_i0 * ((R + 4) * (C + 4))) + (_i1 * (C + 4))) + _i2)] * (1 + weight)) + (blury[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)] * -(weight))); } } } for (int _i0 = 0; (_i0 <= 2); _i0 = (_i0 + 1)) { for (int _i1 = _ct13; (_i1 <= _ct12); _i1 = (_i1 + 1)) { #pragma ivdep for (int _i2 = _ct15; (_i2 <= _ct14); _i2 = (_i2 + 1)) { float _ct16 = img[(((_i0 * ((R + 4) * (C + 4))) + (_i1 * (C + 4))) + _i2)]; float _ct17 = sharpen[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)]; float _ct18 = ((std::abs((img[(((_i0 * ((R + 4) * (C + 4))) + (_i1 * (C + 4))) + _i2)] - blury[_i0][((-32 * _T_i1) + _i1)][((-256 * _T_i2) + _i2)])) < threshold)? _ct16: _ct17); mask_flip[((((_i1-2) * (3 * C)) + ((_i2 - 2) * 3)) + (_i0))] = _ct18; } } } } }
15/104
for (i=0; i<N; i++) for (j=0; j<N; j++)
B[i][j] = A[i][j] + u1[i]*v1[j] + u2[i]*v2[j];
for (i=0; i<N; i++) for (j=0; j<N; j++)
x[i] = x[i] + beta* B[j][i]*y[j];
for (i=0; i<N; i++)
x[i] = x[i] + z[i];
for (i=0; i<N; i++) for (j=0; j<N; j++)
w[i] = w[i] + alpha* B[i][j]*x[j];
16/104
17/104
18/104
19/104
5 10 15 20 1 2 4 9 16 25 32 Execution time in seconds Number of processors scalapack pluto-data-tile-gp (sudoku) pluto-data-tile-block-cyclic
20/104
21/104
21/104
22/104
22/104
23/104
23/104
23/104
23/104
24/104
25/104
26/104
1
2
3
27/104
1
28/104
1
2
28/104
1
2
3
28/104
29/104
29/104
29/104
30/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
1
2
3
31/104
for (i=1; i<=N-1; i++) for (j=1; j<=N-1; j++)
A[i][j] = f(A[i-1][j], A[i][j-1]);
N-1 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
32/104
for (i=1; i<=N-1; i++) for (j=1; j<=N-1; j++)
A[i][j] = f(A[i-1][j], A[i][j-1]);
N-1 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
1
2
32/104
for (i=1; i<=N-1; i++) for (j=1; j<=N-1; j++)
A[i][j] = f(A[i-1][j], A[i][j-1]);
N-1 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
1
2
32/104
for (i=1; i<=N-1; i++) for (j=1; j<=N-1; j++)
A[i][j] = f(A[i-1][j], A[i][j-1]);
N-1 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
2N-2 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 4 5 6 7 8 1 2 3
32/104
for (i=1; i<=N-1; i++) for (j=1; j<=N-1; j++)
A[i][j] = f(A[i-1][j], A[i][j-1]);
N-1 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
2N-2 N-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 4 5 6 7 8 1 2 3
32/104
33/104
for (i=1 i<N; i++)
P(i); /* Produces B[i] using another array A */
for (i=1; i<N; i++)
C(i); /* Consumes B[i] and B[i-1] to create D[i] */
34/104
for (i=1 i<N; i++)
P(i); /* Produces B[i] using another array A */
for (i=1; i<N; i++)
C(i); /* Consumes B[i] and B[i-1] to create D[i] */
for (t1=1; t1<N; t1++) {
P(t1); C(t1); }
34/104
for (i=1 i<N; i++)
P(i); /* Produces A[i] */
for (i=1; i<N; i++)
C(i); /* Consumes A[i] and A[i-1] */
for (t1=0;t1<=floord(N-1,32);t1++) { for (t3=max(1,32*t1);t3<=min(N-1,32*t1+31);t3++) {
P(t3); C(t3); } }
35/104
for (i=1 i<N; i++)
P(i); /* Produces A[i] */
for (i=1; i<N; i++)
C(i); /* Consumes A[i] and A[i-1] */
for (t1=0;t1<=floord(N-1,32);t1++) { for (t3=max(1,32*t1;t3<=min(N-1,32*t1+31);t3++)
P(t3);
for (t3=max(1,32*t1);t3<=min(N-1,32*t1+31);t3++)
C(t3); }
36/104
37/104
38/104
i
1
2
3
. . .
N − 1
j
1
2
3
. . .
M − 1
for (i = 0; i < N; i++) for (j = 0; j < M; j++) A[i+1][j+1] = f(A[i][j]) /* O(N) synchronization if j is parallelized */
t1 = i − j
1
2
3
−3
−2
−1
. . .
. . .
N − 1
−M + 1
t2 = j
1
2
3
. . .
M − 1
#pragma omp parallel for private(t2) for (t1=-M+1; t1<=N-1; t1++) for (t2=max(0,-t1); t2<=min(M-1,N-1-t1); t2++) A[t1+t2+1][t2+1] = f(A[t1+t2][t2]); /* Synchronization-free */
38/104
i
1
2
3
. . .
N − 1
j
1
2
3
. . .
M − 1
t1 = i − j
1
2
3
−3
−2
−1
. . .
. . .
N − 1
−M + 1
t2 = j
1
2
3
. . .
M − 1
39/104
40/104
40/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
41/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
42/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
42/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
43/104
44/104
N-2 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
N-2 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
45/104
46/104
46/104
for (i=1; i<T; i++) for (j=1; j<N-1; j++)
A[(i+1)%2][j] = f(A[i%2][j-1], A[i%2][j], A[i%2][j+1]);
46/104
for (i=1; i<T; i++) for (j=1; j<N-1; j++)
A[(i+1)%2][j] = f(A[i%2][j-1], A[i%2][j], A[i%2][j+1]);
46/104
N-2 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
N+T-3 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 4 5 6 7 1 2 3
47/104
N-2 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
N+T-3 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 4 5 6 7 1 2 3
47/104
48/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
49/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
49/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
49/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1]);
49/104
50/104
(1,0) (2,1)
(1,1) (1,0) (1,1) (1,0) P1 P2 P0 P3 P1 P1 P0 P2 P2 P2 P1 P1 P1 P0
P0
space space
(1,0) (2,1)
P1 P0 P3 (1,1) (1,0) (1,1) (1,0) space P1 P2 P0 space time space time
time P2 P2
One line
Two lines of
Three lines
51/104
52/104
N-2 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3 for (t = 0; t <= T-1; t++) for (i = 1; i <= N-2; i++)
A[(t+1)%2][i] = 0.125 * (A[t%2][i+1]
53/104
N-2 T-1
1 2 3 1 2 3
53/104
N-2 T-1
1 2 3 1 2 3
N-2 T-1
b b b b b b b b b b b b b b b b b b b b b b b b b
1 2 3 1 2 3
53/104
N-2 T-1
1 2 3 1 2 3
N-2 T-1
1 2 3 1 2 3
53/104
54/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1], A[t%2][i][j]);
1
55/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1], A[t%2][i][j]);
1
2
55/104
for (t = 0; t < T; t++) for (i = 1; i < N+1; i++) for (j = 1; j < N+1; j++)
A[(t+1)%2][i][j] = f((A[t%2][i+1][j], A[t%2][i][j], A[t%2][i-1][j], A[t%2][i][j+1], A[t%2][i][j-1], A[t%2][i][j]);
1
2
3
4
55/104
/* Start of CLooG code */
for (t1=-1; t1<=31; t1++) { int lbp=ceild(t1,2), ubp=floord(t1+125,2);
#pragma omp parallel for private(lbv,ubv,t3,t4,t5,t6)
for (t2=lbp; t2<=ubp; t2++) for (t3=max(0,ceild(t1-1,2)); t3<=floord(t1+126,2); t3++) for (t4=max(max(max(0,32*t1),64*t3-4000),64*t1-64*t2+1);
t4<=min(min(min(999,32*t1+63),64*t2+62),64*t3+62); t4++)
for (t5=max(max(64*t2,t4+1),-64*t1+64*t2+2*t4-63);
t5<=min(min(64*t2+63,t4+4000),-64*t1+64*t2+2*t4); t5++)
#pragma ivdep #pragma vector always for (t6=max(64*t3,t4+1); t6<=min(64*t3+63,t4+4000); t6++)
A[( t4 + 1) % 2][ (-t4+t5)][ (-t4+t6)] = (((0.125 * ((A[ t4 % 2][ (-t4+t5) + 1][ (-t4+t6)]
+ (0.125 * ((A[ t4 % 2][ (-t4+t5)][ (-t4+t6) + 1] - (2.0 * A[ t4 % 2][ (-t4+t5)][ (-t4+t6)])) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6) - 1]))) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6)]); } /* End of CLooG code */
56/104
/* Start of CLooG code */
for (t1=-1; t1<=31; t1++) { int lbp=ceild(t1,2), ubp=floord(t1+125,2);
#pragma omp parallel for private(lbv,ubv,t3,t4,t5,t6)
for (t2=lbp; t2<=ubp; t2++) for (t3=max(0,ceild(t1-1,2)); t3<=floord(t1+126,2); t3++) for (t4=max(max(max(0,32*t1),64*t3-4000),64*t1-64*t2+1);
t4<=min(min(min(999,32*t1+63),64*t2+62),64*t3+62); t4++)
for (t5=max(max(64*t2,t4+1),-64*t1+64*t2+2*t4-63);
t5<=min(min(64*t2+63,t4+4000),-64*t1+64*t2+2*t4); t5++)
#pragma ivdep #pragma vector always for (t6=max(64*t3,t4+1); t6<=min(64*t3+63,t4+4000); t6++)
A[( t4 + 1) % 2][ (-t4+t5)][ (-t4+t6)] = (((0.125 * ((A[ t4 % 2][ (-t4+t5) + 1][ (-t4+t6)]
+ (0.125 * ((A[ t4 % 2][ (-t4+t5)][ (-t4+t6) + 1] - (2.0 * A[ t4 % 2][ (-t4+t5)][ (-t4+t6)])) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6) - 1]))) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6)]); } /* End of CLooG code */
56/104
/* Start of CLooG code */
for (t1=-1; t1<=31; t1++) { int lbp=ceild(t1,2), ubp=floord(t1+125,2);
#pragma omp parallel for private(lbv,ubv,t3,t4,t5,t6)
for (t2=lbp; t2<=ubp; t2++) for (t3=max(0,ceild(t1-1,2)); t3<=floord(t1+126,2); t3++) for (t4=max(max(max(0,32*t1),64*t3-4000),64*t1-64*t2+1);
t4<=min(min(min(999,32*t1+63),64*t2+62),64*t3+62); t4++)
for (t5=max(max(64*t2,t4+1),-64*t1+64*t2+2*t4-63);
t5<=min(min(64*t2+63,t4+4000),-64*t1+64*t2+2*t4); t5++)
#pragma ivdep #pragma vector always for (t6=max(64*t3,t4+1); t6<=min(64*t3+63,t4+4000); t6++)
A[( t4 + 1) % 2][ (-t4+t5)][ (-t4+t6)] = (((0.125 * ((A[ t4 % 2][ (-t4+t5) + 1][ (-t4+t6)]
+ (0.125 * ((A[ t4 % 2][ (-t4+t5)][ (-t4+t6) + 1] - (2.0 * A[ t4 % 2][ (-t4+t5)][ (-t4+t6)])) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6) - 1]))) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6)]); } /* End of CLooG code */
56/104
/* Start of CLooG code */
for (t1=-1; t1<=31; t1++) { int lbp=ceild(t1,2), ubp=floord(t1+125,2);
#pragma omp parallel for private(lbv,ubv,t3,t4,t5,t6)
for (t2=lbp; t2<=ubp; t2++) for (t3=max(0,ceild(t1-1,2)); t3<=floord(t1+126,2); t3++) for (t4=max(max(max(0,32*t1),64*t3-4000),64*t1-64*t2+1);
t4<=min(min(min(999,32*t1+63),64*t2+62),64*t3+62); t4++)
for (t5=max(max(64*t2,t4+1),-64*t1+64*t2+2*t4-63);
t5<=min(min(64*t2+63,t4+4000),-64*t1+64*t2+2*t4); t5++)
#pragma ivdep #pragma vector always for (t6=max(64*t3,t4+1); t6<=min(64*t3+63,t4+4000); t6++)
A[( t4 + 1) % 2][ (-t4+t5)][ (-t4+t6)] = (((0.125 * ((A[ t4 % 2][ (-t4+t5) + 1][ (-t4+t6)]
+ (0.125 * ((A[ t4 % 2][ (-t4+t5)][ (-t4+t6) + 1] - (2.0 * A[ t4 % 2][ (-t4+t5)][ (-t4+t6)])) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6) - 1]))) + A[ t4 % 2][ (-t4+t5)][ (-t4+t6)]); } /* End of CLooG code */
56/104
57/104
58/104
59/104
60/104
61/104
62/104
62/104
62/104
63/104
64/104
64/104
65/104
66/104
67/104
68/104
+1
σx=−1 +1
σy=−1
68/104
+1
σx=−1 +1
σy=−1
68/104
+1
σx=−1 +1
σy=−1
68/104
69/104
70/104
71/104
R, C = Parameter(Int), Parameter(Int) I = Image(Float, [R+2, C+2]) x, y = Variable(), Variable() row, col = Interval(0,R+1,1), Interval(0,C+1,1) c = Condition(x,’>=’,1) & Condition(x,’<=’,R) & Condition(y,’>=’,1) & Condition(y,’<=’,C) cb = Condition(x,’>=’,2) & Condition(x,’<=’,R-1) & Condition(y,’>=’,2) & Condition(y,’<=’,C-1) Iy = Function(varDom = ([x,y],[row,col]),Float) Iy.defn = [ Case(c, Stencil(I(x,y), 1.0/12, [[-1, -2, -1], [ 0, 0, 0], [ 1, 2, 1]]) ] Ix = Function(varDom = ([x,y],[row,col]),Float) Ix.defn = [ Case(c, Stencil(I(x,y), 1.0/12, [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) ] Ixx = Function(varDom = ([x,y],[row,col]),Float) Ixx.defn = [ Case(c, Ix(x,y) * Ix(x,y)) ] Iyy = Function(varDom = ([x,y],[row,col]),Float) Iyy.defn = [ Case(c, Iy(x,y) * Iy(x,y)) ] Ixy = Function(varDom = ([x,y],[row,col]),Float) Ixy.defn = [ Case(c, Ix(x,y) * Iy(x,y)) ] Sxx = Function(varDom = ([x,y],[row,col]),Float) Syy = Function(varDom = ([x,y],[row,col]),Float) Sxy = Function(varDom = ([x,y],[row,col]),Float) for pair in [(Sxx, Ixx), (Syy, Iyy), (Sxy, Ixy)]: pair[0].defn = [ Case(cb, Stencil(pair[1], 1, [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) ] det = Function(varDom = ([x,y],[row,col]),Float) d = Sxx(x,y) * Syy(x,y) - Sxy(x,y) * Sxy(x,y) det.defn = [ Case(cb, d) ] trace = Function(varDom = ([x,y],[row,col]),Float) trace.defn = [ Case(cb, Sxx(x,y) + Syy(x,y)) ] harris = Function(varDom = ([x,y],[row,col]),Float) coarsity = det(x,y) - .04 * trace(x,y) * trace(x,y) harris.defn = [ Case(cb, coarsity) ]
72/104
f1
f2
fout
x = Variable()
73/104
f1
f2
fout
x = Variable()
73/104
f1
f2
fout
73/104
f1
f2
fout
73/104
f1
f2
fout
74/104
f1
f2
fout
74/104
f1
f2
fout
74/104
f1
f2
fout
74/104
f1
f2
fout
74/104
f1
f2
fout
74/104
f1
f2
fout
74/104
f1
f2
fout
74/104
75/104
75/104
75/104
75/104
75/104
75/104
75/104
75/104
75/104
75/104
75/104
76/104
77/104
3.74 7.35 12.85 24.02 46.78 1.12 2.24 4.03 7.64 15.18 2.47 4.31 7.83 12.22 16.22 1 1.94 3.47 6.18 10.3
PolyOpt(opt+vec) PolyOpt(opt) PolyOpt(base+vec) PolyOpt(base)
78/104
79/104
80/104
1
2
3
81/104
82/104
82/104
82/104
83/104
84/104
1 Relax vh for n1 iterations: νh ← (1 − ωD−1Ah)vh + ωD−1f h
2 if coarsest level then 3
4 rh ← f h − Ahvh
5 r2h ← I2h h rh
6 e2h ← 0 7 e2h ← V-cycle2h(e2h, r2h) 8 eh ← Ih 2he2h
9 vh ← vh + eh
10 Relax vh for n3 iterations
11 return vh
85/104
86/104
87/104
Smoother
Defect/Residual
Restrict/Reciprocate
Interpolate/Prolongation
Correction
Input
88/104
89/104
90/104
91/104
92/104
93/104
94/104
95/104
95/104
95/104
95/104
for (n = 0; n < N; n++) /* Samples in a batch */ for (o = 0; o < Oc; o++) /* Output feature channels */ for (i = 0; i2 < Ic; i++) /* Input feature channels */ for (y = 0; i3 < Y; i3++) /* Layer height */ for (x = 0; i4 < X; i4++) /* Layer width */ for (kh = 0; i5 < Kh; i5++) /* Convolution kernel height */ for (kw = 0; i6 < Kw; i6++) /* Convolution kernel width */
X Y X Y Kw Ic Kh Oc Oc 96/104
for (n = 0; n < N; n++) /* Samples in a batch */ for (o = 0; o < Oc; o++) /* Output feature channels */ for (i = 0; i2 < Ic; i++) /* Input feature channels */ for (y = 0; i3 < Y; i3++) /* Layer height */ for (x = 0; i4 < X; i4++) /* Layer width */ for (kh = 0; i5 < Kh; i5++) /* Convolution kernel height */ for (kw = 0; i6 < Kw; i6++) /* Convolution kernel width */
1
2
3
96/104
97/104
98/104
98/104
1
2
3
99/104
100/104
101/104
1
2
3
102/104
103/104
104/104