Do we need dataflow programming?
Ant ntho hony Da y Dana nali lis
Innovative Computing Laboratory University of Tennessee
Do we need dataflow programming? Ant ntho hony Da y Dana nali - - PowerPoint PPT Presentation
Do we need dataflow programming? Ant ntho hony Da y Dana nali lis Innovative Computing Laboratory University of Tennessee CCDS DSC1 16, C , Cha hateau d des Cont ntes Programming vs Execution Dataflow based execution
Innovative Computing Laboratory University of Tennessee
3 3
4 4
P: nodes N: number of kernel executions Tk: kernel execution time To: overhead of discovery To*N << Tk*N/P => To*N <= 0.1*Tk*N/P => P <= 0.1*Tk/To To = 100ns, Tk = 100us => P <= 100 P <= 100
5 5
6 6
7 7
DO {x4} CALL nxt_ctx_next(ctx, icounter, next) IF ( (int_mb(…)+...).ne.8 ) THEN CALL MA_P _PUS USH_GE _GET() CA CALL LL DF DFIL ILL() DO {x2} IF ( (int_mb(…)+… .eq. int_mb(…) ) THEN CALL MA_PUSH_GET(…,k_a) CALL GE GET_H _HASH_B _BLOCK(d_a, dbl_mb(k_a), …) CALL DGE DGEMM(…) END IF END DO CALL TCE_S _SORT_4 _4(dbl_mb(k_c), …) CALL ADD_H DD_HASH_B _BLOCK(d_c, dbl_mb(k_c), …) END DO
Allocate and initialize C Allocate and fetch A
(same for B, not shown)
Global work stealing Actual work Push C back
500 1000 1500 2000 2500 3000 3500 4000 4500 5000 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Execution Time (sec) Cores/Node
Original 32 PaRSEC 32
DO {x4} CALL nxt_ctx_next(ctx, icounter, next) IF ( (int_mb(…)+...).ne.8 ) THEN CALL MA_P _PUS USH_GE _GET() CA CALL LL DF DFIL ILL() DO {x2} IF ( (int_mb(…)+… .eq. int_mb(…) ) THEN CALL MA_PUSH_GET(…,k_a) CALL GE GET_H _HASH_B _BLOCK(d_a, dbl_mb(k_a), …) CALL DGE DGEMM(…) END IF END DO CALL TCE_S _SORT_4 _4(dbl_mb(k_c), …) CALL ADD_H DD_HASH_B _BLOCK(d_c, dbl_mb(k_c), …) END DO
15 15
16 16
MPI has a simple and an advanced API and many developers use only the simple one.
17 17
18 18
19 19
20 20
21 21
DO {x4} CALL nxt_ctx_next(ctx, icounter, next) IF ( (int_mb(…)+...).ne.8 ) THEN CALL MA_P _PUS USH_GE _GET() CA CALL LL DF DFIL ILL() DO {x2} IF ( (int_mb(…)+… .eq. int_mb(…) ) THEN CALL MA_PUSH_GET(…,k_a) CALL GE GET_H _HASH_B _BLOCK(d_a, dbl_mb(k_a), …) CALL DGE DGEMM(…) END IF END DO CALL TCE_S _SORT_4 _4(dbl_mb(k_c), …) CALL ADD_H DD_HASH_B _BLOCK(d_c, dbl_mb(k_c), …) END DO
22 22
DO {x4} CALL nxt_ctx_next(ctx, icounter, next) IF ( (int_mb(…)+...).ne.8 ) THEN CALL MA_P _PUS USH_GE _GET() CA CALL LL DF DFIL ILL() DO {x2} IF ( (int_mb(…)+… .eq. int_mb(…) ) THEN CALL MA_PUSH_GET(…,k_a) CALL GE GET_H _HASH_B _BLOCK(d_a, dbl_mb(k_a), …) CALL DGE DGEMM(…) END IF END DO CALL TCE_S _SORT_4 _4(dbl_mb(k_c), …) CALL ADD_H DD_HASH_B _BLOCK(d_c, dbl_mb(k_c), …) END DO
23 23
DO {x4} CALL nxt_ctx_next(ctx, icounter, next) IF ( (int_mb(…)+...).ne.8 ) THEN CALL MA_P _PUS USH_GE _GET() CA CALL LL DF DFIL ILL() DO {x2} IF ( (int_mb(…)+… .eq. int_mb(…) ) THEN CALL MA_PUSH_GET(…,k_a) CALL GE GET_H _HASH_B _BLOCK(d_a, dbl_mb(k_a), …) CALL DGE DGEMM(…) END IF END DO CALL TCE_S _SORT_4 _4(dbl_mb(k_c), …) CALL ADD_H DD_HASH_B _BLOCK(d_c, dbl_mb(k_c), …) END DO
26 26
27 27
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
28 28
POTRF TRSM T=>T TRSM T=>T TRSM T=>T TRSM T=>T GEMM C=>B GEMM C=>B GEMM C=>B SYRK C=>A C=>A SYRK C=>A GEMM C=>B GEMM C=>B GEMM C=>C SYRK T=>T C=>A C=>A SYRK C=>A GEMM C=>B GEMM C=>C GEMM C=>C SYRK T=>T C=>A C=>A C=>A SYRK C=>A TRSM C=>C TRSM C=>C TRSM C=>C POTRF T=>T T=>T T=>T T=>T C=>B SYRK C=>A C=>B C=>A C=>A C=>B T=>T GEMM C=>C SYRK T=>T SYRK T=>T C=>A C=>A C=>A TRSM C=>C POTRF T=>T T=>T TRSM T=>T C=>C C=>A C=>B SYRK T=>T C=>A C=>A POTRF T=>T TRSM C=>C T=>T C=>A POTRF T=>T
{ [ k ]
[ k , k + 1 ] : k < = m t
}
GEQRT(k)
k = 0 .. mt-1
TSMQR(k,m,n)
k = 0 .. mt-1 m = k+1 .. mt-1 n = k+1 .. mt-1
TSQRT(k,m)
k = 0 .. mt-1 m = k+1 .. mt-1
UNMQR(k,n)
k = 0 .. mt-1 n = k+1 .. mt-1 {[k,m,n]->[n]:k+1==n && k+1==m} {[k,m]->[k,m,n]:k<nt-1 && k<n<nt} {[k,m,n]->[n,m]:k+1==n && m>n} {[k,m,n]->[k+1,n]:k+1==m && n>m} {[k]->[k,n]:k<n<nt && k<nt-1} {[k,n]->[k,k+1,n]:k<mt-1} {[k,m,n]->[k+1,m,n]:n>k+1 && m>k+1} {[k,m,n]->[k,m+1,n]:m<mt-1} {[k,m]->[k,m+1]:m<mt-1}
30 30
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
31 31
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
32 32
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
33 33
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
Iteration_vector Iteration_vector(k,n,m k,n,m) Indices(A[k][n], Indices(A[k][n],k,n k,n)
34 34
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
35 35
for for (k = 0; k < MT; k++) { Insert_Task( zgeqrt, A[k][k], INOUT, T[k][k], OUTPUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsqrt, A[k][k], INOUT | REGION_D|REGION_U, A[m][k], INOUT | LOCALITY, T[m][k], OUTPUT); } for for (n = k+1; n < NT; n++) { Insert_Task( zunmqr, A[k][k], INPUT | REGION_L, T[k][k], INPUT, A[k][n], INOUT); for for (m = k+1; m < MT; m++) { Insert_Task( ztsmqr, A[k][n], INOUT, A[m][n], INOUT | LOCALITY, A[m][k], INPUT, T[m][k], INPUT);
k<m<MT
36 36
37 37