md_poly : A Performance-Portable Polyhedral Compiler based on - PowerPoint PPT Presentation

md_poly : A Performance-Portable Polyhedral Compiler based on Multi-Dimensional Homomorphisms Ari Rasch, Richard Schulze, Sergei Gorlatch University of Münster, Germany

<latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> Our Background We are the developers of the MDH code generation approach: Executable Generic Different architectures program code program code and input sizes High-level parallel __kernel void gemv_fst( __global float* in_matrix, __global float* in_vector, __global float* out_vector, { // private memory for a WI's computation __private float res_prv = 0.0f; // local memory for a WG's computation __local float res_lcl[ NUM_WI_1 ][ NUM_WI_2 ]; __kernel void gemv_fst( __global float* in_matrix, programming abstractions __global float* in_vector, // iteration over P_sq blocks for( int i_sq = 1 ; i_sq <= NUM_SQ_1 ; ++i_sq ) { __global float* out_vector, for( int j_sq = 1 ; j_sq <= NUM_SQ_2 ; ++j_sq ) { { res_prv = 0.0f; // sequential computation on a P_wi partition // private memory for a WI's computation for( int i = 1 ; i <= WI_PART_SIZE_1 ; ++i ) for( int j = 1 ; j <= WI_PART_SIZE_2 ; ++j ) __private float res_prv = 0.0f; res_prv += my_p_wi( i, j, 0 ) * my_p_wi( i, j, 1 ); // store result in local memory // local memory for a WG's computation res_lcl[ WI_ID_1 ][ WI_ID_2 ] = res_prv; __local float res_lcl[ NUM_WI_1 ][ NUM_WI_2 ]; barrier( CLK_LOCAL_MEM_FENCE ); // iteration over P_sq blocks // combine the WIs' results in dimension x for( int stride = NUM_WI_2 / 2 ; stride > 0 ; stride /= 2) for( int i_sq = 1 ; i_sq <= NUM_SQ_1 ; ++i_sq ) { { for( int j_sq = 1 ; j_sq <= NUM_SQ_2 ; ++j_sq ) { if( WI_ID_2 < stride) res_lcl[ WI_ID_1 ][ WI_ID_2 ] += res_lcl[ WI_ID_1 ][ WI_ID_2 + stride ]; res_prv = 0.0f; barrier( CLK_LOCAL_MEM_FENCE ); } // sequential computation on a P_wi partition for( int i = 1 ; i <= WI_PART_SIZE_1 ; ++i ) // store WGs' results in global memory if( WI_ID_2 == 0 ) md hom ( f, ( ~ 1 , . . . , ~ k ) ) for( int j = 1 ; j <= WI_PART_SIZE_2 ; ++j ) my_res( i_sq ) = res_lcl[ WI_ID_1 ][0]; res_prv += my_p_wi( i, j, 0 ) * my_p_wi( i, j, 1 ); barrier( CLK_LOCAL_MEM_FENCE ); // store result in local memory } // end of for-loop j_sq } // end of for-loop i_sq res_lcl[ WI_ID_1 ][ WI_ID_2 ] = res_prv; } // end of kernel barrier( CLK_LOCAL_MEM_FENCE ); // combine the WIs' results in dimension x … for( int stride = NUM_WI_2 / 2 ; stride > 0 ; stride /= 2) { if( WI_ID_2 < stride) res_lcl[ WI_ID_1 ][ WI_ID_2 ] += res_lcl[ WI_ID_1 ][ WI_ID_2 + stride ]; barrier( CLK_LOCAL_MEM_FENCE ); } // store WGs' results in global memory if( WI_ID_2 == 0 ) my_res( i_sq ) = res_lcl[ WI_ID_1 ][0]; __kernel void gemv_fst( __global float* in_matrix, barrier( CLK_LOCAL_MEM_FENCE ); __global float* in_vector, __global float* out_vector, { } // end of for-loop j_sq // private memory for a WI's computation } // end of for-loop i_sq __private float res_prv = 0.0f; } // end of kernel // local memory for a WG's computation __local float res_lcl[ NUM_WI_1 ][ NUM_WI_2 ]; // iteration over P_sq blocks for( int i_sq = 1 ; i_sq <= NUM_SQ_1 ; ++i_sq ) { for( int j_sq = 1 ; j_sq <= NUM_SQ_2 ; ++j_sq ) { res_prv = 0.0f; // sequential computation on a P_wi partition (1) (2) (3) for( int i = 1 ; i <= WI_PART_SIZE_1 ; ++i ) for( int j = 1 ; j <= WI_PART_SIZE_2 ; ++j ) res_prv += my_p_wi( i, j, 0 ) * my_p_wi( i, j, 1 ); // store result in local memory res_lcl[ WI_ID_1 ][ WI_ID_2 ] = res_prv; barrier( CLK_LOCAL_MEM_FENCE ); // combine the WIs' results in dimension x for( int stride = NUM_WI_2 / 2 ; stride > 0 ; stride /= 2) { if( WI_ID_2 < stride) res_lcl[ WI_ID_1 ][ WI_ID_2 ] += res_lcl[ WI_ID_1 ][ WI_ID_2 + stride ]; barrier( CLK_LOCAL_MEM_FENCE ); Generation Optimization Execution } // store WGs' results in global memory if( WI_ID_2 == 0 ) my_res( i_sq ) = res_lcl[ WI_ID_1 ][0]; barrier( CLK_LOCAL_MEM_FENCE ); } // end of for-loop j_sq } // end of for-loop i_sq } // end of kernel [PACT’19, IJPP’18] [CCPE’18, HPCC’17] [JOS’19, ICPADS’18] • Multi-Dimensional Homomorphisms (MDHs) are a formally defined class of functions that cover   important data-parallel computations , e.g.: linear algebra routines (BLAS), stencils computations, … • We enable conveniently implementing MDHs by providing a high-level DSL for them. • We provide a DSL compiler that automatically generates OpenCL code — the standard for uniformly programming different parallel architectures (e.g., CPU and GPU). • Our OpenCL code is fully automatically optimizable (auto-tunable) — for each combination of a target architecture , and input size — by being generated as targeted to OpenCL’s abstract device models and as parametrized in these models’ performance-critical parameters. 2

Experimental Results Stencils Data Mining Gaussian (2D) Jacobi (3D) CPU RW PC RW PC Probabilistic Record Linkage CPU 2¹⁵ 2²⁰ 2¹⁶ 2¹⁷ 2¹⁸ 2¹� 4.90 5.96 1.94 2.49 Lift [2] 1.87 2.06 4.98 13.86 28.34 39.36 EKR [5] 6.99 14.31 N/A N/A MKL-DNN [5] Forchhammer et al. “Duplicate Detection on GPUs.”, HFSL’13 . Gaussian (2D) Jacobi (3D) GPU RW PC RW PC 2.33 1.09 1.14 1.02 Lift [2] 3.78 19.11 N/A N/A cuDNN Our MDH approach achieves [2] Hagedorn et. al, "High Performance Stencil Code Generation with LIFT.”, CGO’18 often better performance than   (Best Paper Award) . well-performing competitors [1] Linear Algebra [1] Rasch, Schulze, Gorlatch. "Generating Portable High-Performance Code via Multi- GEMM GEMV CPU Dimensional Homomorphisms.”, PACT’19 RW PC RW PC Lift [1] fails 3.04 1.51 1.99 MKL 4.22 0.74 1.05 0.87 Tensor Contractions GEMM GEMV Tensor Contractions GPU GPU RW PC RW PC RW 1 RW 2 RW 3 RW 4 RW 5 RW 6 RW 7 RW 8 RW 9 Lift [1] 4.33 1.17 3.52 2.98 1.26 1.16 2.12 1.24 1.18 1.36 1.48 1.44 1.85 COGENT [3] cuBLAS 2.91 0.83 1.03 1.00 F-TC [4] 1.19 2.00 1.43 2.89 1.35 1.54 1.25 2.02 1.49 [3] Kim et. al. "A Code Generator for High-Performance Tensor [1] Steuwer et. al, "Lift: A Contractions on GPUs.”, CGO’19 . Functional Data-Parallel IR for [4] Vasilache et al. "The Next 700 Accelerated Layers: From High-Performance GPU Code Mathematical Expressions of Network Computation Graphs to Generation”, CGO’17 . 3 Accelerated GPU Kernels, Automatically . ”, TACO, 2019 .

Observation Comparison: MDH Approach vs. Polyhedral Approaches (e.g. PPCG) • Polyhedral approaches often provide better productivity → automatically parallelize sequential program code (rather than relying on a DSL). • The MDH approach achieves often higher performance than polyhedral compilers; its generated code is portable over different architectures (e.g., GPU and CPU). Goal of this work: Combining the advantages of both approaches 4

md_poly : A Performance-Portable Polyhedral Compiler based on - PowerPoint PPT Presentation

md_poly : A Performance-Portable Polyhedral Compiler based on Multi-Dimensional Homomorphisms Ari Rasch, Richard Schulze, Sergei Gorlatch University of Mnster, Germany <latexit

PC PORTABLE PC PORTABLE PC PORTABLE Introducing the PC Portable Lamp, one of a range of

Compiler Construction Chapter 11 1 Compiler Construction Compiler Construction A New Compiler

Polyhedral Volumes Visual Techniques T. V. Raman & M. S. Krishnamoorthy Polyhedral Volumes

Polyhedral Volumes Visual Techniques T. V. Raman & M. S. Krishnamoorthy Polyhedral Volumes

A study of some pitfalls preventing peak performance in polyhedral compilation using a polyhedral

Portable fuel cell system s Jaeyoung Lee September 19, 2006 http:/ / w w w .h2 fc.re.kr Energy

Interactive Proofs Lecture 19 And Beyond 1 So far 2 So far IP = PSPACE = AM[poly] 2 So far

Computing the Cohomology Ring of a Polyhedral Complex Joint work with D. Kravatz, R.

The Polyhedral Model Beyond Loops Recursion Optimization and Parallelization Through Polyhedral

Computing the Cohomology Algebra of a Polyhedral Complex Joint work with R. Gonzalez-Diaz &

AlphaZ: A System for Design Space Exploration in the Polyhedral Model Tomofumi Yuki, Gautam

PORTABLE MANAGEMENT BEX/BTA Oversight Committee May 17, 2019 Agenda Portable Management

Portable Enforcement Solution International Product Marketing Department Portable PTZ Dome Body

Polyhedral Compilation Opportunities in MLIR Uday Bondhugula Indian Institute of Science

Combining Polyhedral and AST Transformations in CHiLL Huihui Zhang , Anand Venkat, Protonu Basu,

MECHANICAL AND MORPHOLOGICAL PROPERTIES OF POLY (LACTIC ACID)/POLY (BUTYLENE ADIPATE- CO -

Performance and Stability Comparison of Vehicular Congestion Control Algorithms Presented by Ali

Computer Animation CSE169: Computer Animation Instructor: Steve Rotenberg UCSD, Winter 2020

NUMERICAL METHODS FOR FRACTIONAL DIFFUSION Ricardo H. Nochetto Department of Mathematics and

Blazing Performance with Flame Graphs Brendan Gregg An Interactive Visualization for Stack

Management of She was not taking NSAIDS and was h.pylori negative. Paraesophageal Hernia Upper GI

Learning to Recognize Discontiguous Entities Aldrian Obaja Muis and Wei Lu Singapore University

Saving Lives In the Medical Surgical Unit and Establishing a Successful Capnography

The Case of the Upset Stomach A Mystery only Quantified Self could solve! June 2011 Why???

md_poly : A Performance-Portable Polyhedral Compiler based on - PowerPoint PPT Presentation

md_poly : A Performance-Portable Polyhedral Compiler based on Multi-Dimensional Homomorphisms Ari Rasch, Richard Schulze, Sergei Gorlatch University of Mnster, Germany <latexit

PC PORTABLE PC PORTABLE PC PORTABLE Introducing the PC Portable Lamp, one of a range of

Compiler Construction Chapter 11 1 Compiler Construction Compiler Construction A New Compiler

Polyhedral Volumes Visual Techniques T. V. Raman &amp; M. S. Krishnamoorthy Polyhedral Volumes

Polyhedral Volumes Visual Techniques T. V. Raman &amp; M. S. Krishnamoorthy Polyhedral Volumes

A study of some pitfalls preventing peak performance in polyhedral compilation using a polyhedral

Portable fuel cell system s Jaeyoung Lee September 19, 2006 http:/ / w w w .h2 fc.re.kr Energy

Interactive Proofs Lecture 19 And Beyond 1 So far 2 So far IP = PSPACE = AM[poly] 2 So far

Computing the Cohomology Ring of a Polyhedral Complex Joint work with D. Kravatz, R.

The Polyhedral Model Beyond Loops Recursion Optimization and Parallelization Through Polyhedral

Computing the Cohomology Algebra of a Polyhedral Complex Joint work with R. Gonzalez-Diaz &amp;

AlphaZ: A System for Design Space Exploration in the Polyhedral Model Tomofumi Yuki, Gautam

PORTABLE MANAGEMENT BEX/BTA Oversight Committee May 17, 2019 Agenda Portable Management

Portable Enforcement Solution International Product Marketing Department Portable PTZ Dome Body

Polyhedral Compilation Opportunities in MLIR Uday Bondhugula Indian Institute of Science

Combining Polyhedral and AST Transformations in CHiLL Huihui Zhang , Anand Venkat, Protonu Basu,

MECHANICAL AND MORPHOLOGICAL PROPERTIES OF POLY (LACTIC ACID)/POLY (BUTYLENE ADIPATE- CO -

Performance and Stability Comparison of Vehicular Congestion Control Algorithms Presented by Ali

Computer Animation CSE169: Computer Animation Instructor: Steve Rotenberg UCSD, Winter 2020

NUMERICAL METHODS FOR FRACTIONAL DIFFUSION Ricardo H. Nochetto Department of Mathematics and

Blazing Performance with Flame Graphs Brendan Gregg An Interactive Visualization for Stack

Management of She was not taking NSAIDS and was h.pylori negative. Paraesophageal Hernia Upper GI

Learning to Recognize Discontiguous Entities Aldrian Obaja Muis and Wei Lu Singapore University

Saving Lives In the Medical Surgical Unit and Establishing a Successful Capnography

The Case of the Upset Stomach A Mystery only Quantified Self could solve! June 2011 Why???

Polyhedral Volumes Visual Techniques T. V. Raman & M. S. Krishnamoorthy Polyhedral Volumes

Polyhedral Volumes Visual Techniques T. V. Raman & M. S. Krishnamoorthy Polyhedral Volumes

Computing the Cohomology Algebra of a Polyhedral Complex Joint work with R. Gonzalez-Diaz &