md poly a performance portable polyhedral compiler based
play

md_poly : A Performance-Portable Polyhedral Compiler based on - PowerPoint PPT Presentation

md_poly : A Performance-Portable Polyhedral Compiler based on Multi-Dimensional Homomorphisms Ari Rasch, Richard Schulze, Sergei Gorlatch University of Mnster, Germany <latexit


  1. md_poly : A Performance-Portable Polyhedral Compiler based on Multi-Dimensional Homomorphisms Ari Rasch, Richard Schulze, Sergei Gorlatch University of Münster, Germany

  2. <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> <latexit sha1_base64="yONhjet6jUxuF8XYlEGXPrQ7wns=">ACzXicbVHLbtswEKTV6q+nPbYC1EngNOogRSgaI9Bc+ktKVAnASxDoKiVTZgPgVylNlT12m/p1/Ta/E1oxwfb6QIEBrMzuyQnr6RwGMc3neDBw0ePn+w8DZ89f/HyVXf39YUzteUw4EYae5UzB1JoGKBACVeVBaZyCZf59HTRv7wG64TR3FewUixsRal4Aw9lXVP91KEGSI2qkiziVFtP41oGdE06qdcWC6hYA6zJEoLg45H6+T0wMsO9rJuLz6Kl0Xvg2QFemRV59lux/pvFagkUvm3DCJKxw1zKLws9twP60dVIxP2RiGHmqmwI2a5XNbu+ZgpbG+qORLtlw3VFci8otPdG2eV3XMOXcXOV+omI4cdu9Bfm/3rDG8vOoEbqETS/u1BZS4qGLv6YFsICRzn3gHEr/KMonzDLOPokNrbkyrtpbmRBhaLTVSZAjY0Dj2goMc+4s37u7lGNvOch/cKMV0kRaVrF1zmE6cV8H75sPHatYetmHoU0q2M7kPLo6PEo+/HfdOvqzy2iFvyTvSJwn5RE7IV3JOBoSTP+Qv+UdugrOgDn4Gv+6kQWfleUM2Kvh9C+A74Ws=</latexit> Our Background We are the developers of the MDH code generation approach: Executable Generic Different architectures program code program code and input sizes High-level parallel __kernel void gemv_fst( __global float* in_matrix, __global float* in_vector, __global float* out_vector, { // private memory for a WI's computation __private float res_prv = 0.0f; // local memory for a WG's computation __local float res_lcl[ NUM_WI_1 ][ NUM_WI_2 ]; __kernel void gemv_fst( __global float* in_matrix, programming abstractions __global float* in_vector, // iteration over P_sq blocks for( int i_sq = 1 ; i_sq <= NUM_SQ_1 ; ++i_sq ) { __global float* out_vector, for( int j_sq = 1 ; j_sq <= NUM_SQ_2 ; ++j_sq ) { { res_prv = 0.0f; // sequential computation on a P_wi partition // private memory for a WI's computation for( int i = 1 ; i <= WI_PART_SIZE_1 ; ++i ) for( int j = 1 ; j <= WI_PART_SIZE_2 ; ++j ) __private float res_prv = 0.0f; res_prv += my_p_wi( i, j, 0 ) * my_p_wi( i, j, 1 ); // store result in local memory // local memory for a WG's computation res_lcl[ WI_ID_1 ][ WI_ID_2 ] = res_prv; __local float res_lcl[ NUM_WI_1 ][ NUM_WI_2 ]; barrier( CLK_LOCAL_MEM_FENCE ); // iteration over P_sq blocks // combine the WIs' results in dimension x for( int stride = NUM_WI_2 / 2 ; stride > 0 ; stride /= 2) for( int i_sq = 1 ; i_sq <= NUM_SQ_1 ; ++i_sq ) { { for( int j_sq = 1 ; j_sq <= NUM_SQ_2 ; ++j_sq ) { if( WI_ID_2 < stride) res_lcl[ WI_ID_1 ][ WI_ID_2 ] += res_lcl[ WI_ID_1 ][ WI_ID_2 + stride ]; res_prv = 0.0f; barrier( CLK_LOCAL_MEM_FENCE ); } // sequential computation on a P_wi partition for( int i = 1 ; i <= WI_PART_SIZE_1 ; ++i ) // store WGs' results in global memory if( WI_ID_2 == 0 ) md hom ( f, ( ~ 1 , . . . , ~ k ) ) for( int j = 1 ; j <= WI_PART_SIZE_2 ; ++j ) my_res( i_sq ) = res_lcl[ WI_ID_1 ][0]; res_prv += my_p_wi( i, j, 0 ) * my_p_wi( i, j, 1 ); barrier( CLK_LOCAL_MEM_FENCE ); // store result in local memory } // end of for-loop j_sq } // end of for-loop i_sq res_lcl[ WI_ID_1 ][ WI_ID_2 ] = res_prv; } // end of kernel barrier( CLK_LOCAL_MEM_FENCE ); // combine the WIs' results in dimension x … for( int stride = NUM_WI_2 / 2 ; stride > 0 ; stride /= 2) { if( WI_ID_2 < stride) res_lcl[ WI_ID_1 ][ WI_ID_2 ] += res_lcl[ WI_ID_1 ][ WI_ID_2 + stride ]; barrier( CLK_LOCAL_MEM_FENCE ); } // store WGs' results in global memory if( WI_ID_2 == 0 ) my_res( i_sq ) = res_lcl[ WI_ID_1 ][0]; __kernel void gemv_fst( __global float* in_matrix, barrier( CLK_LOCAL_MEM_FENCE ); __global float* in_vector, __global float* out_vector, { } // end of for-loop j_sq // private memory for a WI's computation } // end of for-loop i_sq __private float res_prv = 0.0f; } // end of kernel // local memory for a WG's computation __local float res_lcl[ NUM_WI_1 ][ NUM_WI_2 ]; // iteration over P_sq blocks for( int i_sq = 1 ; i_sq <= NUM_SQ_1 ; ++i_sq ) { for( int j_sq = 1 ; j_sq <= NUM_SQ_2 ; ++j_sq ) { res_prv = 0.0f; // sequential computation on a P_wi partition (1) (2) (3) for( int i = 1 ; i <= WI_PART_SIZE_1 ; ++i ) for( int j = 1 ; j <= WI_PART_SIZE_2 ; ++j ) res_prv += my_p_wi( i, j, 0 ) * my_p_wi( i, j, 1 ); // store result in local memory res_lcl[ WI_ID_1 ][ WI_ID_2 ] = res_prv; barrier( CLK_LOCAL_MEM_FENCE ); // combine the WIs' results in dimension x for( int stride = NUM_WI_2 / 2 ; stride > 0 ; stride /= 2) { if( WI_ID_2 < stride) res_lcl[ WI_ID_1 ][ WI_ID_2 ] += res_lcl[ WI_ID_1 ][ WI_ID_2 + stride ]; barrier( CLK_LOCAL_MEM_FENCE ); Generation Optimization Execution } // store WGs' results in global memory if( WI_ID_2 == 0 ) my_res( i_sq ) = res_lcl[ WI_ID_1 ][0]; barrier( CLK_LOCAL_MEM_FENCE ); } // end of for-loop j_sq } // end of for-loop i_sq } // end of kernel [PACT’19, IJPP’18] [CCPE’18, HPCC’17] [JOS’19, ICPADS’18] • Multi-Dimensional Homomorphisms (MDHs) are a formally defined class of functions that cover 
 important data-parallel computations , e.g.: linear algebra routines (BLAS), stencils computations, … • We enable conveniently implementing MDHs by providing a high-level DSL for them. • We provide a DSL compiler that automatically generates OpenCL code — the standard for uniformly programming different parallel architectures (e.g., CPU and GPU). • Our OpenCL code is fully automatically optimizable (auto-tunable) — for each combination of a target architecture , and input size — by being generated as targeted to OpenCL’s abstract device models and as parametrized in these models’ performance-critical parameters. 2

  3. Experimental Results Stencils Data Mining Gaussian (2D) Jacobi (3D) CPU RW PC RW PC Probabilistic Record Linkage CPU 2¹⁵ 2²⁰ 2¹⁶ 2¹⁷ 2¹⁸ 2¹� 4.90 5.96 1.94 2.49 Lift [2] 1.87 2.06 4.98 13.86 28.34 39.36 EKR [5] 6.99 14.31 N/A N/A MKL-DNN [5] Forchhammer et al. “Duplicate Detection on GPUs.”, HFSL’13 . Gaussian (2D) Jacobi (3D) GPU RW PC RW PC 2.33 1.09 1.14 1.02 Lift [2] 3.78 19.11 N/A N/A cuDNN Our MDH approach achieves [2] Hagedorn et. al, "High Performance Stencil Code Generation with LIFT.”, CGO’18 often better performance than 
 (Best Paper Award) . well-performing competitors [1] Linear Algebra [1] Rasch, Schulze, Gorlatch. "Generating Portable High-Performance Code via Multi- GEMM GEMV CPU Dimensional Homomorphisms.”, PACT’19 RW PC RW PC Lift [1] fails 3.04 1.51 1.99 MKL 4.22 0.74 1.05 0.87 Tensor Contractions GEMM GEMV Tensor Contractions GPU GPU RW PC RW PC RW 1 RW 2 RW 3 RW 4 RW 5 RW 6 RW 7 RW 8 RW 9 Lift [1] 4.33 1.17 3.52 2.98 1.26 1.16 2.12 1.24 1.18 1.36 1.48 1.44 1.85 COGENT [3] cuBLAS 2.91 0.83 1.03 1.00 F-TC [4] 1.19 2.00 1.43 2.89 1.35 1.54 1.25 2.02 1.49 [3] Kim et. al. "A Code Generator for High-Performance Tensor [1] Steuwer et. al, "Lift: A Contractions on GPUs.”, CGO’19 . Functional Data-Parallel IR for [4] Vasilache et al. "The Next 700 Accelerated Layers: From High-Performance GPU Code Mathematical Expressions of Network Computation Graphs to Generation”, CGO’17 . 3 Accelerated GPU Kernels, Automatically . ”, TACO, 2019 .

  4. Observation Comparison: MDH Approach vs. Polyhedral Approaches (e.g. PPCG) • Polyhedral approaches often provide better productivity → automatically parallelize sequential program code (rather than relying on a DSL). • The MDH approach achieves often higher performance than polyhedral compilers; its generated code is portable over different architectures (e.g., GPU and CPU). Goal of this work: Combining the advantages of both approaches 4

Download Presentation
Download Policy: The content available on the website is offered to you 'AS IS' for your personal information and use only. It cannot be commercialized, licensed, or distributed on other websites without prior consent from the author. To download a presentation, simply click this link. If you encounter any difficulties during the download process, it's possible that the publisher has removed the file from their server.

Recommend


More recommend