GraphIt: A DSL for High-Performance Graph Analytics Yunming Zhang, - PowerPoint PPT Presentation

GraphIt: A DSL for High-Performance Graph Analytics Yunming Zhang, Mengjiao Yang, Riyadh Baghdadi, Shoaib Kamil, Julian Shun , and Saman Amarasinghe � 1

PageRank Example in C++ void pagerank(Graph &graph, double * new_rank, double * old_rank, int * out_degree, int max_iter){ for (i = 0; i < max_iter; i++) { for (src : graph.vertices ()) { for (dst : graph . getOutgoingNeighbors (node)) { new_rank [dst] += old_rank [src]/ out_degree [src]; } } for (node : graph . vertices ()) { new_rank [node] = base_score + damping* new_rank [node]; } swap (old_rank, new_rank); } } � 2

Hand-Optimized C++ template < typename APPLY_FUNC> void edgeset_apply_pull_parallel(Graph &g, APPLY_FUNC apply_func) { More than 23x faster int64_t numVertices = g. num_nodes (), numEdges = g. num_edges (); parallel_for( int n = 0; n < numVertices; n++) { for ( int socketId = 0; socketId < omp_get_num_places(); socketId++) { Intel Xeon E5-2695 v3 CPUs with 12 cores local_new_rank[socketId][n] = new_rank[n]; } } int numPlaces = omp_get_num_places(); each for a total of 24 cores. int numSegments = g. getNumSegments ( "s1" ); int segmentsPerSocket = (numSegments + numPlaces - 1) / numPlaces; #pragma omp parallel num_threads(numPlaces) proc_bind(spread){ int socketId = omp_get_place_num(); for ( int i = 0; i < segmentsPerSocket; i++) { int segmentId = socketId + i * numPlaces; if (segmentId >= numSegments) break ; auto sg = g. getSegmentedGraph (std::string( "s1" ), segmentId); #pragma omp parallel num_threads(omp_get_place_num_procs(socketId)) proc_bind(close){ #pragma omp for schedule(dynamic, 1024) for (NodeID localId = 0; localId < sg-> numVertices ; localId++) { NodeID d = sg-> graphId [localId]; for (int64_t ngh = sg-> vertexArray [localId]; ngh < sg-> vertexArray [localId + 1]; ngh++) { NodeID s = sg-> edgeArray [ngh]; local_new_rank[socketId][d] += contrib[s]; }}}}} parallel_for( int n = 0; n < numVertices; n++) { for ( int socketId = 0; socketId < omp_get_num_places(); socketId++) { new_rank[n] += local_new_rank[socketId][n]; }}} struct updateVertex { void operator () (NodeID v) { double old_score = old_rank[v]; new_rank[v] = (beta_score + (damp * new_rank[v])); error[v] = fabs((new_rank[v] - old_rank[v])) ; old_rank[v] = new_rank[v]; new_rank[v] = (( float ) 0) ; }; }; void pagerank(Graph &g, double *new_rank, double *old_rank, int *out_degree, int max_iter) { for ( int i = (0); i < (max_iter); i++) { parallel_for( int v_iter = 0; v_iter < builtin_getVertices(edges); v_iter ++) { contrib[v] = (old_rank[v] / out_degree[v]);}; edgeset_apply_pull_parallel(edges, updateEdge()); parallel_for( int v_iter = 0; v_iter < builtin_getVertices(edges); v_iter ++) { updateVertex()(v_iter); }; } � 5

Hand-Optimized C++ template < typename APPLY_FUNC> void edgeset_apply_pull_parallel(Graph &g, APPLY_FUNC apply_func) { More than 23x faster int64_t numVertices = g. num_nodes (), numEdges = g. num_edges (); parallel_for( int n = 0; n < numVertices; n++) { for ( int socketId = 0; socketId < omp_get_num_places(); socketId++) { Intel Xeon E5-2695 v3 CPUs with 12 cores local_new_rank[socketId][n] = new_rank[n]; } } int numPlaces = omp_get_num_places(); each for a total of 24 cores. int numSegments = g. getNumSegments ( "s1" ); int segmentsPerSocket = (numSegments + numPlaces - 1) / numPlaces; #pragma omp parallel num_threads(numPlaces) proc_bind(spread){ int socketId = omp_get_place_num(); for ( int i = 0; i < segmentsPerSocket; i++) { Multi-Threaded int segmentId = socketId + i * numPlaces; if (segmentId >= numSegments) break ; auto sg = g. getSegmentedGraph (std::string( "s1" ), segmentId); Load Balanced #pragma omp parallel num_threads(omp_get_place_num_procs(socketId)) proc_bind(close){ #pragma omp for schedule(dynamic, 1024) for (NodeID localId = 0; localId < sg-> numVertices ; localId++) { NodeID d = sg-> graphId [localId]; NUMA Optimized for (int64_t ngh = sg-> vertexArray [localId]; ngh < sg-> vertexArray [localId + 1]; ngh++) { NodeID s = sg-> edgeArray [ngh]; local_new_rank[socketId][d] += contrib[s]; }}}}} parallel_for( int n = 0; n < numVertices; n++) { Cache Optimized for ( int socketId = 0; socketId < omp_get_num_places(); socketId++) { new_rank[n] += local_new_rank[socketId][n]; }}} struct updateVertex { void operator () (NodeID v) { double old_score = old_rank[v]; new_rank[v] = (beta_score + (damp * new_rank[v])); error[v] = fabs((new_rank[v] - old_rank[v])) ; old_rank[v] = new_rank[v]; new_rank[v] = (( float ) 0) ; }; }; void pagerank(Graph &g, double *new_rank, double *old_rank, int *out_degree, int max_iter) { for ( int i = (0); i < (max_iter); i++) { parallel_for( int v_iter = 0; v_iter < builtin_getVertices(edges); v_iter ++) { contrib[v] = (old_rank[v] / out_degree[v]);}; edgeset_apply_pull_parallel(edges, updateEdge()); parallel_for( int v_iter = 0; v_iter < builtin_getVertices(edges); v_iter ++) { updateVertex()(v_iter); }; } � 6

Hand-Optimized C++ template < typename APPLY_FUNC> void edgeset_apply_pull_parallel(Graph &g, APPLY_FUNC apply_func) { More than 23x faster int64_t numVertices = g. num_nodes (), numEdges = g. num_edges (); parallel_for( int n = 0; n < numVertices; n++) { for ( int socketId = 0; socketId < omp_get_num_places(); socketId++) { Intel Xeon E5-2695 v3 CPUs with 12 cores local_new_rank[socketId][n] = new_rank[n]; } } int numPlaces = omp_get_num_places(); each for a total of 24 cores. int numSegments = g. getNumSegments ( "s1" ); int segmentsPerSocket = (numSegments + numPlaces - 1) / numPlaces; #pragma omp parallel num_threads(numPlaces) proc_bind(spread){ int socketId = omp_get_place_num(); for ( int i = 0; i < segmentsPerSocket; i++) { Multi-Threaded int segmentId = socketId + i * numPlaces; if (segmentId >= numSegments) break ; auto sg = g. getSegmentedGraph (std::string( "s1" ), segmentId); Load Balanced #pragma omp parallel num_threads(omp_get_place_num_procs(socketId)) proc_bind(close){ #pragma omp for schedule(dynamic, 1024) for (NodeID localId = 0; localId < sg-> numVertices ; localId++) { NodeID d = sg-> graphId [localId]; NUMA Optimized for (int64_t ngh = sg-> vertexArray [localId]; ngh < sg-> vertexArray [localId + 1]; ngh++) { NodeID s = sg-> edgeArray [ngh]; local_new_rank[socketId][d] += contrib[s]; }}}}} parallel_for( int n = 0; n < numVertices; n++) { Cache Optimized for ( int socketId = 0; socketId < omp_get_num_places(); socketId++) { new_rank[n] += local_new_rank[socketId][n]; }}} struct updateVertex { void operator () (NodeID v) { double old_score = old_rank[v]; (1) Hard to write correctly new_rank[v] = (beta_score + (damp * new_rank[v])); error[v] = fabs((new_rank[v] - old_rank[v])) ; old_rank[v] = new_rank[v]; (2) Extremely di ffi cult to experiment new_rank[v] = (( float ) 0) ; }; }; void pagerank(Graph &g, double *new_rank, double *old_rank, int *out_degree, int max_iter) { with di ff erent combinations of for ( int i = (0); i < (max_iter); i++) { parallel_for( int v_iter = 0; v_iter < builtin_getVertices(edges); v_iter ++) { optimizations contrib[v] = (old_rank[v] / out_degree[v]);}; edgeset_apply_pull_parallel(edges, updateEdge()); parallel_for( int v_iter = 0; v_iter < builtin_getVertices(edges); v_iter ++) { updateVertex()(v_iter); }; } � 7

Optimization Tradeoff Space Locality Push Pull Partitioning Vertex-Parallel Edge-aware Vertex-parallel Bitvector …. Parallelism Work-E ffi ciency � 8

Optimizations � 9

Graphs Optimizations � 10

Graphs Algorithms Optimizations � 11

Graphs Hardware Algorithms Optimizations � 12

Graphs Hardware Algorithms Optimizations Bad sets of optimizations can be > 100x slower � 13

GraphIt A Domain-Specific Language for Graph Analytics • Decouple algorithm from optimization for graph programs • Algorithm : What to Compute • Optimization (schedule) : How to Compute • Optimization (schedule) representation • Easy to use for users to try di ff erent combinations • Powerful enough to beat hand-hand-optimized libraries by up to 4.8x � 14

GraphIt A Domain-Specific Language for Graph Analytics • Decouple algorithm from optimization for graph programs • Algorithm : What to Compute • Optimization (schedule) : How to Compute • Optimization (schedule) representation • Easy to use for users to try di ff erent combinations • Powerful enough to beat hand-optimized libraries by up to 4.8x � 15

GraphIt DSL Autotuner Algorithm Representation Optimization Representation (Algorithm Language) • Scheduling Language • Schedule Representation   (e.g. Graph Iteration Space) � 16

GraphIt: A DSL for High-Performance Graph Analytics Yunming Zhang, - PowerPoint PPT Presentation

GraphIt: A DSL for High-Performance Graph Analytics Yunming Zhang, Mengjiao Yang, Riyadh Baghdadi, Shoaib Kamil, Julian Shun , and Saman Amarasinghe 1 PageRank Example in C++ void pagerank(Graph &graph, double * new_rank, double *

DSL with pyrser Author: L. Auroux lionel@lse.epita.fr For pyParis 2018 lionel@lse.epita.fr For

100% JDclare Language Workbench Software Factories DSL Workbenches - PMW DSL Workbenches -

Analytics and Data Summit 2020 Analytics and Data Summit 2020 Analytics and Data Summit 2020

Perl in Scheme: A DSL Abram Hindle Kitchener/Waterloo Perl Mongers Canada http://kw.pm.org/ {

DSL CPE Module A unique solution for enabling board functional test of existing & emerging

Using Aspects for Language Portability Lennart Kats Eelco Visser DSLs Stratego SDF Spoofax

DSL vs. Library API Shootout Rich Unger Salesforce.com Jaroslav Tulach Oracle Agenda What do

Twitter: @pandamonial www.pandamonial.com Objectives DSL/LOP Background Internal/External

(DSL) ETI 2506 TELECOMMUNICATION SYSTEMS Monday, 10 October 2016 1 COURSE OUTLINE (5) 2

DSL Design DSL Design Jumps/GOTOs Control flow in DSLs A jump transfers control to a

GRAPH MINING AND GRAPH KERNELS Part I: Graph Mining Karsten Borgwardt^ and Xifeng Yan*

WITH RAPIDS Joe Eaton, Ph.D. Technical Lead for Graph Analytics AGENDA Introduction - Why

Undergraduate Business Analytics Minor Spreadsheet Analytics BANA-2081 Business Analytics

Green Marl: A DSL for Easy and Efficient Graph Analysis Sungpack Hong, Hassan Chafi, Eric

GRAPH MINING AND GRAPH KERNELS Part II: Graph Kernels Karsten Borgwardt^ and Xifeng Yan*

Massively Parallel Graph Analytics Supercomputing for large-scale graph analytics George M. Slota

Mesh representations and data structures Luca Castelli Aleardi Shared vertex representation

Graph Processing Connor Gramazio Spiros Boosalis Pregel why not MapReduce? semantics: awkward

Graph Processing Marco Serafini COMPSCI 532 Lecture 9 Graph Analytics Marco Serafini 2 2

Lecture 6: Graphs. Graphs! Euler Definitions: model. Euler Again!! Konigsberg bridges problem.

Chapter 10.1 Trees Prof. Tesler Math 184A Winter 2017 Prof. Tesler Ch. 10.1: Trees Math 184A

Graph Processing & Bulk Synchronous Parallel Model CompSci

FPGP: Graph Processing Framework on FPGA Guohao DAI, Yuze CHI, Yu WANG, Huazhong YANG E.E.

1.2 Surface Representation & Data Structures Hao Li http://cs599.hao-li.com 1

GraphIt: A DSL for High-Performance Graph Analytics Yunming Zhang, - PowerPoint PPT Presentation

GraphIt: A DSL for High-Performance Graph Analytics Yunming Zhang, Mengjiao Yang, Riyadh Baghdadi, Shoaib Kamil, Julian Shun , and Saman Amarasinghe 1 PageRank Example in C++ void pagerank(Graph &graph, double * new_rank, double *

DSL with pyrser Author: L. Auroux lionel@lse.epita.fr For pyParis 2018 lionel@lse.epita.fr For

100% JDclare Language Workbench Software Factories DSL Workbenches - PMW DSL Workbenches -

Analytics and Data Summit 2020 Analytics and Data Summit 2020 Analytics and Data Summit 2020

Perl in Scheme: A DSL Abram Hindle Kitchener/Waterloo Perl Mongers Canada http://kw.pm.org/ {

DSL CPE Module A unique solution for enabling board functional test of existing &amp; emerging

Using Aspects for Language Portability Lennart Kats Eelco Visser DSLs Stratego SDF Spoofax

DSL vs. Library API Shootout Rich Unger Salesforce.com Jaroslav Tulach Oracle Agenda What do

Twitter: @pandamonial www.pandamonial.com Objectives DSL/LOP Background Internal/External

(DSL) ETI 2506 TELECOMMUNICATION SYSTEMS Monday, 10 October 2016 1 COURSE OUTLINE (5) 2

DSL Design DSL Design Jumps/GOTOs Control flow in DSLs A jump transfers control to a

GRAPH MINING AND GRAPH KERNELS Part I: Graph Mining Karsten Borgwardt^ and Xifeng Yan*

WITH RAPIDS Joe Eaton, Ph.D. Technical Lead for Graph Analytics AGENDA Introduction - Why

Undergraduate Business Analytics Minor Spreadsheet Analytics BANA-2081 Business Analytics

Green Marl: A DSL for Easy and Efficient Graph Analysis Sungpack Hong, Hassan Chafi, Eric

GRAPH MINING AND GRAPH KERNELS Part II: Graph Kernels Karsten Borgwardt^ and Xifeng Yan*

Massively Parallel Graph Analytics Supercomputing for large-scale graph analytics George M. Slota

Mesh representations and data structures Luca Castelli Aleardi Shared vertex representation

Graph Processing Connor Gramazio Spiros Boosalis Pregel why not MapReduce? semantics: awkward

Graph Processing Marco Serafini COMPSCI 532 Lecture 9 Graph Analytics Marco Serafini 2 2

Lecture 6: Graphs. Graphs! Euler Definitions: model. Euler Again!! Konigsberg bridges problem.

Chapter 10.1 Trees Prof. Tesler Math 184A Winter 2017 Prof. Tesler Ch. 10.1: Trees Math 184A

Graph Processing &amp; Bulk Synchronous Parallel Model CompSci

FPGP: Graph Processing Framework on FPGA Guohao DAI, Yuze CHI, Yu WANG, Huazhong YANG E.E.

1.2 Surface Representation &amp; Data Structures Hao Li http://cs599.hao-li.com 1

DSL CPE Module A unique solution for enabling board functional test of existing & emerging

Graph Processing & Bulk Synchronous Parallel Model CompSci

1.2 Surface Representation & Data Structures Hao Li http://cs599.hao-li.com 1