Toward a Core Design to Distribute an Execution on a Manycore - PowerPoint PPT Presentation

PaCT’2015, Petrozavodsk, August 31 - September 4, 2015 Toward a Core Design to Distribute an Execution on a Manycore Processor. Bernard Goossens, David Parello, Katarzyna Porada, Djallal Rahmoune Universit´ e de Perpignan Via Domitia DALI-LIRMM 1 / 33

Summary. Parallelization of a C Code. 1 Automatic Hardware Parallelization. 2 Determinism. 3 Conclusion. 4 2 / 33

Parallelization of a C Code. 3 / 33

Example : a sum reduction. long sum( long t [ ] , unsigned i n t n) { i f ( n==1) return t [ 0 ] ; i f ( n==2) return t [0]+ t [ 1 ] ; return sum( t , n /2) + sum(&( t [ n / 2 ] ) , n − n / 2 ) ; } This code looks sequential. Let us parallelize it. 4 / 33

What we do today : e.g. using pthreads . s t r u c t { unsigned long ∗ p ; i ; } ST ; typedef unsigned long ∗ sum( void ∗ s t ) { void ST str1 , s t r 2 ; s , s1 , s2 ; unsigned long p t h r e a d t tid1 , t i d 2 ; ( ( ( ST ∗ ) s t) − > i > 2) { i f s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } 5 / 33

What we do today : e.g. using pthreads . s t r u c t { unsigned long ∗ p ; i ; } ST ; typedef unsigned long ∗ sum( void ∗ s t ) { void ST str1 , s t r 2 ; s , s1 , s2 ; unsigned long p t h r e a d t tid1 , t i d 2 ; ( ( ( ST ∗ ) s t) − > i > 2) { i f s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } The code is multithreaded. 5 / 33

What we do today : e.g. using pthreads . s t r u c t { unsigned long ∗ p ; i ; } ST ; typedef unsigned long ∗ sum( void ∗ s t ) { void ST str1 , s t r 2 ; s , s1 , s2 ; unsigned long p t h r e a d t tid1 , t i d 2 ; ( ( ( ST ∗ ) s t) − > i > 2) { i f s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } The code is multithreaded. Threads executions are non deterministically ordered. 5 / 33

What we do today : e.g. using pthreads . s t r u c t { unsigned long ∗ p ; i ; } ST ; typedef unsigned long ∗ sum( void ∗ s t ) { void ST str1 , s t r 2 ; s , s1 , s2 ; unsigned long p t h r e a d t tid1 , t i d 2 ; ( ( ( ST ∗ ) s t) − > i > 2) { i f s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } The code is multithreaded. Threads executions are non deterministically ordered. Too few synchronization = > the result is not deterministic. 5 / 33

Synchronized threads. typedef s t r u c t { unsigned long ∗ p ; unsigned long i ; } ST ; void ∗ sum( void ∗ s t ) { ST str1 , s t r 2 ; unsigned long s , s1 , s2 ; p t h r e a d t tid1 , t i d 2 ; i f ( ( ( ST ∗ ) s t) − > i > 2) { s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; p t h r e a d j o i n ( tid1 , ( void ∗ )&s1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; p t h r e a d j o i n ( tid2 , ( void ∗ )&s2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } 6 / 33

Synchronized threads. typedef s t r u c t { unsigned long ∗ p ; unsigned long i ; } ST ; void ∗ sum( void ∗ s t ) { ST str1 , s t r 2 ; unsigned long s , s1 , s2 ; p t h r e a d t tid1 , t i d 2 ; i f ( ( ( ST ∗ ) s t) − > i > 2) { s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; p t h r e a d j o i n ( tid1 , ( void ∗ )&s1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; p t h r e a d j o i n ( tid2 , ( void ∗ )&s2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } Among all the run orderings, the synchronization keeps only good ones (i.e. computing the same result as a sequential execution). 6 / 33

Synchronized threads. typedef s t r u c t { unsigned long ∗ p ; unsigned long i ; } ST ; void ∗ sum( void ∗ s t ) { ST str1 , s t r 2 ; unsigned long s , s1 , s2 ; p t h r e a d t tid1 , t i d 2 ; i f ( ( ( ST ∗ ) s t) − > i > 2) { s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; p t h r e a d j o i n ( tid1 , ( void ∗ )&s1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; p t h r e a d j o i n ( tid2 , ( void ∗ )&s2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } Among all the run orderings, the synchronization keeps only good ones (i.e. computing the same result as a sequential execution). Too much synchronization = > not parallel enough. 6 / 33

Correctly synchronized threads. s t r u c t { unsigned long ∗ p ; i ; } ST ; typedef unsigned long void ∗ sum( void ∗ s t ) { ST str1 , s t r 2 ; unsigned long s , s1 , s2 ; p t h r e a d t tid1 , t i d 2 ; i f ( ( ( ST ∗ ) s t) − > i > 2) { s t r 1 . p=((ST ∗ ) s t) − > p ; s t r 1 . i =((ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid1 , NULL, sum , ( void ∗ )& s t r 1 ) ; s t r 2 . p=((ST ∗ ) s t) − > p + ( (ST ∗ ) s t) − > i /2; s t r 2 . i =((ST ∗ ) s t) − > i − ( (ST ∗ ) s t) − > i /2; p t h r e a d c r e a t e (& tid2 , NULL, sum , ( void ∗ )& s t r 2 ) ; p t h r e a d j o i n ( tid1 , ( void ∗ )&s1 ) ; p t h r e a d j o i n ( tid2 , ( void ∗ )&s2 ) ; } e l s e i f ( ( ( ST ∗ ) s t) − > i ==1) { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =0; } e l s e { s1 =((ST ∗ ) s t) − > p [ 0 ] ; s2 =((ST ∗ ) s t) − > p [ 1 ] ; } s=s1+s2 ; p t h r e a d e x i t ( ( void ∗ ) s ) ; } 7 / 33

What we propose to do. long sum( long t [ ] , unsigned i n t n) { i f ( n==1) return t [ 0 ] ; i f ( n==2) return t [0]+ t [ 1 ] ; return sum( t , n /2) + sum(&( t [ n / 2 ] ) , n − n / 2 ) ; } 8 / 33

What we propose to do. long sum( long t [ ] , unsigned i n t n) { i f ( n==1) return t [ 0 ] ; i f ( n==2) return t [0]+ t [ 1 ] ; return sum( t , n /2) + sum(&( t [ n / 2 ] ) , n − n / 2 ) ; } This code is usually understood as sequential. 8 / 33

Toward a Core Design to Distribute an Execution on a Manycore - PowerPoint PPT Presentation

PaCT2015, Petrozavodsk, August 31 - September 4, 2015 Toward a Core Design to Distribute an Execution on a Manycore Processor. Bernard Goossens, David Parello, Katarzyna Porada, Djallal Rahmoune Universit e de Perpignan Via Domitia

Welcome Welcome Core: Core A Regional Destination Core: Core UL Core: Core Downtown

Caching, Parallelism, Fault Tolerance Marco Serafini COMPSCI 532 Lectures 2-3 Memory Hierarchy

MASTERING STRATEGY EXECUTION 18 BEST PRACTICES FOR STRATEGY EXECUTION STRATEGY EXECUTION AS

execution states with swapping Processes, Execution, and State 3F. Execution State Model exit

EDIA Working Group EDIA Working Group Journey Toward Equity Journey Toward Equity SARAH We are

Motivation Memory is a shared resource Core Core Memory Core Core Threads requests

PSHE curriculum Robert Willmott Core Themes Core Theme 1: Health and Core Theme 2: Core Theme

Final Assembly Chip Core Your final project chip consists of a core The Chip Core is

PRODUCTION EXECUTION PRODUCTION EXECUTION Table of contents Course Map Module 1: Production

STRATAEGOS CONSULTING STRATEGY EXECUTION CONSULTING STRATAEGOS.COM WELCOME STRATEGY EXECUTION

Outline Side and covert channels Transient execution CSci 5271 Introduction to Computer

Secure Multi-Execution Dominique Devriese Frank Piessens K.U.Leuven May 14, 2010 Dominique

Precise Exceptions and Out-of-Order Execution Samira Khan Multi-Cycle Execution Not all

Symbolic Execution of Linux binaries About Symbolic Execution Dynamically explore all

Speculative Execution Outcome unknown Block 1 Predict future execution path Begin executing

Exploiting Out-of-Order-Execution Processor Side Channels to Enable Cross VM Code Execution Sophia

Dynamic instrumentation techniques Ahmad shahnejat Michel Dagenais May, 06 1

Metro Banking Webinar Current Financial Case Law and Legislative Issues to have on Your Teams

Year 8 Parent Evening Wednesday, 19 February 2020 Positive Education Student Wellbeing Student

Machine Learning for Malware Analysis Andrew Davis Data Scientist Introduction - What is

City of Covington, KY FY17 Budget Development June 14, 2016 Recall FY16 Budget Goals &

WIS 35 (Church Hill Road / Main Street) County C to Laser Drive Public Involvement Presentation

Osseo Road reconstruction County Road 152 in Minneapolis Project update July 23, 2020 Amber

Traffic Interchange Reconstruction January 11, 2018 Tonights Meeting Project purpose and

Toward a Core Design to Distribute an Execution on a Manycore - PowerPoint PPT Presentation

PaCT2015, Petrozavodsk, August 31 - September 4, 2015 Toward a Core Design to Distribute an Execution on a Manycore Processor. Bernard Goossens, David Parello, Katarzyna Porada, Djallal Rahmoune Universit e de Perpignan Via Domitia

Welcome Welcome Core: Core A Regional Destination Core: Core UL Core: Core Downtown

Caching, Parallelism, Fault Tolerance Marco Serafini COMPSCI 532 Lectures 2-3 Memory Hierarchy

MASTERING STRATEGY EXECUTION 18 BEST PRACTICES FOR STRATEGY EXECUTION STRATEGY EXECUTION AS

execution states with swapping Processes, Execution, and State 3F. Execution State Model exit

EDIA Working Group EDIA Working Group Journey Toward Equity Journey Toward Equity SARAH We are

Motivation Memory is a shared resource Core Core Memory Core Core Threads requests

PSHE curriculum Robert Willmott Core Themes Core Theme 1: Health and Core Theme 2: Core Theme

Final Assembly Chip Core Your final project chip consists of a core The Chip Core is

PRODUCTION EXECUTION PRODUCTION EXECUTION Table of contents Course Map Module 1: Production

STRATAEGOS CONSULTING STRATEGY EXECUTION CONSULTING STRATAEGOS.COM WELCOME STRATEGY EXECUTION

Outline Side and covert channels Transient execution CSci 5271 Introduction to Computer

Secure Multi-Execution Dominique Devriese Frank Piessens K.U.Leuven May 14, 2010 Dominique

Precise Exceptions and Out-of-Order Execution Samira Khan Multi-Cycle Execution Not all

Symbolic Execution of Linux binaries About Symbolic Execution Dynamically explore all

Speculative Execution Outcome unknown Block 1 Predict future execution path Begin executing

Exploiting Out-of-Order-Execution Processor Side Channels to Enable Cross VM Code Execution Sophia

Dynamic instrumentation techniques Ahmad shahnejat Michel Dagenais May, 06 1

Metro Banking Webinar Current Financial Case Law and Legislative Issues to have on Your Teams

Year 8 Parent Evening Wednesday, 19 February 2020 Positive Education Student Wellbeing Student

Machine Learning for Malware Analysis Andrew Davis Data Scientist Introduction - What is

City of Covington, KY FY17 Budget Development June 14, 2016 Recall FY16 Budget Goals &amp;

WIS 35 (Church Hill Road / Main Street) County C to Laser Drive Public Involvement Presentation

Osseo Road reconstruction County Road 152 in Minneapolis Project update July 23, 2020 Amber

Traffic Interchange Reconstruction January 11, 2018 Tonights Meeting Project purpose and

City of Covington, KY FY17 Budget Development June 14, 2016 Recall FY16 Budget Goals &