PARALLEL PROGRAMMING ON MULTICORES Threading Subroutines
Bo Zhang zhangb@cs.duke.edu Nikos P. Pitsianis Xiaobai Sun
DEPARTMENT OF COMPUTER SCIENCE DUKE UNIVERSITY
09/29/2010
1 / 38
Threading Subroutines Bo Zhang zhangb@cs.duke.edu Nikos P. - - PowerPoint PPT Presentation
P ARALLEL P ROGRAMMING ON M ULTICORES Threading Subroutines Bo Zhang zhangb@cs.duke.edu Nikos P. Pitsianis Xiaobai Sun D EPARTMENT OF C OMPUTER S CIENCE D UKE U NIVERSITY 09/29/2010 1 / 38 M Y E XPERIENCE B ACKGROUND Ph.D. in Mathematics;
1 / 38
2 / 38
3 / 38
4 / 38
double balance; void deposit (double s) { balance += s; } void withdraw (double s) { balance -= s; } int main () { balance = 0.0 deposit (s1); deposit (s2); withdraw (s3); . . . printf("%f\n",balance); return 0; } 5 / 38
double balance; void deposit (double s) { balance += s; } void withdraw (double s) { balance -= s; } int main () { balance = 0.0 deposit (s1); deposit (s2); withdraw (s3); . . . printf("%f\n",balance); return 0; }
6 / 38
double balance; void deposit (double s) { balance += s; } void withdraw (double s) { balance -= s; } int main () { balance = 0.0 deposit (s1); deposit (s2); withdraw (s3); . . . printf("%f\n",balance); return 0; } double balance; pthread_mutex_t balance_mutex; void deposit (double s) { pthread_mutex_lock (&balance_mutex); balance += s; pthread_mutex_unlock (&balance_mutex); } void withdraw (double s) { pthread_mutex_lock (&balance_mutex); balance -= s; pthread_mutex_unlock (&balance_mutex); } int main () { balance = 0.0; . . . return 0; }
7 / 38
int main () { int s1, s2, s, work[100]; s1 = task1(work); s2 = task2(work); s = s1+s2; printf("%d\n", s); return 0; } 8 / 38
int main () { int s1, s2, s, work[100]; s1 = task1(work); s2 = task2(work); s = s1+s2; printf("%d\n", s); return 0; }
9 / 38
int main () { int s1, s2, s, work[100]; s1 = task1(work); s2 = task2(work); s = s1+s2; printf("%d\n", s); return 0; } int main () { int s1, s2, s, work1[100], work2[100]; s1 = task1(work1); s2 = task2(work2); s = s1+s2; printf("%d\n", s); return 0; }
10 / 38
void foo (int i, int *s) { int ia[100]; if ( i > 0) foo(i-1, s); *s += compute(ia, i); } int main () { int s, i = 2; foo(i, &s); return 0; } 11 / 38
void foo (int i, int *s) { int ia[100]; if ( i > 0) foo(i-1, s); *s += compute(ia, i); } int main () { int s, i = 2; foo(i, &s); return 0; } int i, s
12 / 38
void foo (int i, int *s) { int ia[100]; if ( i > 0) foo(i-1, s); *s += compute(ia, i); } int main () { int s, i = 2; foo(i, &s); return 0; } int i, s
int ia[100]
13 / 38
void foo (int i, int *s) { int ia[100]; if ( i > 0) foo(i-1, s); *s += compute(ia, i); } int main () { int s, i = 2; foo(i, &s); return 0; } int i, s
int ia[100]
int ia[100]
14 / 38
void foo (int i, int *s) { int ia[100]; if ( i > 0) foo(i-1, s); *s += compute(ia, i); } int main () { int s, i = 2; foo(i, &s); return 0; } int i, s
int ia[100]
int ia[100]
int ia[100]
15 / 38
void foo (int i, int *s) { int ia[100]; if ( i > 0) foo(i-1, s); *s += compute(ia, i); } int main () { int s, i = 2; foo(i, &s); return 0; }
int main () { int i, s, ia[100]; for ( i = 0; i < 2; i++ ) s += compute(ia,i); return 0; } int i, s ia[100] 16 / 38
int ps[3]; void *foo (void *threadid) { long tid = (long)threadid; int ia[100]; ps[tid] = compute(ia,i); pthread_exit(NULL); } int main () { long i; int s; pthread_t threads[3]; for ( i = 0; i < 3; i++ ) pthread_create(&threads[i], NULL, foo, (void *i)); for ( i = 0; i < 3; i++ ) pthread_join (threads[t], NULL); s = ps[0]+ps[1]+ps[2]; return 0; } 17 / 38
18 / 38
int ps[3]; int *gia; void *foo (void *threadid) { long tid = (long)threadid; int *ia = &gia[tid*100]; ps[tid] = compute(ia,i); pthread_exit(NULL); } int main () { long i; int s; pthread_t threads[3]; gia = (int *)malloc(sizeof(int)*3*100); for ( i = 0; i < 3; i++ ) pthread_create(&threads[i], NULL, foo, (void *i)); for ( i = 0; i < 3; i++ ) pthread_join (threads[t], NULL); s = ps[0]+ps[1]+ps[2]; free(gia); return 0; } 19 / 38
20 / 38
22 / 38
23 / 38
24 / 38
25 / 38
26 / 38
27 / 38
28 / 38
29 / 38
lmax lmax − 1 lmax lmax − 2 lmax − 1 lmax − 3 lmax − 2 lmax − 2 . . . . . . . . . . . . . . . . . . . . . . . . . . . 2 3 3 1 2 2 1 2 . . . . . . . . . lmax − 3 lmax − 2 lmax − 1 lmax All l’s All l’s All Particles
OPERATIONS Upward Pass: TSM, TMM List 2: TME, TEE, TEL List 3: TMT or TST List 5 & Evaluate Local: TLL & TLT List 4: TSL or TST List 1: TST Sum Local & Direct TIME STEP
30 / 38
31 / 38
32 / 38
33 / 38
34 / 38
35 / 38
36 / 38
#! /bin/bash for flag in -O -O2 -O4 do make clean make CFLAGS=$flag fFFLAGS=$flag sFFLAGS=-O for (( t=2; t<=128; t=t*2 )) do for (( s=50; s<=90; s=s+5 )) do for (( count=1; count<=10; count++ )) do ./fmm -n 10000 -t $t -s $s -d 2 done done done done
37 / 38
38 / 38