Center for Information Services and High Performance Computing (ZIH)
Analysis and Optimization of a Molecular Dynamics Code using PAPI and the Vampir Toolchain
May 2, 2012
Analysis and Optimization of a Molecular Dynamics Code using PAPI - - PowerPoint PPT Presentation
Center for Information Services and High Performance Computing (ZIH) Analysis and Optimization of a Molecular Dynamics Code using PAPI and the Vampir Toolchain May 2, 2012 Thomas William Zellescher Weg 12 Willers-Bau A 34 +49 351 - 463 32446
Center for Information Services and High Performance Computing (ZIH)
May 2, 2012
1
2
3
4
5
6
7
1/34
1
1/34
1/34
2/34
3/34
4/34
5/34
6/34
7/34
8/34
2
9/34
9/34
10/34
0" 10000" 20000" 30000" 40000" 50000" 60000" P P 1 " A " B " P P 2 " A 1 " B 2 " P P 3 " A " B " 1 6 " P P 3 " A " B 1 " 8 " P P 3 " A " B 2 " 2 " P P 3 " A " B 2 " 2 5 6 " P P 3 " A " B 3 " 1 2 8 " P P 3 " A " B 4 " 6 4 " P P 3 " A " B 5 " 3 2 " P P 3 " A 1 " B " 2 " P P 3 " A 1 " B " 2 5 6 " P P 3 " A 1 " B 1 " 1 2 8 " P P 3 " A 1 " B 2 " 6 4 " P P 3 " A 1 " B 3 " 3 2 " P P 3 " A 1 " B 4 " 1 6 " P P 3 " A 1 " B 5 " 8 " P P 3 " A 1 " B 6 " 2 " P P 3 " A 2 " B " 6 4 " P P 3 " A 2 " B 1 " 3 2 " P P 3 " A 2 " B 2 " 1 6 " P P 3 " A 2 " B 3 " 8 " P P 3 " A 2 " B 4 " 2 " P P 3 " A 2 " B 4 " 2 5 6 " P P 3 " A 2 " B 5 " 1 2 8 "
run$me'in'seconds'
Run$me'for'all'code'combina$ons'
O3" O2" FASTSSE"
11/34
0" 5000" 10000" 15000" 20000" 25000" 30000" 35000" 40000" 45000" 50000" P P 1 " A " B " P P 2 " A " B " P P 2 " A " B 1 " P P 2 " A " B 2 " P P 2 " A 1 " B " P P 2 " A 1 " B 1 " P P 2 " A 1 " B 2 " P P 2 " A 2 " B " P P 2 " A 2 " B 1 " P P 2 " A 2 " B 2 " Run$me'in'seconds' Source'code'file'and'code'block'version'
O3" O2" FASTSSE"
12/34
13/34
14/34
3
15/34
0" 2E+09" 4E+09" 6E+09" 8E+09" 1E+10" 1.2E+10" A0_B0" A0_B1" A0_B2" A1_B0" A1_B1" A1_B2" A2_B0" A2_B1" A2_B2" #"of"instruc,ons" code"block"
PAPI_FAD_INS"
O2" O3" FAST" 0" 2E+09" 4E+09" 6E+09" 8E+09" 1E+10" 1.2E+10" 1.4E+10" A0_B0" A0_B1" A0_B2" A1_B0" A1_B1" A1_B2" A2_B0" A2_B1" A2_B2" #"of"instruc,ons" code"block"
PAPI_FML_INS"
O2" O3" FAST"
0" 5E+09" 1E+10" 1.5E+10" 2E+10" 2.5E+10" A0_B0" A0_B1" A0_B2" A1_B0" A1_B1" A1_B2" A2_B0" A2_B1" A2_B2" #"of"instruc,ons" code"block"
O2" O3" FAST" 15/34
0.00%$ 1.00%$ 2.00%$ 3.00%$ 4.00%$ 5.00%$ 6.00%$ 7.00%$ 8.00%$ A0_B0$ A0_B1$ A0_B2$ A1_B0$ A1_B1$ A1_B2$ A2_B0$ A2_B1$ A2_B2$ idle%&me%in%%% code%block%
O2$ O3$ FAST$
16/34
0" 2E+09" 4E+09" 6E+09" 8E+09" 1E+10" 1.2E+10" A0_B0" A0_B1" A0_B2" A1_B0" A1_B1" A1_B2" A2_B0" A2_B1" A2_B2" #"of"instruc,ons" code"block"
PAPI_BR_INS"
O2" O3" FAST" 0.E+00% 1.E+08% 2.E+08% 3.E+08% 4.E+08% 5.E+08% 6.E+08% A0_B0% A0_B1% A0_B2% A1_B0% A1_B1% A1_B2% A2_B0% A2_B1% A2_B2% #"of"instruc,ons" code"block"
PAPI_BR_MSP"
O2% O3% FAST%
0.00%$ 2.00%$ 4.00%$ 6.00%$ 8.00%$ 10.00%$ 12.00%$ 14.00%$ 16.00%$ A0_B0$ A0_B1$ A0_B2$ A1_B0$ A1_B1$ A1_B2$ A2_B0$ A2_B1$ A2_B2$ miss$rate$in$%$ code$block$
O2$ O3$ FAST$ 17/34
4
18/34
# i f defined (A0) r2 =0.0d0 do k=1,3 xx ( k)=x ( k , i )−x ( k , j ) i f ( xx ( k ) . gt .+ h a l f l ( k ) ) xx ( k)= xx ( k)− x l ( k ) i f ( xx ( k ) . l t .− h a l f l ( k ) ) xx ( k)= xx ( k)+ x l ( k ) r2=r2+xx ( k )∗ xx ( k ) enddo # e l i f defined (A1) r2 =0.0d0 do k=1,3 xx ( k)=x ( k , i )−x ( k , j ) xx ( k)= xx ( k)− a i n t ( xx ( k )∗ h a l f l i ( k ) ) ∗ x l ( k ) r2=r2+xx ( k )∗ xx ( k ) enddo # e l i f defined (A2) xx ( : ) = x ( : , i )−x ( : , j ) xx=xx−a i n t ( xx∗ h a l f l i )∗ x l r2=xx (1)∗ xx (1)+ xx (2)∗ xx (2)+ xx (3)∗ xx (3) #else
18/34
# i f defined (B0) r= sqrt ( r2 ) fc = exp(−xmuc∗ r ) ∗ ( 1 . / r+xmuc ) / r2 do k=1,3 f i ( k ) = f i ( k ) + z i i ( j )∗ fc ∗xx ( k ) f j ( k , j ) = f j ( k , j ) − z i i ( i )∗ fc ∗xx ( k ) enddo # e l i f defined (B1) r= sqrt ( r2 ) fc = exp(−xmuc∗ r ) ∗ ( 1 . / r+xmuc ) / r2 f i ( : ) = f i ( : ) + z i i ( j )∗ fc ∗xx ( : ) f j ( : , j ) = f j ( : , j ) − z i i ( i )∗ fc ∗xx ( : ) # e l i f defined (B2) i f ( r2 . le . r c u t o f f 2 ) then r= sqrt ( r2 ) fc=exp(−xmuc∗ r ) ∗ ( 1 . / r+xmuc ) / r2 f i ( : ) = f i ( : ) + z i i ( j )∗ fc ∗xx ( : ) f j ( : , j ) = f j ( : , j ) − z i i ( i )∗ fc ∗xx ( : ) endif #else
19/34
5
20/34
20/34
allocate ( f j (3 ,0: n−1)) do 100 i =myrank , n−2,nprocs f i ( : ) = 0 . 0 d0 !$omp p a r a l l e l do p ri v at e ( r2 , k , xx , r , fc ) , \ reduction ( + : f i ) , schedule ( runtime ) do 90 j = i +1 ,n−1 ! − − − A −Block− − − . . . . . .
!$omp p a r a l l e l p ri v at e ( nthrd , nj , nr , j0 , j1 , xx , r2 , r , fc , f i ) nthrd = omp_get_num_threads ( ) allocate ( f i (3 ,0: n−1)) !$omp do schedule ( s t a t i c , 1 ) do 110 i t h r d =0 , nthrd−1 do 100 i =myrank , n−2,nprocs f i ( : , i )=0.0d0 nj =(n −i −1)/ nthrd nr=mod(n −i −1,nthrd ) j0 =( i +1)+ i t h r d ∗nj+min ( i t h r d , nr ) j1 =( i +1)+( i t h r d +1)∗ nj+min ( i t h r d +1 , nr)−1 do 90 j =j0 , j1
21/34
22/34
80# 90# 100# 1# 2# 3# 4# 5# 6# 7# 8# Efficiency(in(%( #(of(threads(
Old#Code# New#Code#
23/34
0%# 20%# 40%# 60%# 80%# 100%# 120%# 8# 8# 16# 32# 64# 128# 256# 512# 672# Efficiency(in(%( MPI(cores(
Parallel#Efficiency#
24/34
0%# 20%# 40%# 60%# 80%# 100%# 120%# 8# 64# 128# 256# 512# 672# Efficiency(in(%( #(of(cores((MPI+OpenMP)(
Parallel#Efficiency#
25/34
6
26/34
26/34
27/34
28/34
29/34
30/34
31/34
32/34
7
33/34
33/34
34/34
0" 0.1" 0.2" 0.3" 0.4" 0.5" 0.6" 0.7" 0.8" 0.9" 1" 1.1" 1" 3" 5" 7" 9" 11" 13" 15" 17" 19" 21" 23" 25" 27" 29" 31" 33" 35" 37" 39" 41" 43" 45" 47" 49" 51" 53" 55" 57" 59" 61" 63" Parallel"Efficiency" Number"of"OpenMP"Threads"
Parallel"Efficiency"
34/34
34/34
34/34
34/34
34/34
34/34