How to get peak FLOPS (CPU) — What I wish I knew when I was twenty about CPU —
Kenjiro Taura
1 / 54
How to get peak FLOPS (CPU) What I wish I knew when I was twenty - - PowerPoint PPT Presentation
How to get peak FLOPS (CPU) What I wish I knew when I was twenty about CPU Kenjiro Taura 1 / 54 Contents 1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple
1 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
2 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
3 / 54
4 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
5 / 54
1
#if __AVX512F__
2
const int vwidth = 64;
3
#elif __AVX__
4
const int vwidth = 32;
5
#else
6
#error "you’d better have a better machine"
7
#endif
8 9
const int valign = sizeof(float);
10
typedef float floatv attribute ((vector size(vwidth),aligned(valign)));
11
/∗ SIMD lanes ∗/
12
const int L = sizeof(floatv) / sizeof(float);
1
floatv a, x, c;
2
for (i = 0; i < n; i++) {
3
x = a * x + c;
4
}
6 / 54
7 / 54
1
$ sudo sysctl -w kernel.perf_event_paranoid=-1
1
$ cat /proc/cpuinfo
8 / 54
1
$ gcc -o axpy -march=native
9 / 54
1
$ gcc -o axpy -march=native
1
$ ./axpy simd
2
algo = simd
3
m = 8
4
n = 1000000000
5
flops = 32000000000
6
4000530984 CPU clocks, 2967749142 REF clocks, 1144168403 ns
7
4.000531 CPU clocks/iter, 2.967749 REF clocks/iter, 1.144168 ns/iter
8
7.998938 flops/CPU clock, 10.782583 flops/REF clock, 27.967911 GFLOPS
9 / 54
1
$ gcc -o axpy -march=native
1
$ ./axpy simd
2
algo = simd
3
m = 8
4
n = 1000000000
5
flops = 32000000000
6
4000530984 CPU clocks, 2967749142 REF clocks, 1144168403 ns
7
4.000531 CPU clocks/iter, 2.967749 REF clocks/iter, 1.144168 ns/iter
8
7.998938 flops/CPU clock, 10.782583 flops/REF clock, 27.967911 GFLOPS
9 / 54
1
$ gcc -o axpy -march=native
1
$ ./axpy simd
2
algo = simd
3
m = 8
4
n = 1000000000
5
flops = 32000000000
6
4000530984 CPU clocks, 2967749142 REF clocks, 1144168403 ns
7
4.000531 CPU clocks/iter, 2.967749 REF clocks/iter, 1.144168 ns/iter
8
7.998938 flops/CPU clock, 10.782583 flops/REF clock, 27.967911 GFLOPS
9 / 54
1
asm volatile ("# axpy simd: ax+c loop begin");
2
for (i = 0; i < n; i++) {
3
x = a * x + c;
4
}
5
asm volatile ("# axpy simd: ax+c loop end");
10 / 54
1
asm volatile ("# axpy simd: ax+c loop begin");
2
for (i = 0; i < n; i++) {
3
x = a * x + c;
4
}
5
asm volatile ("# axpy simd: ax+c loop end");
1
$ gcc -S -march=native -O3 axpy.c
10 / 54
1
# axpy_simd: ax+c loop begin
2
# 0 "" 2
3
#NO_APP
4
testq %rdi, %rdi
5
jle .L659
6
xorl %edx, %edx
7
.p2align 4,,10
8
.p2align 3
9
.L660:
10
addq $1,%rdx
11
vfmadd132ps %zmm0,%zmm1,%zmm2
12
cmpq %rdx,%rdi
13
jne .L660
14
.L659:
15
#APP
16
# 63 "axpy.cc" 1
17
# axpy_simd: ax+c loop end
11 / 54
1
#pragma GCC optimize("unroll-loops", 8)
2
long axpy_simd(long n, floatv a,
3
floatv* X, floatv c) {
4
...
5
for (i = 0; i < n; i++) {
6
x = a * x + c;
7
}
8
}
1
.L1662:
2
addq $8, %rdx
3
vfmadd132ps %zmm0,%zmm1,%zmm2
4
vfmadd132ps %zmm0,%zmm1,%zmm2
5
cmpq %rdx,%rdi
6
vfmadd132ps %zmm0,%zmm1,%zmm2
7
vfmadd132ps %zmm0,%zmm1,%zmm2
8
vfmadd132ps %zmm0,%zmm1,%zmm2
9
vfmadd132ps %zmm0,%zmm1,%zmm2
10
vfmadd132ps %zmm0,%zmm1,%zmm2
11
vfmadd132ps %zmm0,%zmm1,%zmm2
12
jne .L1662
12 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
13 / 54
1
.L1662:
2
addq $8, %rdx
3
vfmadd132ps %zmm0,%zmm1,%zmm2
4
vfmadd132ps %zmm0,%zmm1,%zmm2
5
cmpq %rdx,%rdi
6
vfmadd132ps %zmm0,%zmm1,%zmm2
7
...
8
vfmadd132ps %zmm0,%zmm1,%zmm2
9
jne .L1662
14 / 54
1
.L1662:
2
addq $8, %rdx
3
vfmadd132ps %zmm0,%zmm1,%zmm2
4
vfmadd132ps %zmm0,%zmm1,%zmm2
5
cmpq %rdx,%rdi
6
vfmadd132ps %zmm0,%zmm1,%zmm2
7
...
8
vfmadd132ps %zmm0,%zmm1,%zmm2
9
jne .L1662
14 / 54
15 / 54
1
.L1662:
2
addq $8, %rdx
3
vfmadd132ps %zmm0,%zmm1,%zmm2
4
vfmadd132ps %zmm0,%zmm1,%zmm2
5
cmpq %rdx,%rdi
6
vfmadd132ps %zmm0,%zmm1,%zmm2
7
...
8
vfmadd132ps %zmm0,%zmm1,%zmm2
9
jne .L1662
1
for (i = 0; i < n; i++) {
2
x = a * x + c;
3
}
vfmaddps zmm2 vfmaddps vfmaddps vfmaddps zmm2 zmm2 zmm2 zmm2 16 / 54
reference clock absolute time CPU clock
vfmaddps vfmaddps vfmaddps vfmaddps 17 / 54
18 / 54
vfmaddps zmm2 vfmaddps vfmaddps vfmaddps zmm2 zmm2 zmm2 zmm2 18 / 54
vfmaddps zmm2 vfmaddps vfmaddps vfmaddps zmm2 zmm2 zmm2 zmm2
18 / 54
vfmaddps zmm2 vfmaddps vfmaddps vfmaddps zmm2 zmm2 zmm2 zmm2
1
for (i = 0; i < n; i++) {
2
x0 = a * x0 + c;
3
x1 = a * x1 + c;
4
}
18 / 54
1
template<int nv>
2
long axpy_simd_c( ... ) {
3
for (long i = 0; i < n; i++) {
4
for (long j = 0; j < nv; j++) {
5
X[j] = a * X[j] + c;
6
} } }
19 / 54
1 2 3 4 5 6 7 8 2 4 6 8 10 12 14 160 8 16 24 32 40 48 56 64 CPU cycles/iter flops/CPU cycle variables CPU cycles/iter flops/CPU cycle chains clocks/iter flops/clock 1 4.0 7.999 2 4.001 15.998 3 4.001 23.996 4 4.001 31.995 5 4.049 39.517 6 4.001 47.994 7 4.113 54.458 8 4.001 63.991 9 4.567 63.059 10 5.001 63.982 11 5.501 63.991 12 6.001 63.988 13 6.502 63.981 14 7.001 63.989 15 7.501 63.99
20 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
21 / 54
22 / 54
22 / 54
22 / 54
22 / 54
22 / 54
22 / 54
1 2 3 4 5 6 7 8 2 4 6 8 10 12 14 160 8 16 24 32 40 48 56 64 CPU cycles/iter flops/CPU cycle variables CPU cycles/iter flops/CPU cycle
23 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
24 / 54
1
void axpy_simd_m(..., long m) {
2
for (long i = 0; i < n; i++) {
3
for (long j = 0; j < m; j++) {
4
X[j] = a * X[j] + c;
5
} } }
25 / 54
chains clocks/iter flops/clock 1 11.002 2.909 2 11.037 5.799 3 11.028 8.705 4 11.131 11.499 5 12.021 13.31 6 14.018 13.696 7 16.013 13.989 8 18.006 14.218 9 20.57 14.001 10 22.017 14.535 11 24.008 14.662 12 26.024 14.756 13 28.011 14.851 14 30.022 14.923 15 32.653 14.7 5 10 15 20 25 2 4 6 8 10 12 14 160 8 16 24 CPU cycles/iter flops/CPU cycle variables CPU cycles/iter flops/CPU cycle
26 / 54
1
.L1811:
2
vmovaps %zmm0, %zmm2
3
addq $64, %rcx
4
vfmadd132ps -64(%rcx), %zmm1, %zmm2
5
vmovups %zmm2, -64(%rcx)
6
cmpq %rcx, %r8
7
jne .L1811
1
.L1800:
2
addq $1, %rdx
3
vfmadd132ps %zmm0, %zmm1, %zmm2
4
cmpq %rdx, %rdi
5
jne .L1800
27 / 54
28 / 54
29 / 54
30 / 54
31 / 54
instruction type 1/throughput vmovaps %zmm0,%zmm2 register move
0.33
addq $64,%rcx int op
0.25
vfmadd132ps -64(%rcx),%zmm1,%zmm2 load + FMA 0.5, 0.5 vmovups %zmm2,-64(%rcx) store 1.0 cmpq %rcx,%r8 compare 0.25 jne .L1811 jump 1-2
32 / 54
source: https://en.wikichip.org/wiki/intel/microarchitectures/skylak
33 / 54
34 / 54
1
for (i = 0; i < n; i++) {
2
for (j = 0; j < nv; j++) {
3
x[j] = a * x[j] + c; // load; fmadd; store
4
}
5
}
1
load x[j] to a register;
2
do “a * x + c” several times on the register;
3
store the result to x[j];
35 / 54
i j
i j
i j small and constant load
i j
load store
36 / 54
1
for (j = 0; j < nv; j += b) {
2
/* run b variables until the end */
3
for (i = 0; i < n; i++) {
4
for (jj = j; jj < j + b; jj++) {
5
xx[jj] = a * xx[jj] + c;
6
}
7
}
8
}
37 / 54
1
for (i = 0; i < n; i += 3) {
2
/* run all variables 3 steps */
3
for (j = 0; j < m; j++) {
4
for (ii = 0; ii < 3; ii++) {
5
x[j] = a * x[j] + c;
6
}
7
}
8
}
38 / 54
chains clocks/iter flops/clock 1 4.0 7.999 2 4.0 15.998 3 4.001 23.996 4 4.001 31.996 5 4.018 39.818 6 4.001 47.991 7 4.114 54.447 8 4.001 63.99 9 4.583 62.834 10 5.001 63.986 11 5.501 63.991 12 6.001 63.993 13 6.501 63.988 14 7.001 63.991 15 7.501 63.987 1 2 3 4 5 6 7 8 2 4 6 8 10 12 14 160 8 16 24 32 40 48 56 64 CPU cycles/iter flops/CPU cycle variables CPU cycles/iter flops/CPU cycle
39 / 54
1 Introduction 2 An endeavor to nearly peak FLOPS 3 Latency 4 Instruction Level Parallelism (ILP) 5 Analyzing throughput 6 A simple yet fairly fast single-core matrix multiply
40 / 54
+= * M N K K N C A B
41 / 54
M N K K C A B
41 / 54
42 / 54
j M N K i K
1
$ ./mmc00
2
M = 12, N = 32, K = 192
3
A : 12 x 192 (ld=192) 9216 bytes
4
B : 192 x 32 (ld=32) 24576 bytes
5
C : 12 x 32 (ld=32) 1536 bytes
6
total = 35328 bytes
7
...
8
3.456 CPU clocks/iter
9
2.520 REF clocks/iter
10
0.579 flops/CPU clock
11
0.794 flops/REF clock
12
2.058 GFLOPS
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
for (k = 0; k < K; k++)
4
C(i,j) += A(i,k) * B(k,j);
43 / 54
44 / 54
j M N K i K
1
M = 12, N = 32, K = 192
2
A : 12 x 192 (ld=192) 9216 bytes
3
B : 192 x 32 (ld=32) 24576 bytes
4
C : 12 x 32 (ld=32) 1536 bytes
5
total = 35328 bytes
6
repeat : 100000 times
7
...
8
3.555 CPU clocks/iter
9
2.681 REF clocks/iter
10
9.002 flops/CPU clock
11
11.936 flops/REF clock
12
30.960 GFLOPS
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j += L)
3
for (k = 0; k < K; k++)
4
C(i,j:j+L) += A(i,k) * B(k,j:j+L);
45 / 54
j i M N K K
1
M = 12, N = 32, K = 192
2
A : 12 x 192 (ld=192) 9216 bytes
3
B : 192 x 32 (ld=32) 24576 bytes
4
C : 12 x 32 (ld=32) 1536 bytes
5
total = 35328 bytes
6
repeat : 100000 times
7
...
8
3.555 CPU clocks/iter
9
2.681 REF clocks/iter
10
9.002 flops/CPU clock
11
11.936 flops/REF clock
12
30.960 GFLOPS
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j += L)
3
for (k = 0; k < K; k++)
4
C(i,j:j+L) += A(i,k) * B(k,j:j+L);
45 / 54
46 / 54
j i M N K K
1
login000:07mm$ ./mmc02
2
M = 8, N = 32, K = 192
3
A : 8 x 192 (ld=192) 6144 bytes
4
B : 192 x 32 (ld=32) 24576 bytes
5
C : 8 x 32 (ld=32) 1024 bytes
6
...
7
5.451 CPU clocks/iter
8
4.387 REF clocks/iter
9
46.966 flops/CPU clock
10
58.349 flops/REF clock
11
151.341 GFLOPS
1
for (i = 0; i < M; i += bM)
2
for (j = 0; j < N; j += L)
3
for (k = 0; k < K; k++)
4
for (di = 0; di < bM; di++)
5
C(i+di,j:j+L) += A(i+di,k) * B(k,j:j+L);
47 / 54
i M N K j K
1
login000:07mm$ ./mmc02
2
M = 8, N = 32, K = 192
3
A : 8 x 192 (ld=192) 6144 bytes
4
B : 192 x 32 (ld=32) 24576 bytes
5
C : 8 x 32 (ld=32) 1024 bytes
6
...
7
5.451 CPU clocks/iter
8
4.387 REF clocks/iter
9
46.966 flops/CPU clock
10
58.349 flops/REF clock
11
151.341 GFLOPS
1
for (i = 0; i < M; i += bM)
2
for (j = 0; j < N; j += L)
3
for (k = 0; k < K; k++)
4
for (di = 0; di < bM; di++)
5
C(i+di,j:j+L) += A(i+di,k) * B(k,j:j+L);
47 / 54
i M N K j K
1
for (i = 0; i < M; i += bM)
2
for (j = 0; j < N; j += L)
3
for (k = 0; k < K; k++)
4
for (di = 0; di < bM; di++)
5
C(i+di,j:j+L) += A(i+di,k) * B(k,j:j+L);
48 / 54
M N j K i K
1
$ ./mmc03
2
M = 8, N = 32, K = 192
3
A : 8 x 192 (ld=192) 6144 bytes
4
B : 192 x 32 (ld=32) 24576 bytes
5
C : 8 x 32 (ld=32) 1024 bytes
6
...
7
4.926 CPU clocks/iter
8
4.938 REF clocks/iter
9
51.969 flops/CPU clock
10
51.846 flops/REF clock
11
134.474 GFLOPS
12
1
for (i = 0; i < M; i += bM’)
2
for (j = 0; j < N; j += bN * L)
3
for (k = 0; k < K; k++)
4
for (di = 0; di < bM’; di++)
5
for (dj = 0; dj < bN * L; dj += L)
6
C(i+di,j+dj:j+dj+L) += A(i+di,k) * B(k,j+dj:j+L);
49 / 54
50 / 54
L3 cache
hardware thread (virtual core, CPU)(physical) core
L2 cache
L1 cachechip (socket, node, CPU)
51 / 54
52 / 54
53 / 54
54 / 54