What You Must Know about Memory, Caches, and Shared Memory
Kenjiro Taura
1 / 105
What You Must Know about Memory, Caches, and Shared Memory Kenjiro - - PowerPoint PPT Presentation
What You Must Know about Memory, Caches, and Shared Memory Kenjiro Taura 1 / 105 Contents 1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
1 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
2 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
3 / 105
4 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
for (k = 0; k < K; k++)
4
C(i,j) += A(i,k) * B(k,j);
5 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
for (k = 0; k < K; k++)
4
C(i,j) += A(i,k) * B(k,j);
5 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
for (k = 0; k < K; k++)
4
C(i,j) += A(i,k) * B(k,j);
5 / 105
6 / 105
7 / 105
7 / 105
http://ark.intel.com/products/81060/Intel-Xeon-Processor-E5-2698-v3-40M-Cache-2_30-GHz
7 / 105
http://ark.intel.com/products/81060/Intel-Xeon-Processor-E5-2698-v3-40M-Cache-2_30-GHz
7 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
8 / 105
9 / 105
9 / 105
9 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
y = x
M N 10 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
y = x
M N
10 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
y = x
M N
10 / 105
1
for (k = 0; k < A.nnz; k++) {
2
i,j,Aij = A.elems[k];
3
y[i] += Aij * x[j];
4
}
y = x
M N
10 / 105
11 / 105
11 / 105
11 / 105
12 / 105
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
y[i] += a[i][j] * x[j];
y = x
M N 13 / 105
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
y[i] += a[i][j] * x[j];
y = x
M N
13 / 105
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
y[i] += a[i][j] * x[j];
y = x
M N
13 / 105
+= * M N K K N C A B
14 / 105
+= * M N K K N C A B
14 / 105
1
for (i = 0; i < M; i++)
2
for (j = 0; j < N; j++)
3
for (k = 0; k < K; k++)
4
C(i,j) += A(i,k) * B(k,j);
1
C(i,j) += A(i,k) * B(k,j)
15 / 105
1
double t0 = cur_time();
2
memcpy(a, b, nb);
3
double t1 = cur_time();
16 / 105
1
double t0 = cur_time();
2
memcpy(a, b, nb);
3
double t1 = cur_time();
1
$ gcc -O3 memcpy.c
2
$ ./a.out $((1 << 26)) # 64M long elements = 512MB
3
536870912 bytes copied in 0.117333 sec 4.575611 GB/sec
16 / 105
1
double t0 = cur_time();
2
memcpy(a, b, nb);
3
double t1 = cur_time();
1
$ gcc -O3 memcpy.c
2
$ ./a.out $((1 << 26)) # 64M long elements = 512MB
3
536870912 bytes copied in 0.117333 sec 4.575611 GB/sec
16 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
17 / 105
memory controller
L3 cache
(physical) core
cache
18 / 105
(physical) core
L2 cache
L1 cache
multi-level caches
19 / 105
memory controller
L3 cache
hardware thread (virtual core, CPU)
(physical) core
L2 cache
L1 cache
chip (socket, node, CPU)
20 / 105
memory controller
L3 cache
hardware thread (virtual core, CPU)
(physical) core
L2 cache
L1 cache
chip (socket, node, CPU)
21 / 105
memory controller
L3 cache
hardware thread (virtual core, CPU)(physical) core
L2 cache
L1 cachechip (socket, node, CPU) interconnect
22 / 105
23 / 105
24 / 105
24 / 105
24 / 105
24 / 105
24 / 105
25 / 105
25 / 105
25 / 105
25 / 105
a 32KB cache with 64 bytes lines (holds most recently accessed 512 distinct blocks)
26 / 105
a 32KB cache with 64 bytes lines (holds most recently accessed 512 distinct blocks)
26 / 105
a 32KB cache with 64 bytes lines (holds most recently accessed 512 distinct blocks)
26 / 105
27 / 105
28 / 105
29 / 105
5 6 11 12 a address within a line (26 = 64 bytes) index the set in the cache (among 26 = 64 sets)
30 / 105
S sets K ways a line Cache Size
31 / 105
S sets K ways a line Cache Size
32 / 105
stride the number of sets utilization they are mapped to 2048 2 1/32 1024 4 1/16 512 8 1/8 256 16 1/4 128 32 1/2 64 64 1
S sets K ways a line Cache Size
33 / 105
a address within a line (26 = 64 bytes) index the set in the cache
34 / 105
a address within a line (26 = 64 bytes) index the set in the cache (among 29 = 512 sets) 14 15 intact with address translation changed by address translation 256KB/8way 5 6 11 12 35 / 105
stride utilization 4096 1/64 2048 1/32 1024 1/16 512 1/8 256 1/4 128 1/2 64 1
a address within a line (26 = 64 index the set in the cache (among 29 = 512 14 15 intact with address translation changed by address translation 256KB/8way 5 6 11 12 36 / 105
stride utilization . . . ∼ 1/64 16384 ∼ 1/64 8192 ∼ 1/64 4096 1/64 2048 1/32 1024 1/16 512 1/8 256 1/4 128 1/2 64 1
a address within a line (26 = 64 bytes) index the set in the cache (among 26 = 64 sets) intact with address translation 32KB/8way 5 6 11 12 a address within a line (26 = 64 bytes) index the set in the cache (among 29 = 512 sets) 14 15 intact with address translation changed by address translation 256KB/8way 5 6 11 12 37 / 105
1
float a[100][1024];
1
float a[100][1024+16];
38 / 105
39 / 105
39 / 105
39 / 105
39 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
40 / 105
41 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
42 / 105
43 / 105
1
for (N times) {
2
p = p->next;
3
}
43 / 105
1
for (N times) {
2
p = p->next;
3
}
cache line size next pointers N elements (link all elements in a random order)
43 / 105
1
$ numactl --cpunodebind 0 --interleave 0 ./mem
2
$ numactl -N 0 -i 0 ./mem # abbreviation 50 100 150 200 250 300 350 400 450 16384 65536 262144 1.04858 × 106 4.1943 × 106 1.67772 × 107 6.71089 × 107 2.68435 × 108 latency/load (CPU cycles) size of the region (bytes) latency per load in a random list traversal [0,1073741824] local
memory controller L3 cache hardware thread (virtual core, CPU) (physical) core L2 cache L1 cache chip (socket, node, CPU) interconnect44 / 105
size level latency latency (cycles) (ns) 12,736 L1 4.004 1.31 103,616 L2 13.80 4.16 2,964,928 L3 77.40 24.24 301,307,584 main 377.60 115.45
L1 L2 L3 main memory
50 100 150 200 250 300 350 400 450 10000 100000 1x106 1x107 1x108 latency/load size of the region (bytes) latency per load in a random list traversal [0,1073741824] local
45 / 105
C cache miss rate 1 C + 1 fully associative size to repeatedly scan
L1 L2 L3 main memory
50 100 150 200 250 300 350 400 450 10000 100000 1x106 1x107 1x108 latency/load size of the region (bytes) latency per load in a random list traversal [0,1073741824] local
46 / 105
C cache miss rate 1 C + 1 2C fully associative d i r e c t m a p size to repeatedly scan
L1 L2 L3 main memory
50 100 150 200 250 300 350 400 450 10000 100000 1x106 1x107 1x108 latency/load size of the region (bytes) latency per load in a random list traversal [0,1073741824] local
46 / 105
C cache miss rate 1 C + 1 2C C(1 + 1/K) fully associative K-way set associative d i r e c t m a p size to repeatedly scan
L1 L2 L3 main memory
50 100 150 200 250 300 350 400 450 10000 100000 1x106 1x107 1x108 latency/load size of the region (bytes) latency per load in a random list traversal [0,1073741824] local
46 / 105
C cache miss rate 1 C + 1 2C C(1 + 1/K) fully associative K-way set associative d i r e c t m a p size to repeatedly scan
L1 L2 L3 main memory
50 100 150 200 250 300 350 400 450 10000 100000 1x106 1x107 1x108 latency/load size of the region (bytes) latency per load in a random list traversal [0,1073741824] local
47 / 105
1
$ numactl -N 0 -i 1 ./mem 100 200 300 400 500 600 700 800 900 16384 65536 262144 1.04858 × 106 4.1943 × 106 1.67772 × 107 6.71089 × 107 2.68435 × 108 latency/load (CPU cycles) size of the region (bytes) latency per load in a random list traversal [0,1073741824] local remote
memory controller L3 cache hardware thread (virtual core, CPU) (physical) core L2 cache L1 cache chip (socket, node, CPU) interconnect48 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
49 / 105
5 10 15 20 25 30 35 40 45 50 10000 100000 1 × 106 1 × 107 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth of list traversal [0,1073741824] local remote
memory controller L3 cache hardware thread (virtual core, CPU) (physical) core L2 cache L1 cache chip (socket, node, CPU) interconnect50 / 105
0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth of list traversal [33554432,1073741824] local remote
51 / 105
cache line size next pointers N elements (link all elements in a random order)
memory controller
L3 cache
(physical) core
cache
52 / 105
memory controller
L3 cache
(physical) core
cache
53 / 105
memory controller
L3 cache
(physical) core
cache
1
for (N times) {
2
p1 = p1->next;
3
p2 = p2->next;
4
...
5
}
53 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
54 / 105
20 40 60 80 100 120 140 160 180 10000 100000 1 × 106 1 × 107 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth with a number of chains [0,1073741824] 1 chains 2 chains 4 chains 5 chains 8 chains 10 chains 12 chains 14 chains
55 / 105
1 2 3 4 5 6 7 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth with a number of chains [33554432,1073741824] 1 chains 2 chains 4 chains 5 chains 8 chains 10 chains 12 chains 14 chains
56 / 105
0.5 1 1.5 2 2.5 3 3.5 4 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth with a number of chains [33554432,1073741824] 1 chains 2 chains 4 chains 8 chains 10 chains 12 chains 14 chains
57 / 105
memory controller
L3 cache
(physical) core
cache
58 / 105
memory controller
L3 cache
(physical) core
cache
58 / 105
memory controller
L3 cache
(physical) core
cache
58 / 105
59 / 105
59 / 105
59 / 105
59 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
60 / 105
61 / 105
61 / 105
1 make addresses sequential 2 make address generations independent 3 prefetch by software (make address generations go ahead) 4 use multiple threads/cores 61 / 105
1 make addresses sequential 2 make address generations independent 3 prefetch by software (make address generations go ahead) 4 use multiple threads/cores
61 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
62 / 105
cache line size next pointers N elements (link all elements in a random order)
63 / 105
5 10 15 20 25 30 35 40 45 50 10000 100000 1 × 106 1 × 107 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth of random list traversal vs address-ordered list traversal [0,1073741824] address-sorted list random list
64 / 105
65 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
66 / 105
1
for (N times) {
2
j = ... /∗ not use a[·] ∗/
3
a[j];
4
}
memory controller
L3 cache
(physical) core
cache
67 / 105
50 100 150 200 250 300 350 10000 100000 1 × 106 1 × 107 1 × 108 bandwidth (GB/sec) size of the region (bytes) list traversal vs random access vs sequential access [0,1073741824] ptrchase random sequential
68 / 105
2 4 6 8 10 12 14 16 1 × 108 bandwidth (GB/sec) size of the region (bytes) list traversal vs random access vs sequential access [33554432,1073741824] ptrchase random sequential
69 / 105
2 4 6 8 10 12 14 16 1 × 108 bandwidth (GB/sec) size of the region (bytes) list traversal vs random access vs sequential access [33554432,1073741824] ptrchase random sequential
70 / 105
71 / 105
72 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
73 / 105
74 / 105
74 / 105
74 / 105
1
__builtin_prefetch(a [, rw, hint ])
74 / 105
75 / 105
75 / 105
75 / 105
75 / 105
75 / 105
75 / 105
1
for (N times) {
2
p = p->next;
3
prefetch(p->prefetch);
4
}
”prefetch pointers” pointing to several elements ahead
76 / 105
0.5 1 1.5 2 2.5 3 3.5 4 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth w/ and w/o prefetch [33554432,1073741824] prefetch=0 prefetch=10
77 / 105
2 4 6 8 10 12 14 16 1 × 108 bandwidth (GB/sec) size of the region (bytes) summary of various access patterns [33554432,1073741824] ptrchase (sorted) ptrchase random sequential ptrchase (prefetch) ptrchase (x 10)
78 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
79 / 105
80 / 105
20 40 60 80 100 120 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth with a number of threads [33554432,1073741824] 1 chains, 1 threads 10 chains, 1 threads 1 chains, 4 threads 10 chains, 4 threads 1 chains, 8 threads 10 chains, 8 threads 1 chains, 16 threads 10 chains, 16 threads
81 / 105
20 40 60 80 100 120 140 160 180 1 × 108 bandwidth (GB/sec) size of the region (bytes) bandwidth with various methods and number of threads [33554432,1073741824] random 1 threads sequential 1 threads random 8 threads sequential 8 threads random 12 threads sequential 12 threads random 16 threads sequential 16 threads
82 / 105
L3 cache
hardware thread (virtual core, CPU) (physical) coreL2 cache
L1 cachechip (socket, node, CPU) interconnect
83 / 105
1
$ numactl options command
1
$ numactl -N 0 command # threads on CPU 0
1
# threads on cores 0-11 and 16-27
2
$ numactl --physcpubind 0-11,16-27 command
84 / 105
1
$ numactl -i 0,1 command # data on CPU 0 or 1
2
$ numactl -i all command # data on all CPUs
1
$ numactl -l command
85 / 105
86 / 105
1
$ OMP_NUM_THREADS=48 OMP_PROC_BIND=true numactl --physcpubind 0-11,16-27,32-43,48-59 -l command
87 / 105
88 / 105
89 / 105
1 Introduction 2 Many algorithms are bounded by memory not CPU 3 Organization of processors, caches, and memory 4 So how costly is it to access data?
5 Other ways to get more bandwidth
6 How costly is it to communicate between threads?
90 / 105
x
x = 100; ... = x;
91 / 105
x
x = 100; ... = x;
91 / 105
92 / 105
1 a write to an address by a processor “invalidates” all other
1 a write to an address by a processor “invalidates” all other
2 a read to an invalid line causes a miss and searches for a
93 / 105
93 / 105
L3 cache
hardware thread (virtual core, CPU) (physical) coreL2 cache
L1 cachechip (socket, node, CPU) interconnect
94 / 105
L3 cache
hardware thread (virtual core, CPU) (physical) coreL2 cache
L1 cachechip (socket, node, CPU) interconnect
94 / 105
1 one Modified (owner) + others Invalid ( ,
L3 cache
hardware thread (virtual core, CPU) (physical) coreL2 cache
L1 cachechip (socket, node, CPU) interconnect
94 / 1051 one Modified (owner) + others Invalid ( ,
2 no Modified ( ,
L3 cache
hardware thread (virtual core, CPU) (physical) coreL2 cache
L1 cachechip (socket, node, CPU) interconnect
94 / 10595 / 105
95 / 105
95 / 105
96 / 105
96 / 105
96 / 105
1
volatile long x = 0;
2
volatile long y = 0;
1
(ping thread)
2
for (i = 0; i < n; i++) {
3
x = i + 1;
4
while (y <= i) ;
5
}
1
(pong thread)
2
for (i = 0; i < n; i++) {
3
while (x <= i) ;
4
y = i + 1;
5
}
97 / 105
98 / 105
’-’ matrix 16 32 48 64 80 96 112 16 32 48 64 80 96 112 500 1000 1500 2000 2500
99 / 105
’-’ matrix 16 32 48 64 80 96 112 16 32 48 64 80 96 112 500 1000 1500 2000 2500
100 / 105
1
// assume inside #pragma omp parallel
2
...
3
#pragma omp for
4
for (k = 0; k < A.nnz; k++) {
5
i,j,Aij = A.elems[k];
6
#pragma omp atomic
7
y[i] += Aij * x[j];
8
}
101 / 105
102 / 105
103 / 105
104 / 105
105 / 105