last time
play

last time %xmm1: [6, 8, 10, 12] Result will be: %xmm1: [5, 6, 7, - PowerPoint PPT Presentation

1 last time %xmm1: [6, 8, 10, 12] Result will be: %xmm1: [5, 6, 7, 8] %xmm0: [1, 2, 3, 4] Suppose registers contain (interpreted as 4 ints) paddd %xmm0, %xmm1 (packed add dword (32-bit)) example vector instruction 3 extra copies of ALUs


  1. 1 last time %xmm1: [6, 8, 10, 12] Result will be: %xmm1: [5, 6, 7, 8] %xmm0: [1, 2, 3, 4] Suppose registers contain (interpreted as 4 ints) paddd %xmm0, %xmm1 (packed add dword (32-bit)) example vector instruction 3 extra copies of ALUs only accessed by vector instructions instructions vector instructions or SIMD (single instruction, multiple data) 128-bit registers named %xmm0 through %xmm15 4 ints or 4 fmoats or 2 doubles or … example: X86-64 has 128-bit registers modern processors have registers that hold “vector” of values vector instructions 2 reusing address calculations in loops divide by constant multiple accumulators — expose more parallelism latency bound — need to fjnish longest dependency chain graph of operations linked by depedencies the data fmow model idea out-of-order execution and instruction queues 4 instructions that act on all values in register

  2. vector instructions A[7] (stage 1) ALU (lane 1) one view of vector functional units 7 8 elements in vector, so multiples of 8 easier variance from this optimization wiggles on prior graphs 6 B[7] + B[6] (stage 2) + A[6] B[5] + A[5] B[4] A[4] %xmm0 paddd %xmm1 movdqu ALU (lane 1) ALU (lane1) movdqu ALU (lane 4) vector ALU (one/cycle) output values (one/cycle) input values (stage 3) ALU (lane 4) (stage 2) ALU (lane 4) (stage 1) (stage 3) (stage 3) ALU (lane 3) (stage 2) ALU (lane 3) (stage 1) ALU (lane 3) (stage 3) ALU (lane 2) (stage 2) ALU (lane 2) (stage 1) ALU (lane 2) %xmm0 + … %xmm1, %xmm0 A[0] vector add picture 5 rep ret the_loop jne $512, %rax cmpq $16, %rax addq // store 4 in A %xmm0, (%rdi,%rax) movups // add 4 elements! paddd … // load 4 from B (%rsi,%rax), %xmm1 movdqu // load 4 from A (%rdi,%rax), %xmm0 movdqu the_loop: // init. loop counter %eax, %eax xorl add : } a[i] += b[i]; for ( int i = 0; i < 128; ++i) B[0] 8 B[1] A[6] A[2] B[3] A[1] A[4] B[4] A[5] B[5] B[2] A[3] B[6] A[7] B[7] A[8] B[8] A[9] B[9] void add( int * restrict a, int * restrict b) { // + 4 ints = + 16 // 512 = 4 * 128 cycles per multiply/add [optimized loop] 0.5 0.4 0.3 0.2 unblocked 0.1 blocked 0.0 0 200 400 600 800 1000 N

  3. why vector instructions? fjckle compiler vectorization (2) lots of logic not dedicated to computation } but not: #define N 1024 for ( int i = 0; i < N; ++i) for ( int j = 0; j < N; ++j) for ( int k = 0; k < N; ++k) } 11 Clang 5.0.0 generates vector instructions for this: for ( int i = 0; i < N; ++i) for ( int k = 0; k < N; ++k) for ( int i = 0; i < N; ++i) for ( int j = 0; j < N; ++j) } but not: (probably bug?) for ( long k = 0; k < N; ++k) for ( long i = 0; i < N; ++i) for ( long j = 0; j < N; ++j) } for ( int j = 0; j < N; ++j) 12 for ( int k = 0; k < N; ++k) but easily messsed up: instruction queue reorder bufger instruction fetch branch prediction … …but a lot more computational capacity 9 vector instructions and compilers (and have gotten much, much better at it over the past decade) compilers can sometimes fjgure out how to use vector instructions by aliasing 10 #define N 1024 GCC 7.2 and Clang 5.0 generate vector instructions for this: by conditionals by some operation with no vector instruction … fjckle compiler vectorization (1) adding vector instructions — little extra control logic void foo( int N, unsigned int *A, unsigned int *B) { void foo( unsigned int *A, unsigned int *B) { B[i * N + j] += A[i * N + k] * A[k * N + j]; B[i * N + j] += A[i * N + k] * A[k * N + j]; void foo( long N, unsigned int *A, unsigned int *B) { void foo( unsigned int *A, unsigned int *B) { B[i * N + j] += A[i * N + k] * A[k * N + j]; B[i * N + j] += A[i * N + k] * A[j * N + k];

  4. vector intrinsics si128 means “128-bit integer value” for ( int i = 0; i < 128; i += 4) { vector intrinsics: add example 14 epi32 means “4 32-bit integers” function to add u for “unaligned” (otherwise, pointer address must be multiple of 16) functions to store/load // a_values = {a[i], a[i+1], a[i+2], a[i+3]} other types: __m128 (fmoats), __m128d (doubles) special type __m128i — “128 bits of integers” } } _mm_storeu_si128(( __m128i *) &a[i], sums); // {a[i], a[i+1], a[i+2], a[i+3]} = sums // "si128" --> 128 bit integer __m128i a_values = _mm_loadu_si128(( __m128i *) &a[i]); // sums = {a[i] + b[i], a[i+1] + b[i+1], ....} special type __m128i — “128 bits of integers” epi32 means “4 32-bit integers” function to add u for “unaligned” (otherwise, pointer address must be multiple of 16) si128 means “128-bit integer value” functions to store/load other types: __m128 (fmoats), __m128d (doubles) } // b_values = {b[i], b[i+1], b[i+2], b[i+3]} } _mm_storeu_si128(( __m128i *) &a[i], sums); // {a[i], a[i+1], a[i+2], a[i+3]} = sums __m128i sums = _mm_add_epi32(a_values, b_values); // sums = {a[i] + b[i], a[i+1] + b[i+1], ....} // add four 32-bit integers __m128i b_values = _mm_loadu_si128(( __m128i *) &b[i]); __m128i sums = _mm_add_epi32(a_values, b_values); // add four 32-bit integers if compiler doesn’t work… __m128i a_values = _mm_loadu_si128(( __m128i *) &a[i]); // {a[i], a[i+1], a[i+2], a[i+3]} = sums __m128i sums = _mm_add_epi32(a_values, b_values); // sums = {a[i] + b[i], a[i+1] + b[i+1], ....} // add four 32-bit integers __m128i b_values = _mm_loadu_si128(( __m128i *) &b[i]); // b_values = {b[i], b[i+1], b[i+2], b[i+3]} // a_values = {a[i], a[i+1], a[i+2], a[i+3]} __m128i b_values = _mm_loadu_si128(( __m128i *) &b[i]); // "si128" --> 128 bit integer for ( int i = 0; i < 128; i += 4) { vector intrinsics: add example 13 C functions that compile to particular instructions second option: “intrinsic functions” could write vector instruction assembly by hand _mm_storeu_si128(( __m128i *) &a[i], sums); } } special type __m128i — “128 bits of integers” // b_values = {b[i], b[i+1], b[i+2], b[i+3]} __m128i a_values = _mm_loadu_si128(( __m128i *) &a[i]); // a_values = {a[i], a[i+1], a[i+2], a[i+3]} // "si128" --> 128 bit integer for ( int i = 0; i < 128; i += 4) { vector intrinsics: add example 14 epi32 means “4 32-bit integers” function to add u for “unaligned” (otherwise, pointer address must be multiple of 16) si128 means “128-bit integer value” functions to store/load other types: __m128 (fmoats), __m128d (doubles) 14 void vectorized_add( int *a, int *b) { void vectorized_add( int *a, int *b) { void vectorized_add( int *a, int *b) {

  5. vector intrinsics: add example __m128i b_values = _mm_loadu_si128(( __m128i *) &b[i]); __m128i sums = _mm_add_epi64(a_values, b_values); // {a[i], a[i+1]} = sums _mm_storeu_si128(( __m128i *) &a[i], sums); } } 15 vector intrinsics: difgerent size for ( int i = 0; i < 128; i += 2) { // a_values = {a[i], a[i+1]} (2 x 64 bits) __m128i a_values = _mm_loadu_si128(( __m128i *) &a[i]); // b_values = {b[i], b[i+1]} (2 x 64 bits) // add two 64-bit integers: paddq %xmm0, %xmm1 __m128i b_values = _mm_loadu_si128(( __m128i *) &b[i]); // sums = {a[i] + b[i], a[i+1] + b[i+1]} __m128i sums = _mm_add_epi64(a_values, b_values); // {a[i], a[i+1]} = sums _mm_storeu_si128(( __m128i *) &a[i], sums); } } 15 recall: square for ( int k = 0; k < N; ++k) for ( int i = 0; i < N; ++i) for ( int j = 0; j < N; ++j) } // sums = {a[i] + b[i], a[i+1] + b[i+1]} // add two 64-bit integers: paddq %xmm0, %xmm1 // b_values = {b[i], b[i+1]} (2 x 64 bits) special type __m128i — “128 bits of integers” for ( int i = 0; i < 128; i += 4) { // "si128" --> 128 bit integer // a_values = {a[i], a[i+1], a[i+2], a[i+3]} __m128i a_values = _mm_loadu_si128(( __m128i *) &a[i]); // b_values = {b[i], b[i+1], b[i+2], b[i+3]} __m128i b_values = _mm_loadu_si128(( __m128i *) &b[i]); // add four 32-bit integers // sums = {a[i] + b[i], a[i+1] + b[i+1], ....} __m128i sums = _mm_add_epi32(a_values, b_values); // {a[i], a[i+1], a[i+2], a[i+3]} = sums _mm_storeu_si128(( __m128i *) &a[i], sums); } } other types: __m128 (fmoats), __m128d (doubles) epi32 means “4 32-bit integers” __m128i a_values = _mm_loadu_si128(( __m128i *) &a[i]); // a_values = {a[i], a[i+1]} (2 x 64 bits) for ( int i = 0; i < 128; i += 2) { vector intrinsics: difgerent size 14 function to add u for “unaligned” (otherwise, pointer address must be multiple of 16) si128 means “128-bit integer value” functions to store/load 16 void vectorized_add( int *a, int *b) { void vectorized_add_64bit( long *a, long *b) { void vectorized_add_64bit( long *a, long *b) { void square( unsigned int *A, unsigned int *B) { B[i * N + j] += A[i * N + k] * A[k * N + j];

Download Presentation
Download Policy: The content available on the website is offered to you 'AS IS' for your personal information and use only. It cannot be commercialized, licensed, or distributed on other websites without prior consent from the author. To download a presentation, simply click this link. If you encounter any difficulties during the download process, it's possible that the publisher has removed the file from their server.

Recommend


More recommend