memory c and x86 assembly
play

Memory: C and x86 assembly 1 Loop Refresher mem ops Optimized or - PowerPoint PPT Presentation

Memory: C and x86 assembly 1 Loop Refresher mem ops Optimized or sum: .LFB2: .loc 1 2 0 unoptimized? .LVL0: .loc 1 4 0 eax == s 0 movl $0, %eax .LVL1: ??? 0 int sum(int count) testl %edi, %edi ??? 0 { int s = 0; jle


  1. Memory: C and x86 assembly 1

  2. Loop Refresher mem ops Optimized or sum: .LFB2: .loc 1 2 0 unoptimized? .LVL0: .loc 1 4 0 eax == s 0 movl $0, %eax .LVL1: ??? 0 int sum(int count) testl %edi, %edi ??? 0 { int s = 0; jle .L4 s = 0 0 int i; movl $0, %eax i = 0 0 for(i = 0; i < count; i++) { movl $0, %edx s+= i; .LVL2: } .L5: return s; .loc 1 5 0 s+=i 0 } addl %edx, %eax .loc 1 4 0 i++ 0 addl $1, %edx i < count 0 cmpl %edi, %edx go again 0 jne .L5 .L4: .LVL3: .loc 1 8 0 Done 0 rep ; ret 2

  3. Loop Refresher mem ops Optimized or sum: .LFB2: .loc 1 2 0 unoptimized? .LVL0: .loc 1 4 0 eax == s 0 movl $0, %eax .LVL1: ??? 0 int sum(int count) testl %edi, %edi ??? 0 { int s = 0; jle .L4 Optimized s = 0 0 int i; movl $0, %eax i = 0 0 for(i = 0; i < count; i++) { movl $0, %edx s+= i; .LVL2: } .L5: return s; .loc 1 5 0 s+=i 0 } addl %edx, %eax .loc 1 4 0 i++ 0 addl $1, %edx i < count 0 cmpl %edi, %edx go again 0 jne .L5 .L4: .LVL3: .loc 1 8 0 Done 0 rep ; ret 2

  4. Array Access in a Loop sum: mem ops .LFB2: int array[10]; .loc 1 5 0 .LVL0: int sum(int count) .loc 1 8 0 s = 0 0 { movl $0, %eax access memory at int s = 0; .LVL1: count <= 0? 0 int i; testl %edi, %edi yes? skip everything 0 for(i = 0; i < count; i++) { jle .L4 array + (long)i * 4 s = 0 0 s+= array[i]; movl $0, %eax i = 0; is in a 32 bit number 0 } movl $0, %ecx return s; .LVL2: t1 = 0, this is a 64-bit 0 } movl $0, %edx version of i, for address calc purposes. .L5: .loc 1 9 0 addl array(,%rdx,4), %eax s += array[i] 1 .loc 1 8 0 i++ 0 addl $1, %ecx t1++ 0 addq $1, %rdx i < count 0 cmpl %edi, %ecx 0 jne .L5 .L4: .LVL3: .loc 1 12 0 0 rep ; ret .LFE2: The array is statically .size sum, .-sum allocate 40 bytes for array .comm array,40,32 aligned at 32 byte boundary declared here 3

  5. Array Access in a Loop sum: mem ops .LFB2: int array[10]; .loc 1 5 0 .LVL0: int sum(int count) .loc 1 8 0 s = 0 0 { movl $0, %eax int s = 0; .LVL1: ??? 0 int i; testl %edi, %edi ??? 0 for(i = 0; i < count; i++) { jle .L4 s = 0 0 s+= array[i]; movl $0, %eax i = 0; is in a 32 bit number 0 } movl $0, %ecx return s; .LVL2: t1 = 0, this is 0 } movl $0, %edx an address (64 bits) .L5: .loc 1 9 0 addl array(,%rdx,4), %eax s += array[1] 1 .loc 1 8 0 i++ 0 addl $1, %ecx t1++ 0 addq $1, %rdx i < count 0 cmpl %edi, %ecx 0 jne .L5 .L4: .LVL3: .loc 1 12 0 Good Spatial 0 rep ; ret .LFE2: .size sum, .-sum Locality allocate 40 .comm array,40,32 bytes for array aligned at 32 byte boundary byte array array +1 ... first access Second access Third access 4

  6. Long long int instead arayLoop2.c .globl sum .type sum, @function sum: .LFB2: .loc 1 5 0 .LVL0: .loc 1 8 0 s = 0 movl $0, %eax .LVL1: ??? long long int array[10]; testl %edi, %edi ??? jle .L4 s = 0 int sum(int count) movl $0, %eax i = 0 { movl $0, %edx int s = 0; .LVL2: cast count long long int i; movslq %edi,%rcx to a long long int for(i = 0; i < count; i++) { .LVL3: s+= array[i]; .L5: } .loc 1 9 0 addl array(,%rdx,8),%eax now x8 1 return s; instead x4 } .loc 1 8 0 i++ addq $1, %rdx i < count cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .LVL5: .loc 1 12 0 rep ; ret .LFE2: .size sum, .-sum 2x the 5 .comm array,80,32 bytes

  7. Structs How big is aStruct? 24 bytes! me struct.c m 0 a ops 1 a First .globl sum 2 a struct aStruct{ .type sum, @function 3 a int a; sum: 4 b int b; .LFB2: 5 b Second 6 b char c; .loc 1 9 0 7 b s == rdi long long int d; .LVL0: 8 c Third }; .loc 1 13 0 9 padding for alignment t = 0; t += s->a 1 movl (%rdi), %eax 10 padding for alignment 11 padding for alignment t += s->b 1 int sum(struct aStruct * s) { addl 4(%rdi), %eax 12 padding for alignment .LVL1: 13 padding for alignment 14 padding for alignment cast s->c to 1 int t = 0; movsbl 8(%rdi),%edx long 15 padding for alignment t+= s->c t += s->a; addl %edx, %eax 16 d 17 d t += s->b; .LVL2: 18 d t+= s->d 1 t += s->c; addl 16(%rdi), %eax 19 d Fourth t += s->d; .loc 1 19 0 20 d 21 d ret 22 d 23 d return t; Spatial locality? } Some good some bad 6

  8. Structs How big is aStruct? 24 bytes! me struct.c m 0 a ops 1 a First .globl sum 2 a struct aStruct{ .type sum, @function 3 a int a; sum: 4 b int b; .LFB2: 5 b Second 6 b char c; .loc 1 9 0 7 b s == rdi long long int d; .LVL0: 8 c Third }; .loc 1 13 0 9 padding for alignment t = 0; t += s->a 1 movl (%rdi), %eax 10 padding for alignment 11 padding for alignment t += s->b 1 int sum(struct aStruct * s) { addl 4(%rdi), %eax 12 padding for alignment .LVL1: 13 padding for alignment 14 padding for alignment cast s->c to 1 int t = 0; movsbl 8(%rdi),%edx long 15 padding for alignment t+= s->c t += s->a; addl %edx, %eax 16 d 17 d t += s->b; .LVL2: 18 d t+= s->d 1 t += s->c; addl 16(%rdi), %eax 19 d Fourth t += s->d; .loc 1 19 0 20 d 21 d ret 22 d 23 d return t; Spatial locality? } Note the usefulness of the Some good some bad immediate for mem ops. 6

  9. 2D Array sum: long long int array[10][10]; .LFB2: .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7

  10. 2D Array sum: long long int array[10][10]; .LFB2: .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7

  11. 2D Array sum: long long int array[10][10]; .LFB2: x = (x + 4x)*16 .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d x = 16x+64x long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 x*=80 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax 80 = 10 x sizeof(long long) x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7

  12. 2D Array sum: long long int array[10][10]; .LFB2: x = (x + 4x)*16 .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d x = 16x+64x long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 x*=80 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax 80 = 10 x sizeof(long long) x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7

Download Presentation
Download Policy: The content available on the website is offered to you 'AS IS' for your personal information and use only. It cannot be commercialized, licensed, or distributed on other websites without prior consent from the author. To download a presentation, simply click this link. If you encounter any difficulties during the download process, it's possible that the publisher has removed the file from their server.

Recommend


More recommend