/home/ytang/slides http://docs.nvidia.com/cuda/index.html - - PowerPoint PPT Presentation

▶

Jun 06, 2023 325 likes •546 views

/home/ytang/slides http://docs.nvidia.com/cuda/index.html __global__ void foo( ... ) { __global__ void foo( int *bar ) { if (

SLIDE 1

SLIDE 2

/home/ytang/slides
http://docs.nvidia.com/cuda/index.html

SLIDE 3

SLIDE 4

SLIDE 5

SLIDE 6

SLIDE 7

__global__ void foo( ... ) {

if ( threadIdx.x % 2 ) { ... } else { ... } } global void foo( ... ) { if ( ( threadIdx.x / warpSize ) % 2 ) { ... } else { ... } } global void foo( int bar ) { if ( bar[threadIdx.x] ) { ... } else { ... } } global void foo( int bar ) { int tid = threadIdx.x; for( int i = 0; i < bar[tid]; i++ ) { ... } }

SLIDE 8

GPU & On-chip memory

Off-chip GRAM

SLIDE 9

SLIDE 10

cudaError_t cudaMalloc ( void** devPtr, size_t size );
cudaError_t cudaFree ( void* devPtr ) ;
device-side malloc/new/free/delete
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind );
cudaError_t cudaMemset ( void* devPtr, int value, size_t count );
ptr[ index ] = value;

SLIDE 11

Coalesced, Aligned

Coalesced, Unaligned Uncoalesced, Unaligned Strided

SLIDE 12

__global__ void foo( int *bar ) {

bar[thread_id()] = ...; } global void foo( int bar ) { bar[thread_id()+8] = ...; } global void foo( int bar ) { bar[thread_id()+13] = ...; } global void foo( int bar ) { int e = bar[thread_id()+16]; } global void foo( double bar ) { double e = bar[thread_id()+16]; } global void foo( int2 bar ) { int e = bar[thread_id()].x; } global void foo( float4 bar ) { float e = bar[thread_id()].z; } global void foo( int map, int bar ) { int e = bar[ map[thread_id()] ]; }

SLIDE 13

SLIDE 14

SLIDE 15

__global__ void foo( int *bar, int *map ) {

int x = ldg( bar + map[ threadIdx.x ] ); } global void foo2( const int* restrict bar, int *map ) { int x = bar[ map[ threadIdx.x] ]; }

SLIDE 16

SLIDE 17

modify = add, sub, exchange, etc...
float

shared int sum; int b = ...; sum += b; shared int sum; int b = ...; register r = sum; r += b; sum = r; shared int sum; int b0 = ...; register r0 = sum; r0 += b0; int b1 = ...; register r1 = sum; sum = r0; r1 += b1; sum = r1;

SLIDE 18

type __shfl(type var, int srcLane, int width=warpSize);
__shfl()

__shfl_up() __shfl_down() __shfl_xor()

SLIDE 19

𝑇𝑜 = σ𝑗=0

𝑜−1 𝑏𝑗

for(int i = 0 ; i < n ; i++) sum += a[i];

SLIDE 20