nvidia gpus with pgi c
play

NVIDIA GPUS WITH PGI C++ David Olsen GTC S9770 March 20, 2019 - PowerPoint PPT Presentation

C++17 PARALLEL ALGORITHMS ON NVIDIA GPUS WITH PGI C++ David Olsen GTC S9770 March 20, 2019 __global__ void saxpy_kernel(float* x, float* y, float* z, float a, int N) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i +=


  1. C++17 PARALLEL ALGORITHMS ON NVIDIA GPUS WITH PGI C++ David Olsen GTC S9770 March 20, 2019

  2. __global__ void saxpy_kernel(float* x, float* y, float* z, float a, int N) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += gridDim.x * blockDim.x) { z[i] = x[i] * a + y[i]; } } void saxpy(float* x, float* y, float* z, float a, int N) { size_t size = N * sizeof(float); float *d_x, *d_y, *d_z; cudaMalloc(&d_x, size); cudaMalloc(&d_y, size); cudaMalloc(&d_z, size); cudaMemcpy(d_x, x, size, cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, size, cudaMemcpyHostToDevice); saxpy_kernel<<<64,256>>>(d_x, d_y, d_z, a, N); cudaMemcpy(z, d_z, size, cudaMemcpyDeviceToHost); cudaFree(d_x); cudaFree(d_y); cudaFree(d_z); } 2

  3. void saxpy(float* x, float* y, float* z, float a, int N) { for (int i = 0; i < N; ++i) { z[i] = x[i] * a + y[i]; } } 3

  4. GPU C++ PROGRAMMING TODAY KOKKOS #pragmas Language Extensions Libraries 4

  5. C++17 PARALLEL ALGORITHMS Parallelism in Standard C++ Execution policies can be applied to most standard algorithms std::execution::seq = sequential std::execution::par = parallel std::execution::par_unseq = parallel + vectorized Several existing algorithms were renamed accumulate => reduce inner_product => transform_reduce partial_sum => inclusive_scan 5

  6. C++17 PARALLEL ALGORITHMS Example C++98: std::sort(c.begin(), c.end()); C++17: std::sort(std::execution::par, c.begin(), c.end()); C++98: double prod = std::accumulate( first, last, 1.0, std::multiplies()); C++17: double prod = std::reduce(std::execution::par, first, last, 1.0, std::multiplies()); 6

  7. THE FUTURE OF GPU PROGRAMMING Standard C++ | Directives | CUDA __global__ void saxpy(int n, float a, float *x, float *y) { int i = blockIdx.x*blockDim.x + #pragma acc data copy(x,y) { threadIdx.x; if (i < n) y[i] += a*x[i]; ... } std::transform(par, x, x+n, y, y, int main(void) { [ = ]( float x, float y) { ... return y + a*x; cudaMemcpy(d_x, x, ...); } ); cudaMemcpy(d_y, y, ...); std::transform(par, x, x+n, y, y, ... [ = ]( float x, float y ){ saxpy<<<(N+255)/256,256>>>(...); return y + a*x; } cudaMemcpy(y, d_y, ...); } ); Incremental Performance GPU Accelerated Maximize GPU Performance Optimization with OpenACC Standard C++ with CUDA C++ 7

  8. THE FUTURE OF GPU PROGRAMMING Standard C++ | Directives | CUDA Coming soon to a PGI C++ compiler __global__ near you void saxpy(int n, float a, float *x, float *y) { int i = blockIdx.x*blockDim.x + #pragma acc data copy(x,y) { threadIdx.x; if (i < n) y[i] += a*x[i]; ... } std::transform(par, x, x+n, y, y, int main(void) { [ = ]( float x, float y) { ... return y + a*x; cudaMemcpy(d_x, x, ...); } ); cudaMemcpy(d_y, y, ...); std::transform(par, x, x+n, y, y, ... [ = ]( float x, float y ){ saxpy<<<(N+255)/256,256>>>(...); return y + a*x; } cudaMemcpy(y, d_y, ...); } ); Incremental Performance GPU Accelerated Maximize GPU Performance Optimization with OpenACC Standard C++ with CUDA C++ 8

  9. PGI — THE NVIDIA HPC SDK Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on Linux, macOS, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT , TotalView 9

  10. PGI Compilers, The NVIDIA HPC SDK: Updates for 2019 Michael Wolfe (NVIDIA, PGI Compiler Engineer) Thursday, 10:00am, Room 211A 10

  11. CODE EXAMPLES 11

  12. TRAVELING SALESMAN Find the shortest route that visits every city 12

  13. TRAVELING SALESMAN Sequential code route_cost find_best_route(int const* distances, int N) { long num_routes = factorial(N); route_cost best_route; for (long i = 0; i < num_routes; ++i) { int cost = 0; route_iterator it(i, N); int from = it.first(); while (!it.done()) { int to = it.next(); cost += distances[from*N + to]; from = to; } best_route = route_cost::min(best_route, route_cost(i, cost)); } return best_route; } 13

  14. TRAVELING SALESMAN Helper code route_cost is a (route ID, cost) pair, and a min function to return the least costly route struct route_cost { long route; int cost; route_cost() : route(-1), cost(std::numeric_limits<int>::max()) { } route_cost(long route, int cost) : route(route), cost(cost) { } static route_cost min(route_cost const& x, route_cost const& y) { if (x.cost < y.cost) { return x; } return y; } }; 14

  15. TRAVELING SALESMAN Helper code Route_iterator calculates a route, given a route ID and the number of cities struct route_iterator { route_iterator(long route_id, int num_hops); bool done() const; // at the end of the route ? int first(); // first city of the route int next(); // next city of the route }; 15

  16. TRAVELING SALESMAN Sequential code route_cost find_best_route(int const* distances, int N) { long num_routes = factorial(N); route_cost best_route; for (long i = 0; i < num_routes; ++i) { int cost = 0; route_iterator it(i, N); int from = it.first(); while (!it.done()) { int to = it.next(); cost += distances[from*N + to]; from = to; } best_route = route_cost::min(best_route, route_cost(i, cost)); } return best_route; } 16

  17. TRAVELING SALESMAN Sequential code route_cost find_best_route(int const* distances, int N) { long num_routes = factorial(N); route_cost best_route; for (long i = 0; i < num_routes; ++i) { int cost = 0; route_iterator it(i, N); int from = it.first(); while (!it.done()) { int to = it.next(); cost += distances[from*N + to]; from = to; } best_route = route_cost::min(best_route, route_cost(i, cost)); } return best_route; } 17

  18. TRAVELING SALESMAN Sequential code route_cost find_best_route(int const* distances, int N) { long num_routes = factorial(N); route_cost best_route; for (long i = 0; i < num_routes; ++i) { int cost = 0; route_iterator it(i, N); int from = it.first(); while (!it.done()) { int to = it.next(); cost += distances[from*N + to]; from = to; } best_route = route_cost::min(best_route, route_cost(i, cost)); } return best_route; } 18

  19. TRAVELING SALESMAN Analysis route_cost find_best_route(int const* distances, int N) { long num_routes = factorial(N); route_cost best_route; for (long i = 0; i < num_routes; ++i) { int cost = 0; route_iterator it(i, N); int from = it.first(); while (!it.done()) { int to = it.next(); cost += distances[from*N + to]; from = to; } best_route = route_cost::min(best_route, route_cost(i, cost)); } return best_route; } 19

  20. TRAVELING SALESMAN Manual threading route_cost find_best_route(int const* distances, while (!it.done()) { int N) { int to = it.next(); long num_routes = factorial(N); cost += distances[from*N + to]; route_cost best_route; from = to; std::mutex route_mutex; } int num_threads = local_best = route_cost::min( std::thread::hardware_concurrency(); local_best, route_cost(i, cost)); if (num_threads == 0) num_threads = 4; } std::vector<std::thread> threads; std::lock_guard<std::mutex> lck(route_mutex); threads.reserve(num_threads); best_route = route_cost::min( for (int t = 0; t < num_threads; ++t) { best_route, local_best); threads.push_back(std::thread( }, t)); [=, &best_route, &route_mutex](int chunk) { } route_cost local_best; for (std::thread& th : threads) { for (long i = chunk; i < num_routes; th.join(); i += num_threads) { } int cost = 0; return best_route; route_iterator it(i, N); } int from = it.first(); 20

  21. TRAVELING SALESMAN Manual threading route_cost find_best_route(int const* distances, while (!it.done()) { int N) { int to = it.next(); long num_routes = factorial(N); cost += distances[from*N + to]; route_cost best_route; from = to; std::mutex route_mutex; } int num_threads = local_best = route_cost::min( std::thread::hardware_concurrency(); local_best, route_cost(i, cost)); if (num_threads == 0) num_threads = 4; } std::vector<std::thread> threads; std::lock_guard<std::mutex> lck(route_mutex); threads.reserve(num_threads); best_route = route_cost::min( for (int t = 0; t < num_threads; ++t) { best_route, local_best); threads.push_back(std::thread( }, t)); [=, &best_route, &route_mutex](int chunk) { } route_cost local_best; for (std::thread& th : threads) { for (long i = chunk; i < num_routes; th.join(); i += num_threads) { } int cost = 0; return best_route; route_iterator it(i, N); } int from = it.first(); 21

  22. TRAVELING SALESMAN Manual threading route_cost find_best_route(int const* distances, while (!it.done()) { int N) { int to = it.next(); long num_routes = factorial(N); cost += distances[from*N + to]; route_cost best_route; from = to; std::mutex route_mutex; } int num_threads = local_best = route_cost::min( std::thread::hardware_concurrency(); local_best, route_cost(i, cost)); if (num_threads == 0) num_threads = 4; } std::vector<std::thread> threads; std::lock_guard<std::mutex> lck(route_mutex); threads.reserve(num_threads); best_route = route_cost::min( for (int t = 0; t < num_threads; ++t) { best_route, local_best); threads.push_back(std::thread( }, t)); [=, &best_route, &route_mutex](int chunk) { } route_cost local_best; for (std::thread& th : threads) { for (long i = chunk; i < num_routes; th.join(); i += num_threads) { } int cost = 0; return best_route; route_iterator it(i, N); } int from = it.first(); 22

Download Presentation
Download Policy: The content available on the website is offered to you 'AS IS' for your personal information and use only. It cannot be commercialized, licensed, or distributed on other websites without prior consent from the author. To download a presentation, simply click this link. If you encounter any difficulties during the download process, it's possible that the publisher has removed the file from their server.

Recommend


More recommend