Interoperability of Shared Memory Parallel Programming Models with Charm++
Jæmin Choi
University of Illinois Urbana-Champaign
May 2, 2019
1 / 16
Interoperability of Shared Memory Parallel Programming Models with - - PowerPoint PPT Presentation
Interoperability of Shared Memory Parallel Programming Models with Charm++ Jmin Choi University of Illinois Urbana-Champaign May 2, 2019 1 / 16 Overview 2. Compiling Libraries 3. Creating Hybrid Programs 4. Vector Addition Example 5.
1 / 16
2 / 16
▶ ’Performance portability’ ▶ Abstractions for parallel execution and data management
▶ Support for wide variety of architectures ▶ Load balancing 3 / 16
▶ OpenMP backend for CPU ▶ CUDA backend for GPU
4 / 16
mkdir b u i l d && cd b u i l d . . / generate_makefile . bash − −p r e f i x =<absolute path to build > \ − −with−cuda=<path to CUDA t o o l k i t > − −with−cuda−options=enable_lambda \ − −with−
−arch=<CPU arch >,<GPU arch > − −compiler=<path to included NVCC wrapper > make −j kokkoslib make i n s t a l l
5 / 16
mkdir b u i l d && mkdir i n s t a l l && cd b u i l d cmake −DENABLE_CUDA=On −DCMAKE_INSTALL_PREFIX=<path to RAJA i n s t a l l folder > . . / make −j make i n s t a l l
6 / 16
▶ Can be put in the same fjle as Charm++ if GPU is not used
▶ A nodegroup chare for each Kokkos/RAJA instance
▶ Additional options needed (e.g. -fopenmp) ▶ Use NVCC wrapper with Kokkos
▶ Need to link Kokkos/RAJA library
7 / 16
mainmodule vecadd { . . . mainchare Main { . . . } / / E n c a p s u l a t e a Kokkos i n s t a n c e / process nodegroup Process { entry Process ( ) ; entry void run ( ) ; } }
8 / 16
. . . class Process : public CBase_Process { public : Process ( ) { k o k k o s I n i t ( ) ; / / C a l l s Kokkos : : i n i t i a l i z e ( ) i n t e r n a l l y } void run ( ) { / / Execute v e c t o r a d d i t i o n / / Uses OpenMP by d e f a u l t , uses CUDA i f use_gpu vecadd ( n , CkMyNode ( ) , use_gpu ) ; k o k k o s F i n a l i z e ( ) ; / / C a l l s Kokkos : : f i n a l i z e ( ) i n t e r n a l l y / / C o n t r i b u t e to Main to end the program . . . } }
9 / 16
#include <Kokkos_Core . hpp> . . . / / Views typedef Kokkos : : View <double * , Kokkos : : LayoutLeft , Kokkos : : CudaSpace > CudaView ; typedef Kokkos : : View <double * , Kokkos : : LayoutRight , Kokkos : : CudaHostPinnedSpace > HostView ; / / F u n c t o r s template <typename ViewType > struct Compute { ViewType a , b ; Compute ( const ViewType& d_a , const ViewType& d_b ) : a ( d_a ) , b ( d_b ) { } KOKKOS_INLINE_FUNCTION void operator ( ) ( const i n t& i ) const { a ( i ) += b ( i ) ; } } . . . void vecadd ( const ui n t 6 4 _ t n , i n t process , bool use_gpu ) { HostView h_a ( ” Host A” , n ) ; CudaView d_a ( ” Device A” , n ) ; CudaView d_b ( ” Device B” , n ) ; Kokkos : : p a r a l l e l _ f o r ( Kokkos : : RangePolicy <Kokkos : : Cuda >(0 , n ) , Compute<CudaView >( d_a , d_b ) ) ; Kokkos : : deep_copy ( h_a , d_a ) ; }
10 / 16
mainmodule vecadd { . . . mainchare Main { . . . } / / E n c a p s u l a t e a RAJA i n s t a n c e / process nodegroup Process { entry Process ( ) ; entry void run ( ) ; } }
11 / 16
. . . class Process : public CBase_Process { public : Process ( ) { / / No i n i t i a l i z a t i o n / cleanup needed } void run ( ) { / / Execute v e c t o r a d d i t i o n / / Uses OpenMP by d e f a u l t , uses CUDA i f use_gpu vecadd ( n , CkMyNode ( ) , use_gpu ) ; / / C o n t r i b u t e to Main to end the program . . . } }
12 / 16
void vecadd ( const ui n t 6 4 _ t n , i n t process , bool use_gpu ) { double *h_a , *d_a , *d_b ; cudaErrchk ( cudaMallocHost ( ( void **)& h_a , n * sizeof ( double ) ) ) ; cudaErrchk ( cudaMalloc ( ( void **)& d_a , n * sizeof ( double ) ) ) ; cudaErrchk ( cudaMalloc ( ( void **)&d_b , n * sizeof ( double ) ) ) ; RAJA : : f o r a l l <RAJA : : cuda_exec <256>>( RAJA : : RangeSegment ( 0 , n ) , [ = ] RAJA_DEVICE ( i n t i ) { d_a [ i ] += d_b [ i ] ; } ) ; cudaErrchk ( cudaMemcpy ( h_a , d_a , n * sizeof ( double ) , cudaMemcpyDeviceToHost ) ) ; }
13 / 16
14 / 16
▶ In NUMA environments, etc. ▶ Should be able to pin Charm++ processes to a set of cores
15 / 16
16 / 16