Programming Models and Runtime Systems for Heterogeneous Architectures
Sylvain Henry
sylvain.henry@inria.fr
Advisors: Denis Barthou and Alexandre Denis
November 14, 2013
1
Programming Models and Runtime Systems for Heterogeneous - - PowerPoint PPT Presentation
Programming Models and Runtime Systems for Heterogeneous Architectures Sylvain Henry sylvain.henry@inria.fr Advisors: Denis Barthou and Alexandre Denis November 14, 2013 1 High-Performance Computing Sources: Dassault aviation, BMW,
sylvain.henry@inria.fr
Advisors: Denis Barthou and Alexandre Denis
1
2
Sources: Dassault aviation, BMW, Larousse, Interstices
3
Parallel architectures
exponentially increases power consumption
processor speeds
3
Parallel architectures
exponentially increases power consumption
processor speeds
3
Parallel architectures
exponentially increases power consumption
processor speeds
4
Specialized parallel architectures
4
Specialized parallel architectures
4
Specialized parallel architectures
4
Specialized parallel architectures
5
5
6
... ... ...
Memory Memory Memory Memory Memory
6
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
...
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
...
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
...
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A A B
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A A B
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A A B C
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A A B C
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A A B C C
7
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
cuda
Memory PU
mic
Memory Memory PU
OpenCL
PU
mic
PU
mic
... B A C
8
Low-level approach (e.g. OpenCL, CUDA. . . )
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 1 Context 2
8
Low-level approach (e.g. OpenCL, CUDA. . . )
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 1 Context 2
Per device command queues Command submission
8
Low-level approach (e.g. OpenCL, CUDA. . . )
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 1 Context 2
Per device command queues Command submission
Host
Device 1 Tr Tr Tr K
... ...
Device N
...
OpenCL Callback
Command graph
9
C ← A + B
float A[256], B[256], C[256]; clGetPlatformIDs(&platforms ...); clGetDeviceIDs(platforms [0], &devices ...); cl_context context = clCreateContext(devices ...); cl_command_queue cq = clCreateCommandQueue(context, devices[0]...); cl_mem bufA = clCreateBuffer(context, 1024...); cl_mem bufB = clCreateBuffer(context, 1024...); cl_mem bufC = clCreateBuffer(context, 1024...); clEnqueueWriteBuffer(cq, bufA, 0, 1024, A, NULL, &event1...); clEnqueueWriteBuffer(cq, bufB, 0, 1024, B, NULL, &event2...); clSetKernelArg(kernelAdd, 0, sizeof (cl_mem), &bufA); clSetKernelArg(kernelAdd, 1, sizeof (cl_mem), &bufB); clSetKernelArg(kernelAdd, 2, sizeof (cl_mem), &bufC); cl_event deps[] = {event1,event2}; clEnqueueNDRangeKernel(cq, kernelAdd, deps, &event3...); clEnqueueReadBuffer(cq, bufC, 0, 1024, C, &event3, &event4); clWaitForEvents(event4); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC);
10
∗
argv ) { c l _ c o nt e x t cxGPUContext ; c l _ d e v i c e _ i d cdDevice ; i n t deviceNr [MAX_GPU_COUNT] ; cl_command_queue commandQueue [MAX_GPU_COUNT] ; cl_mem d_Data [MAX_GPU_COUNT] ; cl_mem d_Result [MAX_GPU_COUNT] ; cl_program cpProgram ; c l _ k e r n e l reduceKernel [MAX_GPU_COUNT] ; cl_event GPUDone [MAX_GPU_COUNT] ; cl_event GPUExecution [MAX_GPU_COUNT] ; c l _ u i n t ciDeviceCount = 0; s i z e _ t programLength ; c l _ i n t ciErrNum ; char cDeviceName [ 2 5 6 ] ; cl_mem h_DataBuffer ; f l o a t h_SumGPU[MAX_GPU_COUNT∗ACCUM_N] ; f l o a t∗ h_Data ; double sumGPU ; double sumCPU , d R e l E r r o r ; h_Data = ( f l o a t ∗ ) malloc (DATA_N∗ s i z e o f ( f l o a t ) ) ; s h r F i l l A r r a y ( h_Data , DATA_N) ; cxGPUContext = clCreateContextFromType (0 , CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum ) ; i f ( shrCheckCmdLineFlag ( argc , argv , " d e v i c e " ) ) { // User s p e c i f i e d GPUs char∗ d e v i c e L i s t ;
char∗ d e v i c e S t r ;
char∗ next_token ;
// Create command queues f o r a l l Requested GPU ’ s while ( d e v i c e S t r != NULL) { // get & log d e v i c e index # and name deviceNr [ ciDeviceCount ] = a t o i ( d e v i c e S t r ) ; cdDevice = oclGetDev ( cxGPUContext , deviceNr [ ciDeviceCount ] ) ; ciErrNum = c l G e t D e v i c e I n f o ( cdDevice , CL_DEVICE_NAME, s i z e o f ( cDeviceName ) , cDeviceName , NULL ) ; shrCheckError ( ciErrNum , CL_SUCCESS ) ; // c r e a t e a command que commandQueue [ ciDeviceCount ] = clCreateCommandQueue ( cxGPUContext , cdDevice , 0 , &ciErrNum ) ; shrCheckError ( ciErrNum , CL_SUCCESS ) ; #i f d e f GPU_PROFILING ciErrNum = clSetCommandQueueProperty ( commandQueue [ ciDeviceCount ] , CL_QUEUE_PROFILING_ENABLE, CL_TRUE, NULL ) ; shrCheckError ( ciErrNum , CL_SUCCESS ) ; #e n d i f ++ciDeviceCount ; d e v i c e S t r = s t r t o k (NULL, "␣,.−" ) ; } f r e e ( d e v i c e L i s t ) ; } e l s e { // Find∗ source_path = s h r F i n d F i l e P a t h ( " simpleMultiGPU . c l " ,
argv [ 0 ] ) ; char∗ source = oclLoadProgSource ( source_path , " " , &programLength ) ; shrCheckError ( source != NULL, shrTRUE ) ; // Create the program f o r a l l GPUs i n the context cpProgram = clCreateProgramWithSource ( cxGPUContext , 1 , ( const char∗∗
) & source , &programLength , &ciErrNum ) ; shrCheckError ( ciErrNum , CL_SUCCESS ) ; // b u i l d the program ciErrNum = clBuildProgram ( cpProgram , 0 , NULL, " −cl− mad −enable " , NULL, NULL ) ; i f ( ciErrNum != CL_SUCCESS) { // w r i t e11
(for each architecture)
12
13
14
15
16
17
18
18
19
Vendor A OpenCL Vendor Z OpenCL SOCL Installable Client Driver (libOpenCL) Application Installable Client Driver (libOpenCL)
Vendor A OpenCL Vendor Z OpenCL SOCL GPU GPU MIC
GPU GPU MIC
...
20
PU
cpu
PU
cpu
PU
cpu
PU
cpu
PU
gpu
PU
gpu
PU
acc
Memory Memory Memory Memory
20
PU
cpu
PU
cpu
PU
cpu
PU
cpu
PU
gpu
PU
gpu
PU
acc
A B
20
PU
cpu
PU
cpu
PU
cpu
PU
cpu
PU
gpu
PU
gpu
PU
acc
A B C A B r w r
20
PU
cpu
PU
cpu
PU
cpu
PU
cpu
PU
gpu
PU
gpu
PU
acc
A B C A B C C rw
20
PU
cpu
PU
cpu
PU
cpu
PU
cpu
PU
gpu
PU
gpu
PU
acc
A B C A B C C
20
PU
cpu
PU
cpu
PU
cpu
PU
cpu
PU
gpu
PU
gpu
PU
acc
A B C A B C C
21
float A[256], B[256], C[256]; clGetPlatformIDs(&platforms ...); clGetDeviceIDs(platforms [0], &devices ...); cl_context context = clCreateContext(devices ...); cl_command_queue cq1 = clCreateCommandQueue(context, devices[0]...); cl_command_queue cq2 = clCreateCommandQueue(context, devices[1]...); cl_mem bufA = clCreateBuffer(context, 1024...); cl_mem bufB = clCreateBuffer(context, 1024...); cl_mem bufC = clCreateBuffer(context, 1024...); cl_mem bufC2 = clCreateBuffer(context, 1024...); clEnqueueWriteBuffer(cq1, bufA, 0, 1024, A, NULL, &event1...); clEnqueueWriteBuffer(cq1, bufB, 0, 1024, B, NULL, &event2...); clSetKernelArg(kernelAdd, 0, sizeof (cl_mem), &bufA); clSetKernelArg(kernelAdd, 1, sizeof (cl_mem), &bufB); clSetKernelArg(kernelAdd, 2, sizeof (cl_mem), &bufC); cl_event deps[] = {event1,event2}; clEnqueueNDRangeKernel(cq1, kernelAdd, deps, &event3...); clEnqueueReadBuffer(cq1, bufC, 0, 1024, C, &event3, &event4); clEnqueueWriteBuffer(cq2, bufC2, 0, 1024, C, &event3, &event5 ...); clSetKernelArg( kernelPotrf , 0, sizeof (cl_mem), &bufC2); clEnqueueNDRangeKernel(cq2, kernelPotrf, &event5, &event6...); clWaitForEvents(event6); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC); clReleaseMemObject(bufC2);
22
float A[256], B[256], C[256]; clGetPlatformIDs(&platforms ...); clGetDeviceIDs(platforms [0], &devices ...); cl_context context = clCreateContext(devices ...); cl_command_queue cq1 = clCreateCommandQueue(context, devices[0]...); cl_command_queue cq2 = clCreateCommandQueue(context, devices[1]...); cl_mem bufA = clCreateBuffer(context, 1024, CL_MEM_USE_HOST_PTR, A...); cl_mem bufB = clCreateBuffer(context, 1024, CL_MEM_USE_HOST_PTR, B...); cl_mem bufC = clCreateBuffer(context, 1024...); clSetKernelArg(kernelAdd, 0, sizeof (cl_mem), &bufA); clSetKernelArg(kernelAdd, 1, sizeof (cl_mem), &bufB); clSetKernelArg(kernelAdd, 2, sizeof (cl_mem), &bufC); clEnqueueNDRangeKernel(cq1, kernelAdd, NULL, &event1 ...); clSetKernelArg( kernelPotrf , 0, sizeof (cl_mem), &bufC); clEnqueueNDRangeKernel(cq2, kernelPotrf, &event1 , &event2 ...); clWaitForEvents( event2 ); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC);
23
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 2
Per device command queues
Context 1
23
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 2
Per context command queues
Scheduling Context 1
23
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 2
Per context command queues
Scheduling Context 1
23
PU
cpu
PU
cpu
PU
cpu
PU
cpu
Memory PU
gpu
Memory PU
gpu
Memory PU
acc
Memory Device 1 Device 2 Device 3 Context 2
Per context command queues
Scheduling Context 1
24
float A[256], B[256], C[256]; clGetPlatformIDs(&platforms ...); clGetDeviceIDs(platforms [0], &devices ...); cl_context context = clCreateContext(devices ...); cl_command_queue cq1 = clCreateCommandQueue(context, devices[0]...); cl_command_queue cq2 = clCreateCommandQueue(context, devices[1]...); cl_mem bufA = clCreateBuffer(context, 1024, CL_MEM_USE_HOST_PTR, A...); cl_mem bufB = clCreateBuffer(context, 1024, CL_MEM_USE_HOST_PTR, B...); cl_mem bufC = clCreateBuffer(context, 1024...); clSetKernelArg(kernelAdd, 0, sizeof (cl_mem), &bufA); clSetKernelArg(kernelAdd, 1, sizeof (cl_mem), &bufB); clSetKernelArg(kernelAdd, 2, sizeof (cl_mem), &bufC); clEnqueueNDRangeKernel(cq1, kernelAdd, NULL, &event1...); clSetKernelArg( kernelPotrf , 0, sizeof (cl_mem), &bufC); clEnqueueNDRangeKernel(cq2, kernelPotrf, &event1, &event2...); clWaitForEvents(event2); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC);
25
float A[256], B[256], C[256]; clGetPlatformIDs(&platforms ...); clGetDeviceIDs(platforms [0], &devices ...); cl_context context = clCreateContext(devices ...); cl_command_queue cq = clCreateCommandQueue(context, NULL...); cl_mem bufA = clCreateBuffer(context, 1024, CL_MEM_USE_HOST_PTR, A...); cl_mem bufB = clCreateBuffer(context, 1024, CL_MEM_USE_HOST_PTR, B...); cl_mem bufC = clCreateBuffer(context, 1024...); clSetKernelArg(kernelAdd, 0, sizeof (cl_mem), &bufA); clSetKernelArg(kernelAdd, 1, sizeof (cl_mem), &bufB); clSetKernelArg(kernelAdd, 2, sizeof (cl_mem), &bufC); clEnqueueNDRangeKernel(cq, kernelAdd, NULL, &event1...); clSetKernelArg( kernelPotrf , 0, sizeof (cl_mem), &bufC); clEnqueueNDRangeKernel(cq, kernelPotrf , &event1, &event2 ...); clWaitForEvents(event2); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC);
26
LuxRender (rendering software)
CPU CPU 1 GPU CPU 2 GPUs 2 GPUs 1 GPU Nvidia OpenCL Intel SDK SOCL
Samples/sec (average in millions) 0.0 0.5 1.0 1.5 2.0 2.5
Averell1: Intel Xeon E5-2650 2.00GHz with 64GB, 2 NVidia Tesla M2075
27
Black Scholes - blocks of 5M options
10 25 50 100 150 200 250
Intel NVidia SOCL
# Blocks GOptions/s 0.0 0.2 0.4 0.6
Hannibal: Intel Xeon X5550 2.67GHz with 24GB, 3 NVidia Quadro FX 5800
28
29
NBody (OTOO) - 20 iterations - 4000k particles
2 4 8 16 32 64 Adaptive SOCL (heft)
# Blocks Speed−up compared to one kernel 0.0 0.5 1.0 1.5 2.0 2.5
Hannibal: Intel Xeon X5550 2.67GHz with 24GB, 3 NVidia Quadro FX 5800
30
31
(2012)
Support - Inria Research Report (2013)
32
33
33
34
f a b @ f @ a b
d = c + c c = f a b @ f @ a b c: @ + @ c c d: @ + @ d: @ f @ a b
34
f a b @ f @ a b
d = c + c c = f a b @ f @ a b c: @ + @ c c d: @ + @ d: @ f @ a b
35
@ f4 @ result: @ f3 @ c @ f2 @ b a @ f1
f1 f2 f3 f4
Shared-Object Memory
w
T ask Graph a b c
r
t1 t1:
r r w r
t2 t3 result t2: t3:
r w w r r
r e s u l t = f4 ( f3 t1 c ) ( f2 t1 b ) t1 = f1 a
35
@ f4 @ result: @ f3 @ c @ f2 @ b a @ f1
f1 f2 f3 f4
Shared-Object Memory
w
T ask Graph a b c
r
t1 t1:
r r w r
t2 t3 result t2: t3:
r w w r r
r e s u l t = f4 ( f3 t1 c ) ( f2 t1 b ) t1 = f1 a
35
@ f4 @ result: @ f3 @ c @ f2 @ b a @ f1
f1 f2 f3 f4
Shared-Object Memory
w
T ask Graph a b c
r
t1 t1:
r r w r
t2 t3 result t2: t3:
r w w r r
r e s u l t = f4 ( f3 t1 c ) ( f2 t1 b ) t1 = f1 a
36
@ + @ x f: @ * @ x y @ * @ y λ.y λ.x
f x y = (x ∗ x) + (y ∗ y)
36
@ + @ x f: @ * @ x y @ * @ y λ.y λ.x
f x y = (x ∗ x) + (y ∗ y)
if x == 0 then f y else g z @ if @ x @ == @ @ g @ z f @ y
36
@ + @ x f: @ * @ x y @ * @ y λ.y λ.x
f x y = (x ∗ x) + (y ∗ y)
if x == 0 then f y else g z @ if @ x @ == @ @ g @ z f @ y
while test f x = if test x then (while test f (f x)) else x
36
@ + @ x f: @ * @ x y @ * @ y λ.y λ.x
f x y = (x ∗ x) + (y ∗ y)
if x == 0 then f y else g z @ if @ x @ == @ @ g @ z f @ y
while test f x = if test x then (while test f (f x)) else x
37
38
Tiled matrix addition
addTiled a b = u n s p l i t w h ( zipWith2D (+) ( s p l i t w h a ) ( s p l i t w h b ))
split 2 3 split 2 3 a b a00 a01 a02 a12 a11 a10 b00 b10 b11 b01 b02 b12 a00+b00 a01+b01 a02+b02a12+b12 a11+b11 a10+b10 unsplit 2 3 zipWith2D (+)
39
"+" OpenCL kernel CPU kernel CUDA kernel
addTiled a b = u n s p l i t w h ( zipWith2D (+) ( s p l i t w h a ) ( s p l i t w h b ))
39
"+" OpenCL kernel CPU kernel CUDA kernel "addTiled"
addTiled a b = u n s p l i t w h ( zipWith2D (+) ( s p l i t w h a ) ( s p l i t w h b ))
39
"+" OpenCL kernel CPU kernel CUDA kernel "addTiled"
addTiled a b = u n s p l i t w h ( zipWith2D (+) ( s p l i t w h a ) ( s p l i t w h b ))
40
r = a + b + c r = (unsplit w h (zipWith2D (+) (split w h a) ( split w h b))) + c r = unsplit w h (zipWith2D (+) ( split w h ( unsplit w h (zipWith2D (+) (split w h a) ( split w h b)))) ( split w h c)) r = unsplit w h (zipWith2D (+) (zipWith2D (+) (split w h a) ( split w h b))) ( split w h c))
41
a b r square x = x * x r = let a' = square a b' = square b in (a'-b') * (a'+b')
Runtime System
Input data Output data Program "*" = {matrixMulOpenCL, matrixMulCUDA, matrixMulCPU...} "-" = {...} "+" = {...} Kernels
42
pf < − i n i t P l a t f o r m $ C o n f i g u r a t i o n { libraryOpenCL = " libOpenCL . so " } r t < − i n i t R u n t i m e pf e a g e r S c h e d u l e r a < − i n i t F l o a t M a t r i x r t [ [ 1 . 0 , 2.0 , 3 . 0 ] , [ 4 . 0 , 5.0 , 6 . 0 ] , [ 7 . 0 , 8.0 , 9 . 0 ] ] b < − i n i t F l o a t M a t r i x r t [ [ 1 . 0 , 4.0 , 7 . 0 ] , [ 2 . 0 , 5.0 , 8 . 0 ] , [ 3 . 0 , 6.0 , 9 . 0 ] ] b u i l t i n s < − l o a d B u i l t i n s r t [ ( "+" , f l o a t M a t r i x A d d B u i l t i n ) , ( "−" , f l o a t M a t r i x S u b B u i l t i n ) , ( "∗" , f l o a t M a t r i x M u l B u i l t i n ) , ( "a" , d a t a B u i l t i n a ) , ( "b" , d a t a B u i l t i n b ) ] prog < − r e a d F i l e " example . vvm" r < − e v a l b u i l t i n s prog p r i n t F l o a t M a t r i x r t r
− − F i l e : example . vvm square x = x ∗ x main = l e t a ’ = square a b ’ = square b i n ( a’−b ’ ) ∗ ( a’+b ’ )
__kernel void floatMatrixAdd ( u i n t width , u i n t height , __global f l o a t ∗ A, __global f l o a t ∗ B, __global f l o a t ∗ C){ i n t gx = get_global_id ( 0 ) ; i n t gy = get_global_id ( 1 ) ; i f ( gx < width && gy < h e i g h t ) { C[ gy∗width+gx ] = A[ gy∗width+gx ] + B[ gy∗width+gx ] ; } }
Configuration
Host code (mostly imperative)
Coordination
Parallel functional code
Computation
Kernels (C, Fortran, CUDA, OpenCL...)
43
Tiled matrix addition example
/∗ StarPU ∗/ s t r u c t s t a r p u _ d a t a _ f i l t e r f = { . f i l t e r _ f u n c = s t a r p u _ m a t r i x _ f i l t e r _ v e r t i c a l _ b l o c k , . n c h i l d r e n = w } ; s t r u c t s t a r p u _ d a t a _ f i l t e r f2 = { . f i l t e r _ f u n c = s t a r p u _ m a t r i x _ f i l t e r _ b l o c k , . n c h i l d r e n = h } ; starpu_data_map_filters ( a , 2 , &f , &f2 ) ; starpu_data_map_filters (b , 2 , &f , &f2 ) ; starpu_data_map_filters ( c , 2 , &f , &f2 ) ; f o r ( i =0; i <nw ; i ++) { f o r ( j =0; j<nh ; j++) { starpu_data_handle_t sa = starpu_data_get_sub_data ( a , 2 , i , j ) ; starpu_data_handle_t sb = starpu_data_get_sub_data (b , 2 , i , j ) ; starpu_data_handle_t sc = starpu_data_get_sub_data ( c , 2 , i , j ) ; s t a r p u _ i n s e r t _ t a s k (&add , STARPU_R, sa , STARPU_R, sb , STARPU_W, sc , 0 ) ; }} s t a r p u _ t a s k _ w a i t _ f o r _ a l l ( ) ; s t a r p u _ d a t a _ u n p a r t i t i o n ( c , 0 ) ; − − ViperVM : e x p l i c i t c = u n s p l i t ( zipWith2D (+) ( s p l i t w h a ) ( s p l i t w h b )) − − ViperVM : with automatic g r a n u l a r i t y a daptatio n c = a + b
44
Dimensions ViperVM ViperVM StarPU 3 GPUs+CPU 3 GPUs 3 GPUs 16K x 16K 1.9s 2.1s 1.4s 24K x 24K 4.0s 4.4s 2.9s
w x h 1024x1024 4096x1024 1024x4096 GPU (1x) 4.5s 4.4s 4.3s GPU (2x) 3.6s 2.9s 3.2s GPU (3x) 3.1s 2.5s 3.3s CPU 31s 36s 35s GPU (3x) + CPU 3.3s 3.7s 10s
45
46
Conclusion
Computing on Heterogeneous Architectures - FHPC workshop (2013)
47
48