PROGRAMMING AND SIMULATING HETEROGENEOUS DEVICES - OPENCL AND MULTI2SIM
Rafael Ubal, Dana Schaa, Perhaad Mistry, David Kaeli Department of Electrical and Computer Engineering Northeastern University Boston, MA
PROGRAMMING AND SIMULATING HETEROGENEOUS DEVICES - OPENCL AND - - PowerPoint PPT Presentation
PROGRAMMING AND SIMULATING HETEROGENEOUS DEVICES - OPENCL AND MULTI2SIM Rafael Ubal, Dana Schaa, Perhaad Mistry, David Kaeli Department of Electrical and Computer Engineering Northeastern University Boston, MA ICPE 2012 Boston, MA AGENDA
Rafael Ubal, Dana Schaa, Perhaad Mistry, David Kaeli Department of Electrical and Computer Engineering Northeastern University Boston, MA
2 | ICPE Tutorial | April 2012
3 | ICPE Tutorial | April 2012
Multiple cores driving performance increases
Increasingly general purpose data-parallel computing Improving numerical precision
4 | ICPE Tutorial | April 2012
5 | ICPE Tutorial | April 2012
6 | ICPE Tutorial | April 2012
Get Platform Information Get Device Information
7 | ICPE Tutorial | April 2012
8 | ICPE Tutorial | April 2012
9 | ICPE Tutorial | April 2012
10 | ICPE Tutorial | April 2012
11 | ICPE Tutorial | April 2012
12 | ICPE Tutorial | April 2012
13 | ICPE Tutorial | April 2012
14 | ICPE Tutorial | April 2012
15 | ICPE Tutorial | April 2012
16 | ICPE Tutorial | April 2012
17 | ICPE Tutorial | April 2012
18 | ICPE Tutorial | April 2012
19 | ICPE Tutorial | April 2012
20 | ICPE Tutorial | April 2012
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1 4 1 5 1 2 1 3 1 1 1 8 9 6 7 4 5 2 3 1 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 10 10 11 11 1 1 12 12 2 13 3 3 14 4 4 15 5 5
21 | ICPE Tutorial | April 2012
22 | ICPE Tutorial | April 2012
Memory Description Global Accessible by all work-items Constant Read-only, global Local Local to a work-group Private Private to a work-item
23 | ICPE Tutorial | April 2012
24 | ICPE Tutorial | April 2012
25 | ICPE Tutorial | April 2012
Kernels args are set * Step 8
26 | ICPE Tutorial | April 2012
Context
27 | ICPE Tutorial | April 2012
Context
28 | ICPE Tutorial | April 2012
29 | ICPE Tutorial | April 2012
x2 = cos(θ) * (x1) − sin(θ) * (y1) y2 = sin(θ) * (x1) + cos(θ) * (y1)
30 | ICPE Tutorial | April 2012
31 | ICPE Tutorial | April 2012
32 | ICPE Tutorial | April 2012
__kernel void image_rotate( __global float * src_data, __global float * dest_data, //Data in global memory int W, int H, //Image Dimensions float sinTheta, float cosTheta ) //Rotation Parameters { //Thread gets its index within index space const int ix = get_global_id(0); const int iy = get_global_id(1); //Calculate location of data to move into ix and iy – Output decomposition as mentioned float xpos = ( ((float) ix)*cosTheta + ((float)iy )*sinTheta); float ypos = ( ((float) iy)*cosTheta - ((float)ix)*sinTheta); if (( ((int)xpos>=0) && ((int)xpos< W))) //Bound Checking && (((int)ypos>=0) && ((int)ypos< H))) { //Read (xpos,ypos) src_data and store at (ix,iy) in dest_data dest_data[iy*W+ix] = src_data[(int)(floor(ypos*W+xpos))]; } }
33 | ICPE Tutorial | April 2012
34 | ICPE Tutorial | April 2012
35 | ICPE Tutorial | April 2012
36 | ICPE Tutorial | April 2012
37 | ICPE Tutorial | April 2012
38 | ICPE Tutorial | April 2012
clGetEventProfilingInfo( event_time, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&starttime, NULL); clGetEventProfilingInfo(event_time, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endtime, NULL); unsigned long elapsed = (unsigned long)(endtime - starttime); cl_event event_timer; clEnqueueNDRangeKernel(myqueue , myKernel,2, 0, globalws, localws, 0, NULL, &event_timer); unsigned long starttime, endtime;
39 | ICPE Tutorial | April 2012
1 2 3 4 1 5 2 6 3 7 4 8 5 9 6 10 7 11 8 12 9 13 10 14 11 15
int tid = get_global_id(1) * get_global_size(0) + get_global_id(0); 4 8 12 1 4 5 8 9 12 13 1 2 5 6 9 10 13 14 2 3 6 7 10 11 14 15 int tid = get_global_id(0) * get_global_size(1) + get_global_id(1); 1 4 5 2 1 3 4 6 5 7 8 9 12 13 8 10 9 11 12 14 13 15 int group_size = get_local_size(0) * get_local_size(1); int tid = get_group_id(1) * get_num_groups(0) * group_size + get_group_id(0) * group_size + get_local_id(1) * get_local_size(0) + get_local_id(0) *assuming 2x2 groups
40 | ICPE Tutorial | April 2012
41 | ICPE Tutorial | April 2012
42 | ICPE Tutorial | April 2012
43 | ICPE Tutorial | April 2012
44 | ICPE Tutorial | April 2012
F = G * mi * m j || r
ij ||2
* r
ij
|| r
ij ||
F = Resultant Force Vector between particles i and j G = Gravitational Constant mi = Mass of particle i m j = Mass of particle j rij = Distance of particle i and j For each particle this becomes Fi = (G * mi) * m j || r
ij ||2 *
r
ij
|| r
ij ||
j =1→ N
for(i=0; i<n; i++) { ax = ay = az = 0; // Loop over all particles "j” for (j=0; j<n; j++) { //Calculate Displacement dx=x[j]-x[i]; dy=y[j]-y[i]; dz=z[j]-z[i]; // small eps is delta added for dx,dy,dz = 0 invr= 1.0/sqrt(dx*dx+dy*dy+dz*dz +eps); invr3 = invr*invr*invr; f=m[ j ]*invr3; // Accumulate acceleration ax += f*dx; ay += f*dy; az += f*dx; } // Use ax, ay, az to update particle positions }
45 | ICPE Tutorial | April 2012
46 | ICPE Tutorial | April 2012
47 | ICPE Tutorial | April 2012
__kernel void nbody( __global float4 * initial_pos, __global float4 * final_pos, Int N, __local float4 * result) { int localid = get_local_id(0); int globalid = get_global_id(0); result [localid] = 0; for( int i=0 ; i<N;i++) { //! Calculate interaction between //! particle globalid and particle i GetForce( globalid, i, initial_pos, final_pos, &result [localid]) ; } finalpos[ globalid] = result[ localid]; }
48 | ICPE Tutorial | April 2012
49 | ICPE Tutorial | April 2012
for (int i = 0; i < numTiles; ++i) { // load one tile into local memory int idx = i * localSize + tid; localPos[tid] = pos[idx]; barrier(CLK_LOCAL_MEM_FENCE); // calculate acceleration effect due to each body for( int j = 0; j < localSize; ++j ) { // Calculate acceleration caused by particle j on i float4 r = localPos[j] – myPos; float distSqr = r.x * r.x + r.y * r.y + r.z * r.z; float invDist = 1.0f / sqrt(distSqr + epsSqr); float s = localPos[j].w * invDistCube; // accumulate effect of all particles acc += s * r; } // Synchronize so that next tile can be loaded barrier(CLK_LOCAL_MEM_FENCE); } }
50 | ICPE Tutorial | April 2012
51 | ICPE Tutorial | April 2012
52 | ICPE Tutorial | April 2012
53 | ICPE Tutorial | April 2012
20 40 60 80 100 120 140 160 180 8k 16k 32k Kernel Time (ms) No of Particles
Nvidia - GPU AMD - GPU Nvidia - GPU - U2 AMD - GPU - U2 Nvidia - GPU - U4 AMD - GPU - U4 U# in legend denotes unroll factor
54 | ICPE Tutorial | April 2012
int main(void) { try { cl::Context context (CL_DEVICE_TYPE_GPU, 0, NULL, NULL, &err); cl::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); cl::Program::Sources source(1, std::make_pair(helloStr,strlen(helloStr))); cl::Program program_ = cl::Program(context, source); program_.build(devices); cl::Kernel kernel(program_, "hello", &err); cl::CommandQueue queue(context, devices[0], 0, &err); cl::KernelFunctor func = kernel.bind(queue, cl::NDRange(4, 4), cl::NDRange(2, 2)); func().wait(); } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")“ << std::endl; } return EXIT }
55 | ICPE Tutorial | April 2012
56 | ICPE Tutorial | April 2012
57 | ICPE Tutorial | April 2012
58 | ICPE Tutorial | April 2012
0) Object starts on device 0 1) clEnqueueRead*(cq0, ...) copies object to host 3) clEnqueueWrite*(cq1, ...) copies object to device 1 2) Object now valid on host 4) Object ends up on device 1 .) ) 2) Object now valid
59 | ICPE Tutorial | April 2012
60 | ICPE Tutorial | April 2012
1 2 3
4 5 6 7 1 2 3
1 2
3 4 5 6 7
1
2 3
1 2 3
1
2 3
61 | ICPE Tutorial | April 2012
Note that for this technique to work, any objects that are written to will have to be synchronized manually
This allows reading/writing to offsets within a buffer to avoid manually splitting and recombining data
62 | ICPE Tutorial | April 2012
63 | ICPE Tutorial | April 2012
rial | April 2012
64 | ICPE Tutorial | April 2012
MEMORY SYSTEM ON FUSION APUS - Pierre Boudier & Graham Sellers. AMD Fusion Developer Summit 2011
65 | ICPE Tutorial | April 2012
GPU / CPU access to uncached system memory GPU access to cached system memory
66 | ICPE Tutorial | April 2012
67 | ICPE Tutorial | April 2012
CPUs GPUs Overhead Low High (depending on data) Performance Variable High
68 | ICPE Tutorial | April 2012
– MEMORY SYSTEM ON FUSION APUS - Pierre Boudier & Graham Sellers. AMD Fusion Developer Summit 2011
Perhaad Mistry pmistry@ece.neu.edu