important class of applications one of the motifs
play

! Important(class(of(applications(((one(of(the(motifs)( ! - PDF document

! Important(class(of(applications(((one(of(the(motifs)( ! Basis(for(approximating(derivatives(numerically( ! Physical(simulations((e.g.(turbulence(flow,(seismic(wave(propagation)( ! Multimedia(applications(((e.g.(image(smoothing)( !


  1. ! Important(class(of(applications(((one(of(the(motifs)( ! Basis(for(approximating(derivatives(numerically( ! Physical(simulations((e.g.(turbulence(flow,(seismic(wave(propagation)( ! Multimedia(applications(((e.g.(image(smoothing)( ! Nearest(neighbor(update(on(a(structured(grid( ( 3D(Heat(Eqn(using(( ! fully(explicit(finite(differencing:( ! Didem!Unat,!!Xing!Cai,!!Scott!B.!Baden! ! dunat@lbl.gov! U’(x,y,z) = c0*U(x,y,z) + c1*(U(x,y,z-1) + U(x,y,z+1)+ U(x,y-1,z) ! ! + U(x,y+1,z) + U(x-1,y,z) + U(x+1,y,z)) ! Lawrence(Berkeley(National(Laboratory( Simula(Research(Laboratory(( ! Highly(data(parallel,(memory(bandwidth(bound( University(of(California,(San(Diego( › GPU(speedups(over(multicore((8(cores)(( ( " 5X((for(Lattice(Boltzmann(([Lee,ISCA’11],(( Oslo,(Jun(07,(2012( " 4X(Reverse(Time(Migration([Kruger,(SC’11]( ( 4 ( HPC$milestones$ ! Heterogeneity(in(compute(resources( Power(MW) 1(Exaflop/s( ! Explicit(management(of(data(transfer( 20.0 › Separate(device(memory(from(the(host(memory( 1(Petaflop/s( 2.35 ! Reengineering(of(scientific(applications( 0.85 1(Teraflop/s( › Algorithmic(changes(to(match(the(hardware(capabilities( 1(Gigaflop/s( 0.20 › Best(performance(requires(nonStrivial(knowledge(of(the(architecture( ( 1985% 1996% 2008% 2018% host accelerator ( Device Memory ! Power(consumption(is(the(main(design(constraint( Main Memory ! Drastic(changes(in(node(architecture([Shalf,(VecPar’10]( On-chip On-chip Memory Mem ory L2 L2 ! More(parallelism(on(the(chip( Vecto tor core core core core ! SoftwareSmanaged(memory(/(incoherent(caches( C Cores ores ! Already(started(seeing(concrete(instances(( bus bu ( 2 5 ! Graphics(Processing(Units((GPUs)( ! Explicitly(managed(memory(( Device Memory › Massively(parallel(single(chip(processor( › OnSchip(memory(resources(( › Low(power(cores:(trade(off(single(thread(performance( › Private(and(incoherent(( › Large(register(file(and(softwareSmanaged(memory( › e.g(__shared__((float(A[N];( ! Effective(in(accelerating(certain(data(parallel(applications(( ( › Case(Study:(Cardiac(Electrophysiology([Unat,(PARA’10]((( ! Hierarchical(thread(management( › Not(ideal(for(others:(sorting([Lee,(ISCA’10]( › Thread,(thread(groups,(thread(subgroups( ( › Granularity(of(a(thread( Shared Memory/L1 cache host accelerator Device Memory Register File Main Memory ! DomainSspecific(optimizations((( On-chip On-chip ! Limits(the(adoption(in(scientific(computing( Memory Mem ory L2 L2 Vecto tor core core core core C Cores ores We(need(programming(models(to(master(the(new(technology(and( make(it(accessible(to(computational(scientists.( bu bus 3 6

  2. ! Aims(programmer’s(productivity(and(high(performance( ! SourceStoSsource(translator((for(the(Nvidia(GPUs( ! Simplifies(application(development( › Parallelizes(loop(nests( ! Based(on(a(modest(number(of(compiler(directives( › Relieves(the(programmer(of(a(variety(of(tedious(tasks( › #pragma(mint(for( › Incremental(parallelization( ! Abstracts(away(the(programmer’s(view(of((the(hardware( Mint C + directives CUDA ( ( Seismic Modeling Cardiac Simulation ! MotifSspecific(autoSoptimizer( Turbulent Flow › Targets(stencil(methods(( › Incorporates(semantic(knowledge(to(compiler(analysis( Device Memory Main Memory › Performs(data(locality(optimizations(via(onSchip(memory( Mint › Compiler(flags(for(performance(tuning( L2 L2 core core core core ( 7 10 Serial!code! !!!!Accelerated!Region! Data!parallel!for! Host!Region! Data!parallel!for! Host!! Thread! 8 11 Serial!code! !!!!Accelerated!Region! ! #pragma(mint(parallel( Accelerated Region kernel › Indicates(the(accelerated(region( Data!parallel!for! ! #pragma(mint(for( …… › Marks(enclosed(loopSnest(for(acceleration( Block Block Device Memory › 3(additional(clauses(for(optimizations( Host!Region! ! #pragma(mint(copy( Data Transfer › Expresses(data(transfers(between(the(host(and(device( Data!parallel!for! ! #pragma(mint(single( › Handles(serial(section( …… Block Block ! #pragma(mint(barrier( Block › Synchronizes(host(and(device(threads( Host!! Synchronization Thread! 9 12

  3. #pragma mint copy(U, toDevice, (n+2),(m+2),(k+2)) ! #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) ! Performance(tuning(parameters( #pragma mint copy(Unew, toDevice, (n+2),(m+2),(k+2)) ! #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) Device Memory ! HighSlevel(interface(to(lowSlevel( #pragma mint parallel Data hardware(specific(optimizations( { Transfers while( t++ < T ){ ( #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) for (int z=1; z<= k; z++) ForSloop(clauses((( 1. for (int y=1; y<= m; y++) › handle(data(decomposition(and(thread( for (int x=1; x<= n; x++) Unew[z][y][x] = c0 * U[z][y][x] + management( c1 * (U[z][y][x-1] + U[z][y][x+1] + › nest((),(tile((),(chunksize(()( Shared Memory/L1 cache U[z][y-1][x] + U[z][y+1][x] + 2. (Compiler(flags(for(data(locality( U[z-1][y][x] + U[z+1][y][x]); double*** tmp; Register File › Register:(Sregister( tmp = U; U = Unew; Unew = tmp; › SoftwareSmanaged(memory:(Sshared( }//end of while › Cache:(SpreferL1( }//end of parallel region ( #pragma mint copy(U, fromDevice, (n+2),(m+2),(k+2)) ! #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) ! 13 16 #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) Mint Program for #pragma mint parallel #pragma mint parallel the 3D Heat Eqn. { { while( t++ < T ){ while( t++ < T ){ !!#pragma!mint!for ! #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) for (int z=1; z<= k; z++) for (int z=1; z<= k; z++) for (int y=1; y<= m; y++) for (int y=1; y<= m; y++) for (int x=1; x<= n; x++) for (int x=1; x<= n; x++) Unew[z][y][x] = c0 * U[z][y][x] + Unew[z][y][x] = c0 * U[z][y][x] + c1 * (U[z][y][x-1] + U[z][y][x+1] + c1 * (U[z][y][x-1] + U[z][y][x+1] + U[z][y-1][x] + U[z][y+1][x] + U[z][y-1][x] + U[z][y+1][x] + U[z-1][y][x] + U[z+1][y][x]); U[z-1][y][x] + U[z+1][y][x]); double*** tmp; double*** tmp; tmp = U; U = Unew; Unew = tmp; tmp = U; U = Unew; Unew = tmp; Data parallel for loop }//end of while }//end of while }//end of parallel region }//end of parallel region #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) 14 17 Accelerated #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) Region #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) depth of loop #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) parallelism #pragma mint parallel ! #pragma mint parallel #pragma mint parallel { { while( t++ < T ){ while( t++ < T ){ !!#pragma!mint!for!nest(all) ! #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) for (int z=1; z<= k; z++) for (int z=1; z<= k; z++) for (int y=1; y<= m; y++) for (int y=1; y<= m; y++) for (int x=1; x<= n; x++) for (int x=1; x<= n; x++) Unew[z][y][x] = c0 * U[z][y][x] + Unew[z][y][x] = c0 * U[z][y][x] + c1 * (U[z][y][x-1] + U[z][y][x+1] + c1 * (U[z][y][x-1] + U[z][y][x+1] + U[z][y-1][x] + U[z][y+1][x] + U[z][y-1][x] + U[z][y+1][x] + U[z-1][y][x] + U[z+1][y][x]); U[z-1][y][x] + U[z+1][y][x]); double*** tmp; double*** tmp; tmp = U; U = Unew; Unew = tmp; tmp = U; U = Unew; Unew = tmp; }//end of while }//end of while }//end of parallel region }//end of parallel region #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) 15 18

Download Presentation
Download Policy: The content available on the website is offered to you 'AS IS' for your personal information and use only. It cannot be commercialized, licensed, or distributed on other websites without prior consent from the author. To download a presentation, simply click this link. If you encounter any difficulties during the download process, it's possible that the publisher has removed the file from their server.

Recommend


More recommend