! Important(class(of(applications(((one(of(the(motifs)(
|
|
- Bethany Lambert
- 5 years ago
- Views:
Transcription
1 DidemUnat,XingCai,ScottB.Baden LawrenceBerkeleyNationalLaboratory SimulaResearchLaboratory UniversityofCalifornia,SanDiego Oslo,Jun7,12 Importantclassofapplicationsoneofthemotifs) Basisforapproximatingderivativesnumerically Physicalsimulationse.g.turbulenceflow,seismicwavepropagation) Multimediaapplicationse.g.imagesmoothing) Nearestneighborupdateonastructuredgrid 3DHeatEqnusing fullyexplicitfinitedifferencing: U x,y,z) = c*ux,y,z) + c1*ux,y,z-1) + Ux,y,z+1)+ Ux,y-1,z) + Ux,y+1,z) + Ux-1,y,z) + Ux+1,y,z)) Highlydataparallel,memorybandwidthbound GPUspeedupsovermulti8s) " 5XforLatticeBoltzmann[Lee,ISCA 11], " 4XReverseTimeMigration[Kruger,SC 11] 4 PowerMW) Gigaflop/s 1Teraflop/s HPC$milestones$ 1Petaflop/s 1Exaflop/s 1985% 1996% 8% 18% Powerconsumptionisthemaindesignconstraint Drasticchangesinnodearchitecture[Shalf,VecPar ] Moreparallelismonthechip SoftwareSmanagedmemory/incoherentcaches Alreadystartedseeingconcreteinstances Heterogeneityincomputeresources Explicitmanagementofdatatransfer Separatedevicememoryfromthehostmemory Reengineeringofscientificapplications Algorithmicchangestomatchthehardwarecapabilities BestperformancerequiresnonStrivialknowledgeofthearchitecture host Main Memory bus accelerator On-chip Memory Vector Cores 2 5 GraphicsProcessingUnitsGPUs) Massivelyparallelsinglechipprocessor Lowpowers:tradeoffsinglethreadperformance LargeregisterfileandsoftwareSmanagedmemory Effectiveinacceleratingcertaindataparallelapplications CaseStudy:CardiacElectrophysiology[Unat,PARA ] Notidealforothers:sorting[Lee,ISCA ] host accelerator Explicitlymanagedmemory OnSchipmemoryresources Privateandincoherent e.g shared floata[n]; Hierarchicalthreadmanagement Thread,threadgroups,threadsubgroups Granularityofathread Shared Memory/L1 cache Main Memory On-chip Memory DomainSspecificoptimizations Limitstheadoptioninscientificcomputing Register File bus Vector Cores Weneedprogrammingmodelstomasterthenewtechnologyand makeitaccessibletocomputationalscientists. 3 6
2 Aimsprogrammer sproductivityandhighperformance Simplifiesapplicationdevelopment Basedonamodestnumberofcompilerdirectives #pragmamintfor Incrementalparallelization Abstractsawaytheprogrammer sviewofthehardware Seismic Modeling Cardiac Simulation Turbulent Flow Main Memory SourceStoSsourcetranslatorfortheNvidiaGPUs Parallelizesloopnests Relievestheprogrammerofavarietyoftedioustasks C + directives MotifSspecificautoSoptimizer Targetsstencilmethods Incorporatessemanticknowledgetocompileranalysis PerformsdatalocalityoptimizationsviaonSchipmemory Compilerflagsforperformancetuning 7 Serialcode AcceleratedRegion Dataparallelfor HostRegion Dataparallelfor Host Thread 8 11 Serialcode AcceleratedRegion Dataparallelfor HostRegion kernel Block Block #pragmamintparallel Accelerated Region Indicatestheacceleratedregion #pragmamintfor MarksenclosedloopSnestforacceleration 3additionalclausesforoptimizations #pragmamintcopy Data Transfer Expressesdatatransfersbetweenthehostanddevice Dataparallelfor Host Thread Block Block Block #pragmamintsingle Handlesserialsection #pragmamintbarrier Synchronizeshostanddevicethreads Synchronization 9 12
3 Performancetuningparameters HighSlevelinterfacetolowSlevel hardwarespecificoptimizations 1. ForSloopclauses handledatadecompositionandthread management nest),tile),chunksize) 2. Compilerflagsfordatalocality Register:Sregister SoftwareSmanagedmemory:Sshared Cache:SpreferL1 Shared Memory/L1 cache Register File 13 #pragma mint copyu,todevice,n+2),m+2),k+2)) n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) n+2),m+2),k+2)) Data while t++ < T ) Transfers #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) n+2),m+2),k+2)) 16 #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) Program for the 3D Heat Eqn. while t++ < T ) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) while t++ < T ) #pragmamintfor #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + Data parallel for loop #pragma mint copyu,fromdevice,n+2),m+2),k+2)) #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) #pragma #pragma mint mint parallel parallel while t++ < T ) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) Accelerated Region #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma depth mint of copyunew,todevice,n+2),m+2),k+2)) loop parallelism while t++ < T ) #pragmamintfornestall) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) 15 18
4 #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma depth mint of copyunew,todevice,n+2),m+2),k+2)) loop parallelism partitioning iteration space while t++ < T ) #pragmamintfornestall)tile16,16,64) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma depth mint of copyunew,todevice,n+2),m+2),k+2)) loop parallelism partitioning iteration space while t++ < T ) #pragmamintfornestall)tile16,16,64)chunksize1,1,64) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) workload of a thread Fullyautomatedtranslationand optimizationsystem TransformationperformedontheAbstract SyntaxTree BuiltontopoftheROSEcompiler DevelopedatLLNL ROSEprovidesanAPIforgeneratingand manipulatingabstractsyntaxtrees isapartoftherosedistribution sincenov 11. Input code: C + ROSE Parser ROSE backend Output file Cuda src 23 ty 3D Grid Nx, Ny, Nz) threads tx/cx, ty/cy, tz/cz) Cuda thread Inputcode: C+ PragmaHandler BaselineTranslator MemoryManager ROSEParser LoopTransformer tz tile tx,ty,tz) chunksize cx,cy,cz) tx #pragma mint for nest#,all)tilet x,t y,t z ) chunksizec x,c y,c z ) ROSE backend KernelConfig Outliner ArgumentHandler WorkPractitioner ManagesdatadecompositionandthreadworkSassignment. Outputfile Cudasrc Optimizer 21 24
5 #pragmamintparallel whilet<t) t+=dt; #pragmamintfor fori=1;i<=n;i++) forj=1;j<=n;j++) A[i][j]=c*B[iJ1][j]+B[i+1][j] +B[i][jJ1]+B[i][j+1]); }//endofwhile }//endofparallelregion global void mint_1_1517cudapitchedptr ptr_du...) double* U = double *)ptr_du.ptr); int widthu = ptr_du.pitch / sizeofdouble ); int sliceu = ptr_du.ysize * widthu;... int _idx = threadidx.x + 1; int _gidx = _idx + blockdim.x * blockidx.x;... if _gidz >= 1 && _gidz <= k) if _gidy >= 1 && _gidy <= m) if _gidx >= 1 && _gidx <= n) Unew[indUnew] = c * U[indU] + c1 * U[indU - 1] + U[indU + 1]... ); Unpack pitched ptrs Compute local and global indices using thread and block IDs If-statements are derived from forstatements Each thread updates single data point 25 }//end of kernel 28 #pragmamintparallel whilet<t) t+=dt; #pragmamintfor fori=1;i<=n;i++) forj=1;j<=n;j++) A[i][j]=c*B[iJ1][j]+B[i+1][j] +B[i][jJ1]+B[i][j+1]); }//endofwhile }//endofparallelregion Host /* Outlined Kernel */ global void cuda_func ) }... Device AST #pragmamintparallel whilet<t) t+=dt;... /* Outlined Kernel */ global void cuda_func ) }... Device Thebaselinetranslator performsallthememoryreferences throughglobalmemory canstillparallelizeloops,launchkernel, performdatatransfers Baseline Translator Optimizer cuda_1_func_<<<threads, blocks>>> );... }//endofwhile Host }//endofparallelregion Optimizerfocusesonstencilmethods Reducesglobalmemoryaccesses DatalocalityusingonSchipmemory Stencil Analyzer OnSchipMemory Optimizer OnSchipmemoryoptimizationflags SpreferL1,Sregister,Sshared Bestperformancedependsonthedevice andapplication ModifiedAST 27
6 Analyzesarrayaccesspattern Findsstencilstructureanddependencybetweenthreads z y z y z y x Tradeoff:findthesweetspot Whichvariables?andHowmanyvariables? Maximizethetotalreductioninglobalmemoryreferences Minimizethesharedmemoryusage Planesmaycomefrom1ormorearrays 7-point 13-point 19-point Ghostcellregion SpreferL1 ConfiguresonSchipmemoryonFermi Favors48KBL1and16KBsharedmemory Sregister Takesadvantageoflargeregisterfile Placesfrequentlyaccessedarraysintoregisters Enhancesaccesstothecentralpointofastencil Shared Memory/L1 cache Register File Sshared Detectssharablereferencesamongthreads PlacestheminsharedmemorysoftwareSmanagedmemory) Anumberofplanesresideonsharedmemory tile 2D planes Tradeoff Reducesmemoryreferencestofrequentlyaccessedlocations Increasesresourcesneededbyathread,reducingconcurrency Gflops Heat Poisson Variable Poisson 19pt Tesla C6 Hand- Heat Poisson Variable Poisson 19pt Tesla C D.Unat,X.Cai,andS.Baden.":Realizingperformancein3DStencil MethodswithAnnotatedC",inInternationalConferenceonSupercomputing. ICS'11,Tucson,AZ,
7 45 Theoptimizerimprovestheperformance2.6xoverthebaseline. Hand Gflop/s Gflops Heat Poisson Variable Poisson 19pt Heat Poisson Variable Poisson 19pt Tesla C6 2.6x Tesla C 1 MPI OnTeslaC6,achieves79%ofthehandSoptimized. OnTeslaCFermi),achieves76%ofthehandSoptimized. EarthquakeSinducedseismicwavepropagation Tesla C GPU 7 32 MPI processes 6 Gflop/s YifengCuiandJunZhouatSDSC Refersto31threeSdimarrays asymmetric13spointstencil Timeconsumingloops:185lines Generatedcode:1185lines Hand 8 GordonBellPrizefinalistatSC baseline optimizer TheedcodeonsingleGPUisslightlyfasterthan32s. UsedbyresearchersSouthernCAEarthquakeCenter 32 MPI 4 nodes Petascaleanelasticwavepropagationcode 16 MPI 8 Nehalem s / node 37 8 MPI D.Unat,J.Zhou,Y.Cui,X.Cai,andS.Baden. Acceleratinga3DFinite DifferenceEarthquakeSimulationwithaCMtoMTranslator,inComputing inscienceandengineeringjournal,12. 1 MPI 8 MPI 16 MPI 32 MPI 8 Nehalem s / node baseline optimizer Hand Tesla C GPU 4 nodes Gflop/s Gflop/s Theedcodeachieves83%ofthehandSoptimized. 8 83% 1 MPI 8 MPI 16 MPI 32 MPI 8 Nehalem s / node 4 nodes baseline optimizer Hand Tesla C GPU 1 MPI 8 MPI 16 MPI 32 MPI 8 Nehalem s / node 39 4 nodes baseline optimizer Hand Tesla C GPU 42
8 Computervisionalgorithm Detectscornersandhighintensitypoints CollaboratedwithHanKim&JürgenSchulze Inserted5linesofcodeintothe originalcode~3lines) RealStimeperformancewith S22xperformanceof8Nehalems runningopenmpthreads TeslaCvsIntelXeonE54Nehalem D.Unat,H.S.Kim,J.Schulze,S.B.Baden, AutoMoptimizationofaFeature SelectionAlgorithm,inEmergingApplicationsandManySArchitecture, EAMAWorkshopcoSlocatedwithISCA,SanJose,CA, ProgrammingModel AddressestheprogrammabilityissueofGPUs " Today smassivelyparallelarchitectures " SoftwareSmanagedstorageandmassivelyparallelchip SourceStoSsourcetranslatorandoptimizer IncorporatesmotifSspecificknowledge Achievedaround8%ofthehandSoptimized " BothcommonlySusedkernelsandrealSworldapplications Availablefordownload Ourprojectwebsite " OnlineTranslator: " 46 OpenMPCextendsOpenMPtosupport Parallelizesonlyouterloopandnosharedmemoryoptimization CommercialcompilerfromPortlandGroupPGI) GeneralSpurposecompiler Gflops OpenMPC PGI HandD HeatEqn OpenACC CollaborativeeffortfromPGI,Cray,CAPs,Nvidia ShowsthatdirectiveSbasedmodelisapromisingapproach AllresultsareobtainedonTeslaC6.PGIv11.1)results:ontheLincolnsystem. 44 ScottBadenUCSD), XingCaiSimula), AllanSnavelySDSC), HanSukKimUCSD,Apple), JürgenSchulzeUCSD), YifengCuiSDSC), JunZhouSDSC), WenjieWeiSimula), RossWalkerSDSC), DanQuinlanLLNL), ROSETeamLLNL), PauliusMicikeviciusNvidia), EverettPhillipsNvidia) 47 TargetmultipleGPUs MPIcodegeneration ExtendforIntelManyIntegratedCoreMIC) Sameexecutionmodel Newclausesorcompileroptions " Registerblocking,SSEinstructions,softwareprefetching Soffload=[mic apu cuda] Maintainsinglecodebase? Main Memory Main Memory [Shalf,VecPar ]JohnShalf,SudipDosanjh,andJohnMorrison.Exascalecomputing technologychallenges.inproceedingsofthe9thinternationalconferenceonhigh PerformanceComputingforComputationalScience,VECPAR. [Lee,ISCA ]VictorW.Lee,ChangkyuKim,JatinChhugani,MichaelDeisher,Daehyun Kim,AnthonyD.Nguyen,NadathurSatish,MikhailSmelyanskiy,SrinivasChennuSpaty,Per Hammarlund,RonakSinghal,andPradeepDubey.DebunkingtheXGPUvs.CPUmyth: anevaluationofthroughputcomputingoncpuandgpu.sigarchcomput. [Unat,Para ]DidemUnat,XingCai,andScottBaden.OptimizingtheAlievSPanfilov modelofcardiacexcitationonheterogeneoussystems.para:stateoftheartin ScientificandParallelComputing [Lee]SeyongLee,SeungSJaiMin,andRudolfEigenmann.OpenMPtoGPGPU:acompiler frameworkforautomatictranslationandoptimization.inproceedingsofthe14thacm SIGPLANSymposiumonPrinciplesandPracticeofParallelProgramming,PPoPP 9 [William]SamuelWebbWilliams.AutoStuningPerformanceonMultiComputers.PhD thesis,eecsdepartment,universityofcalifornia,berkeley,dec8 [Carrington]LauraCarrington,MustafaM.Tikir,CatherineOlschanowsky,Michael Laurenzano,JoshuaPeraza,AllanSnavely,andStephenPoole.AnidiomSfindingtoolfor increasingproductivityofaccelerators.inproceedingsoftheinternationalconferenceon Supercomputing,ICS 11 [Datta]KaushikDatta,MarkMurphy,VasilyVolkov,SamuelWilliams,JonathanCarter, LeonidOliker,DavidPatterson,JohnShalf,andKatherineYelick.Stencilcomputation optimizationandautostuningonstatesofsthesartmultiarchitectures.insc
Didem Unat, Xing Cai, Scott Baden
Didem Unat, Xing Cai, Scott Baden GPUs are effective means of accelerating data parallel applications However, porting algorithms on a GPU still remains a challenge. A directive based programming model
More informationLecture 10. Stencil Methods Limits to Performance
Lecture 10 Stencil Methods Limits to Performance Announcements Tuesday s lecture on 2/11 will be moved to room 4140 from 6.30 PM to 7.50 PM Office hours are on for today Scott B. Baden /CSE 260/ Winter
More informationCUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012
CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012 Overview 1. Memory Access Efficiency 2. CUDA Memory Types 3. Reducing Global Memory Traffic 4. Example: Matrix-Matrix
More informationModule Memory and Data Locality
GPU Teaching Kit Accelerated Computing Module 4.4 - Memory and Data Locality Tiled Matrix Multiplication Kernel Objective To learn to write a tiled matrix-multiplication kernel Loading and using tiles
More informationPhysis: An Implicitly Parallel Framework for Stencil Computa;ons
Physis: An Implicitly Parallel Framework for Stencil Computa;ons Naoya Maruyama RIKEN AICS (Formerly at Tokyo Tech) GTC12, May 2012 1 è Good performance with low programmer produc;vity Mul;- GPU Applica;on
More informationIntroduction to GPGPU and GPU-architectures
Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks
More informationGPU programming CUDA C. GPU programming,ii. COMP528 Multi-Core Programming. Different ways:
COMP528 Multi-Core Programming GPU programming,ii www.csc.liv.ac.uk/~alexei/comp528 Alexei Lisitsa Dept of computer science University of Liverpool a.lisitsa@.liverpool.ac.uk Different ways: GPU programming
More informationOpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016
OpenACC. Part I Ned Nedialkov McMaster University Canada October 2016 Outline Introduction Execution model Memory model Compiling pgaccelinfo Example Speedups Profiling c 2016 Ned Nedialkov 2/23 Why accelerators
More informationDevice Memories and Matrix Multiplication
Device Memories and Matrix Multiplication 1 Device Memories global, constant, and shared memories CUDA variable type qualifiers 2 Matrix Multiplication an application of tiling runningmatrixmul in the
More informationMPI_Send(a,..., MPI_COMM_WORLD); MPI_Recv(a,..., MPI_COMM_WORLD, &status);
$ $ 2 global void kernel(int a[max], int llimit, int ulimit) {... } : int main(int argc, char *argv[]){ MPI_Int(&argc, &argc); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size);
More informationOpenACC programming for GPGPUs: Rotor wake simulation
DLR.de Chart 1 OpenACC programming for GPGPUs: Rotor wake simulation Melven Röhrig-Zöllner, Achim Basermann Simulations- und Softwaretechnik DLR.de Chart 2 Outline Hardware-Architecture (CPU+GPU) GPU computing
More informationAutomatic translation from CUDA to C++ Luca Atzori, Vincenzo Innocente, Felice Pantaleo, Danilo Piparo
Automatic translation from CUDA to C++ Luca Atzori, Vincenzo Innocente, Felice Pantaleo, Danilo Piparo 31 August, 2015 Goals Running CUDA code on CPUs. Why? Performance portability! A major challenge faced
More informationLecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1
Lecture 15: Introduction to GPU programming Lecture 15: Introduction to GPU programming p. 1 Overview Hardware features of GPGPU Principles of GPU programming A good reference: David B. Kirk and Wen-mei
More informationIntroduction to Parallel Computing with CUDA. Oswald Haan
Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries
More informationCRAY XK6 REDEFINING SUPERCOMPUTING. - Sanjana Rakhecha - Nishad Nerurkar
CRAY XK6 REDEFINING SUPERCOMPUTING - Sanjana Rakhecha - Nishad Nerurkar CONTENTS Introduction History Specifications Cray XK6 Architecture Performance Industry acceptance and applications Summary INTRODUCTION
More informationReview. Lecture 10. Today s Outline. Review. 03b.cu. 03?.cu CUDA (II) Matrix addition CUDA-C API
Review Lecture 10 CUDA (II) host device CUDA many core processor threads thread blocks grid # threads >> # of cores to be efficient Threads within blocks can cooperate Threads between thread blocks cannot
More informationTiled Matrix Multiplication
Tiled Matrix Multiplication Basic Matrix Multiplication Kernel global void MatrixMulKernel(int m, m, int n, n, int k, k, float* A, A, float* B, B, float* C) C) { int Row = blockidx.y*blockdim.y+threadidx.y;
More informationIntroduction to GPU programming. Introduction to GPU programming p. 1/17
Introduction to GPU programming Introduction to GPU programming p. 1/17 Introduction to GPU programming p. 2/17 Overview GPUs & computing Principles of CUDA programming One good reference: David B. Kirk
More informationACCELERATING THE PRODUCTION OF SYNTHETIC SEISMOGRAMS BY A MULTICORE PROCESSOR CLUSTER WITH MULTIPLE GPUS
ACCELERATING THE PRODUCTION OF SYNTHETIC SEISMOGRAMS BY A MULTICORE PROCESSOR CLUSTER WITH MULTIPLE GPUS Ferdinando Alessi Annalisa Massini Roberto Basili INGV Introduction The simulation of wave propagation
More informationDense Linear Algebra. HPC - Algorithms and Applications
Dense Linear Algebra HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 6 th 2017 Last Tutorial CUDA Architecture thread hierarchy:
More informationCUDA. Matthew Joyner, Jeremy Williams
CUDA Matthew Joyner, Jeremy Williams Agenda What is CUDA? CUDA GPU Architecture CPU/GPU Communication Coding in CUDA Use cases of CUDA Comparison to OpenCL What is CUDA? What is CUDA? CUDA is a parallel
More informationParallel Numerical Algorithms
Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance
More informationOutline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun
Outline Memory Management CIS 565 Fall 2011 Qing Sun sunqing@seas.upenn.edu Kernels Matrix multiplication Managing Memory CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory
More informationCS/CoE 1541 Final exam (Fall 2017). This is the cumulative final exam given in the Fall of Question 1 (12 points): was on Chapter 4
CS/CoE 1541 Final exam (Fall 2017). Name: This is the cumulative final exam given in the Fall of 2017. Question 1 (12 points): was on Chapter 4 Question 2 (13 points): was on Chapter 4 For Exam 2, you
More information1/25/12. Administrative
Administrative L3: Memory Hierarchy Optimization I, Locality and Data Placement Next assignment due Friday, 5 PM Use handin program on CADE machines handin CS6235 lab1 TA: Preethi Kotari - Email:
More informationOpenACC Fundamentals. Steve Abbott November 15, 2017
OpenACC Fundamentals Steve Abbott , November 15, 2017 AGENDA Data Regions Deep Copy 2 while ( err > tol && iter < iter_max ) { err=0.0; JACOBI ITERATION #pragma acc parallel loop reduction(max:err)
More informationAuto-Generation and Auto-Tuning of 3D Stencil Codes on GPU Clusters
Auto-Generation and Auto-Tuning of 3D Stencil s on GPU Clusters Yongpeng Zhang, Frank Mueller North Carolina State University CGO 2012 Outline Motivation DSL front-end and Benchmarks Framework Experimental
More informationGPU Memory Memory issue for CUDA programming
Memory issue for CUDA programming Variable declaration Memory Scope Lifetime device local int LocalVar; local thread thread device shared int SharedVar; shared block block device int GlobalVar; global
More informationIntroduction CPS343. Spring Parallel and High Performance Computing. CPS343 (Parallel and HPC) Introduction Spring / 29
Introduction CPS343 Parallel and High Performance Computing Spring 2018 CPS343 (Parallel and HPC) Introduction Spring 2018 1 / 29 Outline 1 Preface Course Details Course Requirements 2 Background Definitions
More informationGPU Performance Nuggets
GPU Performance Nuggets Simon Garcia de Gonzalo & Carl Pearson PhD Students, IMPACT Research Group Advised by Professor Wen-mei Hwu Jun. 15, 2016 grcdgnz2@illinois.edu pearson@illinois.edu GPU Performance
More informationCSE 599 I Accelerated Computing - Programming GPUS. Memory performance
CSE 599 I Accelerated Computing - Programming GPUS Memory performance GPU Teaching Kit Accelerated Computing Module 6.1 Memory Access Performance DRAM Bandwidth Objective To learn that memory bandwidth
More informationLecture 8: GPU Programming. CSE599G1: Spring 2017
Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library
More informationUnrolling parallel loops
Unrolling parallel loops Vasily Volkov UC Berkeley November 14, 2011 1 Today Very simple optimization technique Closely resembles loop unrolling Widely used in high performance codes 2 Mapping to GPU:
More informationComputational Fluid Dynamics (CFD) using Graphics Processing Units
Computational Fluid Dynamics (CFD) using Graphics Processing Units Aaron F. Shinn Mechanical Science and Engineering Dept., UIUC Accelerators for Science and Engineering Applications: GPUs and Multicores
More informationOpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer
OpenACC Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer 3 WAYS TO ACCELERATE APPLICATIONS Applications Libraries Compiler Directives Programming Languages Easy to use Most Performance
More informationParallel Computing. Lecture 19: CUDA - I
CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4
More informationProgramming in CUDA. Malik M Khan
Programming in CUDA October 21, 2010 Malik M Khan Outline Reminder of CUDA Architecture Execution Model - Brief mention of control flow Heterogeneous Memory Hierarchy - Locality through data placement
More informationGPU Acceleration of the Longwave Rapid Radiative Transfer Model in WRF using CUDA Fortran. G. Ruetsch, M. Fatica, E. Phillips, N.
GPU Acceleration of the Longwave Rapid Radiative Transfer Model in WRF using CUDA Fortran G. Ruetsch, M. Fatica, E. Phillips, N. Juffa Outline WRF and RRTM Previous Work CUDA Fortran Features RRTM in CUDA
More informationGPU Architecture and Programming. Andrei Doncescu inspired by NVIDIA
GPU Architecture and Programming Andrei Doncescu inspired by NVIDIA Traditional Computing Von Neumann architecture: instructions are sent from memory to the CPU Serial execution: Instructions are executed
More informationOpenACC. Part 2. Ned Nedialkov. McMaster University Canada. CS/SE 4F03 March 2016
OpenACC. Part 2 Ned Nedialkov McMaster University Canada CS/SE 4F03 March 2016 Outline parallel construct Gang loop Worker loop Vector loop kernels construct kernels vs. parallel Data directives c 2013
More informationCartoon parallel architectures; CPUs and GPUs
Cartoon parallel architectures; CPUs and GPUs CSE 6230, Fall 2014 Th Sep 11! Thanks to Jee Choi (a senior PhD student) for a big assist 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ~ socket 14 ~ core 14 ~ HWMT+SIMD
More informationNVIDIA GTX200: TeraFLOPS Visual Computing. August 26, 2008 John Tynefield
NVIDIA GTX200: TeraFLOPS Visual Computing August 26, 2008 John Tynefield 2 Outline Execution Model Architecture Demo 3 Execution Model 4 Software Architecture Applications DX10 OpenGL OpenCL CUDA C Host
More informationCSE 591: GPU Programming. Memories. Klaus Mueller. Computer Science Department Stony Brook University
CSE 591: GPU Programming Memories Klaus Mueller Computer Science Department Stony Brook University Importance of Memory Access Efficiency Every loop iteration has two global memory accesses two floating
More informationTesla Architecture, CUDA and Optimization Strategies
Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization
More informationGPU Memory. GPU Memory. Memory issue for CUDA programming. Copyright 2013 Yong Cao, Referencing UIUC ECE498AL Course Notes
Memory issue for CUDA programming CUDA Variable Type Qualifiers Variable declaration Memory Scope Lifetime device local int LocalVar; local thread thread device shared int SharedVar; shared block block
More informationPersistent RNNs. (stashing recurrent weights on-chip) Gregory Diamos. April 7, Baidu SVAIL
(stashing recurrent weights on-chip) Baidu SVAIL April 7, 2016 SVAIL Think hard AI. Goal Develop hard AI technologies that impact 100 million users. Deep Learning at SVAIL 100 GFLOP/s 1 laptop 6 TFLOP/s
More informationE6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices
E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices Ching-Yung Lin, Ph.D. Adjunct Professor, Dept. of Electrical Engineering and Computer Science IBM Chief Scientist, Graph
More informationLecture 7. Overlap Using Shared Memory Performance programming the memory hierarchy
Lecture 7 Overlap Using Shared Memory Performance programming the memory hierarchy Announcements Mac Mini lab (APM 2402) Starts Tuesday Every Tues and Fri for the next 2 weeks Project proposals due on
More informationGPU Programming. Performance Considerations. Miaoqing Huang University of Arkansas Fall / 60
1 / 60 GPU Programming Performance Considerations Miaoqing Huang University of Arkansas Fall 2013 2 / 60 Outline Control Flow Divergence Memory Coalescing Shared Memory Bank Conflicts Occupancy Loop Unrolling
More informationCOSC 6374 Parallel Computations Introduction to CUDA
COSC 6374 Parallel Computations Introduction to CUDA Edgar Gabriel Fall 2014 Disclaimer Material for this lecture has been adopted based on various sources Matt Heavener, CS, State Univ. of NY at Buffalo
More informationGREAT PERFORMANCE FOR TINY PROBLEMS: BATCHED PRODUCTS OF SMALL MATRICES. Nikolay Markovskiy Peter Messmer
GREAT PERFORMANCE FOR TINY PROBLEMS: BATCHED PRODUCTS OF SMALL MATRICES Nikolay Markovskiy Peter Messmer ABOUT CP2K Atomistic and molecular simulations of solid state From ab initio DFT and Hartree-Fock
More informationCS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra)
CS 470 Spring 2016 Mike Lam, Professor Other Architectures (with an aside on linear algebra) Parallel Systems Shared memory (uniform global address space) Primary story: make faster computers Programming
More informationGPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler
GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler Taylor Lloyd Jose Nelson Amaral Ettore Tiotto University of Alberta University of Alberta IBM Canada 1 Why? 2 Supercomputer Power/Performance GPUs
More informationMassively Parallel Architectures
Massively Parallel Architectures A Take on Cell Processor and GPU programming Joel Falcou - LRI joel.falcou@lri.fr Bat. 490 - Bureau 104 20 janvier 2009 Motivation The CELL processor Harder,Better,Faster,Stronger
More informationA case study of performance portability with OpenMP 4.5
A case study of performance portability with OpenMP 4.5 Rahul Gayatri, Charlene Yang, Thorsten Kurth, Jack Deslippe NERSC pre-print copy 1 Outline General Plasmon Pole (GPP) application from BerkeleyGW
More informationLECTURE ON PASCAL GPU ARCHITECTURE. Jiri Kraus, November 14 th 2016
LECTURE ON PASCAL GPU ARCHITECTURE Jiri Kraus, November 14 th 2016 ACCELERATED COMPUTING CPU Optimized for Serial Tasks GPU Accelerator Optimized for Parallel Tasks 2 ACCELERATED COMPUTING CPU Optimized
More information60x Computational Fluid Dynamics and Visualisation
60x Computational Fluid Dynamics and Visualisation Jamil Appa BAE Systems Advanced Technology Centre 1 Outline BAE Systems - Introduction Aerodynamic Design Challenges Why GPUs? CFD on GPUs Example Kernel
More informationImage Processing Optimization C# on GPU with Hybridizer
Image Processing Optimization C# on GPU with Hybridizer regis.portalez@altimesh.com Median Filter Denoising Noisy image (lena 1960x1960) Denoised image window = 3 2 Median Filter Denoising window Output[i,j]=
More informationHigh Performance Linear Algebra on Data Parallel Co-Processors I
926535897932384626433832795028841971693993754918980183 592653589793238462643383279502884197169399375491898018 415926535897932384626433832795028841971693993754918980 592653589793238462643383279502884197169399375491898018
More informationParalization on GPU using CUDA An Introduction
Paralization on GPU using CUDA An Introduction Ehsan Nedaaee Oskoee 1 1 Department of Physics IASBS IPM Grid and HPC workshop IV, 2011 Outline 1 Introduction to GPU 2 Introduction to CUDA Graphics Processing
More informationGraphics Processing Unit (GPU) Acceleration of Machine Vision Software for Space Flight Applications
Graphics Processing Unit (GPU) Acceleration of Machine Vision Software for Space Flight Applications Workshop on Space Flight Software November 6, 2009 Brent Tweddle Massachusetts Institute of Technology
More informationDeveloping PIC Codes for the Next Generation Supercomputer using GPUs. Viktor K. Decyk UCLA
Developing PIC Codes for the Next Generation Supercomputer using GPUs Viktor K. Decyk UCLA Abstract The current generation of supercomputer (petaflops scale) cannot be scaled up to exaflops (1000 petaflops),
More informationProfiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015
Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15: Profiling and Parallelizing with the OpenACC Toolkit
More informationGPU Computing: Development and Analysis. Part 1. Anton Wijs Muhammad Osama. Marieke Huisman Sebastiaan Joosten
GPU Computing: Development and Analysis Part 1 Anton Wijs Muhammad Osama Marieke Huisman Sebastiaan Joosten NLeSC GPU Course Rob van Nieuwpoort & Ben van Werkhoven Who are we? Anton Wijs Assistant professor,
More informationDesigning a Domain-specific Language to Simulate Particles. dan bailey
Designing a Domain-specific Language to Simulate Particles dan bailey Double Negative Largest Visual Effects studio in Europe Offices in London and Singapore Large and growing R & D team Squirt Fluid Solver
More informationLecture 3: Introduction to CUDA
CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu
More informationKernelGen a toolchain for automatic GPU-centric applications porting. Nicolas Lihogrud Dmitry Mikushin Andrew Adinets
P A R A L L E L C O M P U T A T I O N A L T E C H N O L O G I E S ' 2 0 1 2 KernelGen a toolchain for automatic GPU-centric applications porting Nicolas Lihogrud Dmitry Mikushin Andrew Adinets Contents
More informationGPGPUGPGPU: Multi-GPU Programming
GPGPUGPGPU: Multi-GPU Programming Fall 2012 HW4 global void cuda_transpose(const float *ad, const int n, float *atd) { } int i = threadidx.y + blockidx.y*blockdim.y; int j = threadidx.x + blockidx.x*blockdim.x;
More informationTechnische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics
GPU Programming Rüdiger Westermann Chair for Computer Graphics & Visualization Faculty of Informatics Overview Programming interfaces and support libraries The CUDA programming abstraction An in-depth
More informationRecent Advances in Heterogeneous Computing using Charm++
Recent Advances in Heterogeneous Computing using Charm++ Jaemin Choi, Michael Robson Parallel Programming Laboratory University of Illinois Urbana-Champaign April 12, 2018 1 / 24 Heterogeneous Computing
More informationWarps and Reduction Algorithms
Warps and Reduction Algorithms 1 more on Thread Execution block partitioning into warps single-instruction, multiple-thread, and divergence 2 Parallel Reduction Algorithms computing the sum or the maximum
More informationBy: Tomer Morad Based on: Erik Lindholm, John Nickolls, Stuart Oberman, John Montrym. NVIDIA TESLA: A UNIFIED GRAPHICS AND COMPUTING ARCHITECTURE In IEEE Micro 28(2), 2008 } } Erik Lindholm, John Nickolls,
More informationS4289: Efficient solution of multiple scalar and block-tridiagonal equations
S4289: Efficient solution of multiple scalar and block-tridiagonal equations Endre László endre.laszlo [at] oerc.ox.ac.uk Oxford e-research Centre, University of Oxford, UK Pázmány Péter Catholic University,
More informationIntroduction to Parallel Programming
Introduction to Parallel Programming Pablo Brubeck Department of Physics Tecnologico de Monterrey October 14, 2016 Student Chapter Tecnológico de Monterrey Tecnológico de Monterrey Student Chapter Outline
More informationLLVM supported source-to-source translation. Translation from annotated C/C++ to CUDA C/C++
LLVM supported source-to-source translation Translation from annotated C/C++ to CUDA C/C++ Niklas Jacobsen Master s Thesis Autumn 2016 LLVM supported source-to-source translation Niklas Jacobsen 1st November
More informationINTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017
INTRODUCTION TO OPENACC Analyzing and Parallelizing with OpenACC, Feb 22, 2017 Objective: Enable you to to accelerate your applications with OpenACC. 2 Today s Objectives Understand what OpenACC is and
More informationCSC266 Introduction to Parallel Computing using GPUs Introduction to CUDA
CSC266 Introduction to Parallel Computing using GPUs Introduction to CUDA Sreepathi Pai October 18, 2017 URCS Outline Background Memory Code Execution Model Outline Background Memory Code Execution Model
More informationOpenACC introduction (part 2)
OpenACC introduction (part 2) Aleksei Ivakhnenko APC Contents Understanding PGI compiler output Compiler flags and environment variables Compiler limitations in dependencies tracking Organizing data persistence
More informationCOSC 462 Parallel Programming
November 22, 2017 1/12 COSC 462 Parallel Programming CUDA Beyond Basics Piotr Luszczek Mixing Blocks and Threads int N = 100, SN = N * sizeof(double); global void sum(double *a, double *b, double *c) {
More informationALYA Multi-Physics System on GPUs: Offloading Large-Scale Computational Mechanics Problems
www.bsc.es ALYA Multi-Physics System on GPUs: Offloading Large-Scale Computational Mechanics Problems Vishal Mehta Engineer, Barcelona Supercomputing Center vishal.mehta@bsc.es Training BSC/UPC GPU Centre
More informationCS 677: Parallel Programming for Many-core Processors Lecture 6
1 CS 677: Parallel Programming for Many-core Processors Lecture 6 Instructor: Philippos Mordohai Webpage: www.cs.stevens.edu/~mordohai E-mail: Philippos.Mordohai@stevens.edu Logistics Midterm: March 11
More informationParticle-in-Cell Simulations on Modern Computing Platforms. Viktor K. Decyk and Tajendra V. Singh UCLA
Particle-in-Cell Simulations on Modern Computing Platforms Viktor K. Decyk and Tajendra V. Singh UCLA Outline of Presentation Abstraction of future computer hardware PIC on GPUs OpenCL and Cuda Fortran
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software
More informationHardware/Software Co-Design
1 / 13 Hardware/Software Co-Design Review so far Miaoqing Huang University of Arkansas Fall 2011 2 / 13 Problem I A student mentioned that he was able to multiply two 1,024 1,024 matrices using a tiled
More informationCSE 160 Lecture 24. Graphical Processing Units
CSE 160 Lecture 24 Graphical Processing Units Announcements Next week we meet in 1202 on Monday 3/11 only On Weds 3/13 we have a 2 hour session Usual class time at the Rady school final exam review SDSC
More informationCUDA. GPU Computing. K. Cooper 1. 1 Department of Mathematics. Washington State University
GPU Computing K. Cooper 1 1 Department of Mathematics Washington State University 2014 Review of Parallel Paradigms MIMD Computing Multiple Instruction Multiple Data Several separate program streams, each
More informationPorting COSMO to Hybrid Architectures
Porting COSMO to Hybrid Architectures T. Gysi 1, O. Fuhrer 2, C. Osuna 3, X. Lapillonne 3, T. Diamanti 3, B. Cumming 4, T. Schroeder 5, P. Messmer 5, T. Schulthess 4,6,7 [1] Supercomputing Systems AG,
More informationCUDA C Programming Mark Harris NVIDIA Corporation
CUDA C Programming Mark Harris NVIDIA Corporation Agenda Tesla GPU Computing CUDA Fermi What is GPU Computing? Introduction to Tesla CUDA Architecture Programming & Memory Models Programming Environment
More informationLecture 5. Performance Programming with CUDA
Lecture 5 Performance Programming with CUDA Announcements 2011 Scott B. Baden / CSE 262 / Spring 2011 2 Today s lecture Matrix multiplication 2011 Scott B. Baden / CSE 262 / Spring 2011 3 Memory Hierarchy
More informationdcuda: Distributed GPU Computing with Hardware Overlap
dcuda: Distributed GPU Computing with Hardware Overlap Tobias Gysi, Jeremia Bär, Lukas Kuster, and Torsten Hoefler GPU computing gained a lot of popularity in various application domains weather & climate
More informationCME 213 S PRING Eric Darve
CME 213 S PRING 2017 Eric Darve Review Secret behind GPU performance: simple cores but a large number of them; even more threads can exist live on the hardware (10k 20k threads live). Important performance
More informationIntroduc)on to GPU Programming
Introduc)on to GPU Programming Mubashir Adnan Qureshi h3p://www.ncsa.illinois.edu/people/kindr/projects/hpca/files/singapore_p1.pdf h3p://developer.download.nvidia.com/cuda/training/nvidia_gpu_compu)ng_webinars_cuda_memory_op)miza)on.pdf
More informationGPU CUDA Programming
GPU CUDA Programming 이정근 (Jeong-Gun Lee) 한림대학교컴퓨터공학과, 임베디드 SoC 연구실 www.onchip.net Email: Jeonggun.Lee@hallym.ac.kr ALTERA JOINT LAB Introduction 차례 Multicore/Manycore and GPU GPU on Medical Applications
More informationAn Extension of XcalableMP PGAS Lanaguage for Multi-node GPU Clusters
An Extension of XcalableMP PGAS Lanaguage for Multi-node Clusters Jinpil Lee, Minh Tuan Tran, Tetsuya Odajima, Taisuke Boku and Mitsuhisa Sato University of Tsukuba 1 Presentation Overview l Introduction
More informationOutline. L13: Application Case Studies. Reconstructing MR Images. Reconstructing MR Images 3/26/12
Outline L13: Application Case Studies Application Case Studies Advanced MRI Reconstruction Reading: Kirk and Hwu, Chapter 8 or From the sample book http://courses.ece.illinois.edu/ece498/al/textbook/ Chapter7-MRI-Case-Study.pdf
More informationScientific discovery, analysis and prediction made possible through high performance computing.
Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013
More informationGPU Programming Using NVIDIA CUDA
GPU Programming Using NVIDIA CUDA Siddhante Nangla 1, Professor Chetna Achar 2 1, 2 MET s Institute of Computer Science, Bandra Mumbai University Abstract: GPGPU or General-Purpose Computing on Graphics
More informationExploiting GPU Caches in Sparse Matrix Vector Multiplication. Yusuke Nagasaka Tokyo Institute of Technology
Exploiting GPU Caches in Sparse Matrix Vector Multiplication Yusuke Nagasaka Tokyo Institute of Technology Sparse Matrix Generated by FEM, being as the graph data Often require solving sparse linear equation
More informationCS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra)
CS 470 Spring 2018 Mike Lam, Professor Other Architectures (with an aside on linear algebra) Aside (P3 related): linear algebra Many scientific phenomena can be modeled as matrix operations Differential
More information