! Important(class(of(applications(((one(of(the(motifs)(

Size: px
Start display at page:

Download "! Important(class(of(applications(((one(of(the(motifs)("

Transcription

1 DidemUnat,XingCai,ScottB.Baden LawrenceBerkeleyNationalLaboratory SimulaResearchLaboratory UniversityofCalifornia,SanDiego Oslo,Jun7,12 Importantclassofapplicationsoneofthemotifs) Basisforapproximatingderivativesnumerically Physicalsimulationse.g.turbulenceflow,seismicwavepropagation) Multimediaapplicationse.g.imagesmoothing) Nearestneighborupdateonastructuredgrid 3DHeatEqnusing fullyexplicitfinitedifferencing: U x,y,z) = c*ux,y,z) + c1*ux,y,z-1) + Ux,y,z+1)+ Ux,y-1,z) + Ux,y+1,z) + Ux-1,y,z) + Ux+1,y,z)) Highlydataparallel,memorybandwidthbound GPUspeedupsovermulti8s) " 5XforLatticeBoltzmann[Lee,ISCA 11], " 4XReverseTimeMigration[Kruger,SC 11] 4 PowerMW) Gigaflop/s 1Teraflop/s HPC$milestones$ 1Petaflop/s 1Exaflop/s 1985% 1996% 8% 18% Powerconsumptionisthemaindesignconstraint Drasticchangesinnodearchitecture[Shalf,VecPar ] Moreparallelismonthechip SoftwareSmanagedmemory/incoherentcaches Alreadystartedseeingconcreteinstances Heterogeneityincomputeresources Explicitmanagementofdatatransfer Separatedevicememoryfromthehostmemory Reengineeringofscientificapplications Algorithmicchangestomatchthehardwarecapabilities BestperformancerequiresnonStrivialknowledgeofthearchitecture host Main Memory bus accelerator On-chip Memory Vector Cores 2 5 GraphicsProcessingUnitsGPUs) Massivelyparallelsinglechipprocessor Lowpowers:tradeoffsinglethreadperformance LargeregisterfileandsoftwareSmanagedmemory Effectiveinacceleratingcertaindataparallelapplications CaseStudy:CardiacElectrophysiology[Unat,PARA ] Notidealforothers:sorting[Lee,ISCA ] host accelerator Explicitlymanagedmemory OnSchipmemoryresources Privateandincoherent e.g shared floata[n]; Hierarchicalthreadmanagement Thread,threadgroups,threadsubgroups Granularityofathread Shared Memory/L1 cache Main Memory On-chip Memory DomainSspecificoptimizations Limitstheadoptioninscientificcomputing Register File bus Vector Cores Weneedprogrammingmodelstomasterthenewtechnologyand makeitaccessibletocomputationalscientists. 3 6

2 Aimsprogrammer sproductivityandhighperformance Simplifiesapplicationdevelopment Basedonamodestnumberofcompilerdirectives #pragmamintfor Incrementalparallelization Abstractsawaytheprogrammer sviewofthehardware Seismic Modeling Cardiac Simulation Turbulent Flow Main Memory SourceStoSsourcetranslatorfortheNvidiaGPUs Parallelizesloopnests Relievestheprogrammerofavarietyoftedioustasks C + directives MotifSspecificautoSoptimizer Targetsstencilmethods Incorporatessemanticknowledgetocompileranalysis PerformsdatalocalityoptimizationsviaonSchipmemory Compilerflagsforperformancetuning 7 Serialcode AcceleratedRegion Dataparallelfor HostRegion Dataparallelfor Host Thread 8 11 Serialcode AcceleratedRegion Dataparallelfor HostRegion kernel Block Block #pragmamintparallel Accelerated Region Indicatestheacceleratedregion #pragmamintfor MarksenclosedloopSnestforacceleration 3additionalclausesforoptimizations #pragmamintcopy Data Transfer Expressesdatatransfersbetweenthehostanddevice Dataparallelfor Host Thread Block Block Block #pragmamintsingle Handlesserialsection #pragmamintbarrier Synchronizeshostanddevicethreads Synchronization 9 12

3 Performancetuningparameters HighSlevelinterfacetolowSlevel hardwarespecificoptimizations 1. ForSloopclauses handledatadecompositionandthread management nest),tile),chunksize) 2. Compilerflagsfordatalocality Register:Sregister SoftwareSmanagedmemory:Sshared Cache:SpreferL1 Shared Memory/L1 cache Register File 13 #pragma mint copyu,todevice,n+2),m+2),k+2)) n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) n+2),m+2),k+2)) Data while t++ < T ) Transfers #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) n+2),m+2),k+2)) 16 #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) Program for the 3D Heat Eqn. while t++ < T ) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) while t++ < T ) #pragmamintfor #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + Data parallel for loop #pragma mint copyu,fromdevice,n+2),m+2),k+2)) #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma mint copyunew,todevice,n+2),m+2),k+2)) #pragma #pragma mint mint parallel parallel while t++ < T ) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) Accelerated Region #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma depth mint of copyunew,todevice,n+2),m+2),k+2)) loop parallelism while t++ < T ) #pragmamintfornestall) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) 15 18

4 #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma depth mint of copyunew,todevice,n+2),m+2),k+2)) loop parallelism partitioning iteration space while t++ < T ) #pragmamintfornestall)tile16,16,64) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) #pragma mint copyu,todevice,n+2),m+2),k+2)) #pragma depth mint of copyunew,todevice,n+2),m+2),k+2)) loop parallelism partitioning iteration space while t++ < T ) #pragmamintfornestall)tile16,16,64)chunksize1,1,64) #pragma mint for nestall) tile16,16,64) chunksize1,1,64) for int z=1; z<= k; z++) for int y=1; y<= m; y++) for int x=1; x<= n; x++) c1 * U[z][y][x-1] + U[z][y][x+1] + #pragma mint copyu,fromdevice,n+2),m+2),k+2)) workload of a thread Fullyautomatedtranslationand optimizationsystem TransformationperformedontheAbstract SyntaxTree BuiltontopoftheROSEcompiler DevelopedatLLNL ROSEprovidesanAPIforgeneratingand manipulatingabstractsyntaxtrees isapartoftherosedistribution sincenov 11. Input code: C + ROSE Parser ROSE backend Output file Cuda src 23 ty 3D Grid Nx, Ny, Nz) threads tx/cx, ty/cy, tz/cz) Cuda thread Inputcode: C+ PragmaHandler BaselineTranslator MemoryManager ROSEParser LoopTransformer tz tile tx,ty,tz) chunksize cx,cy,cz) tx #pragma mint for nest#,all)tilet x,t y,t z ) chunksizec x,c y,c z ) ROSE backend KernelConfig Outliner ArgumentHandler WorkPractitioner ManagesdatadecompositionandthreadworkSassignment. Outputfile Cudasrc Optimizer 21 24

5 #pragmamintparallel whilet<t) t+=dt; #pragmamintfor fori=1;i<=n;i++) forj=1;j<=n;j++) A[i][j]=c*B[iJ1][j]+B[i+1][j] +B[i][jJ1]+B[i][j+1]); }//endofwhile }//endofparallelregion global void mint_1_1517cudapitchedptr ptr_du...) double* U = double *)ptr_du.ptr); int widthu = ptr_du.pitch / sizeofdouble ); int sliceu = ptr_du.ysize * widthu;... int _idx = threadidx.x + 1; int _gidx = _idx + blockdim.x * blockidx.x;... if _gidz >= 1 && _gidz <= k) if _gidy >= 1 && _gidy <= m) if _gidx >= 1 && _gidx <= n) Unew[indUnew] = c * U[indU] + c1 * U[indU - 1] + U[indU + 1]... ); Unpack pitched ptrs Compute local and global indices using thread and block IDs If-statements are derived from forstatements Each thread updates single data point 25 }//end of kernel 28 #pragmamintparallel whilet<t) t+=dt; #pragmamintfor fori=1;i<=n;i++) forj=1;j<=n;j++) A[i][j]=c*B[iJ1][j]+B[i+1][j] +B[i][jJ1]+B[i][j+1]); }//endofwhile }//endofparallelregion Host /* Outlined Kernel */ global void cuda_func ) }... Device AST #pragmamintparallel whilet<t) t+=dt;... /* Outlined Kernel */ global void cuda_func ) }... Device Thebaselinetranslator performsallthememoryreferences throughglobalmemory canstillparallelizeloops,launchkernel, performdatatransfers Baseline Translator Optimizer cuda_1_func_<<<threads, blocks>>> );... }//endofwhile Host }//endofparallelregion Optimizerfocusesonstencilmethods Reducesglobalmemoryaccesses DatalocalityusingonSchipmemory Stencil Analyzer OnSchipMemory Optimizer OnSchipmemoryoptimizationflags SpreferL1,Sregister,Sshared Bestperformancedependsonthedevice andapplication ModifiedAST 27

6 Analyzesarrayaccesspattern Findsstencilstructureanddependencybetweenthreads z y z y z y x Tradeoff:findthesweetspot Whichvariables?andHowmanyvariables? Maximizethetotalreductioninglobalmemoryreferences Minimizethesharedmemoryusage Planesmaycomefrom1ormorearrays 7-point 13-point 19-point Ghostcellregion SpreferL1 ConfiguresonSchipmemoryonFermi Favors48KBL1and16KBsharedmemory Sregister Takesadvantageoflargeregisterfile Placesfrequentlyaccessedarraysintoregisters Enhancesaccesstothecentralpointofastencil Shared Memory/L1 cache Register File Sshared Detectssharablereferencesamongthreads PlacestheminsharedmemorysoftwareSmanagedmemory) Anumberofplanesresideonsharedmemory tile 2D planes Tradeoff Reducesmemoryreferencestofrequentlyaccessedlocations Increasesresourcesneededbyathread,reducingconcurrency Gflops Heat Poisson Variable Poisson 19pt Tesla C6 Hand- Heat Poisson Variable Poisson 19pt Tesla C D.Unat,X.Cai,andS.Baden.":Realizingperformancein3DStencil MethodswithAnnotatedC",inInternationalConferenceonSupercomputing. ICS'11,Tucson,AZ,

7 45 Theoptimizerimprovestheperformance2.6xoverthebaseline. Hand Gflop/s Gflops Heat Poisson Variable Poisson 19pt Heat Poisson Variable Poisson 19pt Tesla C6 2.6x Tesla C 1 MPI OnTeslaC6,achieves79%ofthehandSoptimized. OnTeslaCFermi),achieves76%ofthehandSoptimized. EarthquakeSinducedseismicwavepropagation Tesla C GPU 7 32 MPI processes 6 Gflop/s YifengCuiandJunZhouatSDSC Refersto31threeSdimarrays asymmetric13spointstencil Timeconsumingloops:185lines Generatedcode:1185lines Hand 8 GordonBellPrizefinalistatSC baseline optimizer TheedcodeonsingleGPUisslightlyfasterthan32s. UsedbyresearchersSouthernCAEarthquakeCenter 32 MPI 4 nodes Petascaleanelasticwavepropagationcode 16 MPI 8 Nehalem s / node 37 8 MPI D.Unat,J.Zhou,Y.Cui,X.Cai,andS.Baden. Acceleratinga3DFinite DifferenceEarthquakeSimulationwithaCMtoMTranslator,inComputing inscienceandengineeringjournal,12. 1 MPI 8 MPI 16 MPI 32 MPI 8 Nehalem s / node baseline optimizer Hand Tesla C GPU 4 nodes Gflop/s Gflop/s Theedcodeachieves83%ofthehandSoptimized. 8 83% 1 MPI 8 MPI 16 MPI 32 MPI 8 Nehalem s / node 4 nodes baseline optimizer Hand Tesla C GPU 1 MPI 8 MPI 16 MPI 32 MPI 8 Nehalem s / node 39 4 nodes baseline optimizer Hand Tesla C GPU 42

8 Computervisionalgorithm Detectscornersandhighintensitypoints CollaboratedwithHanKim&JürgenSchulze Inserted5linesofcodeintothe originalcode~3lines) RealStimeperformancewith S22xperformanceof8Nehalems runningopenmpthreads TeslaCvsIntelXeonE54Nehalem D.Unat,H.S.Kim,J.Schulze,S.B.Baden, AutoMoptimizationofaFeature SelectionAlgorithm,inEmergingApplicationsandManySArchitecture, EAMAWorkshopcoSlocatedwithISCA,SanJose,CA, ProgrammingModel AddressestheprogrammabilityissueofGPUs " Today smassivelyparallelarchitectures " SoftwareSmanagedstorageandmassivelyparallelchip SourceStoSsourcetranslatorandoptimizer IncorporatesmotifSspecificknowledge Achievedaround8%ofthehandSoptimized " BothcommonlySusedkernelsandrealSworldapplications Availablefordownload Ourprojectwebsite " OnlineTranslator: " 46 OpenMPCextendsOpenMPtosupport Parallelizesonlyouterloopandnosharedmemoryoptimization CommercialcompilerfromPortlandGroupPGI) GeneralSpurposecompiler Gflops OpenMPC PGI HandD HeatEqn OpenACC CollaborativeeffortfromPGI,Cray,CAPs,Nvidia ShowsthatdirectiveSbasedmodelisapromisingapproach AllresultsareobtainedonTeslaC6.PGIv11.1)results:ontheLincolnsystem. 44 ScottBadenUCSD), XingCaiSimula), AllanSnavelySDSC), HanSukKimUCSD,Apple), JürgenSchulzeUCSD), YifengCuiSDSC), JunZhouSDSC), WenjieWeiSimula), RossWalkerSDSC), DanQuinlanLLNL), ROSETeamLLNL), PauliusMicikeviciusNvidia), EverettPhillipsNvidia) 47 TargetmultipleGPUs MPIcodegeneration ExtendforIntelManyIntegratedCoreMIC) Sameexecutionmodel Newclausesorcompileroptions " Registerblocking,SSEinstructions,softwareprefetching Soffload=[mic apu cuda] Maintainsinglecodebase? Main Memory Main Memory [Shalf,VecPar ]JohnShalf,SudipDosanjh,andJohnMorrison.Exascalecomputing technologychallenges.inproceedingsofthe9thinternationalconferenceonhigh PerformanceComputingforComputationalScience,VECPAR. [Lee,ISCA ]VictorW.Lee,ChangkyuKim,JatinChhugani,MichaelDeisher,Daehyun Kim,AnthonyD.Nguyen,NadathurSatish,MikhailSmelyanskiy,SrinivasChennuSpaty,Per Hammarlund,RonakSinghal,andPradeepDubey.DebunkingtheXGPUvs.CPUmyth: anevaluationofthroughputcomputingoncpuandgpu.sigarchcomput. [Unat,Para ]DidemUnat,XingCai,andScottBaden.OptimizingtheAlievSPanfilov modelofcardiacexcitationonheterogeneoussystems.para:stateoftheartin ScientificandParallelComputing [Lee]SeyongLee,SeungSJaiMin,andRudolfEigenmann.OpenMPtoGPGPU:acompiler frameworkforautomatictranslationandoptimization.inproceedingsofthe14thacm SIGPLANSymposiumonPrinciplesandPracticeofParallelProgramming,PPoPP 9 [William]SamuelWebbWilliams.AutoStuningPerformanceonMultiComputers.PhD thesis,eecsdepartment,universityofcalifornia,berkeley,dec8 [Carrington]LauraCarrington,MustafaM.Tikir,CatherineOlschanowsky,Michael Laurenzano,JoshuaPeraza,AllanSnavely,andStephenPoole.AnidiomSfindingtoolfor increasingproductivityofaccelerators.inproceedingsoftheinternationalconferenceon Supercomputing,ICS 11 [Datta]KaushikDatta,MarkMurphy,VasilyVolkov,SamuelWilliams,JonathanCarter, LeonidOliker,DavidPatterson,JohnShalf,andKatherineYelick.Stencilcomputation optimizationandautostuningonstatesofsthesartmultiarchitectures.insc

Didem Unat, Xing Cai, Scott Baden

Didem Unat, Xing Cai, Scott Baden Didem Unat, Xing Cai, Scott Baden GPUs are effective means of accelerating data parallel applications However, porting algorithms on a GPU still remains a challenge. A directive based programming model

More information

Lecture 10. Stencil Methods Limits to Performance

Lecture 10. Stencil Methods Limits to Performance Lecture 10 Stencil Methods Limits to Performance Announcements Tuesday s lecture on 2/11 will be moved to room 4140 from 6.30 PM to 7.50 PM Office hours are on for today Scott B. Baden /CSE 260/ Winter

More information

CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012

CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012 CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012 Overview 1. Memory Access Efficiency 2. CUDA Memory Types 3. Reducing Global Memory Traffic 4. Example: Matrix-Matrix

More information

Module Memory and Data Locality

Module Memory and Data Locality GPU Teaching Kit Accelerated Computing Module 4.4 - Memory and Data Locality Tiled Matrix Multiplication Kernel Objective To learn to write a tiled matrix-multiplication kernel Loading and using tiles

More information

Physis: An Implicitly Parallel Framework for Stencil Computa;ons

Physis: An Implicitly Parallel Framework for Stencil Computa;ons Physis: An Implicitly Parallel Framework for Stencil Computa;ons Naoya Maruyama RIKEN AICS (Formerly at Tokyo Tech) GTC12, May 2012 1 è Good performance with low programmer produc;vity Mul;- GPU Applica;on

More information

Introduction to GPGPU and GPU-architectures

Introduction to GPGPU and GPU-architectures Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks

More information

GPU programming CUDA C. GPU programming,ii. COMP528 Multi-Core Programming. Different ways:

GPU programming CUDA C. GPU programming,ii. COMP528 Multi-Core Programming. Different ways: COMP528 Multi-Core Programming GPU programming,ii www.csc.liv.ac.uk/~alexei/comp528 Alexei Lisitsa Dept of computer science University of Liverpool a.lisitsa@.liverpool.ac.uk Different ways: GPU programming

More information

OpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016

OpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016 OpenACC. Part I Ned Nedialkov McMaster University Canada October 2016 Outline Introduction Execution model Memory model Compiling pgaccelinfo Example Speedups Profiling c 2016 Ned Nedialkov 2/23 Why accelerators

More information

Device Memories and Matrix Multiplication

Device Memories and Matrix Multiplication Device Memories and Matrix Multiplication 1 Device Memories global, constant, and shared memories CUDA variable type qualifiers 2 Matrix Multiplication an application of tiling runningmatrixmul in the

More information

MPI_Send(a,..., MPI_COMM_WORLD); MPI_Recv(a,..., MPI_COMM_WORLD, &status);

MPI_Send(a,..., MPI_COMM_WORLD); MPI_Recv(a,..., MPI_COMM_WORLD, &status); $ $ 2 global void kernel(int a[max], int llimit, int ulimit) {... } : int main(int argc, char *argv[]){ MPI_Int(&argc, &argc); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size);

More information

OpenACC programming for GPGPUs: Rotor wake simulation

OpenACC programming for GPGPUs: Rotor wake simulation DLR.de Chart 1 OpenACC programming for GPGPUs: Rotor wake simulation Melven Röhrig-Zöllner, Achim Basermann Simulations- und Softwaretechnik DLR.de Chart 2 Outline Hardware-Architecture (CPU+GPU) GPU computing

More information

Automatic translation from CUDA to C++ Luca Atzori, Vincenzo Innocente, Felice Pantaleo, Danilo Piparo

Automatic translation from CUDA to C++ Luca Atzori, Vincenzo Innocente, Felice Pantaleo, Danilo Piparo Automatic translation from CUDA to C++ Luca Atzori, Vincenzo Innocente, Felice Pantaleo, Danilo Piparo 31 August, 2015 Goals Running CUDA code on CPUs. Why? Performance portability! A major challenge faced

More information

Lecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1

Lecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1 Lecture 15: Introduction to GPU programming Lecture 15: Introduction to GPU programming p. 1 Overview Hardware features of GPGPU Principles of GPU programming A good reference: David B. Kirk and Wen-mei

More information

Introduction to Parallel Computing with CUDA. Oswald Haan

Introduction to Parallel Computing with CUDA. Oswald Haan Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries

More information

CRAY XK6 REDEFINING SUPERCOMPUTING. - Sanjana Rakhecha - Nishad Nerurkar

CRAY XK6 REDEFINING SUPERCOMPUTING. - Sanjana Rakhecha - Nishad Nerurkar CRAY XK6 REDEFINING SUPERCOMPUTING - Sanjana Rakhecha - Nishad Nerurkar CONTENTS Introduction History Specifications Cray XK6 Architecture Performance Industry acceptance and applications Summary INTRODUCTION

More information

Review. Lecture 10. Today s Outline. Review. 03b.cu. 03?.cu CUDA (II) Matrix addition CUDA-C API

Review. Lecture 10. Today s Outline. Review. 03b.cu. 03?.cu CUDA (II) Matrix addition CUDA-C API Review Lecture 10 CUDA (II) host device CUDA many core processor threads thread blocks grid # threads >> # of cores to be efficient Threads within blocks can cooperate Threads between thread blocks cannot

More information

Tiled Matrix Multiplication

Tiled Matrix Multiplication Tiled Matrix Multiplication Basic Matrix Multiplication Kernel global void MatrixMulKernel(int m, m, int n, n, int k, k, float* A, A, float* B, B, float* C) C) { int Row = blockidx.y*blockdim.y+threadidx.y;

More information

Introduction to GPU programming. Introduction to GPU programming p. 1/17

Introduction to GPU programming. Introduction to GPU programming p. 1/17 Introduction to GPU programming Introduction to GPU programming p. 1/17 Introduction to GPU programming p. 2/17 Overview GPUs & computing Principles of CUDA programming One good reference: David B. Kirk

More information

ACCELERATING THE PRODUCTION OF SYNTHETIC SEISMOGRAMS BY A MULTICORE PROCESSOR CLUSTER WITH MULTIPLE GPUS

ACCELERATING THE PRODUCTION OF SYNTHETIC SEISMOGRAMS BY A MULTICORE PROCESSOR CLUSTER WITH MULTIPLE GPUS ACCELERATING THE PRODUCTION OF SYNTHETIC SEISMOGRAMS BY A MULTICORE PROCESSOR CLUSTER WITH MULTIPLE GPUS Ferdinando Alessi Annalisa Massini Roberto Basili INGV Introduction The simulation of wave propagation

More information

Dense Linear Algebra. HPC - Algorithms and Applications

Dense Linear Algebra. HPC - Algorithms and Applications Dense Linear Algebra HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 6 th 2017 Last Tutorial CUDA Architecture thread hierarchy:

More information

CUDA. Matthew Joyner, Jeremy Williams

CUDA. Matthew Joyner, Jeremy Williams CUDA Matthew Joyner, Jeremy Williams Agenda What is CUDA? CUDA GPU Architecture CPU/GPU Communication Coding in CUDA Use cases of CUDA Comparison to OpenCL What is CUDA? What is CUDA? CUDA is a parallel

More information

Parallel Numerical Algorithms

Parallel Numerical Algorithms Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance

More information

Outline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun

Outline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun Outline Memory Management CIS 565 Fall 2011 Qing Sun sunqing@seas.upenn.edu Kernels Matrix multiplication Managing Memory CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory

More information

CS/CoE 1541 Final exam (Fall 2017). This is the cumulative final exam given in the Fall of Question 1 (12 points): was on Chapter 4

CS/CoE 1541 Final exam (Fall 2017). This is the cumulative final exam given in the Fall of Question 1 (12 points): was on Chapter 4 CS/CoE 1541 Final exam (Fall 2017). Name: This is the cumulative final exam given in the Fall of 2017. Question 1 (12 points): was on Chapter 4 Question 2 (13 points): was on Chapter 4 For Exam 2, you

More information

1/25/12. Administrative

1/25/12. Administrative Administrative L3: Memory Hierarchy Optimization I, Locality and Data Placement Next assignment due Friday, 5 PM Use handin program on CADE machines handin CS6235 lab1 TA: Preethi Kotari - Email:

More information

OpenACC Fundamentals. Steve Abbott November 15, 2017

OpenACC Fundamentals. Steve Abbott November 15, 2017 OpenACC Fundamentals Steve Abbott , November 15, 2017 AGENDA Data Regions Deep Copy 2 while ( err > tol && iter < iter_max ) { err=0.0; JACOBI ITERATION #pragma acc parallel loop reduction(max:err)

More information

Auto-Generation and Auto-Tuning of 3D Stencil Codes on GPU Clusters

Auto-Generation and Auto-Tuning of 3D Stencil Codes on GPU Clusters Auto-Generation and Auto-Tuning of 3D Stencil s on GPU Clusters Yongpeng Zhang, Frank Mueller North Carolina State University CGO 2012 Outline Motivation DSL front-end and Benchmarks Framework Experimental

More information

GPU Memory Memory issue for CUDA programming

GPU Memory Memory issue for CUDA programming Memory issue for CUDA programming Variable declaration Memory Scope Lifetime device local int LocalVar; local thread thread device shared int SharedVar; shared block block device int GlobalVar; global

More information

Introduction CPS343. Spring Parallel and High Performance Computing. CPS343 (Parallel and HPC) Introduction Spring / 29

Introduction CPS343. Spring Parallel and High Performance Computing. CPS343 (Parallel and HPC) Introduction Spring / 29 Introduction CPS343 Parallel and High Performance Computing Spring 2018 CPS343 (Parallel and HPC) Introduction Spring 2018 1 / 29 Outline 1 Preface Course Details Course Requirements 2 Background Definitions

More information

GPU Performance Nuggets

GPU Performance Nuggets GPU Performance Nuggets Simon Garcia de Gonzalo & Carl Pearson PhD Students, IMPACT Research Group Advised by Professor Wen-mei Hwu Jun. 15, 2016 grcdgnz2@illinois.edu pearson@illinois.edu GPU Performance

More information

CSE 599 I Accelerated Computing - Programming GPUS. Memory performance

CSE 599 I Accelerated Computing - Programming GPUS. Memory performance CSE 599 I Accelerated Computing - Programming GPUS Memory performance GPU Teaching Kit Accelerated Computing Module 6.1 Memory Access Performance DRAM Bandwidth Objective To learn that memory bandwidth

More information

Lecture 8: GPU Programming. CSE599G1: Spring 2017

Lecture 8: GPU Programming. CSE599G1: Spring 2017 Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library

More information

Unrolling parallel loops

Unrolling parallel loops Unrolling parallel loops Vasily Volkov UC Berkeley November 14, 2011 1 Today Very simple optimization technique Closely resembles loop unrolling Widely used in high performance codes 2 Mapping to GPU:

More information

Computational Fluid Dynamics (CFD) using Graphics Processing Units

Computational Fluid Dynamics (CFD) using Graphics Processing Units Computational Fluid Dynamics (CFD) using Graphics Processing Units Aaron F. Shinn Mechanical Science and Engineering Dept., UIUC Accelerators for Science and Engineering Applications: GPUs and Multicores

More information

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer OpenACC Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer 3 WAYS TO ACCELERATE APPLICATIONS Applications Libraries Compiler Directives Programming Languages Easy to use Most Performance

More information

Parallel Computing. Lecture 19: CUDA - I

Parallel Computing. Lecture 19: CUDA - I CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4

More information

Programming in CUDA. Malik M Khan

Programming in CUDA. Malik M Khan Programming in CUDA October 21, 2010 Malik M Khan Outline Reminder of CUDA Architecture Execution Model - Brief mention of control flow Heterogeneous Memory Hierarchy - Locality through data placement

More information

GPU Acceleration of the Longwave Rapid Radiative Transfer Model in WRF using CUDA Fortran. G. Ruetsch, M. Fatica, E. Phillips, N.

GPU Acceleration of the Longwave Rapid Radiative Transfer Model in WRF using CUDA Fortran. G. Ruetsch, M. Fatica, E. Phillips, N. GPU Acceleration of the Longwave Rapid Radiative Transfer Model in WRF using CUDA Fortran G. Ruetsch, M. Fatica, E. Phillips, N. Juffa Outline WRF and RRTM Previous Work CUDA Fortran Features RRTM in CUDA

More information

GPU Architecture and Programming. Andrei Doncescu inspired by NVIDIA

GPU Architecture and Programming. Andrei Doncescu inspired by NVIDIA GPU Architecture and Programming Andrei Doncescu inspired by NVIDIA Traditional Computing Von Neumann architecture: instructions are sent from memory to the CPU Serial execution: Instructions are executed

More information

OpenACC. Part 2. Ned Nedialkov. McMaster University Canada. CS/SE 4F03 March 2016

OpenACC. Part 2. Ned Nedialkov. McMaster University Canada. CS/SE 4F03 March 2016 OpenACC. Part 2 Ned Nedialkov McMaster University Canada CS/SE 4F03 March 2016 Outline parallel construct Gang loop Worker loop Vector loop kernels construct kernels vs. parallel Data directives c 2013

More information

Cartoon parallel architectures; CPUs and GPUs

Cartoon parallel architectures; CPUs and GPUs Cartoon parallel architectures; CPUs and GPUs CSE 6230, Fall 2014 Th Sep 11! Thanks to Jee Choi (a senior PhD student) for a big assist 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ~ socket 14 ~ core 14 ~ HWMT+SIMD

More information

NVIDIA GTX200: TeraFLOPS Visual Computing. August 26, 2008 John Tynefield

NVIDIA GTX200: TeraFLOPS Visual Computing. August 26, 2008 John Tynefield NVIDIA GTX200: TeraFLOPS Visual Computing August 26, 2008 John Tynefield 2 Outline Execution Model Architecture Demo 3 Execution Model 4 Software Architecture Applications DX10 OpenGL OpenCL CUDA C Host

More information

CSE 591: GPU Programming. Memories. Klaus Mueller. Computer Science Department Stony Brook University

CSE 591: GPU Programming. Memories. Klaus Mueller. Computer Science Department Stony Brook University CSE 591: GPU Programming Memories Klaus Mueller Computer Science Department Stony Brook University Importance of Memory Access Efficiency Every loop iteration has two global memory accesses two floating

More information

Tesla Architecture, CUDA and Optimization Strategies

Tesla Architecture, CUDA and Optimization Strategies Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization

More information

GPU Memory. GPU Memory. Memory issue for CUDA programming. Copyright 2013 Yong Cao, Referencing UIUC ECE498AL Course Notes

GPU Memory. GPU Memory. Memory issue for CUDA programming. Copyright 2013 Yong Cao, Referencing UIUC ECE498AL Course Notes Memory issue for CUDA programming CUDA Variable Type Qualifiers Variable declaration Memory Scope Lifetime device local int LocalVar; local thread thread device shared int SharedVar; shared block block

More information

Persistent RNNs. (stashing recurrent weights on-chip) Gregory Diamos. April 7, Baidu SVAIL

Persistent RNNs. (stashing recurrent weights on-chip) Gregory Diamos. April 7, Baidu SVAIL (stashing recurrent weights on-chip) Baidu SVAIL April 7, 2016 SVAIL Think hard AI. Goal Develop hard AI technologies that impact 100 million users. Deep Learning at SVAIL 100 GFLOP/s 1 laptop 6 TFLOP/s

More information

E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices

E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices Ching-Yung Lin, Ph.D. Adjunct Professor, Dept. of Electrical Engineering and Computer Science IBM Chief Scientist, Graph

More information

Lecture 7. Overlap Using Shared Memory Performance programming the memory hierarchy

Lecture 7. Overlap Using Shared Memory Performance programming the memory hierarchy Lecture 7 Overlap Using Shared Memory Performance programming the memory hierarchy Announcements Mac Mini lab (APM 2402) Starts Tuesday Every Tues and Fri for the next 2 weeks Project proposals due on

More information

GPU Programming. Performance Considerations. Miaoqing Huang University of Arkansas Fall / 60

GPU Programming. Performance Considerations. Miaoqing Huang University of Arkansas Fall / 60 1 / 60 GPU Programming Performance Considerations Miaoqing Huang University of Arkansas Fall 2013 2 / 60 Outline Control Flow Divergence Memory Coalescing Shared Memory Bank Conflicts Occupancy Loop Unrolling

More information

COSC 6374 Parallel Computations Introduction to CUDA

COSC 6374 Parallel Computations Introduction to CUDA COSC 6374 Parallel Computations Introduction to CUDA Edgar Gabriel Fall 2014 Disclaimer Material for this lecture has been adopted based on various sources Matt Heavener, CS, State Univ. of NY at Buffalo

More information

GREAT PERFORMANCE FOR TINY PROBLEMS: BATCHED PRODUCTS OF SMALL MATRICES. Nikolay Markovskiy Peter Messmer

GREAT PERFORMANCE FOR TINY PROBLEMS: BATCHED PRODUCTS OF SMALL MATRICES. Nikolay Markovskiy Peter Messmer GREAT PERFORMANCE FOR TINY PROBLEMS: BATCHED PRODUCTS OF SMALL MATRICES Nikolay Markovskiy Peter Messmer ABOUT CP2K Atomistic and molecular simulations of solid state From ab initio DFT and Hartree-Fock

More information

CS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra)

CS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra) CS 470 Spring 2016 Mike Lam, Professor Other Architectures (with an aside on linear algebra) Parallel Systems Shared memory (uniform global address space) Primary story: make faster computers Programming

More information

GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler

GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler Taylor Lloyd Jose Nelson Amaral Ettore Tiotto University of Alberta University of Alberta IBM Canada 1 Why? 2 Supercomputer Power/Performance GPUs

More information

Massively Parallel Architectures

Massively Parallel Architectures Massively Parallel Architectures A Take on Cell Processor and GPU programming Joel Falcou - LRI joel.falcou@lri.fr Bat. 490 - Bureau 104 20 janvier 2009 Motivation The CELL processor Harder,Better,Faster,Stronger

More information

A case study of performance portability with OpenMP 4.5

A case study of performance portability with OpenMP 4.5 A case study of performance portability with OpenMP 4.5 Rahul Gayatri, Charlene Yang, Thorsten Kurth, Jack Deslippe NERSC pre-print copy 1 Outline General Plasmon Pole (GPP) application from BerkeleyGW

More information

LECTURE ON PASCAL GPU ARCHITECTURE. Jiri Kraus, November 14 th 2016

LECTURE ON PASCAL GPU ARCHITECTURE. Jiri Kraus, November 14 th 2016 LECTURE ON PASCAL GPU ARCHITECTURE Jiri Kraus, November 14 th 2016 ACCELERATED COMPUTING CPU Optimized for Serial Tasks GPU Accelerator Optimized for Parallel Tasks 2 ACCELERATED COMPUTING CPU Optimized

More information

60x Computational Fluid Dynamics and Visualisation

60x Computational Fluid Dynamics and Visualisation 60x Computational Fluid Dynamics and Visualisation Jamil Appa BAE Systems Advanced Technology Centre 1 Outline BAE Systems - Introduction Aerodynamic Design Challenges Why GPUs? CFD on GPUs Example Kernel

More information

Image Processing Optimization C# on GPU with Hybridizer

Image Processing Optimization C# on GPU with Hybridizer Image Processing Optimization C# on GPU with Hybridizer regis.portalez@altimesh.com Median Filter Denoising Noisy image (lena 1960x1960) Denoised image window = 3 2 Median Filter Denoising window Output[i,j]=

More information

High Performance Linear Algebra on Data Parallel Co-Processors I

High Performance Linear Algebra on Data Parallel Co-Processors I 926535897932384626433832795028841971693993754918980183 592653589793238462643383279502884197169399375491898018 415926535897932384626433832795028841971693993754918980 592653589793238462643383279502884197169399375491898018

More information

Paralization on GPU using CUDA An Introduction

Paralization on GPU using CUDA An Introduction Paralization on GPU using CUDA An Introduction Ehsan Nedaaee Oskoee 1 1 Department of Physics IASBS IPM Grid and HPC workshop IV, 2011 Outline 1 Introduction to GPU 2 Introduction to CUDA Graphics Processing

More information

Graphics Processing Unit (GPU) Acceleration of Machine Vision Software for Space Flight Applications

Graphics Processing Unit (GPU) Acceleration of Machine Vision Software for Space Flight Applications Graphics Processing Unit (GPU) Acceleration of Machine Vision Software for Space Flight Applications Workshop on Space Flight Software November 6, 2009 Brent Tweddle Massachusetts Institute of Technology

More information

Developing PIC Codes for the Next Generation Supercomputer using GPUs. Viktor K. Decyk UCLA

Developing PIC Codes for the Next Generation Supercomputer using GPUs. Viktor K. Decyk UCLA Developing PIC Codes for the Next Generation Supercomputer using GPUs Viktor K. Decyk UCLA Abstract The current generation of supercomputer (petaflops scale) cannot be scaled up to exaflops (1000 petaflops),

More information

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15: Profiling and Parallelizing with the OpenACC Toolkit

More information

GPU Computing: Development and Analysis. Part 1. Anton Wijs Muhammad Osama. Marieke Huisman Sebastiaan Joosten

GPU Computing: Development and Analysis. Part 1. Anton Wijs Muhammad Osama. Marieke Huisman Sebastiaan Joosten GPU Computing: Development and Analysis Part 1 Anton Wijs Muhammad Osama Marieke Huisman Sebastiaan Joosten NLeSC GPU Course Rob van Nieuwpoort & Ben van Werkhoven Who are we? Anton Wijs Assistant professor,

More information

Designing a Domain-specific Language to Simulate Particles. dan bailey

Designing a Domain-specific Language to Simulate Particles. dan bailey Designing a Domain-specific Language to Simulate Particles dan bailey Double Negative Largest Visual Effects studio in Europe Offices in London and Singapore Large and growing R & D team Squirt Fluid Solver

More information

Lecture 3: Introduction to CUDA

Lecture 3: Introduction to CUDA CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu

More information

KernelGen a toolchain for automatic GPU-centric applications porting. Nicolas Lihogrud Dmitry Mikushin Andrew Adinets

KernelGen a toolchain for automatic GPU-centric applications porting. Nicolas Lihogrud Dmitry Mikushin Andrew Adinets P A R A L L E L C O M P U T A T I O N A L T E C H N O L O G I E S ' 2 0 1 2 KernelGen a toolchain for automatic GPU-centric applications porting Nicolas Lihogrud Dmitry Mikushin Andrew Adinets Contents

More information

GPGPUGPGPU: Multi-GPU Programming

GPGPUGPGPU: Multi-GPU Programming GPGPUGPGPU: Multi-GPU Programming Fall 2012 HW4 global void cuda_transpose(const float *ad, const int n, float *atd) { } int i = threadidx.y + blockidx.y*blockdim.y; int j = threadidx.x + blockidx.x*blockdim.x;

More information

Technische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics

Technische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics GPU Programming Rüdiger Westermann Chair for Computer Graphics & Visualization Faculty of Informatics Overview Programming interfaces and support libraries The CUDA programming abstraction An in-depth

More information

Recent Advances in Heterogeneous Computing using Charm++

Recent Advances in Heterogeneous Computing using Charm++ Recent Advances in Heterogeneous Computing using Charm++ Jaemin Choi, Michael Robson Parallel Programming Laboratory University of Illinois Urbana-Champaign April 12, 2018 1 / 24 Heterogeneous Computing

More information

Warps and Reduction Algorithms

Warps and Reduction Algorithms Warps and Reduction Algorithms 1 more on Thread Execution block partitioning into warps single-instruction, multiple-thread, and divergence 2 Parallel Reduction Algorithms computing the sum or the maximum

More information

By: Tomer Morad Based on: Erik Lindholm, John Nickolls, Stuart Oberman, John Montrym. NVIDIA TESLA: A UNIFIED GRAPHICS AND COMPUTING ARCHITECTURE In IEEE Micro 28(2), 2008 } } Erik Lindholm, John Nickolls,

More information

S4289: Efficient solution of multiple scalar and block-tridiagonal equations

S4289: Efficient solution of multiple scalar and block-tridiagonal equations S4289: Efficient solution of multiple scalar and block-tridiagonal equations Endre László endre.laszlo [at] oerc.ox.ac.uk Oxford e-research Centre, University of Oxford, UK Pázmány Péter Catholic University,

More information

Introduction to Parallel Programming

Introduction to Parallel Programming Introduction to Parallel Programming Pablo Brubeck Department of Physics Tecnologico de Monterrey October 14, 2016 Student Chapter Tecnológico de Monterrey Tecnológico de Monterrey Student Chapter Outline

More information

LLVM supported source-to-source translation. Translation from annotated C/C++ to CUDA C/C++

LLVM supported source-to-source translation. Translation from annotated C/C++ to CUDA C/C++ LLVM supported source-to-source translation Translation from annotated C/C++ to CUDA C/C++ Niklas Jacobsen Master s Thesis Autumn 2016 LLVM supported source-to-source translation Niklas Jacobsen 1st November

More information

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017 INTRODUCTION TO OPENACC Analyzing and Parallelizing with OpenACC, Feb 22, 2017 Objective: Enable you to to accelerate your applications with OpenACC. 2 Today s Objectives Understand what OpenACC is and

More information

CSC266 Introduction to Parallel Computing using GPUs Introduction to CUDA

CSC266 Introduction to Parallel Computing using GPUs Introduction to CUDA CSC266 Introduction to Parallel Computing using GPUs Introduction to CUDA Sreepathi Pai October 18, 2017 URCS Outline Background Memory Code Execution Model Outline Background Memory Code Execution Model

More information

OpenACC introduction (part 2)

OpenACC introduction (part 2) OpenACC introduction (part 2) Aleksei Ivakhnenko APC Contents Understanding PGI compiler output Compiler flags and environment variables Compiler limitations in dependencies tracking Organizing data persistence

More information

COSC 462 Parallel Programming

COSC 462 Parallel Programming November 22, 2017 1/12 COSC 462 Parallel Programming CUDA Beyond Basics Piotr Luszczek Mixing Blocks and Threads int N = 100, SN = N * sizeof(double); global void sum(double *a, double *b, double *c) {

More information

ALYA Multi-Physics System on GPUs: Offloading Large-Scale Computational Mechanics Problems

ALYA Multi-Physics System on GPUs: Offloading Large-Scale Computational Mechanics Problems www.bsc.es ALYA Multi-Physics System on GPUs: Offloading Large-Scale Computational Mechanics Problems Vishal Mehta Engineer, Barcelona Supercomputing Center vishal.mehta@bsc.es Training BSC/UPC GPU Centre

More information

CS 677: Parallel Programming for Many-core Processors Lecture 6

CS 677: Parallel Programming for Many-core Processors Lecture 6 1 CS 677: Parallel Programming for Many-core Processors Lecture 6 Instructor: Philippos Mordohai Webpage: www.cs.stevens.edu/~mordohai E-mail: Philippos.Mordohai@stevens.edu Logistics Midterm: March 11

More information

Particle-in-Cell Simulations on Modern Computing Platforms. Viktor K. Decyk and Tajendra V. Singh UCLA

Particle-in-Cell Simulations on Modern Computing Platforms. Viktor K. Decyk and Tajendra V. Singh UCLA Particle-in-Cell Simulations on Modern Computing Platforms Viktor K. Decyk and Tajendra V. Singh UCLA Outline of Presentation Abstraction of future computer hardware PIC on GPUs OpenCL and Cuda Fortran

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software

More information

Hardware/Software Co-Design

Hardware/Software Co-Design 1 / 13 Hardware/Software Co-Design Review so far Miaoqing Huang University of Arkansas Fall 2011 2 / 13 Problem I A student mentioned that he was able to multiply two 1,024 1,024 matrices using a tiled

More information

CSE 160 Lecture 24. Graphical Processing Units

CSE 160 Lecture 24. Graphical Processing Units CSE 160 Lecture 24 Graphical Processing Units Announcements Next week we meet in 1202 on Monday 3/11 only On Weds 3/13 we have a 2 hour session Usual class time at the Rady school final exam review SDSC

More information

CUDA. GPU Computing. K. Cooper 1. 1 Department of Mathematics. Washington State University

CUDA. GPU Computing. K. Cooper 1. 1 Department of Mathematics. Washington State University GPU Computing K. Cooper 1 1 Department of Mathematics Washington State University 2014 Review of Parallel Paradigms MIMD Computing Multiple Instruction Multiple Data Several separate program streams, each

More information

Porting COSMO to Hybrid Architectures

Porting COSMO to Hybrid Architectures Porting COSMO to Hybrid Architectures T. Gysi 1, O. Fuhrer 2, C. Osuna 3, X. Lapillonne 3, T. Diamanti 3, B. Cumming 4, T. Schroeder 5, P. Messmer 5, T. Schulthess 4,6,7 [1] Supercomputing Systems AG,

More information

CUDA C Programming Mark Harris NVIDIA Corporation

CUDA C Programming Mark Harris NVIDIA Corporation CUDA C Programming Mark Harris NVIDIA Corporation Agenda Tesla GPU Computing CUDA Fermi What is GPU Computing? Introduction to Tesla CUDA Architecture Programming & Memory Models Programming Environment

More information

Lecture 5. Performance Programming with CUDA

Lecture 5. Performance Programming with CUDA Lecture 5 Performance Programming with CUDA Announcements 2011 Scott B. Baden / CSE 262 / Spring 2011 2 Today s lecture Matrix multiplication 2011 Scott B. Baden / CSE 262 / Spring 2011 3 Memory Hierarchy

More information

dcuda: Distributed GPU Computing with Hardware Overlap

dcuda: Distributed GPU Computing with Hardware Overlap dcuda: Distributed GPU Computing with Hardware Overlap Tobias Gysi, Jeremia Bär, Lukas Kuster, and Torsten Hoefler GPU computing gained a lot of popularity in various application domains weather & climate

More information

CME 213 S PRING Eric Darve

CME 213 S PRING Eric Darve CME 213 S PRING 2017 Eric Darve Review Secret behind GPU performance: simple cores but a large number of them; even more threads can exist live on the hardware (10k 20k threads live). Important performance

More information

Introduc)on to GPU Programming

Introduc)on to GPU Programming Introduc)on to GPU Programming Mubashir Adnan Qureshi h3p://www.ncsa.illinois.edu/people/kindr/projects/hpca/files/singapore_p1.pdf h3p://developer.download.nvidia.com/cuda/training/nvidia_gpu_compu)ng_webinars_cuda_memory_op)miza)on.pdf

More information

GPU CUDA Programming

GPU CUDA Programming GPU CUDA Programming 이정근 (Jeong-Gun Lee) 한림대학교컴퓨터공학과, 임베디드 SoC 연구실 www.onchip.net Email: Jeonggun.Lee@hallym.ac.kr ALTERA JOINT LAB Introduction 차례 Multicore/Manycore and GPU GPU on Medical Applications

More information

An Extension of XcalableMP PGAS Lanaguage for Multi-node GPU Clusters

An Extension of XcalableMP PGAS Lanaguage for Multi-node GPU Clusters An Extension of XcalableMP PGAS Lanaguage for Multi-node Clusters Jinpil Lee, Minh Tuan Tran, Tetsuya Odajima, Taisuke Boku and Mitsuhisa Sato University of Tsukuba 1 Presentation Overview l Introduction

More information

Outline. L13: Application Case Studies. Reconstructing MR Images. Reconstructing MR Images 3/26/12

Outline. L13: Application Case Studies. Reconstructing MR Images. Reconstructing MR Images 3/26/12 Outline L13: Application Case Studies Application Case Studies Advanced MRI Reconstruction Reading: Kirk and Hwu, Chapter 8 or From the sample book http://courses.ece.illinois.edu/ece498/al/textbook/ Chapter7-MRI-Case-Study.pdf

More information

Scientific discovery, analysis and prediction made possible through high performance computing.

Scientific discovery, analysis and prediction made possible through high performance computing. Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013

More information

GPU Programming Using NVIDIA CUDA

GPU Programming Using NVIDIA CUDA GPU Programming Using NVIDIA CUDA Siddhante Nangla 1, Professor Chetna Achar 2 1, 2 MET s Institute of Computer Science, Bandra Mumbai University Abstract: GPGPU or General-Purpose Computing on Graphics

More information

Exploiting GPU Caches in Sparse Matrix Vector Multiplication. Yusuke Nagasaka Tokyo Institute of Technology

Exploiting GPU Caches in Sparse Matrix Vector Multiplication. Yusuke Nagasaka Tokyo Institute of Technology Exploiting GPU Caches in Sparse Matrix Vector Multiplication Yusuke Nagasaka Tokyo Institute of Technology Sparse Matrix Generated by FEM, being as the graph data Often require solving sparse linear equation

More information

CS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra)

CS 470 Spring Other Architectures. Mike Lam, Professor. (with an aside on linear algebra) CS 470 Spring 2018 Mike Lam, Professor Other Architectures (with an aside on linear algebra) Aside (P3 related): linear algebra Many scientific phenomena can be modeled as matrix operations Differential

More information