ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC

Size: px
Start display at page:

Download "ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC"

Transcription

1 ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018

2 PGI THE NVIDIA HPC SDK Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on Linux, macos, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT, TotalView 2

3 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View PCIe System Memory GPU Memory 3

4 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View NVLink System Memory GPU Memory 4

5 CUDA FORTRAN real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev... allocate (Adev(N,M), Bdev(M,L), Cdev(N,L)) Adev = A(1:N,1:M) Bdev = B(1:M,1:L) call mm_kernel <<<dim3(n/16,m/16),dim3(16,16)>>> ( Adev, Bdev, Cdev, N, M, L ) C(1:N,1:L) = Cdev deallocate ( Adev, Bdev, Cdev )... attributes(global) subroutine mm_kernel ( A, B, C, N, M, L ) real :: A(N,M), B(M,L), C(N,L), Cij integer, value :: N, M, L integer :: i, j, kb, k, tx, ty real, shared :: Asub(16,16),Bsub(16,16) tx = threadidx%x ty = threadidx%y i = blockidx%x * 16 + tx j = blockidx%y * 16 + ty Cij = 0.0 do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) Bsub(tx,ty) = B(kb+ty-1,j) call syncthreads() do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo call syncthreads() enddo C(i,j) = Cij end subroutine mmul_kernel CPU Code Tesla Code 5

6 CUDA FORTRAN!$CUF KERNEL Directives module madd_device_module use cudafor contains subroutine madd_dev(a,b,c,sum,n1,n2) real,dimension(:,:),device :: a,b,c real :: sum integer :: n1,n2 type(dim3) :: grid, block!$cuf kernel do (2) <<<(*,*),(32,4)>>> do j = 1,n2 do i = 1,n1 a(i,j) = b(i,j) + c(i,j) sum = sum + a(i,j) enddo enddo end subroutine end module Equivalent hand-written CUDA kernels module madd_device_module use cudafor implicit none contains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256)! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j)! accumulates partial sum per thread enddo enddo! Now add up all partial sums for the whole thread block! Compute this thread's linear index in the thread block! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads()! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo! Store the partial sum for the thread block bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) end subroutine! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) real, dimension(:) :: blocksum real :: dsum integer, value :: nb real, shared :: bsum(256) integer :: tindex,tneighbor,i! Again, we assume 256 threads in the thread block! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo call syncthreads()! This code is copied from the previous kernel! Accumulate all the partial sums for this thread block to a single value! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) end subroutine subroutine madd_dev(a,b,c,dsum,n1,n2) real, dimension(:,:), device :: a,b,c real, device :: dsum real, dimension(:), allocatable, device :: blocksum integer :: n1,n2,nb type(dim3) :: grid, block integer :: r! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudathreadsynchronize()! don't deallocate too early deallocate(blocksum) end subroutine 6

7 OpenACC Directives Manage Data Movement Initiate Parallel Execution Optimize Loop Mappings #pragma acc data copyin(a,b) copyout(c) {... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i];... } }... } Incremental Single source Interoperable Performance portable CPU, GPU, Manycore 7

8 OpenACC for GPUs in a Nutshell... #pragma acc data copy(b[0:n][0:m]) \ create(a[0:n][0:m]) { for (iter = 1; iter <= p; ++iter){ #pragma acc parallel loop for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ w1*(b[i-1][j]+b[i+1][j]+ b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } #pragma acc parallel loop for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) b[i][j] = a[i][j]; } }... A B System Memory S p 12 (B) S 1 p (B) GPU Memory 8

9 OpenACC is for Multicore, Manycore & GPUs 98!$acc parallel 99!$acc loop independent 100 do k=y_min-depth,y_max+depth 101!$acc loop independent 102 do j=1,depth 103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k) 104 enddo 105 enddo 106!$acc end parallel Multicore CPU Tesla GPU % pgfortran -ta=multicore fast Minfo=acc -c \ update_tile_halo_kernel.f , Loop is parallelizable Generating Multicore code 100,!$acc loop gang 102, Loop is parallelizable % pgfortran -ta=tesla fast -Minfo=acc c \ update_tile_halo_kernel.f , Loop is parallelizable 102, Loop is parallelizable Accelerator kernel generated Generating Tesla code 100,!$acc loop gang, vector(4)! blockidx%y threadidx%y 102,!$acc loop gang, vector(32)! blockidx%x threadidx%x 9

10 GEOMEAN Seconds GEOMEAN Seconds SPEC ACCEL 1.2 BENCHMARKS 200 OpenACC 200 OpenMP 4.5 PGI 18.1 Intel 2018 PGI x Speed-up socket Broadwell 1x Volta V socket Skylake 2-socket EPYC 2-socket Broadwell 40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads Performance measured February, Skylake: Two 20 core Intel Xeon Gold GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E v4 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core Intel Xeon E v4 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB 1.53GHz. SPEC is a registered trademark of the Standard Performance Evaluation Corporation (

11 OPENACC APPLICATIONS 11

12 %GPUCPU= 0-7 = 0-7 Use GPUs 0-7 with CPUs 0-7 as their controllers. Detailed information is available on our website. Project Contributors GAUSSIAN 16 Roberto Gomperts NVIDIA Gaussi an, Inc. 340 Quinnipiac St. Bldg. 40 Wallingford, CT USA custser v@gaussi an.com Michael Frisch Gaussian Mike Frisch, Ph.D. President and CEO Gaussian, Inc. Using OpenACC allowed us to continue development of our Brent fundamental Leback NVIDIA/PGI algorithms and software capabilities simultaneously with the GPU-related work. In the end, we could use the same Copyright code 2017, base Gaussian, for Inc. SMP, All rights cluster/ reser ved. network and GPU parallelism. PGI's compilers were essential to the success of our efforts. Gio Gaussian is a registered trademark of Gaussian, Inc. All other trademarks and the proper ties of their respective holders. Specif cations subject to change witho 12

13 ANSYS FLUENT Sunil Sathe Lead Software Developer ANSYS Fluent We ve effectively used OpenACC for heterogeneous computing in ANSYS Fluent with impressive performance. We re now applying this work to more of our models and new platforms. Image courtesy: ANSYS 13

14 VASP Prof. Georg Kresse Computational Materials Physics University of Vienna For VASP, OpenACC is the way forward for GPU acceleration. Performance is similar and in some cases better than CUDA C, and OpenACC dramatically decreases GPU development and maintenance efforts. We re excited to collaborate with NVIDIA and PGI as an early adopter of CUDA Unified Memory. 14

15 NUMECA FINE/Open David Gutzwiller Lead Software Developer NUMECA Porting our unstructured C++ CFD solver FINE/Open to GPUs using OpenACC would have been impossible two or three years ago, but OpenACC has developed enough that we re now getting some really good results. 15

16 MPAS-A Richard Loft Director, Technology Development NCAR Image courtesy: NCAR Our team has been evaluating OpenACC as a pathway to performance portability for the Model for Prediction (MPAS) atmospheric model. Using this approach on the MPAS dynamical core, we have achieved performance on a single P100 GPU equivalent to 2.7 dual socketed Intel Xeon nodes on our new Cheyenne supercomputer. 16

17 COSMO Dr. Oliver Fuhrer Senior Scientist Meteoswiss OpenACC made it practical to develop for GPU-based hardware while retaining a single source for almost all the COSMO physics code. 17

18 GAMERA FOR GPU Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo Hori, Lalith Wijerathne The University of Tokyo With OpenACC and a compute node based on NVIDIA's Tesla P100 GPU, we achieved more than a 14X speed up over a K Computer node running our earthquake disaster simulation code Map courtesy University of Tokyo 18

19 QUANTUM ESPRESSO Filippo Spiga Head of Research Software Engineering University of Cambridge CUDA Fortran gives us the full performance potential of the CUDA programming model and NVIDIA GPUs.!$CUF KERNELS directives give us productivity and source code maintainability. It s the best of both worlds. 19

20 OPENACC AND CUDA UNIFIED MEMORY 20

21 Programming GPU-Accelerated Systems CUDA Unified Memory for Dynamically Allocated Data GPU Developer View GPU Developer View With CUDA Unified Memory PCIe System Memory GPU Memory Unified Memory 21

22 How CUDA Unified Memory Works on TESLA GPUs Servicing CPU and GPU Page Faults for Allocatable Data cudamallocmanaged(&array, size); memset(array, size); setvalue<<<...>>>(array, size/2, 5);... global void setvalue(char *ptr, int index, char val) { ptr[index] = val; } Page Fault CPU Memory Mapping array GPU Memory Mapping array Page Fault PCIe or NVLink 22

23 PGI OpenACC and CUDA Unified Memory Compiling with the ta=tesla:managed option #pragma acc data copyin(a,b) copyout(c) {... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i];... } }... } GPU Developer View With CUDA Unified Memory Unified Memory C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory 23

24 PGI OpenACC and CUDA Unified Memory Compiling with the ta=tesla:managed option... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i];... } }... GPU Developer View With CUDA Unified Memory Unified Memory C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory 24

25 Center for Accelerated Application Readiness (CAAR) Oak Ridge Leadership Computing Facility IBM POWER9 CPUs NVIDIA Volta V100 GPUs 25

26 GTC: An OpenACC Production Application Being ported for runs on the ORNL Summit supercomputer The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell production code for turbulence simulation in support of the burning plasma experiment ITER, the crucial next step in the quest for fusion energy. 26

27 GTC Performance using OpenACC OpenPOWER NVLink Unified Memory P100 V100 16x 14x 16.5X 12x 10x 12.1X 12X 8x 6x 4x 6.1X 5.9X 2x P8 UM 20-core P8 P8+2xP100 UM P8+2xP100 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK : No Data Directives in sources, compiled with ta=tesla:managed P8+4xP100 UM P8+4xP100 x64+4xv100 Data Directives Data Directives Data Directives 27

28 DEEP COPY 28

29 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Real-world applications often have complex, aggregate data structures CUDA Unified Memory can automatically manage Deep Copy, but Derived Type 3 Members: only static Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic 29

30 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Real-world applications often have complex, aggregate data structures CUDA Unified Memory can automatically manage Deep Copy, but CUDA Unified Memory is only for allocatable data today Derived Type 3 Members: only static Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic 30

31 FORTRAN AUTOMATIC FULL DEEP COPY Fortran Derived Types 31

32 OPENACC 2.6 MANUAL DEEP COPY Supported Today in PGI Compilers typedef struct points { float* x; float* y; float* z; int n; float coef, direction; } points; void sub ( int n, float* y ) { points p; #pragma acc data create (p) { p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc update device (p.n) #pragma acc data copyin (p.x[0:n], p.y[0: n]) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];... 32

33 DRAFT OPENACC 3.0 TRUE DEEP COPY Still in definition by the OpenACC Committee typedef struct points { float* x; float* y; float* z; int n; float coef, direction; #pragma acc policy inout(x[0:n],y[0:n]) } points; void sub ( int n, float* y ) { points p; p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc data copy (p) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];... 33

34 WHITHER OPENACC? 34

35 Speedup vs Single Haswell Core CLOVERLEAF AWE Hydrodynamics mini-app, bm32 data set x PGI 18.1 OpenACC Intel 2018 OpenMP 40x 7.6x 7.9x 10x 10x 14.8x 15x 11x Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal 109x 67x 1x 2x 4x Volta V100 Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4). Compilers: Intel , PGI 18.1 Benchmark: CloverLeaf v1.3 downloaded from the week of November ; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC) Data compiled by PGI February

36 OPENACC DIRECTIVES FOR GPUS % pgfortran fast ta=tesla Minfo -c PdV_kernel.f90 pdv_kernel:... 77, Loop is parallelizable 79, Loop is parallelizable Accelerator kernel generated Generating Tesla code 77,!$acc loop gang, vector(4)! blockidx%y! threadidx%y 79,!$acc loop gang, vector(32)! blockidx%x! threadidx%x... 75!$ACC KERNELS 76!$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78!$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max left_flux= (xarea(j,k )*(xvel0(j,k )+xvel0(j,k+1) & 82 +xvel0(j,k )+xvel0(j,k+1)))*0.25_8*dt* right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt* bottom_flux=(yarea(j,k )*(yvel0(j,k )+yvel0(j+1,k ) & 86 +yvel0(j,k )+yvel0(j+1,k )))*0.25_8*dt* top_flux= (yarea(j,k+1)*(yvel0(j,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j,k+1)+yvel0(j+1,k+1)))*0.25_8*dt* total_flux=right_flux-left_flux+top_flux-bottom_flux volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) min_cell_volume=min(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94,volume(j,k)+right_flux-left_flux & 95,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))* energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107!$ACC END KERNELS 36

37 OPENACC DIRECTIVES FOR MULTICORE CPUS % pgfortran fast ta=multicore... PdV_kernel.f90 pdv_kernel:... 77, Loop is parallelizable Generating Multicore code 77,!$acc loop gang 79, Loop is parallelizable 3 loop-carried redundant expressions removed with 9 operations and 9 arrays Innermost loop distributed: 2 new loops Generated vector SIMD code for the loop Generated 2 prefetch instructions for the loop Generated 12 prefetch instructions for the loop... 75!$ACC KERNELS 76!$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78!$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max left_flux= (xarea(j,k )*(xvel0(j,k )+xvel0(j,k+1) & 82 +xvel0(j,k )+xvel0(j,k+1)))*0.25_8*dt* right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt* bottom_flux=(yarea(j,k )*(yvel0(j,k )+yvel0(j+1,k ) & 86 +yvel0(j,k )+yvel0(j+1,k )))*0.25_8*dt* top_flux= (yarea(j,k+1)*(yvel0(j,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j,k+1)+yvel0(j+1,k+1)))*0.25_8*dt* total_flux=right_flux-left_flux+top_flux-bottom_flux volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) min_cell_volume=min(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94,volume(j,k)+right_flux-left_flux & 95,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))* energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107!$ACC END KERNELS 37

38 FORTRAN 2018 DO CONCURRENT Fortran 2018 DO CONCURRENT + True Parallel Loops + Loop-scope shared/private data No support for reductions No support for atomics No support for data management DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) & 78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, & min_cell_volume,energy_change,recip_volume) left_flux= (xarea(j,k )*(xvel0(j,k )+xvel0(j,k+1) & 82 +xvel0(j,k )+xvel0(j,k+1)))*0.25_8*dt* right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt* bottom_flux=(yarea(j,k )*(yvel0(j,k )+yvel0(j+1,k ) & 86 +yvel0(j,k )+yvel0(j+1,k )))*0.25_8*dt* top_flux= (yarea(j,k+1)*(yvel0(j,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j,k+1)+yvel0(j+1,k+1)))*0.25_8*dt* total_flux=right_flux-left_flux+top_flux-bottom_flux volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) min_cell_volume=min(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94,volume(j,k)+right_flux-left_flux & 95,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))* energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) ENDDO

39 OPENACC FOR EVERYONE The PGI Community Edition, pgicompilers.com/community FREE PROGRAMMING MODELS OpenACC, CUDA Fortran, OpenMP, C/C++/Fortran Compilers and Tools PLATFORMS X86, OpenPOWER, NVIDIA GPU UPDATES 1-2 times a year 6-9 times a year 6-9 times a year SUPPORT User Forums PGI Support PGI Premier Services LICENSE Annual Perpetual Volume/Site 39

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI SC13 GPU Technology Theater Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI The Case for Fortran Clear, straight-forward syntax Successful legacy in the scientific community

More information

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer OpenACC Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer 3 WAYS TO ACCELERATE APPLICATIONS Applications Libraries Compiler Directives Programming Languages Easy to use Most Performance

More information

CUDA Fortran Brent Leback The Portland Group

CUDA Fortran Brent Leback The Portland Group CUDA Fortran 2013 Brent Leback The Portland Group brent.leback@pgroup.com Why Fortran? Rich legacy in the scientific community Semantics easier to vectorize/parallelize Array descriptors Modules Fortran

More information

OPENACC ONLINE COURSE 2018

OPENACC ONLINE COURSE 2018 OPENACC ONLINE COURSE 2018 Week 1 Introduction to OpenACC Jeff Larkin, Senior DevTech Software Engineer, NVIDIA ABOUT THIS COURSE 3 Part Introduction to OpenACC Week 1 Introduction to OpenACC Week 2 Data

More information

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017 INTRODUCTION TO OPENACC Analyzing and Parallelizing with OpenACC, Feb 22, 2017 Objective: Enable you to to accelerate your applications with OpenACC. 2 Today s Objectives Understand what OpenACC is and

More information

Directive-based Programming for Highly-scalable Nodes

Directive-based Programming for Highly-scalable Nodes Directive-based Programming for Highly-scalable Nodes Doug Miles Michael Wolfe PGI Compilers & Tools NVIDIA Cray User Group Meeting May 2016 Talk Outline Increasingly Parallel Nodes Exposing Parallelism

More information

OpenACC Fundamentals. Steve Abbott November 13, 2016

OpenACC Fundamentals. Steve Abbott November 13, 2016 OpenACC Fundamentals Steve Abbott , November 13, 2016 Who Am I? 2005 B.S. Physics Beloit College 2007 M.S. Physics University of Florida 2015 Ph.D. Physics University of New Hampshire

More information

CUDA: NEW AND UPCOMING FEATURES

CUDA: NEW AND UPCOMING FEATURES May 8-11, 2017 Silicon Valley CUDA: NEW AND UPCOMING FEATURES Stephen Jones, GTC 2018 CUDA ECOSYSTEM 2018 CUDA DOWNLOADS IN 2017 3,500,000 CUDA REGISTERED DEVELOPERS 800,000 GTC ATTENDEES 8,000+ 2 CUDA

More information

OpenACC Fundamentals. Steve Abbott November 15, 2017

OpenACC Fundamentals. Steve Abbott November 15, 2017 OpenACC Fundamentals Steve Abbott , November 15, 2017 AGENDA Data Regions Deep Copy 2 while ( err > tol && iter < iter_max ) { err=0.0; JACOBI ITERATION #pragma acc parallel loop reduction(max:err)

More information

Programming NVIDIA GPUs with OpenACC Directives

Programming NVIDIA GPUs with OpenACC Directives Programming NVIDIA GPUs with OpenACC Directives Michael Wolfe michael.wolfe@pgroup.com http://www.pgroup.com/accelerate Programming NVIDIA GPUs with OpenACC Directives Michael Wolfe mwolfe@nvidia.com http://www.pgroup.com/accelerate

More information

STRATEGIES TO ACCELERATE VASP WITH GPUS USING OPENACC. Stefan Maintz, Dr. Markus Wetzstein

STRATEGIES TO ACCELERATE VASP WITH GPUS USING OPENACC. Stefan Maintz, Dr. Markus Wetzstein STRATEGIES TO ACCELERATE VASP WITH GPUS USING OPENACC Stefan Maintz, Dr. Markus Wetzstein smaintz@nvidia.com; mwetzstein@nvidia.com Companies Academia VASP USERS AND USAGE 12-25% of CPU cycles @ supercomputing

More information

PERFORMANCE PORTABILITY WITH OPENACC. Jeff Larkin, NVIDIA, November 2015

PERFORMANCE PORTABILITY WITH OPENACC. Jeff Larkin, NVIDIA, November 2015 PERFORMANCE PORTABILITY WITH OPENACC Jeff Larkin, NVIDIA, November 2015 TWO TYPES OF PORTABILITY FUNCTIONAL PORTABILITY PERFORMANCE PORTABILITY The ability for a single code to run anywhere. The ability

More information

OpenACC Course Lecture 1: Introduction to OpenACC September 2015

OpenACC Course Lecture 1: Introduction to OpenACC September 2015 OpenACC Course Lecture 1: Introduction to OpenACC September 2015 Course Objective: Enable you to accelerate your applications with OpenACC. 2 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15:

More information

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA

More information

NVIDIA Update and Directions on GPU Acceleration for Earth System Models

NVIDIA Update and Directions on GPU Acceleration for Earth System Models NVIDIA Update and Directions on GPU Acceleration for Earth System Models Stan Posey, HPC Program Manager, ESM and CFD, NVIDIA, Santa Clara, CA, USA Carl Ponder, PhD, Applications Software Engineer, NVIDIA,

More information

THE FUTURE OF GPU DATA MANAGEMENT. Michael Wolfe, May 9, 2017

THE FUTURE OF GPU DATA MANAGEMENT. Michael Wolfe, May 9, 2017 THE FUTURE OF GPU DATA MANAGEMENT Michael Wolfe, May 9, 2017 CPU CACHE Hardware managed What data to cache? Where to store the cached data? What data to evict when the cache fills up? When to store data

More information

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15: Profiling and Parallelizing with the OpenACC Toolkit

More information

INTRODUCTION TO ACCELERATED COMPUTING WITH OPENACC. Jeff Larkin, NVIDIA Developer Technologies

INTRODUCTION TO ACCELERATED COMPUTING WITH OPENACC. Jeff Larkin, NVIDIA Developer Technologies INTRODUCTION TO ACCELERATED COMPUTING WITH OPENACC Jeff Larkin, NVIDIA Developer Technologies AGENDA Accelerated Computing Basics What are Compiler Directives? Accelerating Applications with OpenACC Identifying

More information

COMP Parallel Computing. Programming Accelerators using Directives

COMP Parallel Computing. Programming Accelerators using Directives COMP 633 - Parallel Computing Lecture 15 October 30, 2018 Programming Accelerators using Directives Credits: Introduction to OpenACC and toolkit Jeff Larkin, Nvidia COMP 633 - Prins Directives for Accelerator

More information

INTRODUCTION TO COMPILER DIRECTIVES WITH OPENACC

INTRODUCTION TO COMPILER DIRECTIVES WITH OPENACC INTRODUCTION TO COMPILER DIRECTIVES WITH OPENACC DR. CHRISTOPH ANGERER, NVIDIA *) THANKS TO JEFF LARKIN, NVIDIA, FOR THE SLIDES 3 APPROACHES TO GPU PROGRAMMING Applications Libraries Compiler Directives

More information

April 4-7, 2016 Silicon Valley INSIDE PASCAL. Mark Harris, October 27,

April 4-7, 2016 Silicon Valley INSIDE PASCAL. Mark Harris, October 27, April 4-7, 2016 Silicon Valley INSIDE PASCAL Mark Harris, October 27, 2016 @harrism INTRODUCING TESLA P100 New GPU Architecture CPU to CPUEnable the World s Fastest Compute Node PCIe Switch PCIe Switch

More information

Programming paradigms for GPU devices

Programming paradigms for GPU devices Programming paradigms for GPU devices OpenAcc Introduction Sergio Orlandini s.orlandini@cineca.it 1 OpenACC introduction express parallelism optimize data movements practical examples 2 3 Ways to Accelerate

More information

A case study of performance portability with OpenMP 4.5

A case study of performance portability with OpenMP 4.5 A case study of performance portability with OpenMP 4.5 Rahul Gayatri, Charlene Yang, Thorsten Kurth, Jack Deslippe NERSC pre-print copy 1 Outline General Plasmon Pole (GPP) application from BerkeleyGW

More information

High Performance Computing and GPU Programming

High Performance Computing and GPU Programming High Performance Computing and GPU Programming Lecture 1: Introduction Objectives C++/CPU Review GPU Intro Programming Model Objectives Objectives Before we begin a little motivation Intel Xeon 2.67GHz

More information

Optimising the Mantevo benchmark suite for multi- and many-core architectures

Optimising the Mantevo benchmark suite for multi- and many-core architectures Optimising the Mantevo benchmark suite for multi- and many-core architectures Simon McIntosh-Smith Department of Computer Science University of Bristol 1 Bristol's rich heritage in HPC The University of

More information

TESLA V100 PERFORMANCE GUIDE May 2018

TESLA V100 PERFORMANCE GUIDE May 2018 TESLA V100 PERFORMANCE GUIDE May 2018 TESLA V100 The Fastest and Most Productive GPU for AI and HPC Volta Architecture Tensor Core Improved NVLink & HBM2 Volta MPS Improved SIMT Model Most Productive GPU

More information

OpenACC Course. Office Hour #2 Q&A

OpenACC Course. Office Hour #2 Q&A OpenACC Course Office Hour #2 Q&A Q1: How many threads does each GPU core have? A: GPU cores execute arithmetic instructions. Each core can execute one single precision floating point instruction per cycle

More information

HPC with Multicore and GPUs

HPC with Multicore and GPUs HPC with Multicore and GPUs Stan Tomov Electrical Engineering and Computer Science Department University of Tennessee, Knoxville COSC 594 Lecture Notes March 22, 2017 1/20 Outline Introduction - Hardware

More information

VSC Users Day 2018 Start to GPU Ehsan Moravveji

VSC Users Day 2018 Start to GPU Ehsan Moravveji Outline A brief intro Available GPUs at VSC GPU architecture Benchmarking tests General Purpose GPU Programming Models VSC Users Day 2018 Start to GPU Ehsan Moravveji Image courtesy of Nvidia.com Generally

More information

Introduction to Parallel Computing with CUDA. Oswald Haan

Introduction to Parallel Computing with CUDA. Oswald Haan Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries

More information

GPU programming. Dr. Bernhard Kainz

GPU programming. Dr. Bernhard Kainz GPU programming Dr. Bernhard Kainz Overview About myself Motivation GPU hardware and system architecture GPU programming languages GPU programming paradigms Pitfalls and best practice Reduction and tiling

More information

Introduction to Compiler Directives with OpenACC

Introduction to Compiler Directives with OpenACC Introduction to Compiler Directives with OpenACC Agenda Fundamentals of Heterogeneous & GPU Computing What are Compiler Directives? Accelerating Applications with OpenACC - Identifying Available Parallelism

More information

OpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016

OpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016 OpenACC. Part I Ned Nedialkov McMaster University Canada October 2016 Outline Introduction Execution model Memory model Compiling pgaccelinfo Example Speedups Profiling c 2016 Ned Nedialkov 2/23 Why accelerators

More information

LECTURE ON PASCAL GPU ARCHITECTURE. Jiri Kraus, November 14 th 2016

LECTURE ON PASCAL GPU ARCHITECTURE. Jiri Kraus, November 14 th 2016 LECTURE ON PASCAL GPU ARCHITECTURE Jiri Kraus, November 14 th 2016 ACCELERATED COMPUTING CPU Optimized for Serial Tasks GPU Accelerator Optimized for Parallel Tasks 2 ACCELERATED COMPUTING CPU Optimized

More information

Tesla Architecture, CUDA and Optimization Strategies

Tesla Architecture, CUDA and Optimization Strategies Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization

More information

Advanced OpenACC. John Urbanic Parallel Computing Scientist Pittsburgh Supercomputing Center. Copyright 2017

Advanced OpenACC. John Urbanic Parallel Computing Scientist Pittsburgh Supercomputing Center. Copyright 2017 Advanced OpenACC John Urbanic Parallel Computing Scientist Pittsburgh Supercomputing Center Copyright 2017 Outline Loop Directives Data Declaration Directives Data Regions Directives Cache directives Wait

More information

GPU Programming Paradigms

GPU Programming Paradigms GPU Programming with PGI CUDA Fortran and the PGI Accelerator Programming Model Boris Bierbaum, Sandra Wienke (26.3.2010) 1 GPUs@RZ Current: linuxc7: CentOS 5.3, Nvidia GeForce GT 220 hpc-denver: Windows

More information

CACHE DIRECTIVE OPTIMIZATION IN THE OPENACC PROGRAMMING MODEL. Xiaonan (Daniel) Tian, Brent Leback, and Michael Wolfe PGI

CACHE DIRECTIVE OPTIMIZATION IN THE OPENACC PROGRAMMING MODEL. Xiaonan (Daniel) Tian, Brent Leback, and Michael Wolfe PGI CACHE DIRECTIVE OPTIMIZATION IN THE OPENACC PROGRAMMING MODEL Xiaonan (Daniel) Tian, Brent Leback, and Michael Wolfe PGI GPU ARCHITECTURE Threads Register Files Shared Memory L1 Cache Read-Only Data Cache

More information

Cartoon parallel architectures; CPUs and GPUs

Cartoon parallel architectures; CPUs and GPUs Cartoon parallel architectures; CPUs and GPUs CSE 6230, Fall 2014 Th Sep 11! Thanks to Jee Choi (a senior PhD student) for a big assist 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ~ socket 14 ~ core 14 ~ HWMT+SIMD

More information

PGPROF OpenACC Tutorial

PGPROF OpenACC Tutorial PGPROF OpenACC Tutorial Version 2017 PGI Compilers and Tools TABLE OF CONTENTS Chapter 1. Tutorial Setup...1 Chapter 2. Profiling the application... 2 Chapter 3. Adding OpenACC directives... 4 Chapter

More information

Introduction to GPGPU and GPU-architectures

Introduction to GPGPU and GPU-architectures Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks

More information

Achieving Portable Performance for GTC-P with OpenACC on GPU, multi-core CPU, and Sunway Many-core Processor

Achieving Portable Performance for GTC-P with OpenACC on GPU, multi-core CPU, and Sunway Many-core Processor Achieving Portable Performance for GTC-P with OpenACC on GPU, multi-core CPU, and Sunway Many-core Processor Stephen Wang 1, James Lin 1,4, William Tang 2, Stephane Ethier 2, Bei Wang 2, Simon See 1,3

More information

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks. Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)

More information

GPU Developments for the NEMO Model. Stan Posey, HPC Program Manager, ESM Domain, NVIDIA (HQ), Santa Clara, CA, USA

GPU Developments for the NEMO Model. Stan Posey, HPC Program Manager, ESM Domain, NVIDIA (HQ), Santa Clara, CA, USA GPU Developments for the NEMO Model Stan Posey, HPC Program Manager, ESM Domain, NVIDIA (HQ), Santa Clara, CA, USA NVIDIA HPC AND ESM UPDATE TOPICS OF DISCUSSION GPU PROGRESS ON NEMO MODEL 2 NVIDIA GPU

More information

OPENACC ONLINE COURSE 2018

OPENACC ONLINE COURSE 2018 OPENACC ONLINE COURSE 2018 Week 3 Loop Optimizations with OpenACC Jeff Larkin, Senior DevTech Software Engineer, NVIDIA ABOUT THIS COURSE 3 Part Introduction to OpenACC Week 1 Introduction to OpenACC Week

More information

INTRODUCTION TO OPENACC

INTRODUCTION TO OPENACC INTRODUCTION TO OPENACC Hossein Pourreza hossein.pourreza@umanitoba.ca March 31, 2016 Acknowledgement: Most of examples and pictures are from PSC (https://www.psc.edu/images/xsedetraining/openacc_may2015/

More information

CURRENT STATUS OF THE PROJECT TO ENABLE GAUSSIAN 09 ON GPGPUS

CURRENT STATUS OF THE PROJECT TO ENABLE GAUSSIAN 09 ON GPGPUS CURRENT STATUS OF THE PROJECT TO ENABLE GAUSSIAN 09 ON GPGPUS Roberto Gomperts (NVIDIA, Corp.) Michael Frisch (Gaussian, Inc.) Giovanni Scalmani (Gaussian, Inc.) Brent Leback (PGI) TOPICS Gaussian Design

More information

2010 Compilers & Tools for x64+gpu Systems

2010 Compilers & Tools for x64+gpu Systems PGI 2010 Compilers & Tools for x64+gpu Systems Douglas Miles douglas.miles@pgroup.com www.pgroup.com January 28, 2010 Talk Roadmap Introduction to The Portland Group / PGI Introduction to GPU programming

More information

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming Overview Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.

More information

Implicit Low-Order Unstructured Finite-Element Multiple Simulation Enhanced by Dense Computation using OpenACC

Implicit Low-Order Unstructured Finite-Element Multiple Simulation Enhanced by Dense Computation using OpenACC Fourth Workshop on Accelerator Programming Using Directives (WACCPD), Nov. 13, 2017 Implicit Low-Order Unstructured Finite-Element Multiple Simulation Enhanced by Dense Computation using OpenACC Takuma

More information

PROFILER OPENACC TUTORIAL. Version 2018

PROFILER OPENACC TUTORIAL. Version 2018 PROFILER OPENACC TUTORIAL Version 2018 TABLE OF CONTENTS Chapter Chapter Chapter Chapter Chapter 1. 2. 3. 4. 5. Tutorial Setup... 1 Profiling the application... 2 Adding OpenACC directives...4 Improving

More information

OpenStaPLE, an OpenACC Lattice QCD Application

OpenStaPLE, an OpenACC Lattice QCD Application OpenStaPLE, an OpenACC Lattice QCD Application Enrico Calore Postdoctoral Researcher Università degli Studi di Ferrara INFN Ferrara Italy GTC Europe, October 10 th, 2018 E. Calore (Univ. and INFN Ferrara)

More information

Parallel Numerical Algorithms

Parallel Numerical Algorithms Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance

More information

Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA. Part 1: Hardware design and programming model

Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA. Part 1: Hardware design and programming model Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA Part 1: Hardware design and programming model Dirk Ribbrock Faculty of Mathematics, TU dortmund 2016 Table of Contents Why parallel

More information

Lecture 3: Introduction to CUDA

Lecture 3: Introduction to CUDA CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu

More information

GTC 2017 S7672. OpenACC Best Practices: Accelerating the C++ NUMECA FINE/Open CFD Solver

GTC 2017 S7672. OpenACC Best Practices: Accelerating the C++ NUMECA FINE/Open CFD Solver David Gutzwiller, NUMECA USA (david.gutzwiller@numeca.com) Dr. Ravi Srinivasan, Dresser-Rand Alain Demeulenaere, NUMECA USA 5/9/2017 GTC 2017 S7672 OpenACC Best Practices: Accelerating the C++ NUMECA FINE/Open

More information

NVIDIA Think about Computing as Heterogeneous One Leo Liao, 1/29/2106, NTU

NVIDIA Think about Computing as Heterogeneous One Leo Liao, 1/29/2106, NTU NVIDIA Think about Computing as Heterogeneous One Leo Liao, 1/29/2106, NTU GPGPU opens the door for co-design HPC, moreover middleware-support embedded system designs to harness the power of GPUaccelerated

More information

OpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware

OpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware OpenACC Standard Directives for Accelerators Credits http://www.openacc.org/ o V1.0: November 2011 Specification OpenACC, Directives for Accelerators, Nvidia Slideware CAPS OpenACC Compiler, HMPP Workbench

More information

G P G P U : H I G H - P E R F O R M A N C E C O M P U T I N G

G P G P U : H I G H - P E R F O R M A N C E C O M P U T I N G Joined Advanced Student School (JASS) 2009 March 29 - April 7, 2009 St. Petersburg, Russia G P G P U : H I G H - P E R F O R M A N C E C O M P U T I N G Dmitry Puzyrev St. Petersburg State University Faculty

More information

Titan - Early Experience with the Titan System at Oak Ridge National Laboratory

Titan - Early Experience with the Titan System at Oak Ridge National Laboratory Office of Science Titan - Early Experience with the Titan System at Oak Ridge National Laboratory Buddy Bland Project Director Oak Ridge Leadership Computing Facility November 13, 2012 ORNL s Titan Hybrid

More information

GPU Programming. Ringberg Theorie Seminar 2010

GPU Programming. Ringberg Theorie Seminar 2010 or How to tremendously accelerate your code? Michael Kraus, Christian Konz Max-Planck-Institut für Plasmaphysik, Garching Ringberg Theorie Seminar 2010 Introduction? GPU? GPUs can do more than just render

More information

S Comparing OpenACC 2.5 and OpenMP 4.5

S Comparing OpenACC 2.5 and OpenMP 4.5 April 4-7, 2016 Silicon Valley S6410 - Comparing OpenACC 2.5 and OpenMP 4.5 James Beyer, NVIDIA Jeff Larkin, NVIDIA GTC16 April 7, 2016 History of OpenMP & OpenACC AGENDA Philosophical Differences Technical

More information

Introduction to GPU Computing. 周国峰 Wuhan University 2017/10/13

Introduction to GPU Computing. 周国峰 Wuhan University 2017/10/13 Introduction to GPU Computing chandlerz@nvidia.com 周国峰 Wuhan University 2017/10/13 GPU and Its Application 3 Ways to Develop Your GPU APP An Example to Show the Developments Add GPUs: Accelerate Science

More information

Introduction to OpenACC. Shaohao Chen Research Computing Services Information Services and Technology Boston University

Introduction to OpenACC. Shaohao Chen Research Computing Services Information Services and Technology Boston University Introduction to OpenACC Shaohao Chen Research Computing Services Information Services and Technology Boston University Outline Introduction to GPU and OpenACC Basic syntax and the first OpenACC program:

More information

Quantum Chemistry (QC) on GPUs. March 2018

Quantum Chemistry (QC) on GPUs. March 2018 Quantum Chemistry (QC) on GPUs March 2018 Overview of Life & Material Accelerated Apps MD: All key codes are GPU-accelerated Great multi-gpu performance Focus on dense (up to 16) GPU nodes &/or large #

More information

WHAT S NEW IN CUDA 8. Siddharth Sharma, Oct 2016

WHAT S NEW IN CUDA 8. Siddharth Sharma, Oct 2016 WHAT S NEW IN CUDA 8 Siddharth Sharma, Oct 2016 WHAT S NEW IN CUDA 8 Why Should You Care >2X Run Computations Faster* Solve Larger Problems** Critical Path Analysis * HOOMD Blue v1.3.3 Lennard-Jones liquid

More information

HETEROGENEOUS HPC, ARCHITECTURAL OPTIMIZATION, AND NVLINK STEVE OBERLIN CTO, TESLA ACCELERATED COMPUTING NVIDIA

HETEROGENEOUS HPC, ARCHITECTURAL OPTIMIZATION, AND NVLINK STEVE OBERLIN CTO, TESLA ACCELERATED COMPUTING NVIDIA HETEROGENEOUS HPC, ARCHITECTURAL OPTIMIZATION, AND NVLINK STEVE OBERLIN CTO, TESLA ACCELERATED COMPUTING NVIDIA STATE OF THE ART 2012 18,688 Tesla K20X GPUs 27 PetaFLOPS FLAGSHIP SCIENTIFIC APPLICATIONS

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014

More information

OpenACC 2.6 Proposed Features

OpenACC 2.6 Proposed Features OpenACC 2.6 Proposed Features OpenACC.org June, 2017 1 Introduction This document summarizes features and changes being proposed for the next version of the OpenACC Application Programming Interface, tentatively

More information

Module Memory and Data Locality

Module Memory and Data Locality GPU Teaching Kit Accelerated Computing Module 4.4 - Memory and Data Locality Tiled Matrix Multiplication Kernel Objective To learn to write a tiled matrix-multiplication kernel Loading and using tiles

More information

EXTENDING THE REACH OF PARALLEL COMPUTING WITH CUDA

EXTENDING THE REACH OF PARALLEL COMPUTING WITH CUDA EXTENDING THE REACH OF PARALLEL COMPUTING WITH CUDA Mark Harris, NVIDIA @harrism #NVSC14 EXTENDING THE REACH OF CUDA 1 Machine Learning 2 Higher Performance 3 New Platforms 4 New Languages 2 GPUS: THE

More information

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh.

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh. GPGPU Alan Gray/James Perry EPCC The University of Edinburgh a.gray@ed.ac.uk Contents Introduction GPU Technology Programming GPUs GPU Performance Optimisation 2 Introduction 3 Introduction Central Processing

More information

OpenACC programming for GPGPUs: Rotor wake simulation

OpenACC programming for GPGPUs: Rotor wake simulation DLR.de Chart 1 OpenACC programming for GPGPUs: Rotor wake simulation Melven Röhrig-Zöllner, Achim Basermann Simulations- und Softwaretechnik DLR.de Chart 2 Outline Hardware-Architecture (CPU+GPU) GPU computing

More information

Josef Pelikán, Jan Horáček CGG MFF UK Praha

Josef Pelikán, Jan Horáček CGG MFF UK Praha GPGPU and CUDA 2012-2018 Josef Pelikán, Jan Horáček CGG MFF UK Praha pepca@cgg.mff.cuni.cz http://cgg.mff.cuni.cz/~pepca/ 1 / 41 Content advances in hardware multi-core vs. many-core general computing

More information

Early Experiences Writing Performance Portable OpenMP 4 Codes

Early Experiences Writing Performance Portable OpenMP 4 Codes Early Experiences Writing Performance Portable OpenMP 4 Codes Verónica G. Vergara Larrea Wayne Joubert M. Graham Lopez Oscar Hernandez Oak Ridge National Laboratory Problem statement APU FPGA neuromorphic

More information

Stan Posey, NVIDIA, Santa Clara, CA, USA

Stan Posey, NVIDIA, Santa Clara, CA, USA Stan Posey, sposey@nvidia.com NVIDIA, Santa Clara, CA, USA NVIDIA Strategy for CWO Modeling (Since 2010) Initial focus: CUDA applied to climate models and NWP research Opportunities to refactor code with

More information

Quantum ESPRESSO on GPU accelerated systems

Quantum ESPRESSO on GPU accelerated systems Quantum ESPRESSO on GPU accelerated systems Massimiliano Fatica, Everett Phillips, Josh Romero - NVIDIA Filippo Spiga - University of Cambridge/ARM (UK) MaX International Conference, Trieste, Italy, January

More information

Overview: Graphics Processing Units

Overview: Graphics Processing Units advent of GPUs GPU architecture Overview: Graphics Processing Units the NVIDIA Fermi processor the CUDA programming model simple example, threads organization, memory model case study: matrix multiply

More information

TESLA V100 PERFORMANCE GUIDE. Life Sciences Applications

TESLA V100 PERFORMANCE GUIDE. Life Sciences Applications TESLA V100 PERFORMANCE GUIDE Life Sciences Applications NOVEMBER 2017 TESLA V100 PERFORMANCE GUIDE Modern high performance computing (HPC) data centers are key to solving some of the world s most important

More information

Advanced OpenACC. John Urbanic Parallel Computing Scientist Pittsburgh Supercomputing Center. Copyright 2018

Advanced OpenACC. John Urbanic Parallel Computing Scientist Pittsburgh Supercomputing Center. Copyright 2018 Advanced OpenACC John Urbanic Parallel Computing Scientist Pittsburgh Supercomputing Center Copyright 2018 Outline Loop Directives Data Declaration Directives Data Regions Directives Cache directives Wait

More information

ACCELERATED COMPUTING: THE PATH FORWARD. Jen-Hsun Huang, Co-Founder and CEO, NVIDIA SC15 Nov. 16, 2015

ACCELERATED COMPUTING: THE PATH FORWARD. Jen-Hsun Huang, Co-Founder and CEO, NVIDIA SC15 Nov. 16, 2015 ACCELERATED COMPUTING: THE PATH FORWARD Jen-Hsun Huang, Co-Founder and CEO, NVIDIA SC15 Nov. 16, 2015 COMMODITY DISRUPTS CUSTOM SOURCE: Top500 ACCELERATED COMPUTING: THE PATH FORWARD It s time to start

More information

Lecture 1: an introduction to CUDA

Lecture 1: an introduction to CUDA Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Overview hardware view software view CUDA programming

More information

EXPOSING PARTICLE PARALLELISM IN THE XGC PIC CODE BY EXPLOITING GPU MEMORY HIERARCHY. Stephen Abbott, March

EXPOSING PARTICLE PARALLELISM IN THE XGC PIC CODE BY EXPLOITING GPU MEMORY HIERARCHY. Stephen Abbott, March EXPOSING PARTICLE PARALLELISM IN THE XGC PIC CODE BY EXPLOITING GPU MEMORY HIERARCHY Stephen Abbott, March 26 2018 ACKNOWLEDGEMENTS Collaborators: Oak Ridge Nation Laboratory- Ed D Azevedo NVIDIA - Peng

More information

ACCELERATING SANJEEVINI: A DRUG DISCOVERY SOFTWARE SUITE

ACCELERATING SANJEEVINI: A DRUG DISCOVERY SOFTWARE SUITE ACCELERATING SANJEEVINI: A DRUG DISCOVERY SOFTWARE SUITE Abhilash Jayaraj, IIT Delhi Shashank Shekhar, IIT Delhi Bharatkumar Sharma, Nvidia Nagavijayalakshmi, Nvidia AGENDA Quick Introduction to Computer

More information

TESLA P100 PERFORMANCE GUIDE. Deep Learning and HPC Applications

TESLA P100 PERFORMANCE GUIDE. Deep Learning and HPC Applications TESLA P PERFORMANCE GUIDE Deep Learning and HPC Applications SEPTEMBER 217 TESLA P PERFORMANCE GUIDE Modern high performance computing (HPC) data centers are key to solving some of the world s most important

More information

Unrolling parallel loops

Unrolling parallel loops Unrolling parallel loops Vasily Volkov UC Berkeley November 14, 2011 1 Today Very simple optimization technique Closely resembles loop unrolling Widely used in high performance codes 2 Mapping to GPU:

More information

Scientific discovery, analysis and prediction made possible through high performance computing.

Scientific discovery, analysis and prediction made possible through high performance computing. Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013

More information

CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012

CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012 CUDA Memory Types All material not from online sources/textbook copyright Travis Desell, 2012 Overview 1. Memory Access Efficiency 2. CUDA Memory Types 3. Reducing Global Memory Traffic 4. Example: Matrix-Matrix

More information

Parallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer

Parallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer Parallel Programming and Debugging with CUDA C Geoff Gerfin Sr. System Software Engineer CUDA - NVIDIA s Architecture for GPU Computing Broad Adoption Over 250M installed CUDA-enabled GPUs GPU Computing

More information

An Introduction to OpenACC. Zoran Dabic, Rusell Lutrell, Edik Simonian, Ronil Singh, Shrey Tandel

An Introduction to OpenACC. Zoran Dabic, Rusell Lutrell, Edik Simonian, Ronil Singh, Shrey Tandel An Introduction to OpenACC Zoran Dabic, Rusell Lutrell, Edik Simonian, Ronil Singh, Shrey Tandel Chapter 1 Introduction OpenACC is a software accelerator that uses the host and the device. It uses compiler

More information

Parallel Computing. Hwansoo Han (SKKU)

Parallel Computing. Hwansoo Han (SKKU) Parallel Computing Hwansoo Han (SKKU) Unicore Limitations Performance scaling stopped due to Power consumption Wire delay DRAM latency Limitation in ILP 10000 SPEC CINT2000 2 cores/chip Xeon 3.0GHz Core2duo

More information

Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015

Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015 Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015 Abstract As both an OpenMP and OpenACC insider I will present my opinion of the current status of these two directive sets for programming

More information

Preparing GPU-Accelerated Applications for the Summit Supercomputer

Preparing GPU-Accelerated Applications for the Summit Supercomputer Preparing GPU-Accelerated Applications for the Summit Supercomputer Fernanda Foertter HPC User Assistance Group Training Lead foertterfs@ornl.gov This research used resources of the Oak Ridge Leadership

More information

IBM CORAL HPC System Solution

IBM CORAL HPC System Solution IBM CORAL HPC System Solution HPC and HPDA towards Cognitive, AI and Deep Learning Deep Learning AI / Deep Learning Strategy for Power Power AI Platform High Performance Data Analytics Big Data Strategy

More information

Unified Memory. Notes on GPU Data Transfers. Andreas Herten, Forschungszentrum Jülich, 24 April Member of the Helmholtz Association

Unified Memory. Notes on GPU Data Transfers. Andreas Herten, Forschungszentrum Jülich, 24 April Member of the Helmholtz Association Unified Memory Notes on GPU Data Transfers Andreas Herten, Forschungszentrum Jülich, 24 April 2017 Handout Version Overview, Outline Overview Unified Memory enables easy access to GPU development But some

More information

Real-time Graphics 9. GPGPU

Real-time Graphics 9. GPGPU Real-time Graphics 9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing

More information

Parallel Computing. Lecture 19: CUDA - I

Parallel Computing. Lecture 19: CUDA - I CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4

More information

Inter-Block GPU Communication via Fast Barrier Synchronization

Inter-Block GPU Communication via Fast Barrier Synchronization CS 3580 - Advanced Topics in Parallel Computing Inter-Block GPU Communication via Fast Barrier Synchronization Mohammad Hasanzadeh-Mofrad University of Pittsburgh September 12, 2017 1 General Purpose Graphics

More information

Compiling a High-level Directive-Based Programming Model for GPGPUs

Compiling a High-level Directive-Based Programming Model for GPGPUs Compiling a High-level Directive-Based Programming Model for GPGPUs Xiaonan Tian, Rengan Xu, Yonghong Yan, Zhifeng Yun, Sunita Chandrasekaran, and Barbara Chapman Department of Computer Science, University

More information