ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC

Size: px

Start display at page:

Download "ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC"

Zoe Leonard
5 years ago
Views:

1 ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018

PGI THE NVIDIA HPC SDK Fortran, C & C++ Compilers

Features CUDA Fortran, OpenACC Directives Multi-Platform

GPUs Supported on Linux, macos, Windows MPI/OpenMP/OpenACC

2 PGI THE NVIDIA HPC SDK Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on Linux, macos, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT, TotalView 2

3 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View PCIe System Memory GPU Memory 3

4 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View NVLink System Memory GPU Memory 4

5 CUDA FORTRAN real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev... allocate (Adev(N,M), Bdev(M,L), Cdev(N,L)) Adev = A(1:N,1:M) Bdev = B(1:M,1:L) call mm_kernel <<<dim3(n/16,m/16),dim3(16,16)>>> ( Adev, Bdev, Cdev, N, M, L ) C(1:N,1:L) = Cdev deallocate ( Adev, Bdev, Cdev )... attributes(global) subroutine mm_kernel ( A, B, C, N, M, L ) real :: A(N,M), B(M,L), C(N,L), Cij integer, value :: N, M, L integer :: i, j, kb, k, tx, ty real, shared :: Asub(16,16),Bsub(16,16) tx = threadidx%x ty = threadidx%y i = blockidx%x * 16 + tx j = blockidx%y * 16 + ty Cij = 0.0 do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) Bsub(tx,ty) = B(kb+ty-1,j) call syncthreads() do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo call syncthreads() enddo C(i,j) = Cij end subroutine mmul_kernel CPU Code Tesla Code 5

6 CUDA FORTRAN!$CUF KERNEL Directives module madd_device_module use cudafor contains subroutine madd_dev(a,b,c,sum,n1,n2) real,dimension(:,:),device :: a,b,c real :: sum integer :: n1,n2 type(dim3) :: grid, block!$cuf kernel do (2) <<<(*,*),(32,4)>>> do j = 1,n2 do i = 1,n1 a(i,j) = b(i,j) + c(i,j) sum = sum + a(i,j) enddo enddo end subroutine end module Equivalent hand-written CUDA kernels module madd_device_module use cudafor implicit none contains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256)! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j)! accumulates partial sum per thread enddo enddo! Now add up all partial sums for the whole thread block! Compute this thread's linear index in the thread block! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads()! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo! Store the partial sum for the thread block bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) end subroutine! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) real, dimension(:) :: blocksum real :: dsum integer, value :: nb real, shared :: bsum(256) integer :: tindex,tneighbor,i! Again, we assume 256 threads in the thread block! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo call syncthreads()! This code is copied from the previous kernel! Accumulate all the partial sums for this thread block to a single value! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) end subroutine subroutine madd_dev(a,b,c,dsum,n1,n2) real, dimension(:,:), device :: a,b,c real, device :: dsum real, dimension(:), allocatable, device :: blocksum integer :: n1,n2,nb type(dim3) :: grid, block integer :: r! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudathreadsynchronize()! don't deallocate too early deallocate(blocksum) end subroutine 6

.. #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) {

7 OpenACC Directives Manage Data Movement Initiate Parallel Execution Optimize Loop Mappings #pragma acc data copyin(a,b) copyout(c) {... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i];... } }... } Incremental Single source Interoperable Performance portable CPU, GPU, Manycore 7

$.. #pragma acc data copy(b[0:n][0:m]) \ create(a[0:n][0:m]) { for (iter = 1; iter <= p; ++iter){ #pragma acc parallel loop for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+$

8 OpenACC for GPUs in a Nutshell... #pragma acc data copy(b[0:n][0:m]) \ create(a[0:n][0:m]) { for (iter = 1; iter <= p; ++iter){ #pragma acc parallel loop for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ w1*(b[i-1][j]+b[i+1][j]+ b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } #pragma acc parallel loop for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) b[i][j] = a[i][j]; } }... A B System Memory S p 12 (B) S 1 p (B) GPU Memory 8

9 OpenACC is for Multicore, Manycore & GPUs 98!$acc parallel 99!$acc loop independent 100 do k=y_min-depth,y_max+depth 101!$acc loop independent 102 do j=1,depth 103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k) 104 enddo 105 enddo 106!$acc end parallel Multicore CPU Tesla GPU % pgfortran -ta=multicore fast Minfo=acc -c \ update_tile_halo_kernel.f , Loop is parallelizable Generating Multicore code 100,!$acc loop gang 102, Loop is parallelizable % pgfortran -ta=tesla fast -Minfo=acc c \ update_tile_halo_kernel.f , Loop is parallelizable 102, Loop is parallelizable Accelerator kernel generated Generating Tesla code 100,!$acc loop gang, vector(4)! blockidx%y threadidx%y 102,!$acc loop gang, vector(32)! blockidx%x threadidx%x 9

10 GEOMEAN Seconds GEOMEAN Seconds SPEC ACCEL 1.2 BENCHMARKS 200 OpenACC 200 OpenMP 4.5 PGI 18.1 Intel 2018 PGI x Speed-up socket Broadwell 1x Volta V socket Skylake 2-socket EPYC 2-socket Broadwell 40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads Performance measured February, Skylake: Two 20 core Intel Xeon Gold GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E v4 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core Intel Xeon E v4 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB 1.53GHz. SPEC is a registered trademark of the Standard Performance Evaluation Corporation (

11 OPENACC APPLICATIONS 11

%GPUCPU= 0-7 = 0-7 Use GPUs 0-7 with CPUs 0-7 as their controllers. Detailed information is available on our website. Project Contributors GAUSSIAN 16 Roberto Gomperts NVIDIA Gaussi an, Inc.

Using OpenACC allowed us to continue development of our Brent fundamental Leback NVIDIA/PGI algorithms and software capabilities simultaneously with the GPU-related work.

12 %GPUCPU= 0-7 = 0-7 Use GPUs 0-7 with CPUs 0-7 as their controllers. Detailed information is available on our website. Project Contributors GAUSSIAN 16 Roberto Gomperts NVIDIA Gaussi an, Inc. 340 Quinnipiac St. Bldg. 40 Wallingford, CT USA custser v@gaussi an.com Michael Frisch Gaussian Mike Frisch, Ph.D. President and CEO Gaussian, Inc. Using OpenACC allowed us to continue development of our Brent fundamental Leback NVIDIA/PGI algorithms and software capabilities simultaneously with the GPU-related work. In the end, we could use the same Copyright code 2017, base Gaussian, for Inc. SMP, All rights cluster/ reser ved. network and GPU parallelism. PGI's compilers were essential to the success of our efforts. Gio Gaussian is a registered trademark of Gaussian, Inc. All other trademarks and the proper ties of their respective holders. Specif cations subject to change witho 12

13 ANSYS FLUENT Sunil Sathe Lead Software Developer ANSYS Fluent We ve effectively used OpenACC for heterogeneous computing in ANSYS Fluent with impressive performance. We re now applying this work to more of our models and new platforms. Image courtesy: ANSYS 13

14 VASP Prof. Georg Kresse Computational Materials Physics University of Vienna For VASP, OpenACC is the way forward for GPU acceleration. Performance is similar and in some cases better than CUDA C, and OpenACC dramatically decreases GPU development and maintenance efforts. We re excited to collaborate with NVIDIA and PGI as an early adopter of CUDA Unified Memory. 14

OpenACC would have been impossible two or three years ago, but

15 NUMECA FINE/Open David Gutzwiller Lead Software Developer NUMECA Porting our unstructured C++ CFD solver FINE/Open to GPUs using OpenACC would have been impossible two or three years ago, but OpenACC has developed enough that we re now getting some really good results. 15

16 MPAS-A Richard Loft Director, Technology Development NCAR Image courtesy: NCAR Our team has been evaluating OpenACC as a pathway to performance portability for the Model for Prediction (MPAS) atmospheric model. Using this approach on the MPAS dynamical core, we have achieved performance on a single P100 GPU equivalent to 2.7 dual socketed Intel Xeon nodes on our new Cheyenne supercomputer. 16

17 COSMO Dr. Oliver Fuhrer Senior Scientist Meteoswiss OpenACC made it practical to develop for GPU-based hardware while retaining a single source for almost all the COSMO physics code. 17

18 GAMERA FOR GPU Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo Hori, Lalith Wijerathne The University of Tokyo With OpenACC and a compute node based on NVIDIA's Tesla P100 GPU, we achieved more than a 14X speed up over a K Computer node running our earthquake disaster simulation code Map courtesy University of Tokyo 18

potential of the CUDA programming model and NVIDIA GPUs.

19 QUANTUM ESPRESSO Filippo Spiga Head of Research Software Engineering University of Cambridge CUDA Fortran gives us the full performance potential of the CUDA programming model and NVIDIA GPUs.!$CUF KERNELS directives give us productivity and source code maintainability. It s the best of both worlds. 19

20 OPENACC AND CUDA UNIFIED MEMORY 20

21 Programming GPU-Accelerated Systems CUDA Unified Memory for Dynamically Allocated Data GPU Developer View GPU Developer View With CUDA Unified Memory PCIe System Memory GPU Memory Unified Memory 21

22 How CUDA Unified Memory Works on TESLA GPUs Servicing CPU and GPU Page Faults for Allocatable Data cudamallocmanaged(&array, size); memset(array, size); setvalue<<<...>>>(array, size/2, 5);... global void setvalue(char *ptr, int index, char val) { ptr[index] = val; } Page Fault CPU Memory Mapping array GPU Memory Mapping array Page Fault PCIe or NVLink 22

23 PGI OpenACC and CUDA Unified Memory Compiling with the ta=tesla:managed option #pragma acc data copyin(a,b) copyout(c) {... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i];... } }... } GPU Developer View With CUDA Unified Memory Unified Memory C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory 23

24 PGI OpenACC and CUDA Unified Memory Compiling with the ta=tesla:managed option... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i];... } }... GPU Developer View With CUDA Unified Memory Unified Memory C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory 24

25 Center for Accelerated Application Readiness (CAAR) Oak Ridge Leadership Computing Facility IBM POWER9 CPUs NVIDIA Volta V100 GPUs 25

GTC: An OpenACC Production Application Being ported for runs on the ORNL Summit supercomputer The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell

26 GTC: An OpenACC Production Application Being ported for runs on the ORNL Summit supercomputer The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell production code for turbulence simulation in support of the burning plasma experiment ITER, the crucial next step in the quest for fusion energy. 26

27 GTC Performance using OpenACC OpenPOWER NVLink Unified Memory P100 V100 16x 14x 16.5X 12x 10x 12.1X 12X 8x 6x 4x 6.1X 5.9X 2x P8 UM 20-core P8 P8+2xP100 UM P8+2xP100 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK : No Data Directives in sources, compiled with ta=tesla:managed P8+4xP100 UM P8+4xP100 x64+4xv100 Data Directives Data Directives Data Directives 27

28 DEEP COPY 28

29 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Real-world applications often have complex, aggregate data structures CUDA Unified Memory can automatically manage Deep Copy, but Derived Type 3 Members: only static Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic 29

30 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Real-world applications often have complex, aggregate data structures CUDA Unified Memory can automatically manage Deep Copy, but CUDA Unified Memory is only for allocatable data today Derived Type 3 Members: only static Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic 30

31 FORTRAN AUTOMATIC FULL DEEP COPY Fortran Derived Types 31

32 OPENACC 2.6 MANUAL DEEP COPY Supported Today in PGI Compilers typedef struct points { float* x; float* y; float* z; int n; float coef, direction; } points; void sub ( int n, float* y ) { points p; #pragma acc data create (p) { p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc update device (p.n) #pragma acc data copyin (p.x[0:n], p.y[0: n]) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];... 32

33 DRAFT OPENACC 3.0 TRUE DEEP COPY Still in definition by the OpenACC Committee typedef struct points { float* x; float* y; float* z; int n; float coef, direction; #pragma acc policy inout(x[0:n],y[0:n]) } points; void sub ( int n, float* y ) { points p; p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc data copy (p) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];... 33

34 WHITHER OPENACC? 34

35 Speedup vs Single Haswell Core CLOVERLEAF AWE Hydrodynamics mini-app, bm32 data set x PGI 18.1 OpenACC Intel 2018 OpenMP 40x 7.6x 7.9x 10x 10x 14.8x 15x 11x Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal 109x 67x 1x 2x 4x Volta V100 Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4). Compilers: Intel , PGI 18.1 Benchmark: CloverLeaf v1.3 downloaded from the week of November ; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC) Data compiled by PGI February

36 OPENACC DIRECTIVES FOR GPUS % pgfortran fast ta=tesla Minfo -c PdV_kernel.f90 pdv_kernel:... 77, Loop is parallelizable 79, Loop is parallelizable Accelerator kernel generated Generating Tesla code 77,!$acc loop gang, vector(4)! blockidx%y! threadidx%y 79,!$acc loop gang, vector(32)! blockidx%x! threadidx%x... 75!$ACC KERNELS 76!$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78!$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max left_flux= (xarea(j,k )*(xvel0(j,k )+xvel0(j,k+1) & 82 +xvel0(j,k )+xvel0(j,k+1)))*0.25_8*dt* right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt* bottom_flux=(yarea(j,k )*(yvel0(j,k )+yvel0(j+1,k ) & 86 +yvel0(j,k )+yvel0(j+1,k )))*0.25_8*dt* top_flux= (yarea(j,k+1)*(yvel0(j,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j,k+1)+yvel0(j+1,k+1)))*0.25_8*dt* total_flux=right_flux-left_flux+top_flux-bottom_flux volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) min_cell_volume=min(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94,volume(j,k)+right_flux-left_flux & 95,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))* energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107!$ACC END KERNELS 36

37 OPENACC DIRECTIVES FOR MULTICORE CPUS % pgfortran fast ta=multicore... PdV_kernel.f90 pdv_kernel:... 77, Loop is parallelizable Generating Multicore code 77,!$acc loop gang 79, Loop is parallelizable 3 loop-carried redundant expressions removed with 9 operations and 9 arrays Innermost loop distributed: 2 new loops Generated vector SIMD code for the loop Generated 2 prefetch instructions for the loop Generated 12 prefetch instructions for the loop... 75!$ACC KERNELS 76!$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78!$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max left_flux= (xarea(j,k )*(xvel0(j,k )+xvel0(j,k+1) & 82 +xvel0(j,k )+xvel0(j,k+1)))*0.25_8*dt* right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt* bottom_flux=(yarea(j,k )*(yvel0(j,k )+yvel0(j+1,k ) & 86 +yvel0(j,k )+yvel0(j+1,k )))*0.25_8*dt* top_flux= (yarea(j,k+1)*(yvel0(j,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j,k+1)+yvel0(j+1,k+1)))*0.25_8*dt* total_flux=right_flux-left_flux+top_flux-bottom_flux volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) min_cell_volume=min(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94,volume(j,k)+right_flux-left_flux & 95,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))* energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107!$ACC END KERNELS 37

38 FORTRAN 2018 DO CONCURRENT Fortran 2018 DO CONCURRENT + True Parallel Loops + Loop-scope shared/private data No support for reductions No support for atomics No support for data management DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) & 78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, & min_cell_volume,energy_change,recip_volume) left_flux= (xarea(j,k )*(xvel0(j,k )+xvel0(j,k+1) & 82 +xvel0(j,k )+xvel0(j,k+1)))*0.25_8*dt* right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt* bottom_flux=(yarea(j,k )*(yvel0(j,k )+yvel0(j+1,k ) & 86 +yvel0(j,k )+yvel0(j+1,k )))*0.25_8*dt* top_flux= (yarea(j,k+1)*(yvel0(j,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j,k+1)+yvel0(j+1,k+1)))*0.25_8*dt* total_flux=right_flux-left_flux+top_flux-bottom_flux volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) min_cell_volume=min(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94,volume(j,k)+right_flux-left_flux & 95,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))* energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) ENDDO

39 OPENACC FOR EVERYONE The PGI Community Edition, pgicompilers.com/community FREE PROGRAMMING MODELS OpenACC, CUDA Fortran, OpenMP, C/C++/Fortran Compilers and Tools PLATFORMS X86, OpenPOWER, NVIDIA GPU UPDATES 1-2 times a year 6-9 times a year 6-9 times a year SUPPORT User Forums PGI Support PGI Premier Services LICENSE Annual Perpetual Volume/Site 39

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI SC13 GPU Technology Theater Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI The Case for Fortran Clear, straight-forward syntax Successful legacy in the scientific community