Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples

Size: px

Start display at page:

Download "Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples"

Victoria Carr
6 years ago
Views:

1 Dr. James C. Beyer

2 Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples

3 Sum elements of an array Original Fortran code a=0.0 do i = 1,n a = a + b(i) end do 3

4 global void reduce0(int *g_idata, int *g_odata) { extern shared int sdata[]; unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blockdim.x + threadidx.x; sdata[tid] = g_idata[i]; syncthreads(); for(unsigned int s=1; s < blockdim.x; s *= 2) { if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; } syncthreads(); } if (tid == 0) g_odata[blockidx.x] = sdata[0]; } extern "C" void reduce0_cuda_(int *n, int *a, int *b) { int *b_d,red; const int b_size = *n; cudamalloc((void **) &b_d, sizeof(int)*b_size); cudamemcpy(b_d, b, sizeof(int)*b_size, cudamemcpyhosttodevice); dim3 dimblock(128, 1, 1); dim3 dimgrid(2048, 1, 1); dim3 small_dimgrid(16, 1, 1); int smemsize = 128 * sizeof(int); int *buffer_d, *red_d; int *small_buffer_d; cudamalloc((void **) &buffer_d, sizeof(int)*2048); cudamalloc((void **) &small_buffer_d, sizeof(int)*16); cudamalloc((void **) &red_d, sizeof(int)); reduce0<<< dimgrid, dimblock, smemsize >>>(b_d, buffer_d); reduce0<<< small_dimgrid, dimblock, smemsize >>>(buffer_d, small_buffer_d); reduce0<<< 1, 16, smemsize >>>(small_buffer_d, red_d); cudamemcpy(&red, red_d, sizeof(int), cudamemcpydevicetohost); *a = red; cudafree(buffer_d); cudafree(small_buffer_d); cudafree(b_d); } 4

5 template<class T> struct SharedMemory { device inline operator { } T*() extern shared int smem[]; return (T*) smem; device inline operator const T*() const { extern shared int smem[]; return (T*) smem; } }; template <class T, unsigned int blocksize, bool nispow2> global void reduce6(t *g_idata, T *g_odata, unsigned int n) { T *sdata = SharedMemory<T>(); unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blocksize*2 + threadidx.x; unsigned int gridsize = blocksize*2*griddim.x; T mysum = 0; while (i < n) { mysum += g_idata[i]; if (nispow2 i + blocksize < n) mysum += g_idata[i+blocksize]; i += gridsize; } sdata[tid] = mysum; syncthreads(); if (blocksize >= 512) { if (tid < 256) { sdata[tid] = mysum = mysum + sdata[tid + 256]; } syncthreads(); } if (blocksize >= 256) { if (tid < 128) { sdata[tid] = mysum = mysum + sdata[tid + 128]; } syncthreads(); } if (blocksize >= 128) { if (tid < 64) { sdata[tid] = mysum = mysum + sdata[tid + 64]; } syncthreads(); } if (tid < 32) { volatile T* smem = sdata; if (blocksize >= 64) { smem[tid] = mysum = mysum + smem[tid + 32]; } if (blocksize >= 32) { smem[tid] = mysum = mysum + smem[tid + 16]; } if (blocksize >= 16) { smem[tid] = mysum = mysum + smem[tid + 8]; } if (blocksize >= 8) { smem[tid] = mysum = mysum + smem[tid + 4]; } if (blocksize >= 4) { smem[tid] = mysum = mysum + smem[tid + 2]; } if (blocksize >= 2) { smem[tid] = mysum = mysum + smem[tid + 1]; } } if (tid == 0) g_odata[blockidx.x] = sdata[0]; } extern "C" void reduce6_cuda_(int *n, int *a, int *b) { int *b_d; const int b_size = *n; cudamalloc((void **) &b_d, sizeof(int)*b_size); cudamemcpy(b_d, b, sizeof(int)*b_size, cudamemcpyhosttodevice); dim3 dimblock(128, 1, 1); dim3 dimgrid(128, 1, 1); dim3 small_dimgrid(1, 1, 1); int smemsize = 128 * sizeof(int); int *buffer_d; int small_buffer[4],*small_buffer_d; cudamalloc((void **) &buffer_d, sizeof(int)*128); cudamalloc((void **) &small_buffer_d, sizeof(int)); reduce6<int,128,false><<< dimgrid, dimblock, smemsize >>>(b_d,buffer_d, b_size); reduce6<int,128,false><<< small_dimgrid, dimblock, smemsize >>>(buffer_d, small_buffer_d,128); cudamemcpy(small_buffer, small_buffer_d, sizeof(int), cudamemcpydevicetohost); *a = *small_buffer; cudafree(buffer_d); cudafree(small_buffer_d); cudafree(b_d); } 5

6 TM Compiler does the work: Identifies parallel loops within the region Determines the kernels needed Splits the code into accelerator and host portions Workshares loops running on accelerator Make use of MIMD and SIMD style parallelism Data movement allocates/frees GPU memory at start/end of region moves of data to/from GPU!$acc data present(a,b)!$acc parallel a = 0.0!$acc loop reduction(+:a) do i = 1,n a = a + b(i) end do!$acc end parallel!$acc end data 6

7 90. subroutine sum_of_int_4(n,a,b) 91. use global_data 92. integer*4 a,b(n) 93. integer*8 start_clock, elapsed_clocks, end_clock 94.!$acc data present(a,b) 95. G----<!$acc parallel 96. G a = G!$acc loop reduction(+:a) 98. G g--< do i = 1,n 99. G g a = a + b(i) 100. G g--> end do 101. G---->!$acc end parallel 102.!$acc end data 103. end subroutine sum_of_int_4 ftn-6413 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 94 A data region was created at line 94 and ending at line 107. ftn-6405 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 95 A region starting at line 95 and ending at line 101 was placed on the accelerator. ftn-6430 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 98 A loop starting at line 98 was partitioned across the threadblocks and the 128 threads within a threadblock.

8 Summary of code complexity and performance Programming Model Unit of computation Lines of code Performance in Gflops (higher is better) Performance normalized to X86 core Fortran Single x86 core Gflops 1.0 Simple CUDA GPU Gflops 0.87 Optimized CUDA GPU Gflops 5.25 OpenACC GPU Gflops

A common directive programming model for today s GPUs Announced at SC11 conference Offers portability between compilers Drawn up by: NVIDIA, Cray, PGI, CAPS Multiple compilers offer portability,

9 A common directive programming model for today s GPUs Announced at SC11 conference Offers portability between compilers Drawn up by: NVIDIA, Cray, PGI, CAPS Multiple compilers offer portability, debugging, permanence Works for Fortran, C, C++ Standard available at Initially implementations targeted at NVIDIA GPUs Current version: 1.0 (November 2011) Compiler support: Cray CCE: partial 2011, complete in 2012 PGI Accelerator: released product in 2012 CAPS: released product in

10 A common directive programming model for shared memory systems Announced 15yrs ago Works with Fortran, C, C++ Current version 3.1 (July 2011) Compiler support

11 Host-directed execution with attached GPU Main program executes on host (i.e. CPU) Compute intensive regions offloaded to the accelerator device under control of the host. device (i.e. GPU) executes parallel regions typically contain kernels (i.e. work-sharing loops), or kernels regions, containing one or more loops which are executed as kernels. Host must orchestrate the execution by: allocating memory on the accelerator device, initiating data transfer, sending the code to the accelerator, passing arguments to the parallel region, queuing the device code, waiting for completion, transferring results back to the host, and deallocating memory. Host can usually queue a sequence of operations to be executed on the device, one after the other. 11

12 Memory spaces on the host and device distinct Different locations, different address space Data movement performed by host using runtime library calls that explicitly move data between the separate GPUs have a weak memory model No synchronisation between different execution units (SMs) Unless explicit memory barrier Can write OpenACC kernels with race conditions OpenACC Giving inconsistent execution results Compiler will catch most errors, but not all (no user-managed barriers) data movement between the memories implicit managed by the compiler, based on directives from the programmer. Device memory caches are managed by the compiler with hints from the programmer in the form of directives. 12

13 A high level approach to programming accelerators Use original source code (Fortran, C, C++) Don t need separate source-base, much more portable Can preserve subprogram structure Familiar programming model if used traditional OpenMP A small performance gap acceptable Target is 10-15%, currently seeing better than that for many cases Draft proposal currently being discussed by OpenMP ARB subcommittee, aiming for OpenMP 4.0 (sometime in 2012) Cray an enthusiastic supporter Co-chairs committee (with TI) CCE already provides reference implementation of the draft standard Subcommittee includes most major vendors Cray, PGI, Nvidia, AMD, TI, IBM, CAPS, Oracle, Intel... An open standard is the most attractive for developers Portability, multiple compilers for debugging 13

14 OpenACC Parallel Kernels Data Loop Host data Cache Update Wait Declare OpenMP for Accelerators Accelerate Dispatch Data Do/for Part of data Nested data Update Wait Mirror/resident

15 man intro_openacc Which module to use, craype-accel-nvidia20 Forces dynamic linking Single object file Whole program Messages/list file Compiles to PTX not cuda Debugger sees original program not cuda intermediate 15

16 Only two one constructs are un-implemented Cache Declare One unimplemented data clause deviceptr 16

17 /* takes a host pointer */ void* cray_acc_create( void*, size_t ); void cray_acc_delete( void* ); void* cray_acc_copyin( void*, size_t ); void cray_acc_copyout( void*, size_t ); void cray_acc_updatein( void*, size_t ); void cray_acc_updateout( void*, size_t ); int cray_acc_is_present( void* ); int cray_acc_is_present_2( void*, size_t); void *cray_acc_deviceptr( void* ); /* takes a device and host pointer */ void cray_acc_memcpy_device_host( void*, void*, size_t ); /* takes a host and device pointer */ void cray_acc_memcpy_host_device( void*, void*, size_t ); 17

18 1.!$acc loop gang : across thread blocks 2.!$acc loop worker : across warps within a thread block 3.!$acc loop vector : across threads within a warp 1.!$acc loop gang : across thread blocks 2.!$acc loop worker vector : across threads within a thread block 1.!$acc loop gang : across thread blocks 2.!$acc loop vector : across threads within a thread block 1.!$acc loop gang worker: across thread blocks and the warps within a thread block 2.!$acc loop vector : across threads within a warp 1.!$acc loop gang vector : across thread blocks and threads within a thread block 1.!$acc loop gang worker vector : across thread blocks and threads within a thread block 18

19 You can also force things to be within a single thread block: 1.!$acc loop worker : across warps within a single thread block 2.!$acc loop vector : across threads within a warp 1.!$acc worker vector : across threads within a single thread block 1.!$acc vector : across threads within a single thread block 19

All parallel regions should contain a loop directive Always start with parallel loop (not just parallel) Fortran assumed size (A(*)) and C pointers must be shaped Always use : when shaping

20 All parallel regions should contain a loop directive Always start with parallel loop (not just parallel) Fortran assumed size (A(*)) and C pointers must be shaped Always use : when shaping with an entire dimension (i.e. A(:,1:2) ) First get your application working without data regions, than add data regions First get your application working without async, than add async 20

21 Reduction Libsci Pattern match Optimization Perftools Data regions Async clauses

22 ftn -rm -o cpu cpu_reduce_int.f90 aprun./cpu Integer reduction of 2^18 elements run on the CPU Kernel 1) flops GFlops int*4 sum_of_int_4 Application resources: utime ~0s, stime ~0s

23 nvcc -arch=sm_20 -c reduce0.cu reduce6.cu ftn -rm -o gpu_cuda gpu_reduce_int_cuda.f90 reduce0.o reduce6.o aprun./gpu_cuda ACC: Transfer 3 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cuda.f90:107 ACC: Execute kernel wake_up_gpu_$ck_l107_1 from gpu_reduce_int_cuda.f90:107 ACC: Synchronize from gpu_reduce_int_cuda.f90:113 ACC: Transfer 3 items (to acc 0 bytes, to host bytes) from gpu_reduce_int_cuda.f90: Integer reduction of 2^18 integers using SDK CUDA kernel reduce Based on micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of Integer reduction of 2^18 integers using SDK CUDA kernel reduce Based on micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of

24 ftn -rm hmsgs -o gpu_cce gpu_reduce_int_cce.f90 aprun -L$allgpunodes./gpu_cce ACC: Transfer 3 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:80 ACC: Execute kernel wake_up_gpu_$ck_l80_1 from gpu_reduce_int_cce.f90:80 ACC: Synchronize from gpu_reduce_int_cce.f90:86 ACC: Transfer 3 items (to acc 0 bytes, to host bytes) from gpu_reduce_int_cce.f90:86 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:43 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:95 ACC: Transfer 2 items (to acc 4 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:96 ACC: Execute kernel sum_of_int_4_$ck_l96_3 from gpu_reduce_int_cce.f90:96 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:106 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:107 ACC: Synchronize from gpu_reduce_int_cce.f90:48 ACC: Transfer 2 items (to acc 0 bytes, to host 4 bytes) from gpu_reduce_int_cce.f90: Integer reduction of 2^18 integers using CCE Open acc directives Based on 31.5 micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of

25 !$acc data copyin(a,b) copy(c,c3)!$acc host_data use_device(a,b,c,c3) t1 = MPI_Wtime() call dgemm('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c,ldc) t2 = MPI_Wtime() t_acc = t2-t1 t1 = MPI_Wtime() call dgemm_acc('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c3,ldc) t2 = MPI_Wtime()!$acc end host_data!$acc end data

26 ftn -hmsgs -o f_dgemm_openacc f_dgemm_openacc.f90!$acc kernels copy(c3) ftn-6413 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 15 A data region was created at line 15 and ending at line 17. ftn-6415 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 15 Allocate memory and copy whole array "c3" to accelerator, copy back at line 17 (acc_copy). c3(1,1) = 1 ftn-6405 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 16 A region starting at line 16 and ending at line 16 was placed on the accelerator. c2(:,:) = c(:,:) ftn-6066 crayftn: SCALAR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop nest at line 40 collapsed to a single loop. ftn-6005 crayftn: SCALAR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop starting at line 40 was unrolled 4 times. ftn-6204 crayftn: VECTOR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop starting at line 40 was vectorized.

27 aprun -L$allgpunodes./f_dgemm_openacc Matrix Size 1024 Error 0. Error between GPU calls 0. DGEMM_ACC_SIMPLE: GFlops in E-3 sec DGEMM_ACC: GFlops in E-3 sec DGEMM_CPU: GFlops in E-2 sec Matrix Size 4096 Error E-12 Error between GPU calls E-12 DGEMM_ACC_SIMPLE: GFlops in sec DGEMM_ACC: GFlops in sec DGEMM_CPU: GFlops in sec Matrix Size 8192 Error E-11 Error between GPU calls E-11 DGEMM_ACC_SIMPLE: GFlops in sec DGEMM_ACC: GFlops in sec DGEMM_CPU: GFlops in sec Application resources: utime ~36s, stime ~4s

28 do j = 1,L do i = 1,N do k = 1,M C(i,j) = C(i,j) + A(i,k)*B(k,j) enddo enddo enddo

29 ftn -hmsgs omatmul matmul_2.f ftn-6202 crayftn: VECTOR MMUL, File = matmul_2.f, Line = 31 A loop starting at line 31 was replaced by a library call. explain ftn-6202 VECTOR: A loop starting at line %s was replaced by a library call. A loop or nest of loops was found to match a library routine. The loop code was replaced by a call to the corresponding routine. Only one pattern was matched and it covered the entire loop. grep dgemm matmul_2.f nm matmul grep dgemm U _dgemm_

30 aprun -L60./matmul arrays sized 4092 by 1024 by 1024 calling mmul Gigaflops including data xfer: C(1,1) = C(2,2) = calling mmul_nopattern No pattern Gigaflops: C(1,1) = No errors found Application resources: utime ~9s, stime ~1s

31 #ifdef USE_DATA!$acc data create(a,b) #endif t1 = gettime() stream_counter = 1 DO j = 1,Nchunks #ifdef FULL_ASYNC my_stream = j #else my_stream = Streams(stream_counter) #endif #ifdef ASYNC #ifndef FULL_ASYNC!$acc wait(my_stream) #endif #endif #ifdef USE_DATA #ifdef ASYNC!$acc update device(a(:,j)) async(my_stream) #else!$acc update device(a(:,j)) #endif #endif #ifdef ASYNC!$acc parallel loop async(my_stream) #else!$acc parallel loop #endif DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc end parallel loop #ifdef USE_DATA #ifdef ASYNC!$acc update host(b(:,j)) async(my_stream) #else!$acc update host(b(:,j)) #endif #endif stream_counter = MOD(stream_counter,3) + 1 ENDDO!$acc wait t2 = gettime()!$acc end data

32 ftn -rad -hnocaf -c -o toaa2.o toaa2.f90 ftn -rad -hnocaf -o toaa2.x toaa2.o pat_build -w toaa2.x aprun toaa2.x+pat Time = Experiment data file written: /lus/scratch/beyerj/openacc/toaa/toaa2.x+pat t.xf Application resources: utime ~83s, stime ~7s pat_report T toaa2.x+pat t.xf

33 Table 1: Profile by Function Group and Function Time% Time Imb. Imb. Calls Group Time Time% Function 100.0% Total % USER % % % toaa_ 0.0% % % % exit 0.0% ====================================================================== 0.0% ETC =======================================================================

34 Table 2: Time and Bytes Transferred for Accelerator Regions Host Host Acc Acc Copy Acc Copy Calls Calltree Time% Time Time In Out (MBytes) (MBytes) 100.0% Total % toaa_ % % toaa_.acc_copy@li % toaa_.acc_copy@li % toaa_.acc_kernel@li % toaa_.acc_sync_wait@li % toaa_.acc_region@li.59(exclusive) =============================================================================================== 0.0% toaa_.acc_sync_wait@li.79 ================================================================================================= Processing step 3 of 3

35 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61

36 ftn -rad -hnocaf -DUSE_DATA -c -o toaa2.o toaa2.f90 ftn -rad -hnocaf -DUSE_DATA -o toaa2.x toaa2.o pat_build -w toaa2.x aprun toaa2.x+pat Time = Experiment data file written: /lus/scratch/beyerj/openacc/toaa/toaa2.x+pat t.xf Application resources: utime ~4s, stime ~2s pat_report T toaa2.x+pat t.xf Faster with 10x the work

37 Table 2: Time and Bytes Transferred for Accelerator Regions Host Host Acc Acc Copy Acc Copy Calls Calltree Time% Time Time In Out (MBytes) (MBytes) 100.0% Total % toaa_ % toaa_.acc_update@li % toaa_.acc_copy@li % toaa_.acc_sync_wait@li % toaa_.acc_update@li.71(exclusive) ================================================================================================== % toaa_.acc_update@li % toaa_.acc_copy@li % toaa_.acc_update@li.52(exclusive) ================================================================================================== [[[...]]] Processing step 3 of 3

38 !$acc data create(a,b) t1 = gettime() stream_counter = 1 DO j = 1,Nchunks #ifdef FULL_ASYNC my_stream = j #else my_stream = Streams(stream_counter) #endif #ifdef ASYNC #ifndef FULL_ASYNC!$acc wait(my_stream) #endif #endif #ifdef ASYNC!$acc update device(a(:,j)) async(my_stream) #else!$acc update device(a(:,j)) #endif #ifdef ASYNC!$acc parallel loop async(my_stream) #else!$acc parallel loop #endif DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc end parallel loop #ifdef ASYNC!$acc update host(b(:,j)) async(my_stream) #else!$acc update host(b(:,j)) #endif stream_counter = MOD(stream_counter,3) + 1 ENDDO!$acc wait t2 = gettime()!$acc end data

39 make clean make aprun toaa2.x Time = Application resources: utime ~6s, stime ~1s

40 make clean make ACC=yes aprun toaa2.x Time = Application resources: utime ~3s, stime ~2s

41 make clean make ACC=yes ASYNC=yes aprun toaa2.x Time = Application resources: utime ~2s, stime ~2s

42 make clean make ACC=yes ASYNC=yes FULL_ASYNC=yes aprun toaa2.x Time = Application resources: utime ~2s, stime ~2s

43 async(handle): like CUDA streams allows overlap of tasks on GPU PCIe transfers in both directions Plus multiple kernels (up to 16 with Fermi) streams identified by handle tasks with same handle execute sequentially can wait on one, more or all tasks OpenACC API also allows completeness check First attempt, a simple pipeline: processes array, slice by slice copy data to GPU, process, bring back to CPU very complicated kernel operation here! should be able to overlap 3 streams at once use slice number as stream handle in this case runtime MODs it back into allowable range Can actually overlap more than three stream No benefit on this test INTEGER, PARAMETER :: Nvec = 10000, Nchunks = REAL(kind=dp) :: a(nvec,nchunks), b(nvec,nchunks)!$acc data create(a,b) DO j = 1,Nchunks!$acc update device(a(:,j)) async(j)!$acc parallel loop async(j) DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc update host(b(:,j)) async(j) ENDDO!$acc wait!$acc end data 43

INTEGER, PARAMETER :: Nvec = 10000, Nchunks = 10000 Execution times (on Cray XK6): CPU: 3.98s OpenACC, blocking: 3.6s OpenACC, async: 0.82s OpenACC, full async: 0.

44 INTEGER, PARAMETER :: Nvec = 10000, Nchunks = Execution times (on Cray XK6): CPU: 3.98s OpenACC, blocking: 3.6s OpenACC, async: 0.82s OpenACC, full async: 0.76s REAL(kind=dp) :: a(nvec,nchunks), b(nvec,nchunks)!$acc data create(a,b) DO j = 1,Nchunks!$acc update device(a(:,j)) async(j)!$acc parallel loop async(j) DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc update host(b(:,j)) async(j) NVIDIA Visual profiler: time flows to right, streams stacked vertically red: data transfer to GPU pink: computational kernel on GPU blue: data transfer from GPU vertical slice shows what is overlapping only 7 of 16 streams fit in window collapsed view at bottom async handle modded by number of streams so see multiple coloured bars per stream ENDDO!$acc wait!$acc end data 44

Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools

Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Major Hybrid Multi Petaflop Systems