Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples

Size: px
Start display at page:

Download "Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples"

Transcription

1 Dr. James C. Beyer

2 Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples

3 Sum elements of an array Original Fortran code a=0.0 do i = 1,n a = a + b(i) end do 3

4 global void reduce0(int *g_idata, int *g_odata) { extern shared int sdata[]; unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blockdim.x + threadidx.x; sdata[tid] = g_idata[i]; syncthreads(); for(unsigned int s=1; s < blockdim.x; s *= 2) { if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; } syncthreads(); } if (tid == 0) g_odata[blockidx.x] = sdata[0]; } extern "C" void reduce0_cuda_(int *n, int *a, int *b) { int *b_d,red; const int b_size = *n; cudamalloc((void **) &b_d, sizeof(int)*b_size); cudamemcpy(b_d, b, sizeof(int)*b_size, cudamemcpyhosttodevice); dim3 dimblock(128, 1, 1); dim3 dimgrid(2048, 1, 1); dim3 small_dimgrid(16, 1, 1); int smemsize = 128 * sizeof(int); int *buffer_d, *red_d; int *small_buffer_d; cudamalloc((void **) &buffer_d, sizeof(int)*2048); cudamalloc((void **) &small_buffer_d, sizeof(int)*16); cudamalloc((void **) &red_d, sizeof(int)); reduce0<<< dimgrid, dimblock, smemsize >>>(b_d, buffer_d); reduce0<<< small_dimgrid, dimblock, smemsize >>>(buffer_d, small_buffer_d); reduce0<<< 1, 16, smemsize >>>(small_buffer_d, red_d); cudamemcpy(&red, red_d, sizeof(int), cudamemcpydevicetohost); *a = red; cudafree(buffer_d); cudafree(small_buffer_d); cudafree(b_d); } 4

5 template<class T> struct SharedMemory { device inline operator { } T*() extern shared int smem[]; return (T*) smem; device inline operator const T*() const { extern shared int smem[]; return (T*) smem; } }; template <class T, unsigned int blocksize, bool nispow2> global void reduce6(t *g_idata, T *g_odata, unsigned int n) { T *sdata = SharedMemory<T>(); unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blocksize*2 + threadidx.x; unsigned int gridsize = blocksize*2*griddim.x; T mysum = 0; while (i < n) { mysum += g_idata[i]; if (nispow2 i + blocksize < n) mysum += g_idata[i+blocksize]; i += gridsize; } sdata[tid] = mysum; syncthreads(); if (blocksize >= 512) { if (tid < 256) { sdata[tid] = mysum = mysum + sdata[tid + 256]; } syncthreads(); } if (blocksize >= 256) { if (tid < 128) { sdata[tid] = mysum = mysum + sdata[tid + 128]; } syncthreads(); } if (blocksize >= 128) { if (tid < 64) { sdata[tid] = mysum = mysum + sdata[tid + 64]; } syncthreads(); } if (tid < 32) { volatile T* smem = sdata; if (blocksize >= 64) { smem[tid] = mysum = mysum + smem[tid + 32]; } if (blocksize >= 32) { smem[tid] = mysum = mysum + smem[tid + 16]; } if (blocksize >= 16) { smem[tid] = mysum = mysum + smem[tid + 8]; } if (blocksize >= 8) { smem[tid] = mysum = mysum + smem[tid + 4]; } if (blocksize >= 4) { smem[tid] = mysum = mysum + smem[tid + 2]; } if (blocksize >= 2) { smem[tid] = mysum = mysum + smem[tid + 1]; } } if (tid == 0) g_odata[blockidx.x] = sdata[0]; } extern "C" void reduce6_cuda_(int *n, int *a, int *b) { int *b_d; const int b_size = *n; cudamalloc((void **) &b_d, sizeof(int)*b_size); cudamemcpy(b_d, b, sizeof(int)*b_size, cudamemcpyhosttodevice); dim3 dimblock(128, 1, 1); dim3 dimgrid(128, 1, 1); dim3 small_dimgrid(1, 1, 1); int smemsize = 128 * sizeof(int); int *buffer_d; int small_buffer[4],*small_buffer_d; cudamalloc((void **) &buffer_d, sizeof(int)*128); cudamalloc((void **) &small_buffer_d, sizeof(int)); reduce6<int,128,false><<< dimgrid, dimblock, smemsize >>>(b_d,buffer_d, b_size); reduce6<int,128,false><<< small_dimgrid, dimblock, smemsize >>>(buffer_d, small_buffer_d,128); cudamemcpy(small_buffer, small_buffer_d, sizeof(int), cudamemcpydevicetohost); *a = *small_buffer; cudafree(buffer_d); cudafree(small_buffer_d); cudafree(b_d); } 5

6 TM Compiler does the work: Identifies parallel loops within the region Determines the kernels needed Splits the code into accelerator and host portions Workshares loops running on accelerator Make use of MIMD and SIMD style parallelism Data movement allocates/frees GPU memory at start/end of region moves of data to/from GPU!$acc data present(a,b)!$acc parallel a = 0.0!$acc loop reduction(+:a) do i = 1,n a = a + b(i) end do!$acc end parallel!$acc end data 6

7 90. subroutine sum_of_int_4(n,a,b) 91. use global_data 92. integer*4 a,b(n) 93. integer*8 start_clock, elapsed_clocks, end_clock 94.!$acc data present(a,b) 95. G----<!$acc parallel 96. G a = G!$acc loop reduction(+:a) 98. G g--< do i = 1,n 99. G g a = a + b(i) 100. G g--> end do 101. G---->!$acc end parallel 102.!$acc end data 103. end subroutine sum_of_int_4 ftn-6413 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 94 A data region was created at line 94 and ending at line 107. ftn-6405 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 95 A region starting at line 95 and ending at line 101 was placed on the accelerator. ftn-6430 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 98 A loop starting at line 98 was partitioned across the threadblocks and the 128 threads within a threadblock.

8 Summary of code complexity and performance Programming Model Unit of computation Lines of code Performance in Gflops (higher is better) Performance normalized to X86 core Fortran Single x86 core Gflops 1.0 Simple CUDA GPU Gflops 0.87 Optimized CUDA GPU Gflops 5.25 OpenACC GPU Gflops

9 A common directive programming model for today s GPUs Announced at SC11 conference Offers portability between compilers Drawn up by: NVIDIA, Cray, PGI, CAPS Multiple compilers offer portability, debugging, permanence Works for Fortran, C, C++ Standard available at Initially implementations targeted at NVIDIA GPUs Current version: 1.0 (November 2011) Compiler support: Cray CCE: partial 2011, complete in 2012 PGI Accelerator: released product in 2012 CAPS: released product in

10 A common directive programming model for shared memory systems Announced 15yrs ago Works with Fortran, C, C++ Current version 3.1 (July 2011) Compiler support

11 Host-directed execution with attached GPU Main program executes on host (i.e. CPU) Compute intensive regions offloaded to the accelerator device under control of the host. device (i.e. GPU) executes parallel regions typically contain kernels (i.e. work-sharing loops), or kernels regions, containing one or more loops which are executed as kernels. Host must orchestrate the execution by: allocating memory on the accelerator device, initiating data transfer, sending the code to the accelerator, passing arguments to the parallel region, queuing the device code, waiting for completion, transferring results back to the host, and deallocating memory. Host can usually queue a sequence of operations to be executed on the device, one after the other. 11

12 Memory spaces on the host and device distinct Different locations, different address space Data movement performed by host using runtime library calls that explicitly move data between the separate GPUs have a weak memory model No synchronisation between different execution units (SMs) Unless explicit memory barrier Can write OpenACC kernels with race conditions OpenACC Giving inconsistent execution results Compiler will catch most errors, but not all (no user-managed barriers) data movement between the memories implicit managed by the compiler, based on directives from the programmer. Device memory caches are managed by the compiler with hints from the programmer in the form of directives. 12

13 A high level approach to programming accelerators Use original source code (Fortran, C, C++) Don t need separate source-base, much more portable Can preserve subprogram structure Familiar programming model if used traditional OpenMP A small performance gap acceptable Target is 10-15%, currently seeing better than that for many cases Draft proposal currently being discussed by OpenMP ARB subcommittee, aiming for OpenMP 4.0 (sometime in 2012) Cray an enthusiastic supporter Co-chairs committee (with TI) CCE already provides reference implementation of the draft standard Subcommittee includes most major vendors Cray, PGI, Nvidia, AMD, TI, IBM, CAPS, Oracle, Intel... An open standard is the most attractive for developers Portability, multiple compilers for debugging 13

14 OpenACC Parallel Kernels Data Loop Host data Cache Update Wait Declare OpenMP for Accelerators Accelerate Dispatch Data Do/for Part of data Nested data Update Wait Mirror/resident

15 man intro_openacc Which module to use, craype-accel-nvidia20 Forces dynamic linking Single object file Whole program Messages/list file Compiles to PTX not cuda Debugger sees original program not cuda intermediate 15

16 Only two one constructs are un-implemented Cache Declare One unimplemented data clause deviceptr 16

17 /* takes a host pointer */ void* cray_acc_create( void*, size_t ); void cray_acc_delete( void* ); void* cray_acc_copyin( void*, size_t ); void cray_acc_copyout( void*, size_t ); void cray_acc_updatein( void*, size_t ); void cray_acc_updateout( void*, size_t ); int cray_acc_is_present( void* ); int cray_acc_is_present_2( void*, size_t); void *cray_acc_deviceptr( void* ); /* takes a device and host pointer */ void cray_acc_memcpy_device_host( void*, void*, size_t ); /* takes a host and device pointer */ void cray_acc_memcpy_host_device( void*, void*, size_t ); 17

18 1.!$acc loop gang : across thread blocks 2.!$acc loop worker : across warps within a thread block 3.!$acc loop vector : across threads within a warp 1.!$acc loop gang : across thread blocks 2.!$acc loop worker vector : across threads within a thread block 1.!$acc loop gang : across thread blocks 2.!$acc loop vector : across threads within a thread block 1.!$acc loop gang worker: across thread blocks and the warps within a thread block 2.!$acc loop vector : across threads within a warp 1.!$acc loop gang vector : across thread blocks and threads within a thread block 1.!$acc loop gang worker vector : across thread blocks and threads within a thread block 18

19 You can also force things to be within a single thread block: 1.!$acc loop worker : across warps within a single thread block 2.!$acc loop vector : across threads within a warp 1.!$acc worker vector : across threads within a single thread block 1.!$acc vector : across threads within a single thread block 19

20 All parallel regions should contain a loop directive Always start with parallel loop (not just parallel) Fortran assumed size (A(*)) and C pointers must be shaped Always use : when shaping with an entire dimension (i.e. A(:,1:2) ) First get your application working without data regions, than add data regions First get your application working without async, than add async 20

21 Reduction Libsci Pattern match Optimization Perftools Data regions Async clauses

22 ftn -rm -o cpu cpu_reduce_int.f90 aprun./cpu Integer reduction of 2^18 elements run on the CPU Kernel 1) flops GFlops int*4 sum_of_int_4 Application resources: utime ~0s, stime ~0s

23 nvcc -arch=sm_20 -c reduce0.cu reduce6.cu ftn -rm -o gpu_cuda gpu_reduce_int_cuda.f90 reduce0.o reduce6.o aprun./gpu_cuda ACC: Transfer 3 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cuda.f90:107 ACC: Execute kernel wake_up_gpu_$ck_l107_1 from gpu_reduce_int_cuda.f90:107 ACC: Synchronize from gpu_reduce_int_cuda.f90:113 ACC: Transfer 3 items (to acc 0 bytes, to host bytes) from gpu_reduce_int_cuda.f90: Integer reduction of 2^18 integers using SDK CUDA kernel reduce Based on micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of Integer reduction of 2^18 integers using SDK CUDA kernel reduce Based on micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of

24 ftn -rm hmsgs -o gpu_cce gpu_reduce_int_cce.f90 aprun -L$allgpunodes./gpu_cce ACC: Transfer 3 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:80 ACC: Execute kernel wake_up_gpu_$ck_l80_1 from gpu_reduce_int_cce.f90:80 ACC: Synchronize from gpu_reduce_int_cce.f90:86 ACC: Transfer 3 items (to acc 0 bytes, to host bytes) from gpu_reduce_int_cce.f90:86 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:43 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:95 ACC: Transfer 2 items (to acc 4 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:96 ACC: Execute kernel sum_of_int_4_$ck_l96_3 from gpu_reduce_int_cce.f90:96 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:106 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:107 ACC: Synchronize from gpu_reduce_int_cce.f90:48 ACC: Transfer 2 items (to acc 0 bytes, to host 4 bytes) from gpu_reduce_int_cce.f90: Integer reduction of 2^18 integers using CCE Open acc directives Based on 31.5 micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of

25 !$acc data copyin(a,b) copy(c,c3)!$acc host_data use_device(a,b,c,c3) t1 = MPI_Wtime() call dgemm('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c,ldc) t2 = MPI_Wtime() t_acc = t2-t1 t1 = MPI_Wtime() call dgemm_acc('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c3,ldc) t2 = MPI_Wtime()!$acc end host_data!$acc end data

26 ftn -hmsgs -o f_dgemm_openacc f_dgemm_openacc.f90!$acc kernels copy(c3) ftn-6413 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 15 A data region was created at line 15 and ending at line 17. ftn-6415 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 15 Allocate memory and copy whole array "c3" to accelerator, copy back at line 17 (acc_copy). c3(1,1) = 1 ftn-6405 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 16 A region starting at line 16 and ending at line 16 was placed on the accelerator. c2(:,:) = c(:,:) ftn-6066 crayftn: SCALAR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop nest at line 40 collapsed to a single loop. ftn-6005 crayftn: SCALAR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop starting at line 40 was unrolled 4 times. ftn-6204 crayftn: VECTOR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop starting at line 40 was vectorized.

27 aprun -L$allgpunodes./f_dgemm_openacc Matrix Size 1024 Error 0. Error between GPU calls 0. DGEMM_ACC_SIMPLE: GFlops in E-3 sec DGEMM_ACC: GFlops in E-3 sec DGEMM_CPU: GFlops in E-2 sec Matrix Size 4096 Error E-12 Error between GPU calls E-12 DGEMM_ACC_SIMPLE: GFlops in sec DGEMM_ACC: GFlops in sec DGEMM_CPU: GFlops in sec Matrix Size 8192 Error E-11 Error between GPU calls E-11 DGEMM_ACC_SIMPLE: GFlops in sec DGEMM_ACC: GFlops in sec DGEMM_CPU: GFlops in sec Application resources: utime ~36s, stime ~4s

28 do j = 1,L do i = 1,N do k = 1,M C(i,j) = C(i,j) + A(i,k)*B(k,j) enddo enddo enddo

29 ftn -hmsgs omatmul matmul_2.f ftn-6202 crayftn: VECTOR MMUL, File = matmul_2.f, Line = 31 A loop starting at line 31 was replaced by a library call. explain ftn-6202 VECTOR: A loop starting at line %s was replaced by a library call. A loop or nest of loops was found to match a library routine. The loop code was replaced by a call to the corresponding routine. Only one pattern was matched and it covered the entire loop. grep dgemm matmul_2.f nm matmul grep dgemm U _dgemm_

30 aprun -L60./matmul arrays sized 4092 by 1024 by 1024 calling mmul Gigaflops including data xfer: C(1,1) = C(2,2) = calling mmul_nopattern No pattern Gigaflops: C(1,1) = No errors found Application resources: utime ~9s, stime ~1s

31 #ifdef USE_DATA!$acc data create(a,b) #endif t1 = gettime() stream_counter = 1 DO j = 1,Nchunks #ifdef FULL_ASYNC my_stream = j #else my_stream = Streams(stream_counter) #endif #ifdef ASYNC #ifndef FULL_ASYNC!$acc wait(my_stream) #endif #endif #ifdef USE_DATA #ifdef ASYNC!$acc update device(a(:,j)) async(my_stream) #else!$acc update device(a(:,j)) #endif #endif #ifdef ASYNC!$acc parallel loop async(my_stream) #else!$acc parallel loop #endif DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc end parallel loop #ifdef USE_DATA #ifdef ASYNC!$acc update host(b(:,j)) async(my_stream) #else!$acc update host(b(:,j)) #endif #endif stream_counter = MOD(stream_counter,3) + 1 ENDDO!$acc wait t2 = gettime()!$acc end data

32 ftn -rad -hnocaf -c -o toaa2.o toaa2.f90 ftn -rad -hnocaf -o toaa2.x toaa2.o pat_build -w toaa2.x aprun toaa2.x+pat Time = Experiment data file written: /lus/scratch/beyerj/openacc/toaa/toaa2.x+pat t.xf Application resources: utime ~83s, stime ~7s pat_report T toaa2.x+pat t.xf

33 Table 1: Profile by Function Group and Function Time% Time Imb. Imb. Calls Group Time Time% Function 100.0% Total % USER % % % toaa_ 0.0% % % % exit 0.0% ====================================================================== 0.0% ETC =======================================================================

34 Table 2: Time and Bytes Transferred for Accelerator Regions Host Host Acc Acc Copy Acc Copy Calls Calltree Time% Time Time In Out (MBytes) (MBytes) 100.0% Total % toaa_ % % toaa_.acc_copy@li % toaa_.acc_copy@li % toaa_.acc_kernel@li % toaa_.acc_sync_wait@li % toaa_.acc_region@li.59(exclusive) =============================================================================================== 0.0% toaa_.acc_sync_wait@li.79 ================================================================================================= Processing step 3 of 3

35 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61

36 ftn -rad -hnocaf -DUSE_DATA -c -o toaa2.o toaa2.f90 ftn -rad -hnocaf -DUSE_DATA -o toaa2.x toaa2.o pat_build -w toaa2.x aprun toaa2.x+pat Time = Experiment data file written: /lus/scratch/beyerj/openacc/toaa/toaa2.x+pat t.xf Application resources: utime ~4s, stime ~2s pat_report T toaa2.x+pat t.xf Faster with 10x the work

37 Table 2: Time and Bytes Transferred for Accelerator Regions Host Host Acc Acc Copy Acc Copy Calls Calltree Time% Time Time In Out (MBytes) (MBytes) 100.0% Total % toaa_ % toaa_.acc_update@li % toaa_.acc_copy@li % toaa_.acc_sync_wait@li % toaa_.acc_update@li.71(exclusive) ================================================================================================== % toaa_.acc_update@li % toaa_.acc_copy@li % toaa_.acc_update@li.52(exclusive) ================================================================================================== [[[...]]] Processing step 3 of 3

38 !$acc data create(a,b) t1 = gettime() stream_counter = 1 DO j = 1,Nchunks #ifdef FULL_ASYNC my_stream = j #else my_stream = Streams(stream_counter) #endif #ifdef ASYNC #ifndef FULL_ASYNC!$acc wait(my_stream) #endif #endif #ifdef ASYNC!$acc update device(a(:,j)) async(my_stream) #else!$acc update device(a(:,j)) #endif #ifdef ASYNC!$acc parallel loop async(my_stream) #else!$acc parallel loop #endif DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc end parallel loop #ifdef ASYNC!$acc update host(b(:,j)) async(my_stream) #else!$acc update host(b(:,j)) #endif stream_counter = MOD(stream_counter,3) + 1 ENDDO!$acc wait t2 = gettime()!$acc end data

39 make clean make aprun toaa2.x Time = Application resources: utime ~6s, stime ~1s

40 make clean make ACC=yes aprun toaa2.x Time = Application resources: utime ~3s, stime ~2s

41 make clean make ACC=yes ASYNC=yes aprun toaa2.x Time = Application resources: utime ~2s, stime ~2s

42 make clean make ACC=yes ASYNC=yes FULL_ASYNC=yes aprun toaa2.x Time = Application resources: utime ~2s, stime ~2s

43 async(handle): like CUDA streams allows overlap of tasks on GPU PCIe transfers in both directions Plus multiple kernels (up to 16 with Fermi) streams identified by handle tasks with same handle execute sequentially can wait on one, more or all tasks OpenACC API also allows completeness check First attempt, a simple pipeline: processes array, slice by slice copy data to GPU, process, bring back to CPU very complicated kernel operation here! should be able to overlap 3 streams at once use slice number as stream handle in this case runtime MODs it back into allowable range Can actually overlap more than three stream No benefit on this test INTEGER, PARAMETER :: Nvec = 10000, Nchunks = REAL(kind=dp) :: a(nvec,nchunks), b(nvec,nchunks)!$acc data create(a,b) DO j = 1,Nchunks!$acc update device(a(:,j)) async(j)!$acc parallel loop async(j) DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc update host(b(:,j)) async(j) ENDDO!$acc wait!$acc end data 43

44 INTEGER, PARAMETER :: Nvec = 10000, Nchunks = Execution times (on Cray XK6): CPU: 3.98s OpenACC, blocking: 3.6s OpenACC, async: 0.82s OpenACC, full async: 0.76s REAL(kind=dp) :: a(nvec,nchunks), b(nvec,nchunks)!$acc data create(a,b) DO j = 1,Nchunks!$acc update device(a(:,j)) async(j)!$acc parallel loop async(j) DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc update host(b(:,j)) async(j) NVIDIA Visual profiler: time flows to right, streams stacked vertically red: data transfer to GPU pink: computational kernel on GPU blue: data transfer from GPU vertical slice shows what is overlapping only 7 of 16 streams fit in window collapsed view at bottom async handle modded by number of streams so see multiple coloured bars per stream ENDDO!$acc wait!$acc end data 44

Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools

Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Major Hybrid Multi Petaflop Systems

More information

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as

More information

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology

Optimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as

More information

Optimizing Parallel Reduction in CUDA

Optimizing Parallel Reduction in CUDA Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology http://developer.download.nvidia.com/assets/cuda/files/reduction.pdf Parallel Reduction Tree-based approach used within each

More information

OpenACC and the Cray Compilation Environment James Beyer PhD

OpenACC and the Cray Compilation Environment James Beyer PhD OpenACC and the Cray Compilation Environment James Beyer PhD Agenda A brief introduction to OpenACC Cray Programming Environment (PE) Cray Compilation Environment, CCE An in depth look at CCE 8.2 and OpenACC

More information

Cray Programming Environment Update Luiz DeRose & Heidi Poxon Cray Inc.

Cray Programming Environment Update Luiz DeRose & Heidi Poxon Cray Inc. Cray Programming Environment Update Luiz DeRose & Heidi Poxon Cray Inc. 1 Legal Disclaimer Information in this document is provided in connection with Cray Inc. products. No license, express or implied,

More information

GPU programming: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique

GPU programming: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique GPU programming: CUDA basics Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr This lecture: CUDA programming We have seen some GPU architecture Now how to program it? 2 Outline

More information

CUDA. Sathish Vadhiyar High Performance Computing

CUDA. Sathish Vadhiyar High Performance Computing CUDA Sathish Vadhiyar High Performance Computing Hierarchical Parallelism Parallel computations arranged as grids One grid executes after another Grid consists of blocks Blocks assigned to SM. A single

More information

CUDA Programming. Week 1. Basic Programming Concepts Materials are copied from the reference list

CUDA Programming. Week 1. Basic Programming Concepts Materials are copied from the reference list CUDA Programming Week 1. Basic Programming Concepts Materials are copied from the reference list G80/G92 Device SP: Streaming Processor (Thread Processors) SM: Streaming Multiprocessor 128 SP grouped into

More information

Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015

Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015 Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015 Abstract As both an OpenMP and OpenACC insider I will present my opinion of the current status of these two directive sets for programming

More information

GPU programming: CUDA. Sylvain Collange Inria Rennes Bretagne Atlantique

GPU programming: CUDA. Sylvain Collange Inria Rennes Bretagne Atlantique GPU programming: CUDA Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr This lecture: CUDA programming We have seen some GPU architecture Now how to program it? 2 Outline GPU

More information

S Comparing OpenACC 2.5 and OpenMP 4.5

S Comparing OpenACC 2.5 and OpenMP 4.5 April 4-7, 2016 Silicon Valley S6410 - Comparing OpenACC 2.5 and OpenMP 4.5 James Beyer, NVIDIA Jeff Larkin, NVIDIA GTC16 April 7, 2016 History of OpenMP & OpenACC AGENDA Philosophical Differences Technical

More information

NVIDIA CUDA. Fermi Compatibility Guide for CUDA Applications. Version 1.1

NVIDIA CUDA. Fermi Compatibility Guide for CUDA Applications. Version 1.1 NVIDIA CUDA Fermi Compatibility Guide for CUDA Applications Version 1.1 4/19/2010 Table of Contents Software Requirements... 1 What Is This Document?... 1 1.1 Application Compatibility on Fermi... 1 1.2

More information

OpenACC Accelerator Directives. May 3, 2013

OpenACC Accelerator Directives. May 3, 2013 OpenACC Accelerator Directives May 3, 2013 OpenACC is... An API Inspired by OpenMP Implemented by Cray, PGI, CAPS Includes functions to query device(s) Evolving Plan to integrate into OpenMP Support of

More information

An Introduction to OpenAcc

An Introduction to OpenAcc An Introduction to OpenAcc ECS 158 Final Project Robert Gonzales Matthew Martin Nile Mittow Ryan Rasmuss Spring 2016 1 Introduction: What is OpenAcc? OpenAcc stands for Open Accelerators. Developed by

More information

Heidi Poxon Cray Inc.

Heidi Poxon Cray Inc. Heidi Poxon Topics GPU support in the Cray performance tools CUDA proxy MPI support for GPUs (GPU-to-GPU) 2 3 Programming Models Supported for the GPU Goal is to provide whole program analysis for programs

More information

Blue Waters Programming Environment

Blue Waters Programming Environment December 3, 2013 Blue Waters Programming Environment Blue Waters User Workshop December 3, 2013 Science and Engineering Applications Support Documentation on Portal 2 All of this information is Available

More information

PPAR: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique PPAR

PPAR: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique PPAR PPAR: CUDA basics Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr http://www.irisa.fr/alf/collange/ PPAR - 2018 This lecture: CUDA programming We have seen some GPU architecture

More information

OPENMP FOR ACCELERATORS

OPENMP FOR ACCELERATORS 7th International Workshop on OpenMP Chicago, Illinois, USA James C. Beyer, Eric J. Stotzer, Alistair Hart, and Bronis R. de Supinski OPENMP FOR ACCELERATORS Accelerator programming Why a new model? There

More information

PGI Accelerator Programming Model for Fortran & C

PGI Accelerator Programming Model for Fortran & C PGI Accelerator Programming Model for Fortran & C The Portland Group Published: v1.3 November 2010 Contents 1. Introduction... 5 1.1 Scope... 5 1.2 Glossary... 5 1.3 Execution Model... 7 1.4 Memory Model...

More information

Hybrid multicore supercomputing and GPU acceleration with next-generation Cray systems

Hybrid multicore supercomputing and GPU acceleration with next-generation Cray systems Hybrid multicore supercomputing and GPU acceleration with next-generation Cray systems Alistair Hart Cray Exascale Research Initiative Europe T-Systems HPCN workshop DLR Braunschweig 26.May.11 ahart@cray.com

More information

Introduction to Parallel Computing with CUDA. Oswald Haan

Introduction to Parallel Computing with CUDA. Oswald Haan Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries

More information

OpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware

OpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware OpenACC Standard Directives for Accelerators Credits http://www.openacc.org/ o V1.0: November 2011 Specification OpenACC, Directives for Accelerators, Nvidia Slideware CAPS OpenACC Compiler, HMPP Workbench

More information

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer OpenACC Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer 3 WAYS TO ACCELERATE APPLICATIONS Applications Libraries Compiler Directives Programming Languages Easy to use Most Performance

More information

OpenACC 2.6 Proposed Features

OpenACC 2.6 Proposed Features OpenACC 2.6 Proposed Features OpenACC.org June, 2017 1 Introduction This document summarizes features and changes being proposed for the next version of the OpenACC Application Programming Interface, tentatively

More information

Adrian Tate XK6 / openacc workshop Manno, Mar

Adrian Tate XK6 / openacc workshop Manno, Mar Adrian Tate XK6 / openacc workshop Manno, Mar6-7 2012 1 Overview & Philosophy Two modes of usage Contents Present contents Upcoming releases Optimization of libsci_acc Autotuning Adaptation Asynchronous

More information

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA

More information

OpenACC compiling and performance tips. May 3, 2013

OpenACC compiling and performance tips. May 3, 2013 OpenACC compiling and performance tips May 3, 2013 OpenACC compiler support Cray Module load PrgEnv-cray craype-accel-nvidia35 Fortran -h acc, noomp # openmp is enabled by default, be careful mixing -fpic

More information

An Introduction to OpenACC

An Introduction to OpenACC An Introduction to OpenACC James Beyer PhD 6.May.2013 1 Timetable Monday 6 th May 2013 8:30 Lecture 1: Introduction to the Cray XK7 (15) 8:45 Lecture 2: OpenACC organization (Duncan Poole) (15) 9:00 Lecture

More information

arxiv: v1 [hep-lat] 12 Nov 2013

arxiv: v1 [hep-lat] 12 Nov 2013 Lattice Simulations using OpenACC compilers arxiv:13112719v1 [hep-lat] 12 Nov 2013 Indian Association for the Cultivation of Science, Kolkata E-mail: tppm@iacsresin OpenACC compilers allow one to use Graphics

More information

Portable and Productive Performance with OpenACC Compilers and Tools. Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc.

Portable and Productive Performance with OpenACC Compilers and Tools. Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Portable and Productive Performance with OpenACC Compilers and Tools Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. 1 Cray: Leadership in Computational Research Earth Sciences

More information

Introduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research

Introduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research Introduction to CUDA CME343 / ME339 18 May 2011 James Balfour [ jbalfour@nvidia.com] NVIDIA Research CUDA Programing system for machines with GPUs Programming Language Compilers Runtime Environments Drivers

More information

OpenACC Course. Office Hour #2 Q&A

OpenACC Course. Office Hour #2 Q&A OpenACC Course Office Hour #2 Q&A Q1: How many threads does each GPU core have? A: GPU cores execute arithmetic instructions. Each core can execute one single precision floating point instruction per cycle

More information

Lecture 11: GPU programming

Lecture 11: GPU programming Lecture 11: GPU programming David Bindel 4 Oct 2011 Logistics Matrix multiply results are ready Summary on assignments page My version (and writeup) on CMS HW 2 due Thursday Still working on project 2!

More information

Lecture 2: CUDA Programming

Lecture 2: CUDA Programming CS 515 Programming Language and Compilers I Lecture 2: CUDA Programming Zheng (Eddy) Zhang Rutgers University Fall 2017, 9/12/2017 Review: Programming in CUDA Let s look at a sequential program in C first:

More information

Lattice Simulations using OpenACC compilers. Pushan Majumdar (Indian Association for the Cultivation of Science, Kolkata)

Lattice Simulations using OpenACC compilers. Pushan Majumdar (Indian Association for the Cultivation of Science, Kolkata) Lattice Simulations using OpenACC compilers Pushan Majumdar (Indian Association for the Cultivation of Science, Kolkata) OpenACC is a programming standard for parallel computing developed by Cray, CAPS,

More information

Parallelism III. MPI, Vectorization, OpenACC, OpenCL. John Cavazos,Tristan Vanderbruggen, and Will Killian

Parallelism III. MPI, Vectorization, OpenACC, OpenCL. John Cavazos,Tristan Vanderbruggen, and Will Killian Parallelism III MPI, Vectorization, OpenACC, OpenCL John Cavazos,Tristan Vanderbruggen, and Will Killian Dept of Computer & Information Sciences University of Delaware 1 Lecture Overview Introduction MPI

More information

CS671 Parallel Programming in the Many-Core Era

CS671 Parallel Programming in the Many-Core Era CS671 Parallel Programming in the Many-Core Era Lecture 3: GPU Programming - Reduce, Scan & Sort Zheng Zhang Rutgers University Review: Programming with CUDA An Example in C Add vector A and vector B to

More information

Portable and Productive Performance on Hybrid Systems with libsci_acc Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc.

Portable and Productive Performance on Hybrid Systems with libsci_acc Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Portable and Productive Performance on Hybrid Systems with libsci_acc Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. 1 What is Cray Libsci_acc? Provide basic scientific

More information

An OpenACC construct is an OpenACC directive and, if applicable, the immediately following statement, loop or structured block.

An OpenACC construct is an OpenACC directive and, if applicable, the immediately following statement, loop or structured block. API 2.6 R EF ER ENC E G U I D E The OpenACC API 2.6 The OpenACC Application Program Interface describes a collection of compiler directives to specify loops and regions of code in standard C, C++ and Fortran

More information

Introduction to GPU programming. Introduction to GPU programming p. 1/17

Introduction to GPU programming. Introduction to GPU programming p. 1/17 Introduction to GPU programming Introduction to GPU programming p. 1/17 Introduction to GPU programming p. 2/17 Overview GPUs & computing Principles of CUDA programming One good reference: David B. Kirk

More information

Practical: a sample code

Practical: a sample code Practical: a sample code Alistair Hart Cray Exascale Research Initiative Europe 1 Aims The aim of this practical is to examine, compile and run a simple, pre-prepared OpenACC code The aims of this are:

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software

More information

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15: Profiling and Parallelizing with the OpenACC Toolkit

More information

Computer Architecture

Computer Architecture Jens Teubner Computer Architecture Summer 2017 1 Computer Architecture Jens Teubner, TU Dortmund jens.teubner@cs.tu-dortmund.de Summer 2017 Jens Teubner Computer Architecture Summer 2017 34 Part II Graphics

More information

Module 2: Introduction to CUDA C. Objective

Module 2: Introduction to CUDA C. Objective ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding

More information

COMP Parallel Computing. Programming Accelerators using Directives

COMP Parallel Computing. Programming Accelerators using Directives COMP 633 - Parallel Computing Lecture 15 October 30, 2018 Programming Accelerators using Directives Credits: Introduction to OpenACC and toolkit Jeff Larkin, Nvidia COMP 633 - Prins Directives for Accelerator

More information

Addressing the Increasing Challenges of Debugging on Accelerated HPC Systems. Ed Hinkel Senior Sales Engineer

Addressing the Increasing Challenges of Debugging on Accelerated HPC Systems. Ed Hinkel Senior Sales Engineer Addressing the Increasing Challenges of Debugging on Accelerated HPC Systems Ed Hinkel Senior Sales Engineer Agenda Overview - Rogue Wave & TotalView GPU Debugging with TotalView Nvdia CUDA Intel Phi 2

More information

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh.

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh. GPGPU Alan Gray/James Perry EPCC The University of Edinburgh a.gray@ed.ac.uk Contents Introduction GPU Technology Programming GPUs GPU Performance Optimisation 2 Introduction 3 Introduction Central Processing

More information

Advanced CUDA Optimizations

Advanced CUDA Optimizations Advanced CUDA Optimizations General Audience Assumptions General working knowledge of CUDA Want kernels to perform better Profiling Before optimizing, make sure you are spending effort in correct location

More information

Introduction to OpenACC. 16 May 2013

Introduction to OpenACC. 16 May 2013 Introduction to OpenACC 16 May 2013 GPUs Reaching Broader Set of Developers 1,000,000 s 100,000 s Early Adopters Research Universities Supercomputing Centers Oil & Gas CAE CFD Finance Rendering Data Analytics

More information

π = 4 N in_circle N total ... // includes # define N 1000000 float uniform_rand(unsigned* seed, float lower, float upper) {... int main () { int num_in_circ = 0; float x, y, dist; #pragma omp parallel

More information

PGI Fortran & C Accelerator Programming Model. The Portland Group

PGI Fortran & C Accelerator Programming Model. The Portland Group PGI Fortran & C Accelerator Programming Model The Portland Group Published: v0.72 December 2008 Contents 1. Introduction...3 1.1 Scope...3 1.2 Glossary...3 1.3 Execution Model...4 1.4 Memory Model...5

More information

A High Level Programming Environment for Accelerated Computing. Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc.

A High Level Programming Environment for Accelerated Computing. Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. A High Level Programming Environment for Accelerated Computing Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Outline Motivation Cray XK6 Overview Why a new programming

More information

OpenACC programming for GPGPUs: Rotor wake simulation

OpenACC programming for GPGPUs: Rotor wake simulation DLR.de Chart 1 OpenACC programming for GPGPUs: Rotor wake simulation Melven Röhrig-Zöllner, Achim Basermann Simulations- und Softwaretechnik DLR.de Chart 2 Outline Hardware-Architecture (CPU+GPU) GPU computing

More information

Technische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics

Technische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics GPU Programming Rüdiger Westermann Chair for Computer Graphics & Visualization Faculty of Informatics Overview Programming interfaces and support libraries The CUDA programming abstraction An in-depth

More information

Tesla Architecture, CUDA and Optimization Strategies

Tesla Architecture, CUDA and Optimization Strategies Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization

More information

OpenACC Fundamentals. Steve Abbott November 15, 2017

OpenACC Fundamentals. Steve Abbott November 15, 2017 OpenACC Fundamentals Steve Abbott , November 15, 2017 AGENDA Data Regions Deep Copy 2 while ( err > tol && iter < iter_max ) { err=0.0; JACOBI ITERATION #pragma acc parallel loop reduction(max:err)

More information

Parallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer

Parallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer Parallel Programming and Debugging with CUDA C Geoff Gerfin Sr. System Software Engineer CUDA - NVIDIA s Architecture for GPU Computing Broad Adoption Over 250M installed CUDA-enabled GPUs GPU Computing

More information

OpenACC 2.5 and Beyond. Michael Wolfe PGI compiler engineer

OpenACC 2.5 and Beyond. Michael Wolfe PGI compiler engineer OpenACC 2.5 and Beyond Michael Wolfe PGI compiler engineer michael.wolfe@pgroup.com OpenACC Timeline 2008 PGI Accelerator Model (targeting NVIDIA GPUs) 2011 OpenACC 1.0 (targeting NVIDIA GPUs, AMD GPUs)

More information

Stanford University. NVIDIA Tesla M2090. NVIDIA GeForce GTX 690

Stanford University. NVIDIA Tesla M2090. NVIDIA GeForce GTX 690 Stanford University NVIDIA Tesla M2090 NVIDIA GeForce GTX 690 Moore s Law 2 Clock Speed 10000 Pentium 4 Prescott Core 2 Nehalem Sandy Bridge 1000 Pentium 4 Williamette Clock Speed (MHz) 100 80486 Pentium

More information

Real-time Graphics 9. GPGPU

Real-time Graphics 9. GPGPU Real-time Graphics 9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing

More information

COMP 322: Fundamentals of Parallel Programming. Flynn s Taxonomy for Parallel Computers

COMP 322: Fundamentals of Parallel Programming. Flynn s Taxonomy for Parallel Computers COMP 322: Fundamentals of Parallel Programming Lecture 37: General-Purpose GPU (GPGPU) Computing Max Grossman, Vivek Sarkar Department of Computer Science, Rice University max.grossman@rice.edu, vsarkar@rice.edu

More information

Real-time Graphics 9. GPGPU

Real-time Graphics 9. GPGPU 9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing GPGPU general-purpose

More information

Module 2: Introduction to CUDA C

Module 2: Introduction to CUDA C ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding

More information

Lecture 3: Introduction to CUDA

Lecture 3: Introduction to CUDA CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu

More information

CS 314 Principles of Programming Languages

CS 314 Principles of Programming Languages CS 314 Principles of Programming Languages Zheng Zhang Fall 2016 Dec 14 GPU Programming Rutgers University Programming with CUDA Compute Unified Device Architecture (CUDA) Mapping and managing computations

More information

Optimizing OpenACC Codes. Peter Messmer, NVIDIA

Optimizing OpenACC Codes. Peter Messmer, NVIDIA Optimizing OpenACC Codes Peter Messmer, NVIDIA Outline OpenACC in a nutshell Tune an example application Data motion optimization Asynchronous execution Loop scheduling optimizations Interface OpenACC

More information

HPC Middle East. KFUPM HPC Workshop April Mohamed Mekias HPC Solutions Consultant. Introduction to CUDA programming

HPC Middle East. KFUPM HPC Workshop April Mohamed Mekias HPC Solutions Consultant. Introduction to CUDA programming KFUPM HPC Workshop April 29-30 2015 Mohamed Mekias HPC Solutions Consultant Introduction to CUDA programming 1 Agenda GPU Architecture Overview Tools of the Trade Introduction to CUDA C Patterns of Parallel

More information

OpenACC (Open Accelerators - Introduced in 2012)

OpenACC (Open Accelerators - Introduced in 2012) OpenACC (Open Accelerators - Introduced in 2012) Open, portable standard for parallel computing (Cray, CAPS, Nvidia and PGI); introduced in 2012; GNU has an incomplete implementation. Uses directives in

More information

ECE/ME/EMA/CS 759 High Performance Computing for Engineering Applications

ECE/ME/EMA/CS 759 High Performance Computing for Engineering Applications ECE/ME/EMA/CS 759 High Performance Computing for Engineering Applications CUDA Prefix Scan Shared Memory Issues CUDA Reduction October 23, 2015 Dan Negrut, 2015 ECE/ME/EMA/CS 759 UW-Madison Quote of the

More information

Lecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1

Lecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1 Lecture 15: Introduction to GPU programming Lecture 15: Introduction to GPU programming p. 1 Overview Hardware features of GPGPU Principles of GPU programming A good reference: David B. Kirk and Wen-mei

More information

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks. Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)

More information

Parallel Numerical Algorithms

Parallel Numerical Algorithms Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance

More information

Hands-on CUDA Optimization. CUDA Workshop

Hands-on CUDA Optimization. CUDA Workshop Hands-on CUDA Optimization CUDA Workshop Exercise Today we have a progressive exercise The exercise is broken into 5 steps If you get lost you can always catch up by grabbing the corresponding directory

More information

Alistair Hart, Roberto Ansaloni, Alan Gray, Kevin Stratford (EPCC) ( Cray Exascale Research Initiative Europe)

Alistair Hart, Roberto Ansaloni, Alan Gray, Kevin Stratford (EPCC) ( Cray Exascale Research Initiative Europe) Alistair Hart, Roberto Ansaloni, Alan Gray, Kevin Stratford (EPCC) ( Cray Exascale Research Initiative Europe) UKGPUCC3 Goodenough College, London Wed. 14.Dec.11 ahart@cray.com Contents The Now: the new

More information

Learn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh

Learn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh Learn CUDA in an Afternoon Alan Gray EPCC The University of Edinburgh Overview Introduction to CUDA Practical Exercise 1: Getting started with CUDA GPU Optimisation Practical Exercise 2: Optimising a CUDA

More information

Optimization and porting of a numerical code for simulations in GRMHD on CPU/GPU clusters PRACE Winter School Stage

Optimization and porting of a numerical code for simulations in GRMHD on CPU/GPU clusters PRACE Winter School Stage Optimization and porting of a numerical code for simulations in GRMHD on CPU/GPU clusters PRACE Winter School Stage INFN - Università di Parma November 6, 2012 Table of contents 1 Introduction 2 3 4 Let

More information

ADVANCED ACCELERATED COMPUTING USING COMPILER DIRECTIVES. Jeff Larkin, NVIDIA

ADVANCED ACCELERATED COMPUTING USING COMPILER DIRECTIVES. Jeff Larkin, NVIDIA ADVANCED ACCELERATED COMPUTING USING COMPILER DIRECTIVES Jeff Larkin, NVIDIA OUTLINE Compiler Directives Review Asynchronous Execution OpenACC Interoperability OpenACC `routine` Advanced Data Directives

More information

GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler

GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler Taylor Lloyd Jose Nelson Amaral Ettore Tiotto University of Alberta University of Alberta IBM Canada 1 Why? 2 Supercomputer Power/Performance GPUs

More information

Massively Parallel Algorithms

Massively Parallel Algorithms Massively Parallel Algorithms Introduction to CUDA & Many Fundamental Concepts of Parallel Programming G. Zachmann University of Bremen, Germany cgvr.cs.uni-bremen.de Hybrid/Heterogeneous Computation/Architecture

More information

Lecture 2: Introduction to CUDA C

Lecture 2: Introduction to CUDA C CS/EE 217 GPU Architecture and Programming Lecture 2: Introduction to CUDA C David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2013 1 CUDA /OpenCL Execution Model Integrated host+device app C program Serial or

More information

Scientific discovery, analysis and prediction made possible through high performance computing.

Scientific discovery, analysis and prediction made possible through high performance computing. Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013

More information

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming Overview Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.

More information

Parallel Computing. Lecture 19: CUDA - I

Parallel Computing. Lecture 19: CUDA - I CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4

More information

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017 INTRODUCTION TO OPENACC Analyzing and Parallelizing with OpenACC, Feb 22, 2017 Objective: Enable you to to accelerate your applications with OpenACC. 2 Today s Objectives Understand what OpenACC is and

More information

An Introduction to OpenACC

An Introduction to OpenACC An Introduction to OpenACC Alistair Hart Cray Exascale Research Initiative Europe 3 Timetable Day 1: Wednesday 29th August 2012 13:00 Welcome and overview 13:15 Session 1: An Introduction to OpenACC 13:15

More information

OPENACC DIRECTIVES FOR ACCELERATORS NVIDIA

OPENACC DIRECTIVES FOR ACCELERATORS NVIDIA OPENACC DIRECTIVES FOR ACCELERATORS NVIDIA Directives for Accelerators ABOUT OPENACC GPUs Reaching Broader Set of Developers 1,000,000 s 100,000 s Early Adopters Research Universities Supercomputing Centers

More information

Adapting Numerical Weather Prediction codes to heterogeneous architectures: porting the COSMO model to GPUs

Adapting Numerical Weather Prediction codes to heterogeneous architectures: porting the COSMO model to GPUs Adapting Numerical Weather Prediction codes to heterogeneous architectures: porting the COSMO model to GPUs O. Fuhrer, T. Gysi, X. Lapillonne, C. Osuna, T. Dimanti, T. Schultess and the HP2C team Eidgenössisches

More information

Lecture 5. Performance Programming with CUDA

Lecture 5. Performance Programming with CUDA Lecture 5 Performance Programming with CUDA Announcements 2011 Scott B. Baden / CSE 262 / Spring 2011 2 Today s lecture Matrix multiplication 2011 Scott B. Baden / CSE 262 / Spring 2011 3 Memory Hierarchy

More information

Graph Partitioning. Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM.

Graph Partitioning. Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM. Graph Partitioning Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM. Partition given graph G=(V,E) in k subgraphs of nearly equal

More information

CUDA Fortran COMPILERS &TOOLS. Porting Guide

CUDA Fortran COMPILERS &TOOLS. Porting Guide Porting Guide CUDA Fortran CUDA Fortran is the Fortran analog of the NVIDIA CUDA C language for programming GPUs. This guide includes examples of common language features used when porting Fortran applications

More information

Lecture 8: GPU Programming. CSE599G1: Spring 2017

Lecture 8: GPU Programming. CSE599G1: Spring 2017 Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library

More information

An Introduc+on to OpenACC Part II

An Introduc+on to OpenACC Part II An Introduc+on to OpenACC Part II Wei Feinstein HPC User Services@LSU LONI Parallel Programming Workshop 2015 Louisiana State University 4 th HPC Parallel Programming Workshop An Introduc+on to OpenACC-

More information

OpenMP 4.0/4.5. Mark Bull, EPCC

OpenMP 4.0/4.5. Mark Bull, EPCC OpenMP 4.0/4.5 Mark Bull, EPCC OpenMP 4.0/4.5 Version 4.0 was released in July 2013 Now available in most production version compilers support for device offloading not in all compilers, and not for all

More information

Using the Cray Programming Environment to Convert an all MPI code to a Hybrid-Multi-core Ready Application

Using the Cray Programming Environment to Convert an all MPI code to a Hybrid-Multi-core Ready Application Using the Cray Programming Environment to Convert an all MPI code to a Hybrid-Multi-core Ready Application John Levesque Cray s Supercomputing Center of Excellence The Target ORNL s Titan System Upgrade

More information

From Hello World to Exascale

From Hello World to Exascale From Hello World to Exascale Rob Farber Chief Scien0st, BlackDog Endeavors, LLC Author, CUDA Applica0on Design and Development Research consultant: ICHEC and others Doctor Dobb s Journal CUDA & OpenACC

More information

Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator

Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator What is CUDA? Programming language? Compiler? Classic car? Beer? Coffee? CUDA Parallel Computing Platform www.nvidia.com/getcuda Programming

More information

Introduction to CUDA Programming

Introduction to CUDA Programming Introduction to CUDA Programming Steve Lantz Cornell University Center for Advanced Computing October 30, 2013 Based on materials developed by CAC and TACC Outline Motivation for GPUs and CUDA Overview

More information