Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples
|
|
- Victoria Carr
- 6 years ago
- Views:
Transcription
1 Dr. James C. Beyer
2 Motivation OpenACC and OpenMP for Accelerators Cray Compilation Environment (CCE) Examples
3 Sum elements of an array Original Fortran code a=0.0 do i = 1,n a = a + b(i) end do 3
4 global void reduce0(int *g_idata, int *g_odata) { extern shared int sdata[]; unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blockdim.x + threadidx.x; sdata[tid] = g_idata[i]; syncthreads(); for(unsigned int s=1; s < blockdim.x; s *= 2) { if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; } syncthreads(); } if (tid == 0) g_odata[blockidx.x] = sdata[0]; } extern "C" void reduce0_cuda_(int *n, int *a, int *b) { int *b_d,red; const int b_size = *n; cudamalloc((void **) &b_d, sizeof(int)*b_size); cudamemcpy(b_d, b, sizeof(int)*b_size, cudamemcpyhosttodevice); dim3 dimblock(128, 1, 1); dim3 dimgrid(2048, 1, 1); dim3 small_dimgrid(16, 1, 1); int smemsize = 128 * sizeof(int); int *buffer_d, *red_d; int *small_buffer_d; cudamalloc((void **) &buffer_d, sizeof(int)*2048); cudamalloc((void **) &small_buffer_d, sizeof(int)*16); cudamalloc((void **) &red_d, sizeof(int)); reduce0<<< dimgrid, dimblock, smemsize >>>(b_d, buffer_d); reduce0<<< small_dimgrid, dimblock, smemsize >>>(buffer_d, small_buffer_d); reduce0<<< 1, 16, smemsize >>>(small_buffer_d, red_d); cudamemcpy(&red, red_d, sizeof(int), cudamemcpydevicetohost); *a = red; cudafree(buffer_d); cudafree(small_buffer_d); cudafree(b_d); } 4
5 template<class T> struct SharedMemory { device inline operator { } T*() extern shared int smem[]; return (T*) smem; device inline operator const T*() const { extern shared int smem[]; return (T*) smem; } }; template <class T, unsigned int blocksize, bool nispow2> global void reduce6(t *g_idata, T *g_odata, unsigned int n) { T *sdata = SharedMemory<T>(); unsigned int tid = threadidx.x; unsigned int i = blockidx.x*blocksize*2 + threadidx.x; unsigned int gridsize = blocksize*2*griddim.x; T mysum = 0; while (i < n) { mysum += g_idata[i]; if (nispow2 i + blocksize < n) mysum += g_idata[i+blocksize]; i += gridsize; } sdata[tid] = mysum; syncthreads(); if (blocksize >= 512) { if (tid < 256) { sdata[tid] = mysum = mysum + sdata[tid + 256]; } syncthreads(); } if (blocksize >= 256) { if (tid < 128) { sdata[tid] = mysum = mysum + sdata[tid + 128]; } syncthreads(); } if (blocksize >= 128) { if (tid < 64) { sdata[tid] = mysum = mysum + sdata[tid + 64]; } syncthreads(); } if (tid < 32) { volatile T* smem = sdata; if (blocksize >= 64) { smem[tid] = mysum = mysum + smem[tid + 32]; } if (blocksize >= 32) { smem[tid] = mysum = mysum + smem[tid + 16]; } if (blocksize >= 16) { smem[tid] = mysum = mysum + smem[tid + 8]; } if (blocksize >= 8) { smem[tid] = mysum = mysum + smem[tid + 4]; } if (blocksize >= 4) { smem[tid] = mysum = mysum + smem[tid + 2]; } if (blocksize >= 2) { smem[tid] = mysum = mysum + smem[tid + 1]; } } if (tid == 0) g_odata[blockidx.x] = sdata[0]; } extern "C" void reduce6_cuda_(int *n, int *a, int *b) { int *b_d; const int b_size = *n; cudamalloc((void **) &b_d, sizeof(int)*b_size); cudamemcpy(b_d, b, sizeof(int)*b_size, cudamemcpyhosttodevice); dim3 dimblock(128, 1, 1); dim3 dimgrid(128, 1, 1); dim3 small_dimgrid(1, 1, 1); int smemsize = 128 * sizeof(int); int *buffer_d; int small_buffer[4],*small_buffer_d; cudamalloc((void **) &buffer_d, sizeof(int)*128); cudamalloc((void **) &small_buffer_d, sizeof(int)); reduce6<int,128,false><<< dimgrid, dimblock, smemsize >>>(b_d,buffer_d, b_size); reduce6<int,128,false><<< small_dimgrid, dimblock, smemsize >>>(buffer_d, small_buffer_d,128); cudamemcpy(small_buffer, small_buffer_d, sizeof(int), cudamemcpydevicetohost); *a = *small_buffer; cudafree(buffer_d); cudafree(small_buffer_d); cudafree(b_d); } 5
6 TM Compiler does the work: Identifies parallel loops within the region Determines the kernels needed Splits the code into accelerator and host portions Workshares loops running on accelerator Make use of MIMD and SIMD style parallelism Data movement allocates/frees GPU memory at start/end of region moves of data to/from GPU!$acc data present(a,b)!$acc parallel a = 0.0!$acc loop reduction(+:a) do i = 1,n a = a + b(i) end do!$acc end parallel!$acc end data 6
7 90. subroutine sum_of_int_4(n,a,b) 91. use global_data 92. integer*4 a,b(n) 93. integer*8 start_clock, elapsed_clocks, end_clock 94.!$acc data present(a,b) 95. G----<!$acc parallel 96. G a = G!$acc loop reduction(+:a) 98. G g--< do i = 1,n 99. G g a = a + b(i) 100. G g--> end do 101. G---->!$acc end parallel 102.!$acc end data 103. end subroutine sum_of_int_4 ftn-6413 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 94 A data region was created at line 94 and ending at line 107. ftn-6405 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 95 A region starting at line 95 and ending at line 101 was placed on the accelerator. ftn-6430 ftn: ACCEL File = gpu_reduce_int_cce.f90, Line = 98 A loop starting at line 98 was partitioned across the threadblocks and the 128 threads within a threadblock.
8 Summary of code complexity and performance Programming Model Unit of computation Lines of code Performance in Gflops (higher is better) Performance normalized to X86 core Fortran Single x86 core Gflops 1.0 Simple CUDA GPU Gflops 0.87 Optimized CUDA GPU Gflops 5.25 OpenACC GPU Gflops
9 A common directive programming model for today s GPUs Announced at SC11 conference Offers portability between compilers Drawn up by: NVIDIA, Cray, PGI, CAPS Multiple compilers offer portability, debugging, permanence Works for Fortran, C, C++ Standard available at Initially implementations targeted at NVIDIA GPUs Current version: 1.0 (November 2011) Compiler support: Cray CCE: partial 2011, complete in 2012 PGI Accelerator: released product in 2012 CAPS: released product in
10 A common directive programming model for shared memory systems Announced 15yrs ago Works with Fortran, C, C++ Current version 3.1 (July 2011) Compiler support
11 Host-directed execution with attached GPU Main program executes on host (i.e. CPU) Compute intensive regions offloaded to the accelerator device under control of the host. device (i.e. GPU) executes parallel regions typically contain kernels (i.e. work-sharing loops), or kernels regions, containing one or more loops which are executed as kernels. Host must orchestrate the execution by: allocating memory on the accelerator device, initiating data transfer, sending the code to the accelerator, passing arguments to the parallel region, queuing the device code, waiting for completion, transferring results back to the host, and deallocating memory. Host can usually queue a sequence of operations to be executed on the device, one after the other. 11
12 Memory spaces on the host and device distinct Different locations, different address space Data movement performed by host using runtime library calls that explicitly move data between the separate GPUs have a weak memory model No synchronisation between different execution units (SMs) Unless explicit memory barrier Can write OpenACC kernels with race conditions OpenACC Giving inconsistent execution results Compiler will catch most errors, but not all (no user-managed barriers) data movement between the memories implicit managed by the compiler, based on directives from the programmer. Device memory caches are managed by the compiler with hints from the programmer in the form of directives. 12
13 A high level approach to programming accelerators Use original source code (Fortran, C, C++) Don t need separate source-base, much more portable Can preserve subprogram structure Familiar programming model if used traditional OpenMP A small performance gap acceptable Target is 10-15%, currently seeing better than that for many cases Draft proposal currently being discussed by OpenMP ARB subcommittee, aiming for OpenMP 4.0 (sometime in 2012) Cray an enthusiastic supporter Co-chairs committee (with TI) CCE already provides reference implementation of the draft standard Subcommittee includes most major vendors Cray, PGI, Nvidia, AMD, TI, IBM, CAPS, Oracle, Intel... An open standard is the most attractive for developers Portability, multiple compilers for debugging 13
14 OpenACC Parallel Kernels Data Loop Host data Cache Update Wait Declare OpenMP for Accelerators Accelerate Dispatch Data Do/for Part of data Nested data Update Wait Mirror/resident
15 man intro_openacc Which module to use, craype-accel-nvidia20 Forces dynamic linking Single object file Whole program Messages/list file Compiles to PTX not cuda Debugger sees original program not cuda intermediate 15
16 Only two one constructs are un-implemented Cache Declare One unimplemented data clause deviceptr 16
17 /* takes a host pointer */ void* cray_acc_create( void*, size_t ); void cray_acc_delete( void* ); void* cray_acc_copyin( void*, size_t ); void cray_acc_copyout( void*, size_t ); void cray_acc_updatein( void*, size_t ); void cray_acc_updateout( void*, size_t ); int cray_acc_is_present( void* ); int cray_acc_is_present_2( void*, size_t); void *cray_acc_deviceptr( void* ); /* takes a device and host pointer */ void cray_acc_memcpy_device_host( void*, void*, size_t ); /* takes a host and device pointer */ void cray_acc_memcpy_host_device( void*, void*, size_t ); 17
18 1.!$acc loop gang : across thread blocks 2.!$acc loop worker : across warps within a thread block 3.!$acc loop vector : across threads within a warp 1.!$acc loop gang : across thread blocks 2.!$acc loop worker vector : across threads within a thread block 1.!$acc loop gang : across thread blocks 2.!$acc loop vector : across threads within a thread block 1.!$acc loop gang worker: across thread blocks and the warps within a thread block 2.!$acc loop vector : across threads within a warp 1.!$acc loop gang vector : across thread blocks and threads within a thread block 1.!$acc loop gang worker vector : across thread blocks and threads within a thread block 18
19 You can also force things to be within a single thread block: 1.!$acc loop worker : across warps within a single thread block 2.!$acc loop vector : across threads within a warp 1.!$acc worker vector : across threads within a single thread block 1.!$acc vector : across threads within a single thread block 19
20 All parallel regions should contain a loop directive Always start with parallel loop (not just parallel) Fortran assumed size (A(*)) and C pointers must be shaped Always use : when shaping with an entire dimension (i.e. A(:,1:2) ) First get your application working without data regions, than add data regions First get your application working without async, than add async 20
21 Reduction Libsci Pattern match Optimization Perftools Data regions Async clauses
22 ftn -rm -o cpu cpu_reduce_int.f90 aprun./cpu Integer reduction of 2^18 elements run on the CPU Kernel 1) flops GFlops int*4 sum_of_int_4 Application resources: utime ~0s, stime ~0s
23 nvcc -arch=sm_20 -c reduce0.cu reduce6.cu ftn -rm -o gpu_cuda gpu_reduce_int_cuda.f90 reduce0.o reduce6.o aprun./gpu_cuda ACC: Transfer 3 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cuda.f90:107 ACC: Execute kernel wake_up_gpu_$ck_l107_1 from gpu_reduce_int_cuda.f90:107 ACC: Synchronize from gpu_reduce_int_cuda.f90:113 ACC: Transfer 3 items (to acc 0 bytes, to host bytes) from gpu_reduce_int_cuda.f90: Integer reduction of 2^18 integers using SDK CUDA kernel reduce Based on micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of Integer reduction of 2^18 integers using SDK CUDA kernel reduce Based on micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of
24 ftn -rm hmsgs -o gpu_cce gpu_reduce_int_cce.f90 aprun -L$allgpunodes./gpu_cce ACC: Transfer 3 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:80 ACC: Execute kernel wake_up_gpu_$ck_l80_1 from gpu_reduce_int_cce.f90:80 ACC: Synchronize from gpu_reduce_int_cce.f90:86 ACC: Transfer 3 items (to acc 0 bytes, to host bytes) from gpu_reduce_int_cce.f90:86 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:43 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:95 ACC: Transfer 2 items (to acc 4 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:96 ACC: Execute kernel sum_of_int_4_$ck_l96_3 from gpu_reduce_int_cce.f90:96 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:106 ACC: Transfer 2 items (to acc 0 bytes, to host 0 bytes) from gpu_reduce_int_cce.f90:107 ACC: Synchronize from gpu_reduce_int_cce.f90:48 ACC: Transfer 2 items (to acc 0 bytes, to host 4 bytes) from gpu_reduce_int_cce.f90: Integer reduction of 2^18 integers using CCE Open acc directives Based on 31.5 micro-seconds measured by the CUDA profiler performance is: Gflops Correct result of
25 !$acc data copyin(a,b) copy(c,c3)!$acc host_data use_device(a,b,c,c3) t1 = MPI_Wtime() call dgemm('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c,ldc) t2 = MPI_Wtime() t_acc = t2-t1 t1 = MPI_Wtime() call dgemm_acc('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c3,ldc) t2 = MPI_Wtime()!$acc end host_data!$acc end data
26 ftn -hmsgs -o f_dgemm_openacc f_dgemm_openacc.f90!$acc kernels copy(c3) ftn-6413 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 15 A data region was created at line 15 and ending at line 17. ftn-6415 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 15 Allocate memory and copy whole array "c3" to accelerator, copy back at line 17 (acc_copy). c3(1,1) = 1 ftn-6405 crayftn: ACCEL EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 16 A region starting at line 16 and ending at line 16 was placed on the accelerator. c2(:,:) = c(:,:) ftn-6066 crayftn: SCALAR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop nest at line 40 collapsed to a single loop. ftn-6005 crayftn: SCALAR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop starting at line 40 was unrolled 4 times. ftn-6204 crayftn: VECTOR EXAMPLE_OPENACC, File = f_dgemm_openacc.f90, Line = 40 A loop starting at line 40 was vectorized.
27 aprun -L$allgpunodes./f_dgemm_openacc Matrix Size 1024 Error 0. Error between GPU calls 0. DGEMM_ACC_SIMPLE: GFlops in E-3 sec DGEMM_ACC: GFlops in E-3 sec DGEMM_CPU: GFlops in E-2 sec Matrix Size 4096 Error E-12 Error between GPU calls E-12 DGEMM_ACC_SIMPLE: GFlops in sec DGEMM_ACC: GFlops in sec DGEMM_CPU: GFlops in sec Matrix Size 8192 Error E-11 Error between GPU calls E-11 DGEMM_ACC_SIMPLE: GFlops in sec DGEMM_ACC: GFlops in sec DGEMM_CPU: GFlops in sec Application resources: utime ~36s, stime ~4s
28 do j = 1,L do i = 1,N do k = 1,M C(i,j) = C(i,j) + A(i,k)*B(k,j) enddo enddo enddo
29 ftn -hmsgs omatmul matmul_2.f ftn-6202 crayftn: VECTOR MMUL, File = matmul_2.f, Line = 31 A loop starting at line 31 was replaced by a library call. explain ftn-6202 VECTOR: A loop starting at line %s was replaced by a library call. A loop or nest of loops was found to match a library routine. The loop code was replaced by a call to the corresponding routine. Only one pattern was matched and it covered the entire loop. grep dgemm matmul_2.f nm matmul grep dgemm U _dgemm_
30 aprun -L60./matmul arrays sized 4092 by 1024 by 1024 calling mmul Gigaflops including data xfer: C(1,1) = C(2,2) = calling mmul_nopattern No pattern Gigaflops: C(1,1) = No errors found Application resources: utime ~9s, stime ~1s
31 #ifdef USE_DATA!$acc data create(a,b) #endif t1 = gettime() stream_counter = 1 DO j = 1,Nchunks #ifdef FULL_ASYNC my_stream = j #else my_stream = Streams(stream_counter) #endif #ifdef ASYNC #ifndef FULL_ASYNC!$acc wait(my_stream) #endif #endif #ifdef USE_DATA #ifdef ASYNC!$acc update device(a(:,j)) async(my_stream) #else!$acc update device(a(:,j)) #endif #endif #ifdef ASYNC!$acc parallel loop async(my_stream) #else!$acc parallel loop #endif DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc end parallel loop #ifdef USE_DATA #ifdef ASYNC!$acc update host(b(:,j)) async(my_stream) #else!$acc update host(b(:,j)) #endif #endif stream_counter = MOD(stream_counter,3) + 1 ENDDO!$acc wait t2 = gettime()!$acc end data
32 ftn -rad -hnocaf -c -o toaa2.o toaa2.f90 ftn -rad -hnocaf -o toaa2.x toaa2.o pat_build -w toaa2.x aprun toaa2.x+pat Time = Experiment data file written: /lus/scratch/beyerj/openacc/toaa/toaa2.x+pat t.xf Application resources: utime ~83s, stime ~7s pat_report T toaa2.x+pat t.xf
33 Table 1: Profile by Function Group and Function Time% Time Imb. Imb. Calls Group Time Time% Function 100.0% Total % USER % % % toaa_ 0.0% % % % exit 0.0% ====================================================================== 0.0% ETC =======================================================================
34 Table 2: Time and Bytes Transferred for Accelerator Regions Host Host Acc Acc Copy Acc Copy Calls Calltree Time% Time Time In Out (MBytes) (MBytes) 100.0% Total % toaa_ % % toaa_.acc_copy@li % toaa_.acc_copy@li % toaa_.acc_kernel@li % toaa_.acc_sync_wait@li % toaa_.acc_region@li.59(exclusive) =============================================================================================== 0.0% toaa_.acc_sync_wait@li.79 ================================================================================================= Processing step 3 of 3
35 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61 ACC: Transfer 2 items (to acc 0 bytes, to host bytes) from toaa2.f90:61 ACC: Transfer 2 items (to acc bytes, to host 0 bytes) from toaa2.f90:55 ACC: Execute kernel toaa_$ck_l55_1 async(auto) from toaa2.f90:55 ACC: Wait async(auto) from toaa2.f90:61
36 ftn -rad -hnocaf -DUSE_DATA -c -o toaa2.o toaa2.f90 ftn -rad -hnocaf -DUSE_DATA -o toaa2.x toaa2.o pat_build -w toaa2.x aprun toaa2.x+pat Time = Experiment data file written: /lus/scratch/beyerj/openacc/toaa/toaa2.x+pat t.xf Application resources: utime ~4s, stime ~2s pat_report T toaa2.x+pat t.xf Faster with 10x the work
37 Table 2: Time and Bytes Transferred for Accelerator Regions Host Host Acc Acc Copy Acc Copy Calls Calltree Time% Time Time In Out (MBytes) (MBytes) 100.0% Total % toaa_ % toaa_.acc_update@li % toaa_.acc_copy@li % toaa_.acc_sync_wait@li % toaa_.acc_update@li.71(exclusive) ================================================================================================== % toaa_.acc_update@li % toaa_.acc_copy@li % toaa_.acc_update@li.52(exclusive) ================================================================================================== [[[...]]] Processing step 3 of 3
38 !$acc data create(a,b) t1 = gettime() stream_counter = 1 DO j = 1,Nchunks #ifdef FULL_ASYNC my_stream = j #else my_stream = Streams(stream_counter) #endif #ifdef ASYNC #ifndef FULL_ASYNC!$acc wait(my_stream) #endif #endif #ifdef ASYNC!$acc update device(a(:,j)) async(my_stream) #else!$acc update device(a(:,j)) #endif #ifdef ASYNC!$acc parallel loop async(my_stream) #else!$acc parallel loop #endif DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc end parallel loop #ifdef ASYNC!$acc update host(b(:,j)) async(my_stream) #else!$acc update host(b(:,j)) #endif stream_counter = MOD(stream_counter,3) + 1 ENDDO!$acc wait t2 = gettime()!$acc end data
39 make clean make aprun toaa2.x Time = Application resources: utime ~6s, stime ~1s
40 make clean make ACC=yes aprun toaa2.x Time = Application resources: utime ~3s, stime ~2s
41 make clean make ACC=yes ASYNC=yes aprun toaa2.x Time = Application resources: utime ~2s, stime ~2s
42 make clean make ACC=yes ASYNC=yes FULL_ASYNC=yes aprun toaa2.x Time = Application resources: utime ~2s, stime ~2s
43 async(handle): like CUDA streams allows overlap of tasks on GPU PCIe transfers in both directions Plus multiple kernels (up to 16 with Fermi) streams identified by handle tasks with same handle execute sequentially can wait on one, more or all tasks OpenACC API also allows completeness check First attempt, a simple pipeline: processes array, slice by slice copy data to GPU, process, bring back to CPU very complicated kernel operation here! should be able to overlap 3 streams at once use slice number as stream handle in this case runtime MODs it back into allowable range Can actually overlap more than three stream No benefit on this test INTEGER, PARAMETER :: Nvec = 10000, Nchunks = REAL(kind=dp) :: a(nvec,nchunks), b(nvec,nchunks)!$acc data create(a,b) DO j = 1,Nchunks!$acc update device(a(:,j)) async(j)!$acc parallel loop async(j) DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc update host(b(:,j)) async(j) ENDDO!$acc wait!$acc end data 43
44 INTEGER, PARAMETER :: Nvec = 10000, Nchunks = Execution times (on Cray XK6): CPU: 3.98s OpenACC, blocking: 3.6s OpenACC, async: 0.82s OpenACC, full async: 0.76s REAL(kind=dp) :: a(nvec,nchunks), b(nvec,nchunks)!$acc data create(a,b) DO j = 1,Nchunks!$acc update device(a(:,j)) async(j)!$acc parallel loop async(j) DO i = 1,Nvec b(i,j) = SQRT(EXP(a(i,j)*2d0)) b(i,j) = LOG(b(i,j)**2d0)/2d0 ENDDO!$acc update host(b(:,j)) async(j) NVIDIA Visual profiler: time flows to right, streams stacked vertically red: data transfer to GPU pink: computational kernel on GPU blue: data transfer from GPU vertical slice shows what is overlapping only 7 of 16 streams fit in window collapsed view at bottom async handle modded by number of streams so see multiple coloured bars per stream ENDDO!$acc wait!$acc end data 44
Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools
Portable and Productive Performance on Hybrid Systems with OpenACC Compilers and Tools Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Major Hybrid Multi Petaflop Systems
More informationOptimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as
More informationOptimizing Parallel Reduction in CUDA. Mark Harris NVIDIA Developer Technology
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology Parallel Reduction Common and important data parallel primitive Easy to implement in CUDA Harder to get it right Serves as
More informationOptimizing Parallel Reduction in CUDA
Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology http://developer.download.nvidia.com/assets/cuda/files/reduction.pdf Parallel Reduction Tree-based approach used within each
More informationOpenACC and the Cray Compilation Environment James Beyer PhD
OpenACC and the Cray Compilation Environment James Beyer PhD Agenda A brief introduction to OpenACC Cray Programming Environment (PE) Cray Compilation Environment, CCE An in depth look at CCE 8.2 and OpenACC
More informationCray Programming Environment Update Luiz DeRose & Heidi Poxon Cray Inc.
Cray Programming Environment Update Luiz DeRose & Heidi Poxon Cray Inc. 1 Legal Disclaimer Information in this document is provided in connection with Cray Inc. products. No license, express or implied,
More informationGPU programming: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique
GPU programming: CUDA basics Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr This lecture: CUDA programming We have seen some GPU architecture Now how to program it? 2 Outline
More informationCUDA. Sathish Vadhiyar High Performance Computing
CUDA Sathish Vadhiyar High Performance Computing Hierarchical Parallelism Parallel computations arranged as grids One grid executes after another Grid consists of blocks Blocks assigned to SM. A single
More informationCUDA Programming. Week 1. Basic Programming Concepts Materials are copied from the reference list
CUDA Programming Week 1. Basic Programming Concepts Materials are copied from the reference list G80/G92 Device SP: Streaming Processor (Thread Processors) SM: Streaming Multiprocessor 128 SP grouped into
More informationComparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015
Comparing OpenACC 2.5 and OpenMP 4.1 James C Beyer PhD, Sept 29 th 2015 Abstract As both an OpenMP and OpenACC insider I will present my opinion of the current status of these two directive sets for programming
More informationGPU programming: CUDA. Sylvain Collange Inria Rennes Bretagne Atlantique
GPU programming: CUDA Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr This lecture: CUDA programming We have seen some GPU architecture Now how to program it? 2 Outline GPU
More informationS Comparing OpenACC 2.5 and OpenMP 4.5
April 4-7, 2016 Silicon Valley S6410 - Comparing OpenACC 2.5 and OpenMP 4.5 James Beyer, NVIDIA Jeff Larkin, NVIDIA GTC16 April 7, 2016 History of OpenMP & OpenACC AGENDA Philosophical Differences Technical
More informationNVIDIA CUDA. Fermi Compatibility Guide for CUDA Applications. Version 1.1
NVIDIA CUDA Fermi Compatibility Guide for CUDA Applications Version 1.1 4/19/2010 Table of Contents Software Requirements... 1 What Is This Document?... 1 1.1 Application Compatibility on Fermi... 1 1.2
More informationOpenACC Accelerator Directives. May 3, 2013
OpenACC Accelerator Directives May 3, 2013 OpenACC is... An API Inspired by OpenMP Implemented by Cray, PGI, CAPS Includes functions to query device(s) Evolving Plan to integrate into OpenMP Support of
More informationAn Introduction to OpenAcc
An Introduction to OpenAcc ECS 158 Final Project Robert Gonzales Matthew Martin Nile Mittow Ryan Rasmuss Spring 2016 1 Introduction: What is OpenAcc? OpenAcc stands for Open Accelerators. Developed by
More informationHeidi Poxon Cray Inc.
Heidi Poxon Topics GPU support in the Cray performance tools CUDA proxy MPI support for GPUs (GPU-to-GPU) 2 3 Programming Models Supported for the GPU Goal is to provide whole program analysis for programs
More informationBlue Waters Programming Environment
December 3, 2013 Blue Waters Programming Environment Blue Waters User Workshop December 3, 2013 Science and Engineering Applications Support Documentation on Portal 2 All of this information is Available
More informationPPAR: CUDA basics. Sylvain Collange Inria Rennes Bretagne Atlantique PPAR
PPAR: CUDA basics Sylvain Collange Inria Rennes Bretagne Atlantique sylvain.collange@inria.fr http://www.irisa.fr/alf/collange/ PPAR - 2018 This lecture: CUDA programming We have seen some GPU architecture
More informationOPENMP FOR ACCELERATORS
7th International Workshop on OpenMP Chicago, Illinois, USA James C. Beyer, Eric J. Stotzer, Alistair Hart, and Bronis R. de Supinski OPENMP FOR ACCELERATORS Accelerator programming Why a new model? There
More informationPGI Accelerator Programming Model for Fortran & C
PGI Accelerator Programming Model for Fortran & C The Portland Group Published: v1.3 November 2010 Contents 1. Introduction... 5 1.1 Scope... 5 1.2 Glossary... 5 1.3 Execution Model... 7 1.4 Memory Model...
More informationHybrid multicore supercomputing and GPU acceleration with next-generation Cray systems
Hybrid multicore supercomputing and GPU acceleration with next-generation Cray systems Alistair Hart Cray Exascale Research Initiative Europe T-Systems HPCN workshop DLR Braunschweig 26.May.11 ahart@cray.com
More informationIntroduction to Parallel Computing with CUDA. Oswald Haan
Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries
More informationOpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware
OpenACC Standard Directives for Accelerators Credits http://www.openacc.org/ o V1.0: November 2011 Specification OpenACC, Directives for Accelerators, Nvidia Slideware CAPS OpenACC Compiler, HMPP Workbench
More informationOpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer
OpenACC Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer 3 WAYS TO ACCELERATE APPLICATIONS Applications Libraries Compiler Directives Programming Languages Easy to use Most Performance
More informationOpenACC 2.6 Proposed Features
OpenACC 2.6 Proposed Features OpenACC.org June, 2017 1 Introduction This document summarizes features and changes being proposed for the next version of the OpenACC Application Programming Interface, tentatively
More informationAdrian Tate XK6 / openacc workshop Manno, Mar
Adrian Tate XK6 / openacc workshop Manno, Mar6-7 2012 1 Overview & Philosophy Two modes of usage Contents Present contents Upcoming releases Optimization of libsci_acc Autotuning Adaptation Asynchronous
More informationGPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh
GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA
More informationOpenACC compiling and performance tips. May 3, 2013
OpenACC compiling and performance tips May 3, 2013 OpenACC compiler support Cray Module load PrgEnv-cray craype-accel-nvidia35 Fortran -h acc, noomp # openmp is enabled by default, be careful mixing -fpic
More informationAn Introduction to OpenACC
An Introduction to OpenACC James Beyer PhD 6.May.2013 1 Timetable Monday 6 th May 2013 8:30 Lecture 1: Introduction to the Cray XK7 (15) 8:45 Lecture 2: OpenACC organization (Duncan Poole) (15) 9:00 Lecture
More informationarxiv: v1 [hep-lat] 12 Nov 2013
Lattice Simulations using OpenACC compilers arxiv:13112719v1 [hep-lat] 12 Nov 2013 Indian Association for the Cultivation of Science, Kolkata E-mail: tppm@iacsresin OpenACC compilers allow one to use Graphics
More informationPortable and Productive Performance with OpenACC Compilers and Tools. Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc.
Portable and Productive Performance with OpenACC Compilers and Tools Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. 1 Cray: Leadership in Computational Research Earth Sciences
More informationIntroduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research
Introduction to CUDA CME343 / ME339 18 May 2011 James Balfour [ jbalfour@nvidia.com] NVIDIA Research CUDA Programing system for machines with GPUs Programming Language Compilers Runtime Environments Drivers
More informationOpenACC Course. Office Hour #2 Q&A
OpenACC Course Office Hour #2 Q&A Q1: How many threads does each GPU core have? A: GPU cores execute arithmetic instructions. Each core can execute one single precision floating point instruction per cycle
More informationLecture 11: GPU programming
Lecture 11: GPU programming David Bindel 4 Oct 2011 Logistics Matrix multiply results are ready Summary on assignments page My version (and writeup) on CMS HW 2 due Thursday Still working on project 2!
More informationLecture 2: CUDA Programming
CS 515 Programming Language and Compilers I Lecture 2: CUDA Programming Zheng (Eddy) Zhang Rutgers University Fall 2017, 9/12/2017 Review: Programming in CUDA Let s look at a sequential program in C first:
More informationLattice Simulations using OpenACC compilers. Pushan Majumdar (Indian Association for the Cultivation of Science, Kolkata)
Lattice Simulations using OpenACC compilers Pushan Majumdar (Indian Association for the Cultivation of Science, Kolkata) OpenACC is a programming standard for parallel computing developed by Cray, CAPS,
More informationParallelism III. MPI, Vectorization, OpenACC, OpenCL. John Cavazos,Tristan Vanderbruggen, and Will Killian
Parallelism III MPI, Vectorization, OpenACC, OpenCL John Cavazos,Tristan Vanderbruggen, and Will Killian Dept of Computer & Information Sciences University of Delaware 1 Lecture Overview Introduction MPI
More informationCS671 Parallel Programming in the Many-Core Era
CS671 Parallel Programming in the Many-Core Era Lecture 3: GPU Programming - Reduce, Scan & Sort Zheng Zhang Rutgers University Review: Programming with CUDA An Example in C Add vector A and vector B to
More informationPortable and Productive Performance on Hybrid Systems with libsci_acc Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc.
Portable and Productive Performance on Hybrid Systems with libsci_acc Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. 1 What is Cray Libsci_acc? Provide basic scientific
More informationAn OpenACC construct is an OpenACC directive and, if applicable, the immediately following statement, loop or structured block.
API 2.6 R EF ER ENC E G U I D E The OpenACC API 2.6 The OpenACC Application Program Interface describes a collection of compiler directives to specify loops and regions of code in standard C, C++ and Fortran
More informationIntroduction to GPU programming. Introduction to GPU programming p. 1/17
Introduction to GPU programming Introduction to GPU programming p. 1/17 Introduction to GPU programming p. 2/17 Overview GPUs & computing Principles of CUDA programming One good reference: David B. Kirk
More informationPractical: a sample code
Practical: a sample code Alistair Hart Cray Exascale Research Initiative Europe 1 Aims The aim of this practical is to examine, compile and run a simple, pre-prepared OpenACC code The aims of this are:
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software
More informationProfiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015
Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15: Profiling and Parallelizing with the OpenACC Toolkit
More informationComputer Architecture
Jens Teubner Computer Architecture Summer 2017 1 Computer Architecture Jens Teubner, TU Dortmund jens.teubner@cs.tu-dortmund.de Summer 2017 Jens Teubner Computer Architecture Summer 2017 34 Part II Graphics
More informationModule 2: Introduction to CUDA C. Objective
ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding
More informationCOMP Parallel Computing. Programming Accelerators using Directives
COMP 633 - Parallel Computing Lecture 15 October 30, 2018 Programming Accelerators using Directives Credits: Introduction to OpenACC and toolkit Jeff Larkin, Nvidia COMP 633 - Prins Directives for Accelerator
More informationAddressing the Increasing Challenges of Debugging on Accelerated HPC Systems. Ed Hinkel Senior Sales Engineer
Addressing the Increasing Challenges of Debugging on Accelerated HPC Systems Ed Hinkel Senior Sales Engineer Agenda Overview - Rogue Wave & TotalView GPU Debugging with TotalView Nvdia CUDA Intel Phi 2
More informationGPGPU. Alan Gray/James Perry EPCC The University of Edinburgh.
GPGPU Alan Gray/James Perry EPCC The University of Edinburgh a.gray@ed.ac.uk Contents Introduction GPU Technology Programming GPUs GPU Performance Optimisation 2 Introduction 3 Introduction Central Processing
More informationAdvanced CUDA Optimizations
Advanced CUDA Optimizations General Audience Assumptions General working knowledge of CUDA Want kernels to perform better Profiling Before optimizing, make sure you are spending effort in correct location
More informationIntroduction to OpenACC. 16 May 2013
Introduction to OpenACC 16 May 2013 GPUs Reaching Broader Set of Developers 1,000,000 s 100,000 s Early Adopters Research Universities Supercomputing Centers Oil & Gas CAE CFD Finance Rendering Data Analytics
More informationπ = 4 N in_circle N total ... // includes # define N 1000000 float uniform_rand(unsigned* seed, float lower, float upper) {... int main () { int num_in_circ = 0; float x, y, dist; #pragma omp parallel
More informationPGI Fortran & C Accelerator Programming Model. The Portland Group
PGI Fortran & C Accelerator Programming Model The Portland Group Published: v0.72 December 2008 Contents 1. Introduction...3 1.1 Scope...3 1.2 Glossary...3 1.3 Execution Model...4 1.4 Memory Model...5
More informationA High Level Programming Environment for Accelerated Computing. Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc.
A High Level Programming Environment for Accelerated Computing Luiz DeRose Sr. Principal Engineer Programming Environments Director Cray Inc. Outline Motivation Cray XK6 Overview Why a new programming
More informationOpenACC programming for GPGPUs: Rotor wake simulation
DLR.de Chart 1 OpenACC programming for GPGPUs: Rotor wake simulation Melven Röhrig-Zöllner, Achim Basermann Simulations- und Softwaretechnik DLR.de Chart 2 Outline Hardware-Architecture (CPU+GPU) GPU computing
More informationTechnische Universität München. GPU Programming. Rüdiger Westermann Chair for Computer Graphics & Visualization. Faculty of Informatics
GPU Programming Rüdiger Westermann Chair for Computer Graphics & Visualization Faculty of Informatics Overview Programming interfaces and support libraries The CUDA programming abstraction An in-depth
More informationTesla Architecture, CUDA and Optimization Strategies
Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization
More informationOpenACC Fundamentals. Steve Abbott November 15, 2017
OpenACC Fundamentals Steve Abbott , November 15, 2017 AGENDA Data Regions Deep Copy 2 while ( err > tol && iter < iter_max ) { err=0.0; JACOBI ITERATION #pragma acc parallel loop reduction(max:err)
More informationParallel Programming and Debugging with CUDA C. Geoff Gerfin Sr. System Software Engineer
Parallel Programming and Debugging with CUDA C Geoff Gerfin Sr. System Software Engineer CUDA - NVIDIA s Architecture for GPU Computing Broad Adoption Over 250M installed CUDA-enabled GPUs GPU Computing
More informationOpenACC 2.5 and Beyond. Michael Wolfe PGI compiler engineer
OpenACC 2.5 and Beyond Michael Wolfe PGI compiler engineer michael.wolfe@pgroup.com OpenACC Timeline 2008 PGI Accelerator Model (targeting NVIDIA GPUs) 2011 OpenACC 1.0 (targeting NVIDIA GPUs, AMD GPUs)
More informationStanford University. NVIDIA Tesla M2090. NVIDIA GeForce GTX 690
Stanford University NVIDIA Tesla M2090 NVIDIA GeForce GTX 690 Moore s Law 2 Clock Speed 10000 Pentium 4 Prescott Core 2 Nehalem Sandy Bridge 1000 Pentium 4 Williamette Clock Speed (MHz) 100 80486 Pentium
More informationReal-time Graphics 9. GPGPU
Real-time Graphics 9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing
More informationCOMP 322: Fundamentals of Parallel Programming. Flynn s Taxonomy for Parallel Computers
COMP 322: Fundamentals of Parallel Programming Lecture 37: General-Purpose GPU (GPGPU) Computing Max Grossman, Vivek Sarkar Department of Computer Science, Rice University max.grossman@rice.edu, vsarkar@rice.edu
More informationReal-time Graphics 9. GPGPU
9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing GPGPU general-purpose
More informationModule 2: Introduction to CUDA C
ECE 8823A GPU Architectures Module 2: Introduction to CUDA C 1 Objective To understand the major elements of a CUDA program Introduce the basic constructs of the programming model Illustrate the preceding
More informationLecture 3: Introduction to CUDA
CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu
More informationCS 314 Principles of Programming Languages
CS 314 Principles of Programming Languages Zheng Zhang Fall 2016 Dec 14 GPU Programming Rutgers University Programming with CUDA Compute Unified Device Architecture (CUDA) Mapping and managing computations
More informationOptimizing OpenACC Codes. Peter Messmer, NVIDIA
Optimizing OpenACC Codes Peter Messmer, NVIDIA Outline OpenACC in a nutshell Tune an example application Data motion optimization Asynchronous execution Loop scheduling optimizations Interface OpenACC
More informationHPC Middle East. KFUPM HPC Workshop April Mohamed Mekias HPC Solutions Consultant. Introduction to CUDA programming
KFUPM HPC Workshop April 29-30 2015 Mohamed Mekias HPC Solutions Consultant Introduction to CUDA programming 1 Agenda GPU Architecture Overview Tools of the Trade Introduction to CUDA C Patterns of Parallel
More informationOpenACC (Open Accelerators - Introduced in 2012)
OpenACC (Open Accelerators - Introduced in 2012) Open, portable standard for parallel computing (Cray, CAPS, Nvidia and PGI); introduced in 2012; GNU has an incomplete implementation. Uses directives in
More informationECE/ME/EMA/CS 759 High Performance Computing for Engineering Applications
ECE/ME/EMA/CS 759 High Performance Computing for Engineering Applications CUDA Prefix Scan Shared Memory Issues CUDA Reduction October 23, 2015 Dan Negrut, 2015 ECE/ME/EMA/CS 759 UW-Madison Quote of the
More informationLecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1
Lecture 15: Introduction to GPU programming Lecture 15: Introduction to GPU programming p. 1 Overview Hardware features of GPGPU Principles of GPU programming A good reference: David B. Kirk and Wen-mei
More informationRegister file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.
Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)
More informationParallel Numerical Algorithms
Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1 PNA16 Lecture Plan General Topics 1. Architecture and Performance
More informationHands-on CUDA Optimization. CUDA Workshop
Hands-on CUDA Optimization CUDA Workshop Exercise Today we have a progressive exercise The exercise is broken into 5 steps If you get lost you can always catch up by grabbing the corresponding directory
More informationAlistair Hart, Roberto Ansaloni, Alan Gray, Kevin Stratford (EPCC) ( Cray Exascale Research Initiative Europe)
Alistair Hart, Roberto Ansaloni, Alan Gray, Kevin Stratford (EPCC) ( Cray Exascale Research Initiative Europe) UKGPUCC3 Goodenough College, London Wed. 14.Dec.11 ahart@cray.com Contents The Now: the new
More informationLearn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh
Learn CUDA in an Afternoon Alan Gray EPCC The University of Edinburgh Overview Introduction to CUDA Practical Exercise 1: Getting started with CUDA GPU Optimisation Practical Exercise 2: Optimising a CUDA
More informationOptimization and porting of a numerical code for simulations in GRMHD on CPU/GPU clusters PRACE Winter School Stage
Optimization and porting of a numerical code for simulations in GRMHD on CPU/GPU clusters PRACE Winter School Stage INFN - Università di Parma November 6, 2012 Table of contents 1 Introduction 2 3 4 Let
More informationADVANCED ACCELERATED COMPUTING USING COMPILER DIRECTIVES. Jeff Larkin, NVIDIA
ADVANCED ACCELERATED COMPUTING USING COMPILER DIRECTIVES Jeff Larkin, NVIDIA OUTLINE Compiler Directives Review Asynchronous Execution OpenACC Interoperability OpenACC `routine` Advanced Data Directives
More informationGPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler
GPGPU Offloading with OpenMP 4.5 In the IBM XL Compiler Taylor Lloyd Jose Nelson Amaral Ettore Tiotto University of Alberta University of Alberta IBM Canada 1 Why? 2 Supercomputer Power/Performance GPUs
More informationMassively Parallel Algorithms
Massively Parallel Algorithms Introduction to CUDA & Many Fundamental Concepts of Parallel Programming G. Zachmann University of Bremen, Germany cgvr.cs.uni-bremen.de Hybrid/Heterogeneous Computation/Architecture
More informationLecture 2: Introduction to CUDA C
CS/EE 217 GPU Architecture and Programming Lecture 2: Introduction to CUDA C David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2013 1 CUDA /OpenCL Execution Model Integrated host+device app C program Serial or
More informationScientific discovery, analysis and prediction made possible through high performance computing.
Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013
More informationOverview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming
Overview Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.
More informationParallel Computing. Lecture 19: CUDA - I
CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4
More informationINTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017
INTRODUCTION TO OPENACC Analyzing and Parallelizing with OpenACC, Feb 22, 2017 Objective: Enable you to to accelerate your applications with OpenACC. 2 Today s Objectives Understand what OpenACC is and
More informationAn Introduction to OpenACC
An Introduction to OpenACC Alistair Hart Cray Exascale Research Initiative Europe 3 Timetable Day 1: Wednesday 29th August 2012 13:00 Welcome and overview 13:15 Session 1: An Introduction to OpenACC 13:15
More informationOPENACC DIRECTIVES FOR ACCELERATORS NVIDIA
OPENACC DIRECTIVES FOR ACCELERATORS NVIDIA Directives for Accelerators ABOUT OPENACC GPUs Reaching Broader Set of Developers 1,000,000 s 100,000 s Early Adopters Research Universities Supercomputing Centers
More informationAdapting Numerical Weather Prediction codes to heterogeneous architectures: porting the COSMO model to GPUs
Adapting Numerical Weather Prediction codes to heterogeneous architectures: porting the COSMO model to GPUs O. Fuhrer, T. Gysi, X. Lapillonne, C. Osuna, T. Dimanti, T. Schultess and the HP2C team Eidgenössisches
More informationLecture 5. Performance Programming with CUDA
Lecture 5 Performance Programming with CUDA Announcements 2011 Scott B. Baden / CSE 262 / Spring 2011 2 Today s lecture Matrix multiplication 2011 Scott B. Baden / CSE 262 / Spring 2011 3 Memory Hierarchy
More informationGraph Partitioning. Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM.
Graph Partitioning Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM. Partition given graph G=(V,E) in k subgraphs of nearly equal
More informationCUDA Fortran COMPILERS &TOOLS. Porting Guide
Porting Guide CUDA Fortran CUDA Fortran is the Fortran analog of the NVIDIA CUDA C language for programming GPUs. This guide includes examples of common language features used when porting Fortran applications
More informationLecture 8: GPU Programming. CSE599G1: Spring 2017
Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library
More informationAn Introduc+on to OpenACC Part II
An Introduc+on to OpenACC Part II Wei Feinstein HPC User Services@LSU LONI Parallel Programming Workshop 2015 Louisiana State University 4 th HPC Parallel Programming Workshop An Introduc+on to OpenACC-
More informationOpenMP 4.0/4.5. Mark Bull, EPCC
OpenMP 4.0/4.5 Mark Bull, EPCC OpenMP 4.0/4.5 Version 4.0 was released in July 2013 Now available in most production version compilers support for device offloading not in all compilers, and not for all
More informationUsing the Cray Programming Environment to Convert an all MPI code to a Hybrid-Multi-core Ready Application
Using the Cray Programming Environment to Convert an all MPI code to a Hybrid-Multi-core Ready Application John Levesque Cray s Supercomputing Center of Excellence The Target ORNL s Titan System Upgrade
More informationFrom Hello World to Exascale
From Hello World to Exascale Rob Farber Chief Scien0st, BlackDog Endeavors, LLC Author, CUDA Applica0on Design and Development Research consultant: ICHEC and others Doctor Dobb s Journal CUDA & OpenACC
More informationIntroduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator
Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator What is CUDA? Programming language? Compiler? Classic car? Beer? Coffee? CUDA Parallel Computing Platform www.nvidia.com/getcuda Programming
More informationIntroduction to CUDA Programming
Introduction to CUDA Programming Steve Lantz Cornell University Center for Advanced Computing October 30, 2013 Based on materials developed by CAC and TACC Outline Motivation for GPUs and CUDA Overview
More information