Size: px
Start display at page:

Download ""

Transcription

1

2

3

4

5

6

7

8 -npool -ndiag

9

10

11 Z/DGEMM

12 MPI_Alltoall

13

14 MPI_Isend MPI_Irecv

15

16

17

18

19 Wilkes-2 (Cambridge) NVIDIA DGX-1 Piz Daint (CSCS) Summit Dev (ORNL) Davide (CINECA) CPU PLX NIC GPU PCIe NVLink

20

21

22

23

24

25

26

27

28

29 QE-GPU CSCS QE CSCS QE Cineca 1 P P BW (360c) 1 KNL (60c) 10 KNL (640c) init_run 15.92s 7.50s 4.45s 21.61s 10.33s electrons s s s s s update_pot 1.37s 1.04s 10.42s 31.95s 7.94s forces 12.06s 3.03s 13.20s 60.91s 11.93s stress 74.28s 15.82s 75.69s s 38.55s cdiaghg 71.38s 6.89s 15.51s s 76.15s PWSCF s s s s s Fermi energy ev ev ev ev ev Total energy Ry Ry Ry npool Ry Ry Total force Total stress Pressure BW/KNL results from

30 QE-GPU CSCS QE-GPU Sirius GPU CSCS 1 P P100 1 V100 1 P P init_run 15.92s 7.50s 11.06s electrons s s s s s update_pot 1.37s 1.04s 0.59s forces 12.06s 3.03s 8.58s 28.86s 3.85s stress 74.28s 15.82s 52.58s 94.95s 12.99s cdiaghg 71.38s 6.89s 84.10s s 76.15s PWSCF s s s s s Fermi energy ev ev ev ev ev Total energy Ry Ry Ry npool Ry Ry Total force Total stress Pressure BW/KNL/SIRIUS results from

31

32

33

34 $ pgf90 nvtx.cuf -L/usr/local/cuda/lib lnvtoolsext $ nvprof -o nvprof.output./a.out NVPROF is profiling process 10653, command:./a.out Generated result file: /Users/nvprof.output program main use nvtx character(len=4) :: itcount! First range with standard color call nvtxstartrange("first label ) do n=1,14! Create custom label for each marker write(itcount,'(i4)') n! Range with custom color call nvtxstartrange("label "//itcount,n)! Add sleep to make markers big call sleep(1) call nvtxendrange end do call nvtxendrange end program main

35

36 gdb cuda-gdb #!/bin/bash QE_DIR=/home/cuda/qe-gpu INFILE=$1 export OMP_NUM_THREADS=6 export MKL_NUM_THREADS=6 export NO_STOP_MESSAGE=yes #export CUDA_VISIBLE_DEVICES=1 today=`date +'%y_%m_%d_%h_%m_%s'` #Normal run mpirun -np 1 --bind-to none ${QE_DIR}/PW/src/pw.x -input ${INFILE} # Run with gdb mpirun -np 1 --bind-to none gdb -ex=r --args ${QE_DIR}/bin/pw.x -input ${INFILE} # Run with cuda-gdb mpirun -np 1 --bind-to none cuda-gdb --args ${QE_DIR}/bin/pw.x -input ${INFILE}

37 1. Generate a backtrace (compile with -g if using -O2 or higher) $ export PGI_TERM='trace' 2. Run program $./a.out 0: copyin Memcpy (dev=0x(nil), host=0x0x7f , size= ) FAILED: 11(invalid argument) /opt/pgi/linux86-64/17.10/lib/libcudafor.so(pgf90_dev_copyin+0x5 3) [0x7f74110a0012]./a.out() [0x403b1a]./a.out() [0x4036d4] /lib/x86_64-linux-gnu/libc.so.6( libc_start_main+0xf0) [0x7f74074d7830]./a.out() [0x403589] 3. Use addr2line to find out the line in which the error occurs. $ addr2line -e a.out 0x403b1a /home/gruetsch/./unalloc.cuf:30

38

39 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1 DO ig = 1, ngm cfac = vg(ig, is) * CONJG(eigts1(mill(1,ig),na) * & eigts2(mill(2,ig),na) * & eigts3(mill(3,ig),na) ) aux1(ig, nb, 1) = g(1, ig) * cfac aux1(ig, nb, 2) = g(2, ig) * cfac aux1(ig, nb, 3) = g(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO

40 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1 DO ig = 1, ngm cfac = vg(ig, is) * CONJG(eigts1(mill(1,ig),na) * & eigts2(mill(2,ig),na) * & eigts3(mill(3,ig),na) ) aux1(ig, nb, 1) = g(1, ig) * cfac aux1(ig, nb, 2) = g(2, ig) * cfac aux1(ig, nb, 3) = g(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO

41 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO

42 subroutine addusforce_g() DO is = 1, nspin_mag USE gvect, ONLY: g_d, nb = 0 DO na = 1, nat MODULE gvect(na) == nt) THEN IF (ityp REAL(DP), ALLOCATABLE :: g nb = nb + 1 REAL(DP), ALLOCATABLE, DEVICE :: g_d!$cuf kernel do DO ig = 1, ngm END MODULE cfac=gvect vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac Option aux1_d(ig, 1: nb, 2) = g_d(2, ig) * cfac ALLOCATE(g(3, ngm)); aux1_d(ig, nb, ALLOCATE(g_d(3, 3) = g_d(3, ig) ngm)) * cfac g = 1.d0 ENDDO g_dendif = g ENDDO Option 2 using F2003 source allocation: ALLOCATE(g(3, ngm)); g = 1.d0 ENDDO ALLOCATE(g_d, source = g)

43 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO

44 subroutine force_corr (forcescc) USE wavefunctions_module, ONLY : psic USE gvect, ONLY : nl, g, gl, igtongl do na = 1, nat if (nt.eq.ityp (na) ) then tau1 = tau(1,na); tau2 = tau(2,na); tau3 = tau(3,na) fscc1 = 0.d0; fscc2 = 0.d0; fscc3 = 0.d0 do ig = gstart, ngm arg = (g (1, ig) * tau1 + g (2, ig) * tau2 + & g (3, ig) * tau3 ) * tpi tmpf = fact * rhocgnt (igtongl(ig) ) * & tpiba * DBLE(DCMPLX(sin(arg), cos(arg)) * & CONJG(psic(nl(ig)))) fscc1 = fscc1 + tmpf * g(1,ig) fscc2 = fscc2 + tmpf * g(2,ig) fscc3 = fscc3 + tmpf * g(3,ig) enddo forcescc(1,na) = forcescc(1,na) + fscc1 forcescc(2,na) = forcescc(2,na) + fscc2 forcescc(3,na) = forcescc(3,na) + fscc3 endif enddo

45 subroutine force_corr (forcescc) USE wavefunctions_module, ONLY : psic=>psic_d USE gvect, ONLY : nl=>nl_d, g=>g_d, gl=>gl_d, igtongl=>igtongl_d do na = 1, nat if (nt.eq.ityp (na) ) then tau1 = tau(1,na); tau2 = tau(2,na); tau3 = tau(3,na) fscc1 = 0.d0; fscc2 = 0.d0; fscc3 = 0.d0!$cuf kernel do do ig = gstart, ngm arg = (g (1, ig) * tau1 + g (2, ig) * tau2 + & g (3, ig) * tau3 ) * tpi tmpf = fact * rhocgnt (igtongl(ig) ) * & tpiba * DBLE(DCMPLX(sin(arg), cos(arg)) * & CONJG(psic(nl(ig)))) fscc1 = fscc1 + tmpf * g(1,ig) fscc2 = fscc2 + tmpf * g(2,ig) fscc3 = fscc3 + tmpf * g(3,ig) enddo forcescc(1,na) = forcescc(1,na) + fscc1 forcescc(2,na) = forcescc(2,na) + fscc2 forcescc(3,na) = forcescc(3,na) + fscc3 endif enddo

46 subroutine force_corr (forcescc) USE wavefunctions_module, ONLY : psic=>psic_d USE gvect, ONLY : nl=>nl_d, g=>g_d, gl=>gl_d, igtongl=>igtongl_d do na = 1, nat if (nt.eq.ityp (na) ) then tau1 = tau(1,na); tau2 = tau(2,na); tau3 = tau(3,na) fscc1 = 0.d0; fscc2 = 0.d0; fscc3 = 0.d0 USE_CUDA!$cuf kernel do do ig = gstart, ngm arg = (g (1, ig) * tau1 + g (2, ig) * tau2 + & g (3, ig) * tau3 ) * tpi tmpf = fact * rhocgnt (igtongl(ig) ) * & tpiba * DBLE(DCMPLX(sin(arg), cos(arg)) * & CONJG(psic(nl(ig)))) fscc1 = fscc1 + tmpf * g(1,ig) fscc2 = fscc2 + tmpf * g(2,ig) fscc3 = fscc3 + tmpf * g(3,ig) enddo forcescc(1,na) = forcescc(1,na) + fscc1 forcescc(2,na) = forcescc(2,na) + fscc2 forcescc(3,na) = forcescc(3,na) + fscc3 endif enddo

47 SUBROUTINE fft_scatter_gpu_batch_a() npp = dfft%npp(me); nnp = dfft%nnp tscale = 1.0_DP / (dfft%nr1 * dfft%nr2) DO iter = 1, dfft%nproc IF(IAND(nprocp, nprocp-1) == 0) THEN dest = IEOR( me-1, iter-1 ) ELSE dest = MOD(me-1 + (iter-1), nprocp) ENDIF ip = dest + 1 ioff = dfft%iss(ip) nswip = dfft%nsp(ip)!$cuf kernel do(2) <<< *, *, 0, dfft%a2a_comp >>> DO cuf_j = 1, npp DO cuf_i = 1, nswip mc = p_ismap_d(cuf_i + ioff) it = (ip - 1) * sendsiz + (cuf_i - 1)*nppx f_aux2_d(cuf_j + it) = f_aux_d(mc + (cuf_j - 1) * nnp) * & tscale ENDDO ENDDO ENDDO

48

49 use cublas integer :: m, n, k real(8) :: alpha, beta real(8) :: a(m,k), b(k,n), c(m,n) real(8),device :: a_d(m,k), b_d(k,n), c_d(m,n)! DGEMM using linked CPU library call DGEMM( N, N, m, n, k, alpha, a, m, b, k, & beta, c, m)! DGEMM using CUBLAS call DGEMM( N, N, m, n, k, alpha, a_d, m, b_d, k, & beta, c_d, m)

50 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO

51 subroutine addusforce_g() use cublas DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm_d, & 2*ngm, aux1_d(1,1,ipol), 2*ngm, 0, & ddeeq_d(1,1,ipol,is), nij ) ENDDO ENDDO

52 SUBROUTINE cft_1z_cpu() IF (isign < 0) THEN CALL FFT_Z_STICK(fw_planz( ip), c(1), ldz, nsl) tscale = 1.0_DP / nz cout( 1 : ldz * nsl ) = c( 1 : ldz * nsl ) * tscale ELSE IF (isign > 0) THEN CALL FFT_Z_STICK(bw_planz( ip), c(1), ldz, nsl) cout( 1 : ldz * nsl ) = c( 1 : ldz * nsl ) END IF

53 SUBROUTINE cft_1z_gpu() USE cufft IF (isign < 0) THEN istat = cufftexecz2z(cufft_planz(ip), c(1), c(1), & CUFFT_FORWARD) tscale = 1.0_DP / nz!$cuf kernel do(1) <<<*,*,0,stream>>> DO i = 1, ldz * nsl cout(i) = c(i) * tscale END DO ELSE IF (isign > 0) THEN istat = cufftexecz2z(cufft_planz(ip), c(1), cout(1), & CUFFT_INVERSE ) END IF

54 Use iso_c_binding interface #if (GPU_ARCH == 35)! Works for Kepler integer(c_int) function cublaszgemm3m(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) & bind(c, name='cublaszgemm_v2') #else! Works for Pascal, Volta, and beyond integer(c_int) function cublaszgemm3m(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) & bind(c, name='cublaszgemm3m') #endif iso_c_binding use cudafor use cublas_v2 type(cublashandle), value :: handle integer(c_int), value :: transa, transb, m, n, k integer(c_int), value :: lda, ldb, ldc complex(8) :: alpha, beta complex(8), device :: A(*), B(*), C(*) end function cublaszgemm3m end interface cublaszgemm3m

55

56 subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo call simpson (msh, aux, rgrid(nt)%rab, rhocgnt(igl)) enddo

57 simpson subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh rhocgnt(igl) if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo call simpson (msh, aux, rgrid(nt)%rab, rhocgnt(igl)) enddo

58 simpson subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh rhocgnt(igl) if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0 do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo

59 subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0 do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo

60 subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba!$cuf kernel do do ir = 1, msh gx = sqrt(gl(igl)) * tpiba if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0!$cuf kernel do do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo

61 subroutine atomic_rho () do igl = gstart, ngl!$cuf kernel do do ir = 1, msh gx = sqrt(gl(igl)) * tpiba if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0!$cuf kernel do do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo

62 subroutine atomic_rho () do igl = gstart, ngl ngl msh!$cuf kernel do do ir = 1, msh gx = sqrt(gl(igl)) * tpiba if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0!$cuf kernel do do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo

63 attributes(global) & subroutine compute_rhocgnt_gpu() tx = threadidx%x ty = threadidx%y igl = (blockidx%x - 1) * blockdim%y + ty mysum = 0.d0 do ir = tx, mesh, blockdim%x val = mysum = mysum + val end do! Reduce by warp mysum = mysum + shfl_down(mysum,1) mysum = mysum + shfl_down(mysum,2) mysum = mysum + shfl_down(mysum,4) mysum = mysum + shfl_down(mysum,8) mysum = mysum + shfl_down(mysum,16) if (tx == 1) then rhocgnt(igl) = mysum / 3.d0 endif end subroutine compute_rhocgnt_gpu

64 DO ih =1,nh DO jh = ih,nh call sub(ngm,ih,jh,..) END DO END DO ngm jh ih In sub: DO ig = 1,ngm END DO DO ih = 1,nh DO jh = ih, nh block = dim3(256,1,1) grid = dim3(ceiling(real(ngm)/block%x,1,1) call sub_g<<<grid, block>>>(ngm,ih,jh,) END DO END DO In sub_g: ig = threadidx%x + blockdim%x * (blockidx%x-1) IF (ig <= ngm) then END IF

65 DO ih =1,nh DO jh = ih,nh call sub(ngm,ih,jh,..) END DO END DO ngm jh In sub: DO ig = 1,ngm END DO block=dim3(256,1,1) grid = dim3(ceiling(real(ngm)/block%x), nh, nh) call sub_g<<<grid,block>>>(ngm,) ih In sub_g: ih = blockidx%y; jh = blockidx%z IF (ih > jh) RETURN ig = threadidx%x + blockdim%x * (blockidx%x-1) IF ( ig <= ngm) THEN END IF

66 DO ih =1,nh DO jh = ih,nh call sub(ngm,ih,jh,..) END DO END DO ngm jh In sub: DO ig = 1,ngm END DO block=dim3(256,1,1) grid = dim3(nblocks, nh, nh) call sub_g<<<grid,block>>>(ngm,) ih In sub_g: ih = blockidx%y; jh = blockidx%z IF (ih > jh) RETURN tx = threadidx%x+ blockdim%x * (blockidx%x-1) DO ig = tx, ngm, blockdim%x*griddim%x... END DO

67 cpu_gpu_interface MODULE cpu_gpu_interface.f90 DO INTERFACE is = 1, nspin_mag add_vuspsi nb = SUBROUTINE 0 add_vuspsi_cpu( lda, n, m, hpsi ) DO na = INTEGER 1, nat :: lda, n, m IF (ityp (na) == nt) THEN COMPLEX(DP) :: hpsi(:,:) nb = nb + 1 END SUBROUTINE add_vuspsi_cpu!$cuf kernel do DO USE_CUDA ig = 1, ngm #ifdef cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi ) eigts2_d(mill_d(2,ig),na) * & INTEGER :: lda, n, m eigts3_d(mill_d(3,ig),na) ) COMPLEX(DP), DEVICE :: hpsi(:,:) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac END SUBROUTINE add_vuspsi_gpu #endif aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac END INTERFACE ENDDO MODULE cpu_gpu_interface ENDIF ENDDO #ifdef USE_GPU #define MY_ROUTINE(x) #else ENDDO #define MY_ROUTINE(x) #endif x##_gpu add_vuspsi.f90 x##_cpu SUBROUTINE MY_ROUTINE(add_vuspsi)( lda, n, m, hpsi )

68

69

Quantum ESPRESSO on GPU accelerated systems

Quantum ESPRESSO on GPU accelerated systems Quantum ESPRESSO on GPU accelerated systems Massimiliano Fatica, Everett Phillips, Josh Romero - NVIDIA Filippo Spiga - University of Cambridge/ARM (UK) MaX International Conference, Trieste, Italy, January

More information

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI

SC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI SC13 GPU Technology Theater Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI The Case for Fortran Clear, straight-forward syntax Successful legacy in the scientific community

More information

CUDA Fortran COMPILERS &TOOLS. Porting Guide

CUDA Fortran COMPILERS &TOOLS. Porting Guide Porting Guide CUDA Fortran CUDA Fortran is the Fortran analog of the NVIDIA CUDA C language for programming GPUs. This guide includes examples of common language features used when porting Fortran applications

More information

Porting Guide. CUDA Fortran COMPILERS &TOOLS

Porting Guide. CUDA Fortran COMPILERS &TOOLS Porting Guide CUDA Fortran COMPILERS &TOOLS 1 Simple Increment Code Host CPU and its memory The cudafor module incudes CUDA Fortran definitions and interfaces to the runtime API The device variable attribute

More information

CUDA Fortran Brent Leback The Portland Group

CUDA Fortran Brent Leback The Portland Group CUDA Fortran 2013 Brent Leback The Portland Group brent.leback@pgroup.com Why Fortran? Rich legacy in the scientific community Semantics easier to vectorize/parallelize Array descriptors Modules Fortran

More information

Porting Scientific Research Codes to GPUs with CUDA Fortran: Incompressible Fluid Dynamics using the Immersed Boundary Method

Porting Scientific Research Codes to GPUs with CUDA Fortran: Incompressible Fluid Dynamics using the Immersed Boundary Method Porting Scientific Research Codes to GPUs with CUDA Fortran: Incompressible Fluid Dynamics using the Immersed Boundary Method Josh Romero, Massimiliano Fatica - NVIDIA Vamsi Spandan, Roberto Verzicco -

More information

CUDA 5 Features in PGI CUDA Fortran 2013

CUDA 5 Features in PGI CUDA Fortran 2013 第 1 頁, 共 7 頁 Technical News from The Portland Group PGI Home Page March 2013 CUDA 5 Features in PGI CUDA Fortran 2013 by Brent Leback PGI Engineering Manager The 2013 release of PGI CUDA Fortran introduces

More information

Introduction to Parallel Computing with CUDA. Oswald Haan

Introduction to Parallel Computing with CUDA. Oswald Haan Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries

More information

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.

Register file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks. Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)

More information

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA

More information

CUDA Lecture 2. Manfred Liebmann. Technische Universität München Chair of Optimal Control Center for Mathematical Sciences, M17

CUDA Lecture 2. Manfred Liebmann. Technische Universität München Chair of Optimal Control Center for Mathematical Sciences, M17 CUDA Lecture 2 Manfred Liebmann Technische Universität München Chair of Optimal Control Center for Mathematical Sciences, M17 manfred.liebmann@tum.de December 15, 2015 CUDA Programming Fundamentals CUDA

More information

Practical Introduction to CUDA and GPU

Practical Introduction to CUDA and GPU Practical Introduction to CUDA and GPU Charlie Tang Centre for Theoretical Neuroscience October 9, 2009 Overview CUDA - stands for Compute Unified Device Architecture Introduced Nov. 2006, a parallel computing

More information

Module 3: CUDA Execution Model -I. Objective

Module 3: CUDA Execution Model -I. Objective ECE 8823A GPU Architectures odule 3: CUDA Execution odel -I 1 Objective A more detailed look at kernel execution Data to thread assignment To understand the organization and scheduling of threads Resource

More information

Lecture 3: Introduction to CUDA

Lecture 3: Introduction to CUDA CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu

More information

GPU Programming. Lecture 2: CUDA C Basics. Miaoqing Huang University of Arkansas 1 / 34

GPU Programming. Lecture 2: CUDA C Basics. Miaoqing Huang University of Arkansas 1 / 34 1 / 34 GPU Programming Lecture 2: CUDA C Basics Miaoqing Huang University of Arkansas 2 / 34 Outline Evolvements of NVIDIA GPU CUDA Basic Detailed Steps Device Memories and Data Transfer Kernel Functions

More information

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming Overview Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.

More information

CME 213 S PRING Eric Darve

CME 213 S PRING Eric Darve CME 213 S PRING 2017 Eric Darve Review Secret behind GPU performance: simple cores but a large number of them; even more threads can exist live on the hardware (10k 20k threads live). Important performance

More information

Lab 1 Part 1: Introduction to CUDA

Lab 1 Part 1: Introduction to CUDA Lab 1 Part 1: Introduction to CUDA Code tarball: lab1.tgz In this hands-on lab, you will learn to use CUDA to program a GPU. The lab can be conducted on the SSSU Fermi Blade (M2050) or NCSA Forge using

More information

ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC

ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018 PGI THE NVIDIA HPC SDK Fortran, C & C++

More information

CUDA Workshop. High Performance GPU computing EXEBIT Karthikeyan

CUDA Workshop. High Performance GPU computing EXEBIT Karthikeyan CUDA Workshop High Performance GPU computing EXEBIT- 2014 Karthikeyan CPU vs GPU CPU Very fast, serial, Low Latency GPU Slow, massively parallel, High Throughput Play Demonstration Compute Unified Device

More information

Sparse Linear Algebra in CUDA

Sparse Linear Algebra in CUDA Sparse Linear Algebra in CUDA HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 22 nd 2017 Table of Contents Homework - Worksheet 2

More information

GPU Computing Master Clss. Development Tools

GPU Computing Master Clss. Development Tools GPU Computing Master Clss Development Tools Generic CUDA debugger goals Support all standard debuggers across all OS Linux GDB, TotalView and DDD Windows Visual studio Mac - XCode Support CUDA runtime

More information

Computational Fluid Dynamics (CFD) using Graphics Processing Units

Computational Fluid Dynamics (CFD) using Graphics Processing Units Computational Fluid Dynamics (CFD) using Graphics Processing Units Aaron F. Shinn Mechanical Science and Engineering Dept., UIUC Accelerators for Science and Engineering Applications: GPUs and Multicores

More information

PROFILER OPENACC TUTORIAL. Version 2018

PROFILER OPENACC TUTORIAL. Version 2018 PROFILER OPENACC TUTORIAL Version 2018 TABLE OF CONTENTS Chapter Chapter Chapter Chapter Chapter 1. 2. 3. 4. 5. Tutorial Setup... 1 Profiling the application... 2 Adding OpenACC directives...4 Improving

More information

Scientific Computing with GPUs Autotuning GEMMs Fermi GPUs

Scientific Computing with GPUs Autotuning GEMMs Fermi GPUs Parallel Processing and Applied Mathematics September 11-14, 2011 Toruń, Poland Scientific Computing with GPUs Autotuning GEMMs Fermi GPUs Innovative Computing Laboratory Electrical Engineering and Computer

More information

CUDA programming model. N. Cardoso & P. Bicudo. Física Computacional (FC5)

CUDA programming model. N. Cardoso & P. Bicudo. Física Computacional (FC5) CUDA programming model N. Cardoso & P. Bicudo Física Computacional (FC5) N. Cardoso & P. Bicudo CUDA programming model 1/23 Outline 1 CUDA qualifiers 2 CUDA Kernel Thread hierarchy Kernel, configuration

More information

PGPROF OpenACC Tutorial

PGPROF OpenACC Tutorial PGPROF OpenACC Tutorial Version 2017 PGI Compilers and Tools TABLE OF CONTENTS Chapter 1. Tutorial Setup...1 Chapter 2. Profiling the application... 2 Chapter 3. Adding OpenACC directives... 4 Chapter

More information

n N c CIni.o ewsrg.au

n N c CIni.o ewsrg.au @NCInews NCI and Raijin National Computational Infrastructure 2 Our Partners General purpose, highly parallel processors High FLOPs/watt and FLOPs/$ Unit of execution Kernel Separate memory subsystem GPGPU

More information

Module Memory and Data Locality

Module Memory and Data Locality GPU Teaching Kit Accelerated Computing Module 4.4 - Memory and Data Locality Tiled Matrix Multiplication Kernel Objective To learn to write a tiled matrix-multiplication kernel Loading and using tiles

More information

ECE 408 / CS 483 Final Exam, Fall 2014

ECE 408 / CS 483 Final Exam, Fall 2014 ECE 408 / CS 483 Final Exam, Fall 2014 Thursday 18 December 2014 8:00 to 11:00 Central Standard Time You may use any notes, books, papers, or other reference materials. In the interest of fair access across

More information

Lecture 1: an introduction to CUDA

Lecture 1: an introduction to CUDA Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Overview hardware view software view CUDA programming

More information

GPU & High Performance Computing (by NVIDIA) CUDA. Compute Unified Device Architecture Florian Schornbaum

GPU & High Performance Computing (by NVIDIA) CUDA. Compute Unified Device Architecture Florian Schornbaum GPU & High Performance Computing (by NVIDIA) CUDA Compute Unified Device Architecture 29.02.2008 Florian Schornbaum GPU Computing Performance In the last few years the GPU has evolved into an absolute

More information

Parallel Computing. Lecture 19: CUDA - I

Parallel Computing. Lecture 19: CUDA - I CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4

More information

CUDA Parallelism Model

CUDA Parallelism Model GPU Teaching Kit Accelerated Computing CUDA Parallelism Model Kernel-Based SPMD Parallel Programming Multidimensional Kernel Configuration Color-to-Grayscale Image Processing Example Image Blur Example

More information

CUDA Fortran. Programming Guide and Reference. Release The Portland Group

CUDA Fortran. Programming Guide and Reference. Release The Portland Group CUDA Fortran Programming Guide and Reference Release 2011 The Portland Group While every precaution has been taken in the preparation of this document, The Portland Group (PGI ), a wholly-owned subsidiary

More information

Card Sizes. Tesla K40: 2880 processors; 12 GB memory

Card Sizes. Tesla K40: 2880 processors; 12 GB memory Card Sizes Tesla K40: 2880 processors; 12 GB memory Data bigger than grid Maximum grid sizes Compute capability 1.0, 1D and 2D grids supported Compute capability 2, 3, 3D grids too. Grid sizes: 65,535

More information

Information Coding / Computer Graphics, ISY, LiTH. CUDA memory! ! Coalescing!! Constant memory!! Texture memory!! Pinned memory 26(86)

Information Coding / Computer Graphics, ISY, LiTH. CUDA memory! ! Coalescing!! Constant memory!! Texture memory!! Pinned memory 26(86) 26(86) Information Coding / Computer Graphics, ISY, LiTH CUDA memory Coalescing Constant memory Texture memory Pinned memory 26(86) CUDA memory We already know... Global memory is slow. Shared memory is

More information

Introduction to GPU programming. Introduction to GPU programming p. 1/17

Introduction to GPU programming. Introduction to GPU programming p. 1/17 Introduction to GPU programming Introduction to GPU programming p. 1/17 Introduction to GPU programming p. 2/17 Overview GPUs & computing Principles of CUDA programming One good reference: David B. Kirk

More information

MPI + X programming. UTK resources: Rho Cluster with GPGPU George Bosilca CS462

MPI + X programming. UTK resources: Rho Cluster with GPGPU   George Bosilca CS462 MPI + X programming UTK resources: Rho Cluster with GPGPU https://newton.utk.edu/doc/documentation/systems/rhocluster George Bosilca CS462 MPI Each programming paradigm only covers a particular spectrum

More information

Introduction to Scientific Programming using GPGPU and CUDA

Introduction to Scientific Programming using GPGPU and CUDA Introduction to Scientific Programming using GPGPU and CUDA Day 1 Sergio Orlandini s.orlandini@cineca.it Mario Tacconi m.tacconi@cineca.it 0 Hands on: Compiling a CUDA program Environment and utility:

More information

COSC 6374 Parallel Computations Introduction to CUDA

COSC 6374 Parallel Computations Introduction to CUDA COSC 6374 Parallel Computations Introduction to CUDA Edgar Gabriel Fall 2014 Disclaimer Material for this lecture has been adopted based on various sources Matt Heavener, CS, State Univ. of NY at Buffalo

More information

Tesla Architecture, CUDA and Optimization Strategies

Tesla Architecture, CUDA and Optimization Strategies Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization

More information

Using a GPU in InSAR processing to improve performance

Using a GPU in InSAR processing to improve performance Using a GPU in InSAR processing to improve performance Rob Mellors, ALOS PI 152 San Diego State University David Sandwell University of California, San Diego What is a GPU? (Graphic Processor Unit) A graphics

More information

Advanced hybrid MPI+OpenMP programming. Carlo Cavazzoni SuperComputing Applications and Innovation Department

Advanced hybrid MPI+OpenMP programming. Carlo Cavazzoni SuperComputing Applications and Innovation Department Advanced hybrid MPI+OpenMP programming Carlo Cavazzoni c.cavazzoni@cineca.it SuperComputing Applications and Innovation Department February 11-15, 2013 Architecture features Floating point units (multiply

More information

Massively Parallel Computing with CUDA. Carlos Alberto Martínez Angeles Cinvestav-IPN

Massively Parallel Computing with CUDA. Carlos Alberto Martínez Angeles Cinvestav-IPN Massively Parallel Computing with CUDA Carlos Alberto Martínez Angeles Cinvestav-IPN What is a GPU? A graphics processing unit (GPU) The term GPU was popularized by Nvidia in 1999 marketed the GeForce

More information

GPU programming basics. Prof. Marco Bertini

GPU programming basics. Prof. Marco Bertini GPU programming basics Prof. Marco Bertini CUDA: atomic operations, privatization, algorithms Atomic operations The basics atomic operation in hardware is something like a read-modify-write operation performed

More information

S CUDA on Xavier

S CUDA on Xavier S8868 - CUDA on Xavier Anshuman Bhat CUDA Product Manager Saikat Dasadhikari CUDA Engineering 29 th March 2018 1 CUDA ECOSYSTEM 2018 CUDA DOWNLOADS IN 2017 3,500,000 CUDA REGISTERED DEVELOPERS 800,000

More information

Lecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1

Lecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1 Lecture 15: Introduction to GPU programming Lecture 15: Introduction to GPU programming p. 1 Overview Hardware features of GPGPU Principles of GPU programming A good reference: David B. Kirk and Wen-mei

More information

Data Parallel Execution Model

Data Parallel Execution Model CS/EE 217 GPU Architecture and Parallel Programming Lecture 3: Kernel-Based Data Parallel Execution Model David Kirk/NVIDIA and Wen-mei Hwu, 2007-2013 Objective To understand the organization and scheduling

More information

GPU Programming. Ringberg Theorie Seminar 2010

GPU Programming. Ringberg Theorie Seminar 2010 or How to tremendously accelerate your code? Michael Kraus, Christian Konz Max-Planck-Institut für Plasmaphysik, Garching Ringberg Theorie Seminar 2010 Introduction? GPU? GPUs can do more than just render

More information

CUDA Accelerated Linpack on Clusters. E. Phillips, NVIDIA Corporation

CUDA Accelerated Linpack on Clusters. E. Phillips, NVIDIA Corporation CUDA Accelerated Linpack on Clusters E. Phillips, NVIDIA Corporation Outline Linpack benchmark CUDA Acceleration Strategy Fermi DGEMM Optimization / Performance Linpack Results Conclusions LINPACK Benchmark

More information

Real-time Graphics 9. GPGPU

Real-time Graphics 9. GPGPU Real-time Graphics 9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing

More information

Mathematical computations with GPUs

Mathematical computations with GPUs Master Educational Program Information technology in applications Mathematical computations with GPUs Using GPUs for mathematical problems in Fortran, Java and C# Alexey A. Romanenko arom@ccfit.nsu.ru

More information

Lecture 8: GPU Programming. CSE599G1: Spring 2017

Lecture 8: GPU Programming. CSE599G1: Spring 2017 Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library

More information

ENDURING DIFFERENTIATION Timothy Lanfear

ENDURING DIFFERENTIATION Timothy Lanfear ENDURING DIFFERENTIATION Timothy Lanfear WHERE ARE WE? 2 LIFE AFTER DENNARD SCALING GPU-ACCELERATED PERFORMANCE 10 7 40 Years of Microprocessor Trend Data 10 6 10 5 10 4 10 3 10 2 Single-threaded perf

More information

ENDURING DIFFERENTIATION. Timothy Lanfear

ENDURING DIFFERENTIATION. Timothy Lanfear ENDURING DIFFERENTIATION Timothy Lanfear WHERE ARE WE? 2 LIFE AFTER DENNARD SCALING 10 7 40 Years of Microprocessor Trend Data 10 6 10 5 10 4 Transistors (thousands) 1.1X per year 10 3 10 2 Single-threaded

More information

Dense Linear Algebra. HPC - Algorithms and Applications

Dense Linear Algebra. HPC - Algorithms and Applications Dense Linear Algebra HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 6 th 2017 Last Tutorial CUDA Architecture thread hierarchy:

More information

Introduction to GPU Computing. 周国峰 Wuhan University 2017/10/13

Introduction to GPU Computing. 周国峰 Wuhan University 2017/10/13 Introduction to GPU Computing chandlerz@nvidia.com 周国峰 Wuhan University 2017/10/13 GPU and Its Application 3 Ways to Develop Your GPU APP An Example to Show the Developments Add GPUs: Accelerate Science

More information

Information Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY

Information Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY Introduction to CUDA Ingemar Ragnemalm Information Coding, ISY This lecture: Programming model and language Memory spaces and memory access Shared memory Examples Lecture questions: 1. Suggest two significant

More information

A few notes on parallel programming with CUDA

A few notes on parallel programming with CUDA A few notes on parallel programming with CUDA Using parallel computing can significantly speed up execution and in many cases can be quite straightforward to implement. These notes focus on those simple

More information

High Performance Computing and GPU Programming

High Performance Computing and GPU Programming High Performance Computing and GPU Programming Lecture 1: Introduction Objectives C++/CPU Review GPU Intro Programming Model Objectives Objectives Before we begin a little motivation Intel Xeon 2.67GHz

More information

Graph Partitioning. Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM.

Graph Partitioning. Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM. Graph Partitioning Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM. Partition given graph G=(V,E) in k subgraphs of nearly equal

More information

Scientific discovery, analysis and prediction made possible through high performance computing.

Scientific discovery, analysis and prediction made possible through high performance computing. Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013

More information

Supporting Data Parallelism in Matcloud: Final Report

Supporting Data Parallelism in Matcloud: Final Report Supporting Data Parallelism in Matcloud: Final Report Yongpeng Zhang, Xing Wu 1 Overview Matcloud is an on-line service to run Matlab-like script on client s web browser. Internally it is accelerated by

More information

GPU Computing Workshop CSU Getting Started. Garland Durham Quantos Analytics

GPU Computing Workshop CSU Getting Started. Garland Durham Quantos Analytics 1 GPU Computing Workshop CSU 2013 Getting Started Garland Durham Quantos Analytics nvidia-smi 2 At command line, run command nvidia-smi to get/set GPU properties. nvidia-smi Options: -q query -L list attached

More information

Fast Bilateral Filter GPU implementation

Fast Bilateral Filter GPU implementation Fast Bilateral Filter GPU implementation Multi-Core Architectures and Programming Gerhard Mlady, Rafael Bernardelli Hardware/Software Co-Design, University of Erlangen-Nuremberg July 21, 2016 Overview

More information

Overcoming the Barriers to Sustained Petaflop Performance. William D. Gropp Mathematics and Computer Science

Overcoming the Barriers to Sustained Petaflop Performance. William D. Gropp Mathematics and Computer Science Overcoming the Barriers to Sustained Petaflop Performance William D. Gropp Mathematics and Computer Science www.mcs.anl.gov/~gropp But First Are we too CPU-centric? What about I/O? What do applications

More information

Device Memories and Matrix Multiplication

Device Memories and Matrix Multiplication Device Memories and Matrix Multiplication 1 Device Memories global, constant, and shared memories CUDA variable type qualifiers 2 Matrix Multiplication an application of tiling runningmatrixmul in the

More information

Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator

Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator What is CUDA? Programming language? Compiler? Classic car? Beer? Coffee? CUDA Parallel Computing Platform www.nvidia.com/getcuda Programming

More information

CUDA PROGRAMMING MODEL. Carlo Nardone Sr. Solution Architect, NVIDIA EMEA

CUDA PROGRAMMING MODEL. Carlo Nardone Sr. Solution Architect, NVIDIA EMEA CUDA PROGRAMMING MODEL Carlo Nardone Sr. Solution Architect, NVIDIA EMEA CUDA: COMMON UNIFIED DEVICE ARCHITECTURE Parallel computing architecture and programming model GPU Computing Application Includes

More information

Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA. Part 1: Hardware design and programming model

Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA. Part 1: Hardware design and programming model Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA Part 1: Hardware design and programming model Dirk Ribbrock Faculty of Mathematics, TU dortmund 2016 Table of Contents Why parallel

More information

Information Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY

Information Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY Introduction to CUDA Ingemar Ragnemalm Information Coding, ISY This lecture: Programming model and language Introduction to memory spaces and memory access Shared memory Matrix multiplication example Lecture

More information

Outline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun

Outline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun Outline Memory Management CIS 565 Fall 2011 Qing Sun sunqing@seas.upenn.edu Kernels Matrix multiplication Managing Memory CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory

More information

CS 179: GPU Computing. Recitation 2: Synchronization, Shared memory, Matrix Transpose

CS 179: GPU Computing. Recitation 2: Synchronization, Shared memory, Matrix Transpose CS 179: GPU Computing Recitation 2: Synchronization, Shared memory, Matrix Transpose Synchronization Ideal case for parallelism: no resources shared between threads no communication between threads Many

More information

Batch Linear Algebra for GPU-Accelerated High Performance Computing Environments

Batch Linear Algebra for GPU-Accelerated High Performance Computing Environments Batch Linear Algebra for GPU-Accelerated High Performance Computing Environments Ahmad Abdelfattah, Azzam Haidar, Stanimire Tomov, and Jack Dongarra SIAM Conference on Computational Science and Engineering

More information

naïve GPU kernels generation from Fortran source code Dmitry Mikushin

naïve GPU kernels generation from Fortran source code Dmitry Mikushin KernelGen naïve GPU kernels generation from Fortran source code Dmitry Mikushin Contents Motivation and target Assembling our own toolchain: schemes and details Toolchain usecase: sincos example Development

More information

Introduction to GPGPUs and to CUDA programming model

Introduction to GPGPUs and to CUDA programming model Introduction to GPGPUs and to CUDA programming model www.cineca.it Marzia Rivi m.rivi@cineca.it GPGPU architecture CUDA programming model CUDA efficient programming Debugging & profiling tools CUDA libraries

More information

GPU programming. Dr. Bernhard Kainz

GPU programming. Dr. Bernhard Kainz GPU programming Dr. Bernhard Kainz Overview About myself Motivation GPU hardware and system architecture GPU programming languages GPU programming paradigms Pitfalls and best practice Reduction and tiling

More information

Solving the heat equation with CUDA

Solving the heat equation with CUDA Solving the heat equation with CUDA Oliver Meister January 09 th 2013 Last Tutorial CSR kernel - scalar One row per thread No coalesced memory access Non-uniform matrices CSR kernel - vectorized One row

More information

Josef Pelikán, Jan Horáček CGG MFF UK Praha

Josef Pelikán, Jan Horáček CGG MFF UK Praha GPGPU and CUDA 2012-2018 Josef Pelikán, Jan Horáček CGG MFF UK Praha pepca@cgg.mff.cuni.cz http://cgg.mff.cuni.cz/~pepca/ 1 / 41 Content advances in hardware multi-core vs. many-core general computing

More information

Introduction to CUDA

Introduction to CUDA Introduction to CUDA Overview HW computational power Graphics API vs. CUDA CUDA glossary Memory model, HW implementation, execution Performance guidelines CUDA compiler C/C++ Language extensions Limitations

More information

Use of Accelerate Tools PGI CUDA FORTRAN Jacket

Use of Accelerate Tools PGI CUDA FORTRAN Jacket Use of Accelerate Tools PGI CUDA FORTRAN Jacket Supercomputing Institute For Advanced Computational Research e-mail: szhang@msi.umn.edu or help@msi.umn.edu Tel: 612-624-8858 (direct), 612-626-0802(help)

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014

More information

HPC Middle East. KFUPM HPC Workshop April Mohamed Mekias HPC Solutions Consultant. Introduction to CUDA programming

HPC Middle East. KFUPM HPC Workshop April Mohamed Mekias HPC Solutions Consultant. Introduction to CUDA programming KFUPM HPC Workshop April 29-30 2015 Mohamed Mekias HPC Solutions Consultant Introduction to CUDA programming 1 Agenda GPU Architecture Overview Tools of the Trade Introduction to CUDA C Patterns of Parallel

More information

PERFORMANCE ANALYSIS AND DEBUGGING FOR VOLTA. Felix Schmitt 11 th Parallel Tools Workshop September 11-12, 2017

PERFORMANCE ANALYSIS AND DEBUGGING FOR VOLTA. Felix Schmitt 11 th Parallel Tools Workshop September 11-12, 2017 PERFORMANCE ANALYSIS AND DEBUGGING FOR VOLTA Felix Schmitt 11 th Parallel Tools Workshop September 11-12, 2017 INTRODUCING TESLA V100 Volta Architecture Improved NVLink & HBM2 Volta MPS Improved SIMT Model

More information

NAG Fortran Library Routine Document F01CTF.1

NAG Fortran Library Routine Document F01CTF.1 NAG Fortran Library Routine Document Note: before using this routine, please read the Users Note for your implementation to check the interpretation of bold italicised terms and other implementation-dependent

More information

Introduction to GPGPU and GPU-architectures

Introduction to GPGPU and GPU-architectures Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks

More information

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh.

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh. GPGPU Alan Gray/James Perry EPCC The University of Edinburgh a.gray@ed.ac.uk Contents Introduction GPU Technology Programming GPUs GPU Performance Optimisation 2 Introduction 3 Introduction Central Processing

More information

CS 179 Lecture 4. GPU Compute Architecture

CS 179 Lecture 4. GPU Compute Architecture CS 179 Lecture 4 GPU Compute Architecture 1 This is my first lecture ever Tell me if I m not speaking loud enough, going too fast/slow, etc. Also feel free to give me lecture feedback over email or at

More information

E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices

E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices Ching-Yung Lin, Ph.D. Adjunct Professor, Dept. of Electrical Engineering and Computer Science IBM Chief Scientist, Graph

More information

GPU Performance Nuggets

GPU Performance Nuggets GPU Performance Nuggets Simon Garcia de Gonzalo & Carl Pearson PhD Students, IMPACT Research Group Advised by Professor Wen-mei Hwu Jun. 15, 2016 grcdgnz2@illinois.edu pearson@illinois.edu GPU Performance

More information

Basic Elements of CUDA Algoritmi e Calcolo Parallelo. Daniele Loiacono

Basic Elements of CUDA Algoritmi e Calcolo Parallelo. Daniele Loiacono Basic Elements of CUDA Algoritmi e Calcolo Parallelo References This set of slides is mainly based on: CUDA Technical Training, Dr. Antonino Tumeo, Pacific Northwest National Laboratory Slide of Applied

More information

Tiled Matrix Multiplication

Tiled Matrix Multiplication Tiled Matrix Multiplication Basic Matrix Multiplication Kernel global void MatrixMulKernel(int m, m, int n, n, int k, k, float* A, A, float* B, B, float* C) C) { int Row = blockidx.y*blockdim.y+threadidx.y;

More information

Introduction to GPU Computing. Design and Analysis of Parallel Algorithms

Introduction to GPU Computing. Design and Analysis of Parallel Algorithms Introduction to GPU Computing Design and Analysis of Parallel Algorithms Sources CUDA Programming Guide (3.2) CUDA Best Practices Guide (3.2) CUDA Toolkit Reference Manual (3.2) CUDA SDK Examples Part

More information

Learn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh

Learn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh Learn CUDA in an Afternoon Alan Gray EPCC The University of Edinburgh Overview Introduction to CUDA Practical Exercise 1: Getting started with CUDA GPU Optimisation Practical Exercise 2: Optimising a CUDA

More information

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series

Introduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software

More information

CUDA Architecture & Programming Model

CUDA Architecture & Programming Model CUDA Architecture & Programming Model Course on Multi-core Architectures & Programming Oliver Taubmann May 9, 2012 Outline Introduction Architecture Generation Fermi A Brief Look Back At Tesla What s New

More information

Kepler Overview Mark Ebersole

Kepler Overview Mark Ebersole Kepler Overview Mark Ebersole TFLOPS TFLOPS 3x Performance in a Single Generation 3.5 3 2.5 2 1.5 1 0.5 0 1.25 1 Single Precision FLOPS (SGEMM) 2.90 TFLOPS.89 TFLOPS.36 TFLOPS Xeon E5-2690 Tesla M2090

More information

Introduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research

Introduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research Introduction to CUDA CME343 / ME339 18 May 2011 James Balfour [ jbalfour@nvidia.com] NVIDIA Research CUDA Programing system for machines with GPUs Programming Language Compilers Runtime Environments Drivers

More information

Adrian Tate XK6 / openacc workshop Manno, Mar

Adrian Tate XK6 / openacc workshop Manno, Mar Adrian Tate XK6 / openacc workshop Manno, Mar6-7 2012 1 Overview & Philosophy Two modes of usage Contents Present contents Upcoming releases Optimization of libsci_acc Autotuning Adaptation Asynchronous

More information