|
|
- Gertrude Simpson
- 5 years ago
- Views:
Transcription
1
2
3
4
5
6
7
8 -npool -ndiag
9
10
11 Z/DGEMM
12 MPI_Alltoall
13
14 MPI_Isend MPI_Irecv
15
16
17
18
19 Wilkes-2 (Cambridge) NVIDIA DGX-1 Piz Daint (CSCS) Summit Dev (ORNL) Davide (CINECA) CPU PLX NIC GPU PCIe NVLink
20
21
22
23
24
25
26
27
28
29 QE-GPU CSCS QE CSCS QE Cineca 1 P P BW (360c) 1 KNL (60c) 10 KNL (640c) init_run 15.92s 7.50s 4.45s 21.61s 10.33s electrons s s s s s update_pot 1.37s 1.04s 10.42s 31.95s 7.94s forces 12.06s 3.03s 13.20s 60.91s 11.93s stress 74.28s 15.82s 75.69s s 38.55s cdiaghg 71.38s 6.89s 15.51s s 76.15s PWSCF s s s s s Fermi energy ev ev ev ev ev Total energy Ry Ry Ry npool Ry Ry Total force Total stress Pressure BW/KNL results from
30 QE-GPU CSCS QE-GPU Sirius GPU CSCS 1 P P100 1 V100 1 P P init_run 15.92s 7.50s 11.06s electrons s s s s s update_pot 1.37s 1.04s 0.59s forces 12.06s 3.03s 8.58s 28.86s 3.85s stress 74.28s 15.82s 52.58s 94.95s 12.99s cdiaghg 71.38s 6.89s 84.10s s 76.15s PWSCF s s s s s Fermi energy ev ev ev ev ev Total energy Ry Ry Ry npool Ry Ry Total force Total stress Pressure BW/KNL/SIRIUS results from
31
32
33
34 $ pgf90 nvtx.cuf -L/usr/local/cuda/lib lnvtoolsext $ nvprof -o nvprof.output./a.out NVPROF is profiling process 10653, command:./a.out Generated result file: /Users/nvprof.output program main use nvtx character(len=4) :: itcount! First range with standard color call nvtxstartrange("first label ) do n=1,14! Create custom label for each marker write(itcount,'(i4)') n! Range with custom color call nvtxstartrange("label "//itcount,n)! Add sleep to make markers big call sleep(1) call nvtxendrange end do call nvtxendrange end program main
35
36 gdb cuda-gdb #!/bin/bash QE_DIR=/home/cuda/qe-gpu INFILE=$1 export OMP_NUM_THREADS=6 export MKL_NUM_THREADS=6 export NO_STOP_MESSAGE=yes #export CUDA_VISIBLE_DEVICES=1 today=`date +'%y_%m_%d_%h_%m_%s'` #Normal run mpirun -np 1 --bind-to none ${QE_DIR}/PW/src/pw.x -input ${INFILE} # Run with gdb mpirun -np 1 --bind-to none gdb -ex=r --args ${QE_DIR}/bin/pw.x -input ${INFILE} # Run with cuda-gdb mpirun -np 1 --bind-to none cuda-gdb --args ${QE_DIR}/bin/pw.x -input ${INFILE}
37 1. Generate a backtrace (compile with -g if using -O2 or higher) $ export PGI_TERM='trace' 2. Run program $./a.out 0: copyin Memcpy (dev=0x(nil), host=0x0x7f , size= ) FAILED: 11(invalid argument) /opt/pgi/linux86-64/17.10/lib/libcudafor.so(pgf90_dev_copyin+0x5 3) [0x7f74110a0012]./a.out() [0x403b1a]./a.out() [0x4036d4] /lib/x86_64-linux-gnu/libc.so.6( libc_start_main+0xf0) [0x7f74074d7830]./a.out() [0x403589] 3. Use addr2line to find out the line in which the error occurs. $ addr2line -e a.out 0x403b1a /home/gruetsch/./unalloc.cuf:30
38
39 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1 DO ig = 1, ngm cfac = vg(ig, is) * CONJG(eigts1(mill(1,ig),na) * & eigts2(mill(2,ig),na) * & eigts3(mill(3,ig),na) ) aux1(ig, nb, 1) = g(1, ig) * cfac aux1(ig, nb, 2) = g(2, ig) * cfac aux1(ig, nb, 3) = g(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO
40 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1 DO ig = 1, ngm cfac = vg(ig, is) * CONJG(eigts1(mill(1,ig),na) * & eigts2(mill(2,ig),na) * & eigts3(mill(3,ig),na) ) aux1(ig, nb, 1) = g(1, ig) * cfac aux1(ig, nb, 2) = g(2, ig) * cfac aux1(ig, nb, 3) = g(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO
41 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO
42 subroutine addusforce_g() DO is = 1, nspin_mag USE gvect, ONLY: g_d, nb = 0 DO na = 1, nat MODULE gvect(na) == nt) THEN IF (ityp REAL(DP), ALLOCATABLE :: g nb = nb + 1 REAL(DP), ALLOCATABLE, DEVICE :: g_d!$cuf kernel do DO ig = 1, ngm END MODULE cfac=gvect vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac Option aux1_d(ig, 1: nb, 2) = g_d(2, ig) * cfac ALLOCATE(g(3, ngm)); aux1_d(ig, nb, ALLOCATE(g_d(3, 3) = g_d(3, ig) ngm)) * cfac g = 1.d0 ENDDO g_dendif = g ENDDO Option 2 using F2003 source allocation: ALLOCATE(g(3, ngm)); g = 1.d0 ENDDO ALLOCATE(g_d, source = g)
43 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO
44 subroutine force_corr (forcescc) USE wavefunctions_module, ONLY : psic USE gvect, ONLY : nl, g, gl, igtongl do na = 1, nat if (nt.eq.ityp (na) ) then tau1 = tau(1,na); tau2 = tau(2,na); tau3 = tau(3,na) fscc1 = 0.d0; fscc2 = 0.d0; fscc3 = 0.d0 do ig = gstart, ngm arg = (g (1, ig) * tau1 + g (2, ig) * tau2 + & g (3, ig) * tau3 ) * tpi tmpf = fact * rhocgnt (igtongl(ig) ) * & tpiba * DBLE(DCMPLX(sin(arg), cos(arg)) * & CONJG(psic(nl(ig)))) fscc1 = fscc1 + tmpf * g(1,ig) fscc2 = fscc2 + tmpf * g(2,ig) fscc3 = fscc3 + tmpf * g(3,ig) enddo forcescc(1,na) = forcescc(1,na) + fscc1 forcescc(2,na) = forcescc(2,na) + fscc2 forcescc(3,na) = forcescc(3,na) + fscc3 endif enddo
45 subroutine force_corr (forcescc) USE wavefunctions_module, ONLY : psic=>psic_d USE gvect, ONLY : nl=>nl_d, g=>g_d, gl=>gl_d, igtongl=>igtongl_d do na = 1, nat if (nt.eq.ityp (na) ) then tau1 = tau(1,na); tau2 = tau(2,na); tau3 = tau(3,na) fscc1 = 0.d0; fscc2 = 0.d0; fscc3 = 0.d0!$cuf kernel do do ig = gstart, ngm arg = (g (1, ig) * tau1 + g (2, ig) * tau2 + & g (3, ig) * tau3 ) * tpi tmpf = fact * rhocgnt (igtongl(ig) ) * & tpiba * DBLE(DCMPLX(sin(arg), cos(arg)) * & CONJG(psic(nl(ig)))) fscc1 = fscc1 + tmpf * g(1,ig) fscc2 = fscc2 + tmpf * g(2,ig) fscc3 = fscc3 + tmpf * g(3,ig) enddo forcescc(1,na) = forcescc(1,na) + fscc1 forcescc(2,na) = forcescc(2,na) + fscc2 forcescc(3,na) = forcescc(3,na) + fscc3 endif enddo
46 subroutine force_corr (forcescc) USE wavefunctions_module, ONLY : psic=>psic_d USE gvect, ONLY : nl=>nl_d, g=>g_d, gl=>gl_d, igtongl=>igtongl_d do na = 1, nat if (nt.eq.ityp (na) ) then tau1 = tau(1,na); tau2 = tau(2,na); tau3 = tau(3,na) fscc1 = 0.d0; fscc2 = 0.d0; fscc3 = 0.d0 USE_CUDA!$cuf kernel do do ig = gstart, ngm arg = (g (1, ig) * tau1 + g (2, ig) * tau2 + & g (3, ig) * tau3 ) * tpi tmpf = fact * rhocgnt (igtongl(ig) ) * & tpiba * DBLE(DCMPLX(sin(arg), cos(arg)) * & CONJG(psic(nl(ig)))) fscc1 = fscc1 + tmpf * g(1,ig) fscc2 = fscc2 + tmpf * g(2,ig) fscc3 = fscc3 + tmpf * g(3,ig) enddo forcescc(1,na) = forcescc(1,na) + fscc1 forcescc(2,na) = forcescc(2,na) + fscc2 forcescc(3,na) = forcescc(3,na) + fscc3 endif enddo
47 SUBROUTINE fft_scatter_gpu_batch_a() npp = dfft%npp(me); nnp = dfft%nnp tscale = 1.0_DP / (dfft%nr1 * dfft%nr2) DO iter = 1, dfft%nproc IF(IAND(nprocp, nprocp-1) == 0) THEN dest = IEOR( me-1, iter-1 ) ELSE dest = MOD(me-1 + (iter-1), nprocp) ENDIF ip = dest + 1 ioff = dfft%iss(ip) nswip = dfft%nsp(ip)!$cuf kernel do(2) <<< *, *, 0, dfft%a2a_comp >>> DO cuf_j = 1, npp DO cuf_i = 1, nswip mc = p_ismap_d(cuf_i + ioff) it = (ip - 1) * sendsiz + (cuf_i - 1)*nppx f_aux2_d(cuf_j + it) = f_aux_d(mc + (cuf_j - 1) * nnp) * & tscale ENDDO ENDDO ENDDO
48
49 use cublas integer :: m, n, k real(8) :: alpha, beta real(8) :: a(m,k), b(k,n), c(m,n) real(8),device :: a_d(m,k), b_d(k,n), c_d(m,n)! DGEMM using linked CPU library call DGEMM( N, N, m, n, k, alpha, a, m, b, k, & beta, c, m)! DGEMM using CUBLAS call DGEMM( N, N, m, n, k, alpha, a_d, m, b_d, k, & beta, c_d, m)
50 subroutine addusforce_g() DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm, & 2*ngm, aux1(1,1,ipol), 2*ngm, 0, & ddeeq(1,1,ipol,is), nij ) ENDDO ENDDO
51 subroutine addusforce_g() use cublas DO is = 1, nspin_mag nb = 0 DO na = 1, nat IF (ityp (na) == nt) THEN nb = nb + 1!$cuf kernel do DO ig = 1, ngm cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & eigts2_d(mill_d(2,ig),na) * & eigts3_d(mill_d(3,ig),na) ) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac ENDDO ENDIF ENDDO DO ipol = 1, 3 CALL DGEMM( 'C', 'N', nij, nab, 2*ngm, fact, qgm_d, & 2*ngm, aux1_d(1,1,ipol), 2*ngm, 0, & ddeeq_d(1,1,ipol,is), nij ) ENDDO ENDDO
52 SUBROUTINE cft_1z_cpu() IF (isign < 0) THEN CALL FFT_Z_STICK(fw_planz( ip), c(1), ldz, nsl) tscale = 1.0_DP / nz cout( 1 : ldz * nsl ) = c( 1 : ldz * nsl ) * tscale ELSE IF (isign > 0) THEN CALL FFT_Z_STICK(bw_planz( ip), c(1), ldz, nsl) cout( 1 : ldz * nsl ) = c( 1 : ldz * nsl ) END IF
53 SUBROUTINE cft_1z_gpu() USE cufft IF (isign < 0) THEN istat = cufftexecz2z(cufft_planz(ip), c(1), c(1), & CUFFT_FORWARD) tscale = 1.0_DP / nz!$cuf kernel do(1) <<<*,*,0,stream>>> DO i = 1, ldz * nsl cout(i) = c(i) * tscale END DO ELSE IF (isign > 0) THEN istat = cufftexecz2z(cufft_planz(ip), c(1), cout(1), & CUFFT_INVERSE ) END IF
54 Use iso_c_binding interface #if (GPU_ARCH == 35)! Works for Kepler integer(c_int) function cublaszgemm3m(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) & bind(c, name='cublaszgemm_v2') #else! Works for Pascal, Volta, and beyond integer(c_int) function cublaszgemm3m(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) & bind(c, name='cublaszgemm3m') #endif iso_c_binding use cudafor use cublas_v2 type(cublashandle), value :: handle integer(c_int), value :: transa, transb, m, n, k integer(c_int), value :: lda, ldb, ldc complex(8) :: alpha, beta complex(8), device :: A(*), B(*), C(*) end function cublaszgemm3m end interface cublaszgemm3m
55
56 subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo call simpson (msh, aux, rgrid(nt)%rab, rhocgnt(igl)) enddo
57 simpson subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh rhocgnt(igl) if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo call simpson (msh, aux, rgrid(nt)%rab, rhocgnt(igl)) enddo
58 simpson subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh rhocgnt(igl) if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0 do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo
59 subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba do ir = 1, msh if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0 do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo
60 subroutine atomic_rho () do igl = gstart, ngl gx = sqrt(gl(igl)) * tpiba!$cuf kernel do do ir = 1, msh gx = sqrt(gl(igl)) * tpiba if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0!$cuf kernel do do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo
61 subroutine atomic_rho () do igl = gstart, ngl!$cuf kernel do do ir = 1, msh gx = sqrt(gl(igl)) * tpiba if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0!$cuf kernel do do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo
62 subroutine atomic_rho () do igl = gstart, ngl ngl msh!$cuf kernel do do ir = 1, msh gx = sqrt(gl(igl)) * tpiba if (rgrid(nt)%r(ir) < 1.0d-8) then aux(ir) = upf(nt)%rho_at(ir) else aux(ir) = upf(nt)%rho_at(ir) * sin(gx*rgrid(nt)%r(ir)) / & (rgrid(nt)%r(ir)*gx) endif enddo rsum = 0.d0!$cuf kernel do do i = 2, msh-1, 2 rsum = rsum + aux(i-1) * rgrid(nt)%rab(i-1) + 4.0d0 * aux(i) * rgrid(nt)%rab(i) + aux(i+1) * rgrid(nt)%rab(i+1) end do rhocgnt(ngl) = rsum / 3.d0 enddo
63 attributes(global) & subroutine compute_rhocgnt_gpu() tx = threadidx%x ty = threadidx%y igl = (blockidx%x - 1) * blockdim%y + ty mysum = 0.d0 do ir = tx, mesh, blockdim%x val = mysum = mysum + val end do! Reduce by warp mysum = mysum + shfl_down(mysum,1) mysum = mysum + shfl_down(mysum,2) mysum = mysum + shfl_down(mysum,4) mysum = mysum + shfl_down(mysum,8) mysum = mysum + shfl_down(mysum,16) if (tx == 1) then rhocgnt(igl) = mysum / 3.d0 endif end subroutine compute_rhocgnt_gpu
64 DO ih =1,nh DO jh = ih,nh call sub(ngm,ih,jh,..) END DO END DO ngm jh ih In sub: DO ig = 1,ngm END DO DO ih = 1,nh DO jh = ih, nh block = dim3(256,1,1) grid = dim3(ceiling(real(ngm)/block%x,1,1) call sub_g<<<grid, block>>>(ngm,ih,jh,) END DO END DO In sub_g: ig = threadidx%x + blockdim%x * (blockidx%x-1) IF (ig <= ngm) then END IF
65 DO ih =1,nh DO jh = ih,nh call sub(ngm,ih,jh,..) END DO END DO ngm jh In sub: DO ig = 1,ngm END DO block=dim3(256,1,1) grid = dim3(ceiling(real(ngm)/block%x), nh, nh) call sub_g<<<grid,block>>>(ngm,) ih In sub_g: ih = blockidx%y; jh = blockidx%z IF (ih > jh) RETURN ig = threadidx%x + blockdim%x * (blockidx%x-1) IF ( ig <= ngm) THEN END IF
66 DO ih =1,nh DO jh = ih,nh call sub(ngm,ih,jh,..) END DO END DO ngm jh In sub: DO ig = 1,ngm END DO block=dim3(256,1,1) grid = dim3(nblocks, nh, nh) call sub_g<<<grid,block>>>(ngm,) ih In sub_g: ih = blockidx%y; jh = blockidx%z IF (ih > jh) RETURN tx = threadidx%x+ blockdim%x * (blockidx%x-1) DO ig = tx, ngm, blockdim%x*griddim%x... END DO
67 cpu_gpu_interface MODULE cpu_gpu_interface.f90 DO INTERFACE is = 1, nspin_mag add_vuspsi nb = SUBROUTINE 0 add_vuspsi_cpu( lda, n, m, hpsi ) DO na = INTEGER 1, nat :: lda, n, m IF (ityp (na) == nt) THEN COMPLEX(DP) :: hpsi(:,:) nb = nb + 1 END SUBROUTINE add_vuspsi_cpu!$cuf kernel do DO USE_CUDA ig = 1, ngm #ifdef cfac= vg_d(ig, is) * CONJG(eigts1_d(mill_d(1,ig),na) * & SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi ) eigts2_d(mill_d(2,ig),na) * & INTEGER :: lda, n, m eigts3_d(mill_d(3,ig),na) ) COMPLEX(DP), DEVICE :: hpsi(:,:) aux1_d(ig, nb, 1) = g_d(1, ig) * cfac END SUBROUTINE add_vuspsi_gpu #endif aux1_d(ig, nb, 2) = g_d(2, ig) * cfac aux1_d(ig, nb, 3) = g_d(3, ig) * cfac END INTERFACE ENDDO MODULE cpu_gpu_interface ENDIF ENDDO #ifdef USE_GPU #define MY_ROUTINE(x) #else ENDDO #define MY_ROUTINE(x) #endif x##_gpu add_vuspsi.f90 x##_cpu SUBROUTINE MY_ROUTINE(add_vuspsi)( lda, n, m, hpsi )
68
69
Quantum ESPRESSO on GPU accelerated systems
Quantum ESPRESSO on GPU accelerated systems Massimiliano Fatica, Everett Phillips, Josh Romero - NVIDIA Filippo Spiga - University of Cambridge/ARM (UK) MaX International Conference, Trieste, Italy, January
More informationSC13 GPU Technology Theater. Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI
SC13 GPU Technology Theater Accessing New CUDA Features from CUDA Fortran Brent Leback, Compiler Manager, PGI The Case for Fortran Clear, straight-forward syntax Successful legacy in the scientific community
More informationCUDA Fortran COMPILERS &TOOLS. Porting Guide
Porting Guide CUDA Fortran CUDA Fortran is the Fortran analog of the NVIDIA CUDA C language for programming GPUs. This guide includes examples of common language features used when porting Fortran applications
More informationPorting Guide. CUDA Fortran COMPILERS &TOOLS
Porting Guide CUDA Fortran COMPILERS &TOOLS 1 Simple Increment Code Host CPU and its memory The cudafor module incudes CUDA Fortran definitions and interfaces to the runtime API The device variable attribute
More informationCUDA Fortran Brent Leback The Portland Group
CUDA Fortran 2013 Brent Leback The Portland Group brent.leback@pgroup.com Why Fortran? Rich legacy in the scientific community Semantics easier to vectorize/parallelize Array descriptors Modules Fortran
More informationPorting Scientific Research Codes to GPUs with CUDA Fortran: Incompressible Fluid Dynamics using the Immersed Boundary Method
Porting Scientific Research Codes to GPUs with CUDA Fortran: Incompressible Fluid Dynamics using the Immersed Boundary Method Josh Romero, Massimiliano Fatica - NVIDIA Vamsi Spandan, Roberto Verzicco -
More informationCUDA 5 Features in PGI CUDA Fortran 2013
第 1 頁, 共 7 頁 Technical News from The Portland Group PGI Home Page March 2013 CUDA 5 Features in PGI CUDA Fortran 2013 by Brent Leback PGI Engineering Manager The 2013 release of PGI CUDA Fortran introduces
More informationIntroduction to Parallel Computing with CUDA. Oswald Haan
Introduction to Parallel Computing with CUDA Oswald Haan ohaan@gwdg.de Schedule Introduction to Parallel Computing with CUDA Using CUDA CUDA Application Examples Using Multiple GPUs CUDA Application Libraries
More informationRegister file. A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks.
Sharing the resources of an SM Warp 0 Warp 1 Warp 47 Register file A single large register file (ex. 16K registers) is partitioned among the threads of the dispatched blocks Shared A single SRAM (ex. 16KB)
More informationGPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh
GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA
More informationCUDA Lecture 2. Manfred Liebmann. Technische Universität München Chair of Optimal Control Center for Mathematical Sciences, M17
CUDA Lecture 2 Manfred Liebmann Technische Universität München Chair of Optimal Control Center for Mathematical Sciences, M17 manfred.liebmann@tum.de December 15, 2015 CUDA Programming Fundamentals CUDA
More informationPractical Introduction to CUDA and GPU
Practical Introduction to CUDA and GPU Charlie Tang Centre for Theoretical Neuroscience October 9, 2009 Overview CUDA - stands for Compute Unified Device Architecture Introduced Nov. 2006, a parallel computing
More informationModule 3: CUDA Execution Model -I. Objective
ECE 8823A GPU Architectures odule 3: CUDA Execution odel -I 1 Objective A more detailed look at kernel execution Data to thread assignment To understand the organization and scheduling of threads Resource
More informationLecture 3: Introduction to CUDA
CSCI-GA.3033-004 Graphics Processing Units (GPUs): Architecture and Programming Lecture 3: Introduction to CUDA Some slides here are adopted from: NVIDIA teaching kit Mohamed Zahran (aka Z) mzahran@cs.nyu.edu
More informationGPU Programming. Lecture 2: CUDA C Basics. Miaoqing Huang University of Arkansas 1 / 34
1 / 34 GPU Programming Lecture 2: CUDA C Basics Miaoqing Huang University of Arkansas 2 / 34 Outline Evolvements of NVIDIA GPU CUDA Basic Detailed Steps Device Memories and Data Transfer Kernel Functions
More informationOverview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming
Overview Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.
More informationCME 213 S PRING Eric Darve
CME 213 S PRING 2017 Eric Darve Review Secret behind GPU performance: simple cores but a large number of them; even more threads can exist live on the hardware (10k 20k threads live). Important performance
More informationLab 1 Part 1: Introduction to CUDA
Lab 1 Part 1: Introduction to CUDA Code tarball: lab1.tgz In this hands-on lab, you will learn to use CUDA to program a GPU. The lab can be conducted on the SSSU Fermi Blade (M2050) or NCSA Forge using
More informationACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC
ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018 PGI THE NVIDIA HPC SDK Fortran, C & C++
More informationCUDA Workshop. High Performance GPU computing EXEBIT Karthikeyan
CUDA Workshop High Performance GPU computing EXEBIT- 2014 Karthikeyan CPU vs GPU CPU Very fast, serial, Low Latency GPU Slow, massively parallel, High Throughput Play Demonstration Compute Unified Device
More informationSparse Linear Algebra in CUDA
Sparse Linear Algebra in CUDA HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 22 nd 2017 Table of Contents Homework - Worksheet 2
More informationGPU Computing Master Clss. Development Tools
GPU Computing Master Clss Development Tools Generic CUDA debugger goals Support all standard debuggers across all OS Linux GDB, TotalView and DDD Windows Visual studio Mac - XCode Support CUDA runtime
More informationComputational Fluid Dynamics (CFD) using Graphics Processing Units
Computational Fluid Dynamics (CFD) using Graphics Processing Units Aaron F. Shinn Mechanical Science and Engineering Dept., UIUC Accelerators for Science and Engineering Applications: GPUs and Multicores
More informationPROFILER OPENACC TUTORIAL. Version 2018
PROFILER OPENACC TUTORIAL Version 2018 TABLE OF CONTENTS Chapter Chapter Chapter Chapter Chapter 1. 2. 3. 4. 5. Tutorial Setup... 1 Profiling the application... 2 Adding OpenACC directives...4 Improving
More informationScientific Computing with GPUs Autotuning GEMMs Fermi GPUs
Parallel Processing and Applied Mathematics September 11-14, 2011 Toruń, Poland Scientific Computing with GPUs Autotuning GEMMs Fermi GPUs Innovative Computing Laboratory Electrical Engineering and Computer
More informationCUDA programming model. N. Cardoso & P. Bicudo. Física Computacional (FC5)
CUDA programming model N. Cardoso & P. Bicudo Física Computacional (FC5) N. Cardoso & P. Bicudo CUDA programming model 1/23 Outline 1 CUDA qualifiers 2 CUDA Kernel Thread hierarchy Kernel, configuration
More informationPGPROF OpenACC Tutorial
PGPROF OpenACC Tutorial Version 2017 PGI Compilers and Tools TABLE OF CONTENTS Chapter 1. Tutorial Setup...1 Chapter 2. Profiling the application... 2 Chapter 3. Adding OpenACC directives... 4 Chapter
More informationn N c CIni.o ewsrg.au
@NCInews NCI and Raijin National Computational Infrastructure 2 Our Partners General purpose, highly parallel processors High FLOPs/watt and FLOPs/$ Unit of execution Kernel Separate memory subsystem GPGPU
More informationModule Memory and Data Locality
GPU Teaching Kit Accelerated Computing Module 4.4 - Memory and Data Locality Tiled Matrix Multiplication Kernel Objective To learn to write a tiled matrix-multiplication kernel Loading and using tiles
More informationECE 408 / CS 483 Final Exam, Fall 2014
ECE 408 / CS 483 Final Exam, Fall 2014 Thursday 18 December 2014 8:00 to 11:00 Central Standard Time You may use any notes, books, papers, or other reference materials. In the interest of fair access across
More informationLecture 1: an introduction to CUDA
Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Overview hardware view software view CUDA programming
More informationGPU & High Performance Computing (by NVIDIA) CUDA. Compute Unified Device Architecture Florian Schornbaum
GPU & High Performance Computing (by NVIDIA) CUDA Compute Unified Device Architecture 29.02.2008 Florian Schornbaum GPU Computing Performance In the last few years the GPU has evolved into an absolute
More informationParallel Computing. Lecture 19: CUDA - I
CSCI-UA.0480-003 Parallel Computing Lecture 19: CUDA - I Mohamed Zahran (aka Z) mzahran@cs.nyu.edu http://www.mzahran.com GPU w/ local DRAM (device) Behind CUDA CPU (host) Source: http://hothardware.com/reviews/intel-core-i5-and-i7-processors-and-p55-chipset/?page=4
More informationCUDA Parallelism Model
GPU Teaching Kit Accelerated Computing CUDA Parallelism Model Kernel-Based SPMD Parallel Programming Multidimensional Kernel Configuration Color-to-Grayscale Image Processing Example Image Blur Example
More informationCUDA Fortran. Programming Guide and Reference. Release The Portland Group
CUDA Fortran Programming Guide and Reference Release 2011 The Portland Group While every precaution has been taken in the preparation of this document, The Portland Group (PGI ), a wholly-owned subsidiary
More informationCard Sizes. Tesla K40: 2880 processors; 12 GB memory
Card Sizes Tesla K40: 2880 processors; 12 GB memory Data bigger than grid Maximum grid sizes Compute capability 1.0, 1D and 2D grids supported Compute capability 2, 3, 3D grids too. Grid sizes: 65,535
More informationInformation Coding / Computer Graphics, ISY, LiTH. CUDA memory! ! Coalescing!! Constant memory!! Texture memory!! Pinned memory 26(86)
26(86) Information Coding / Computer Graphics, ISY, LiTH CUDA memory Coalescing Constant memory Texture memory Pinned memory 26(86) CUDA memory We already know... Global memory is slow. Shared memory is
More informationIntroduction to GPU programming. Introduction to GPU programming p. 1/17
Introduction to GPU programming Introduction to GPU programming p. 1/17 Introduction to GPU programming p. 2/17 Overview GPUs & computing Principles of CUDA programming One good reference: David B. Kirk
More informationMPI + X programming. UTK resources: Rho Cluster with GPGPU George Bosilca CS462
MPI + X programming UTK resources: Rho Cluster with GPGPU https://newton.utk.edu/doc/documentation/systems/rhocluster George Bosilca CS462 MPI Each programming paradigm only covers a particular spectrum
More informationIntroduction to Scientific Programming using GPGPU and CUDA
Introduction to Scientific Programming using GPGPU and CUDA Day 1 Sergio Orlandini s.orlandini@cineca.it Mario Tacconi m.tacconi@cineca.it 0 Hands on: Compiling a CUDA program Environment and utility:
More informationCOSC 6374 Parallel Computations Introduction to CUDA
COSC 6374 Parallel Computations Introduction to CUDA Edgar Gabriel Fall 2014 Disclaimer Material for this lecture has been adopted based on various sources Matt Heavener, CS, State Univ. of NY at Buffalo
More informationTesla Architecture, CUDA and Optimization Strategies
Tesla Architecture, CUDA and Optimization Strategies Lan Shi, Li Yi & Liyuan Zhang Hauptseminar: Multicore Architectures and Programming Page 1 Outline Tesla Architecture & CUDA CUDA Programming Optimization
More informationUsing a GPU in InSAR processing to improve performance
Using a GPU in InSAR processing to improve performance Rob Mellors, ALOS PI 152 San Diego State University David Sandwell University of California, San Diego What is a GPU? (Graphic Processor Unit) A graphics
More informationAdvanced hybrid MPI+OpenMP programming. Carlo Cavazzoni SuperComputing Applications and Innovation Department
Advanced hybrid MPI+OpenMP programming Carlo Cavazzoni c.cavazzoni@cineca.it SuperComputing Applications and Innovation Department February 11-15, 2013 Architecture features Floating point units (multiply
More informationMassively Parallel Computing with CUDA. Carlos Alberto Martínez Angeles Cinvestav-IPN
Massively Parallel Computing with CUDA Carlos Alberto Martínez Angeles Cinvestav-IPN What is a GPU? A graphics processing unit (GPU) The term GPU was popularized by Nvidia in 1999 marketed the GeForce
More informationGPU programming basics. Prof. Marco Bertini
GPU programming basics Prof. Marco Bertini CUDA: atomic operations, privatization, algorithms Atomic operations The basics atomic operation in hardware is something like a read-modify-write operation performed
More informationS CUDA on Xavier
S8868 - CUDA on Xavier Anshuman Bhat CUDA Product Manager Saikat Dasadhikari CUDA Engineering 29 th March 2018 1 CUDA ECOSYSTEM 2018 CUDA DOWNLOADS IN 2017 3,500,000 CUDA REGISTERED DEVELOPERS 800,000
More informationLecture 15: Introduction to GPU programming. Lecture 15: Introduction to GPU programming p. 1
Lecture 15: Introduction to GPU programming Lecture 15: Introduction to GPU programming p. 1 Overview Hardware features of GPGPU Principles of GPU programming A good reference: David B. Kirk and Wen-mei
More informationData Parallel Execution Model
CS/EE 217 GPU Architecture and Parallel Programming Lecture 3: Kernel-Based Data Parallel Execution Model David Kirk/NVIDIA and Wen-mei Hwu, 2007-2013 Objective To understand the organization and scheduling
More informationGPU Programming. Ringberg Theorie Seminar 2010
or How to tremendously accelerate your code? Michael Kraus, Christian Konz Max-Planck-Institut für Plasmaphysik, Garching Ringberg Theorie Seminar 2010 Introduction? GPU? GPUs can do more than just render
More informationCUDA Accelerated Linpack on Clusters. E. Phillips, NVIDIA Corporation
CUDA Accelerated Linpack on Clusters E. Phillips, NVIDIA Corporation Outline Linpack benchmark CUDA Acceleration Strategy Fermi DGEMM Optimization / Performance Linpack Results Conclusions LINPACK Benchmark
More informationReal-time Graphics 9. GPGPU
Real-time Graphics 9. GPGPU GPGPU GPU (Graphics Processing Unit) Flexible and powerful processor Programmability, precision, power Parallel processing CPU Increasing number of cores Parallel processing
More informationMathematical computations with GPUs
Master Educational Program Information technology in applications Mathematical computations with GPUs Using GPUs for mathematical problems in Fortran, Java and C# Alexey A. Romanenko arom@ccfit.nsu.ru
More informationLecture 8: GPU Programming. CSE599G1: Spring 2017
Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library
More informationENDURING DIFFERENTIATION Timothy Lanfear
ENDURING DIFFERENTIATION Timothy Lanfear WHERE ARE WE? 2 LIFE AFTER DENNARD SCALING GPU-ACCELERATED PERFORMANCE 10 7 40 Years of Microprocessor Trend Data 10 6 10 5 10 4 10 3 10 2 Single-threaded perf
More informationENDURING DIFFERENTIATION. Timothy Lanfear
ENDURING DIFFERENTIATION Timothy Lanfear WHERE ARE WE? 2 LIFE AFTER DENNARD SCALING 10 7 40 Years of Microprocessor Trend Data 10 6 10 5 10 4 Transistors (thousands) 1.1X per year 10 3 10 2 Single-threaded
More informationDense Linear Algebra. HPC - Algorithms and Applications
Dense Linear Algebra HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 6 th 2017 Last Tutorial CUDA Architecture thread hierarchy:
More informationIntroduction to GPU Computing. 周国峰 Wuhan University 2017/10/13
Introduction to GPU Computing chandlerz@nvidia.com 周国峰 Wuhan University 2017/10/13 GPU and Its Application 3 Ways to Develop Your GPU APP An Example to Show the Developments Add GPUs: Accelerate Science
More informationInformation Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY
Introduction to CUDA Ingemar Ragnemalm Information Coding, ISY This lecture: Programming model and language Memory spaces and memory access Shared memory Examples Lecture questions: 1. Suggest two significant
More informationA few notes on parallel programming with CUDA
A few notes on parallel programming with CUDA Using parallel computing can significantly speed up execution and in many cases can be quite straightforward to implement. These notes focus on those simple
More informationHigh Performance Computing and GPU Programming
High Performance Computing and GPU Programming Lecture 1: Introduction Objectives C++/CPU Review GPU Intro Programming Model Objectives Objectives Before we begin a little motivation Intel Xeon 2.67GHz
More informationGraph Partitioning. Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM.
Graph Partitioning Standard problem in parallelization, partitioning sparse matrix in nearly independent blocks or discretization grids in FEM. Partition given graph G=(V,E) in k subgraphs of nearly equal
More informationScientific discovery, analysis and prediction made possible through high performance computing.
Scientific discovery, analysis and prediction made possible through high performance computing. An Introduction to GPGPU Programming Bob Torgerson Arctic Region Supercomputing Center November 21 st, 2013
More informationSupporting Data Parallelism in Matcloud: Final Report
Supporting Data Parallelism in Matcloud: Final Report Yongpeng Zhang, Xing Wu 1 Overview Matcloud is an on-line service to run Matlab-like script on client s web browser. Internally it is accelerated by
More informationGPU Computing Workshop CSU Getting Started. Garland Durham Quantos Analytics
1 GPU Computing Workshop CSU 2013 Getting Started Garland Durham Quantos Analytics nvidia-smi 2 At command line, run command nvidia-smi to get/set GPU properties. nvidia-smi Options: -q query -L list attached
More informationFast Bilateral Filter GPU implementation
Fast Bilateral Filter GPU implementation Multi-Core Architectures and Programming Gerhard Mlady, Rafael Bernardelli Hardware/Software Co-Design, University of Erlangen-Nuremberg July 21, 2016 Overview
More informationOvercoming the Barriers to Sustained Petaflop Performance. William D. Gropp Mathematics and Computer Science
Overcoming the Barriers to Sustained Petaflop Performance William D. Gropp Mathematics and Computer Science www.mcs.anl.gov/~gropp But First Are we too CPU-centric? What about I/O? What do applications
More informationDevice Memories and Matrix Multiplication
Device Memories and Matrix Multiplication 1 Device Memories global, constant, and shared memories CUDA variable type qualifiers 2 Matrix Multiplication an application of tiling runningmatrixmul in the
More informationIntroduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator
Introduction to CUDA C/C++ Mark Ebersole, NVIDIA CUDA Educator What is CUDA? Programming language? Compiler? Classic car? Beer? Coffee? CUDA Parallel Computing Platform www.nvidia.com/getcuda Programming
More informationCUDA PROGRAMMING MODEL. Carlo Nardone Sr. Solution Architect, NVIDIA EMEA
CUDA PROGRAMMING MODEL Carlo Nardone Sr. Solution Architect, NVIDIA EMEA CUDA: COMMON UNIFIED DEVICE ARCHITECTURE Parallel computing architecture and programming model GPU Computing Application Includes
More informationIntroduction to Numerical General Purpose GPU Computing with NVIDIA CUDA. Part 1: Hardware design and programming model
Introduction to Numerical General Purpose GPU Computing with NVIDIA CUDA Part 1: Hardware design and programming model Dirk Ribbrock Faculty of Mathematics, TU dortmund 2016 Table of Contents Why parallel
More informationInformation Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY
Introduction to CUDA Ingemar Ragnemalm Information Coding, ISY This lecture: Programming model and language Introduction to memory spaces and memory access Shared memory Matrix multiplication example Lecture
More informationOutline 2011/10/8. Memory Management. Kernels. Matrix multiplication. CIS 565 Fall 2011 Qing Sun
Outline Memory Management CIS 565 Fall 2011 Qing Sun sunqing@seas.upenn.edu Kernels Matrix multiplication Managing Memory CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory
More informationCS 179: GPU Computing. Recitation 2: Synchronization, Shared memory, Matrix Transpose
CS 179: GPU Computing Recitation 2: Synchronization, Shared memory, Matrix Transpose Synchronization Ideal case for parallelism: no resources shared between threads no communication between threads Many
More informationBatch Linear Algebra for GPU-Accelerated High Performance Computing Environments
Batch Linear Algebra for GPU-Accelerated High Performance Computing Environments Ahmad Abdelfattah, Azzam Haidar, Stanimire Tomov, and Jack Dongarra SIAM Conference on Computational Science and Engineering
More informationnaïve GPU kernels generation from Fortran source code Dmitry Mikushin
KernelGen naïve GPU kernels generation from Fortran source code Dmitry Mikushin Contents Motivation and target Assembling our own toolchain: schemes and details Toolchain usecase: sincos example Development
More informationIntroduction to GPGPUs and to CUDA programming model
Introduction to GPGPUs and to CUDA programming model www.cineca.it Marzia Rivi m.rivi@cineca.it GPGPU architecture CUDA programming model CUDA efficient programming Debugging & profiling tools CUDA libraries
More informationGPU programming. Dr. Bernhard Kainz
GPU programming Dr. Bernhard Kainz Overview About myself Motivation GPU hardware and system architecture GPU programming languages GPU programming paradigms Pitfalls and best practice Reduction and tiling
More informationSolving the heat equation with CUDA
Solving the heat equation with CUDA Oliver Meister January 09 th 2013 Last Tutorial CSR kernel - scalar One row per thread No coalesced memory access Non-uniform matrices CSR kernel - vectorized One row
More informationJosef Pelikán, Jan Horáček CGG MFF UK Praha
GPGPU and CUDA 2012-2018 Josef Pelikán, Jan Horáček CGG MFF UK Praha pepca@cgg.mff.cuni.cz http://cgg.mff.cuni.cz/~pepca/ 1 / 41 Content advances in hardware multi-core vs. many-core general computing
More informationIntroduction to CUDA
Introduction to CUDA Overview HW computational power Graphics API vs. CUDA CUDA glossary Memory model, HW implementation, execution Performance guidelines CUDA compiler C/C++ Language extensions Limitations
More informationUse of Accelerate Tools PGI CUDA FORTRAN Jacket
Use of Accelerate Tools PGI CUDA FORTRAN Jacket Supercomputing Institute For Advanced Computational Research e-mail: szhang@msi.umn.edu or help@msi.umn.edu Tel: 612-624-8858 (direct), 612-626-0802(help)
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca (Slides http://support.scinet.utoronto.ca/ northrup/westgrid CUDA.pdf) March 12, 2014
More informationHPC Middle East. KFUPM HPC Workshop April Mohamed Mekias HPC Solutions Consultant. Introduction to CUDA programming
KFUPM HPC Workshop April 29-30 2015 Mohamed Mekias HPC Solutions Consultant Introduction to CUDA programming 1 Agenda GPU Architecture Overview Tools of the Trade Introduction to CUDA C Patterns of Parallel
More informationPERFORMANCE ANALYSIS AND DEBUGGING FOR VOLTA. Felix Schmitt 11 th Parallel Tools Workshop September 11-12, 2017
PERFORMANCE ANALYSIS AND DEBUGGING FOR VOLTA Felix Schmitt 11 th Parallel Tools Workshop September 11-12, 2017 INTRODUCING TESLA V100 Volta Architecture Improved NVLink & HBM2 Volta MPS Improved SIMT Model
More informationNAG Fortran Library Routine Document F01CTF.1
NAG Fortran Library Routine Document Note: before using this routine, please read the Users Note for your implementation to check the interpretation of bold italicised terms and other implementation-dependent
More informationIntroduction to GPGPU and GPU-architectures
Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks
More informationGPGPU. Alan Gray/James Perry EPCC The University of Edinburgh.
GPGPU Alan Gray/James Perry EPCC The University of Edinburgh a.gray@ed.ac.uk Contents Introduction GPU Technology Programming GPUs GPU Performance Optimisation 2 Introduction 3 Introduction Central Processing
More informationCS 179 Lecture 4. GPU Compute Architecture
CS 179 Lecture 4 GPU Compute Architecture 1 This is my first lecture ever Tell me if I m not speaking loud enough, going too fast/slow, etc. Also feel free to give me lecture feedback over email or at
More informationE6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices
E6895 Advanced Big Data Analytics Lecture 8: GPU Examples and GPU on ios devices Ching-Yung Lin, Ph.D. Adjunct Professor, Dept. of Electrical Engineering and Computer Science IBM Chief Scientist, Graph
More informationGPU Performance Nuggets
GPU Performance Nuggets Simon Garcia de Gonzalo & Carl Pearson PhD Students, IMPACT Research Group Advised by Professor Wen-mei Hwu Jun. 15, 2016 grcdgnz2@illinois.edu pearson@illinois.edu GPU Performance
More informationBasic Elements of CUDA Algoritmi e Calcolo Parallelo. Daniele Loiacono
Basic Elements of CUDA Algoritmi e Calcolo Parallelo References This set of slides is mainly based on: CUDA Technical Training, Dr. Antonino Tumeo, Pacific Northwest National Laboratory Slide of Applied
More informationTiled Matrix Multiplication
Tiled Matrix Multiplication Basic Matrix Multiplication Kernel global void MatrixMulKernel(int m, m, int n, n, int k, k, float* A, A, float* B, B, float* C) C) { int Row = blockidx.y*blockdim.y+threadidx.y;
More informationIntroduction to GPU Computing. Design and Analysis of Parallel Algorithms
Introduction to GPU Computing Design and Analysis of Parallel Algorithms Sources CUDA Programming Guide (3.2) CUDA Best Practices Guide (3.2) CUDA Toolkit Reference Manual (3.2) CUDA SDK Examples Part
More informationLearn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh
Learn CUDA in an Afternoon Alan Gray EPCC The University of Edinburgh Overview Introduction to CUDA Practical Exercise 1: Getting started with CUDA GPU Optimisation Practical Exercise 2: Optimising a CUDA
More informationIntroduction to GPU Computing Using CUDA. Spring 2014 Westgid Seminar Series
Introduction to GPU Computing Using CUDA Spring 2014 Westgid Seminar Series Scott Northrup SciNet www.scinethpc.ca March 13, 2014 Outline 1 Heterogeneous Computing 2 GPGPU - Overview Hardware Software
More informationCUDA Architecture & Programming Model
CUDA Architecture & Programming Model Course on Multi-core Architectures & Programming Oliver Taubmann May 9, 2012 Outline Introduction Architecture Generation Fermi A Brief Look Back At Tesla What s New
More informationKepler Overview Mark Ebersole
Kepler Overview Mark Ebersole TFLOPS TFLOPS 3x Performance in a Single Generation 3.5 3 2.5 2 1.5 1 0.5 0 1.25 1 Single Precision FLOPS (SGEMM) 2.90 TFLOPS.89 TFLOPS.36 TFLOPS Xeon E5-2690 Tesla M2090
More informationIntroduction to CUDA CME343 / ME May James Balfour [ NVIDIA Research
Introduction to CUDA CME343 / ME339 18 May 2011 James Balfour [ jbalfour@nvidia.com] NVIDIA Research CUDA Programing system for machines with GPUs Programming Language Compilers Runtime Environments Drivers
More informationAdrian Tate XK6 / openacc workshop Manno, Mar
Adrian Tate XK6 / openacc workshop Manno, Mar6-7 2012 1 Overview & Philosophy Two modes of usage Contents Present contents Upcoming releases Optimization of libsci_acc Autotuning Adaptation Asynchronous
More information