HMPP port. G. Colin de Verdière (CEA)

Size: px
Start display at page:

Download "HMPP port. G. Colin de Verdière (CEA)"

Transcription

1 HMPP port G. Colin de Verdière (CEA)

2 Overview.Uchu prototype HMPP MOD2AS MOD2AM HMPP in a real code 2

3 The UCHU prototype Bull servers 1 login node 4 nodes 2 Haperton, 8GB 2 NVIDIA Tesla S1070 IB DDR Slurm Software gcc/icc/ifort Cuda 2.3/OpenCL Rapidmind HMPP 2.2.1sp2 Uchu1 Nehalem-EP 24GB uchu2 uchu3 uchu4 uchu5 Tesla S1070 GPU0 GPU1 GPU0 GPU1 Tesla S1070 3

4 The real UCHU 4

5 HMPP HMPP is a high-level abstraction for manycore programming Takes annotated source code (pragma) Keeps code portability C & Fortran Has a low learning curve yet can reach significant performances Offers many high level features Asynchronous data transfers and kernel execution Offers target oriented optimisations 5

6 HMPP: Compilation workflow HMPP annoted source code HMPP Preprocessor HMPP Compiler Back-end Generators Application source code Standard Compiler Host application Target source code Target compiler Accelerated codelet library NVIDIA CUDA AMD CAL/IL OpenCL HMPP Runtime Target driver CPU GPU 6

7 HMPP: a small example #pragma hmpp sgemm codelet, target=cuda, args[vin1,vin2,vout].io=inout void sgemm( int m, int n, int k, float alpha, float vin1[n][n], float vin2[n][n], float beta, float vout[n][n] ) { /*... */ int main(int argc, char **argv) { /*... */ for( j = 0 ; j < NB_RUNS ; j++ ) { #pragma hmpp sgemm callsite sgemm( size, size, size, alpha, vin1, vin2, beta, vout ); /*... */ Declare codelet with CUDA target Synchronous codelet call 7

8 HMPP: a more elaborate version int main(int argc, char **argv) { /*... */ #pragma hmpp sgemm allocate /*... */ #pragma hmpp sgemm advancedload, args[vin1;m;n;k;alpha;beta] /*... */ Preload data for( j = 0 ; j < 2 ; j++ ) { #pragma hmpp sgemm callsite & #pragma hmpp sgemm args[m;n;k;alpha;beta;vin1].advancedload=true sgemm( size, size, size, alpha, vin1, vin2, beta, vout ); /*... */ /*... */ #pragma hmpp sgemm release Avoid reloading data 8

9 MOD2AM: original code void mxm(int m, int l, int n, double a[m][l], double b[l][n], double c[m][n]) { int i, j, k, lf, mf; mf = m m % 4; for (i = 0; i < mf; i += 4) { for (j = 0; j < n; j++) { c[i][j] = 0.0; c[i + 1][j] = 0.0; c[i + 2][j] = 0.0; c[i + 3][j] = 0.0; lf = l l % 4; for (i = 0; i < m; i++) { for (j = 0; j < lf; j += 4) { for (k = 0; k < n; k++) { c[i][k] = c[i][k] + a[i][j + 0] * b[j + 0][k] + a[i][j + 1] * b[j + 1][k] + a[i][j + 2] * b[j + 2][k] + a[i][j + 3] * b[j + 3][k]; for (i = 0; i < m; i++) { for (j = lf; j < l; j++) { for (k = 0; k < n; k++) { c[i][k] = c[i][k] + a[i][j] * b[j][k]; // mxm 9

10 MOD2AM: HMPP version main() #pragma hmpp Hmxm advancedload, args[a;b], args[a].size={m,l, args[b].size={l,n for (i = 0; i < nrep; i++) { #pragma hmpp Hmxm callsite, args[a;b].advancedload=true mxm(m, l, n, (double (*)[m]) a, (double (*)[n]) b, (double (*)[n]) c); #pragma hmpp Hmxm delegatedstore, args[c] time = cclock() time; ok = check(m, l, n, (double (*)[n]) c); 10

11 MOD2AM: mxm.h #pragma hmpp Hmxm codelet, args[a;b].io=in, args[c].io=out, TARGET=CUDA, args[a].size={m,l, args[b].size={l,n, args[c].size={m,n void mxm(int m, int l, int n, double a[m][l], double b[l][n], double c[m][n]); 11

12 MOD2AM: mxm.c optimized #include mxm.h void mxm (int m, int l, int n, const double a[m][l], const double b[l][n], double c[m][n]) { #define HUNROLL 2 int i, j, k, lf, mf; int rest = n % HUNROLL; for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { c[i][j] = 0.0; 12

13 #pragma hmppcg unroll(6), jam(k) for (i = 0; i < m; i++) { #pragma hmppcg unroll(2), split, noremainder for (k = 0; k < n rest; k++) { double prod = 0.0; double va, vb; j = 0; va = a[i][j]; // trick: prefetching of arrays a and b vb = b[j][k]; for (j = 1; j < l; j++) { // prefetching: index 0 has been loaded prod += va * vb; va = a[i][j]; vb = b[j][k]; prod += va * vb; // prefetching: don t forget the last index c[i][k] += prod; 13

14 if (rest) { // leftovers using the same trick for (i = 0; i < m; i++) { for (k = n rest; k < n; k++) { double prod = 0.0; double va, vb; j = 0; va = a[i][j]; vb = b[j][k]; for (j = 1; j < l; j++) { prod += va * vb; va = a[i][j]; vb = b[j][k]; prod += va * vb; c[i][k] += prod; 14

15 MOD2AM: results on uchu (78GF DP peak) env HMPPCG_FLAGS="--cuda-block-size 64x1" ccc_mprun./h.mod2am T T T T T T T T T T T T T T T T T 15

16 16

17 MOD2AS: original code #endif void SPMXV(int ncols, int nrows, int nelmts, int indx[nelmts], int rowp[nrows], REAL matvals[nelmts], REAL invec[ncols], REAL outvec[nrows]) { int i, j, nrest; for (i = 0; i < nrows; i++) outvec[i] = (REAL) 0.0; for (i = 0; i < nrows 1; i++) { nrest = (rowp[i + 1] rowp[i]) % 4; for (j = rowp[i]; j < rowp[i + 1] nrest; j += 4) { outvec[i] = outvec[i] + (matvals[j] * invec[indx[j]]) + matvals[j + 1] * invec[indx[j + 1]] + matvals[j + 2] * invec[indx[j + 2]] + matvals[j + 3] * invec[indx[j + 3]]; for (j = rowp[i + 1] nrest; j < rowp[i + 1]; j++) { outvec[i] = outvec[i] + matvals[j] * invec[indx[j]]; for (j = rowp[nrows 1]; j < nelmts; j++) { outvec[nrows 1] = outvec[nrows 1] + matvals[j] * invec[indx[j]]; 17

18 MOD2AS: main() #define LOOPS 1 for (j = 0; j < LOOPS; j++) { for (i = 0; i < nrows; i++) outvec[i] = (REAL) 0.0; #pragma hmpp Hspmxv callsite spmxv(ncols, nrows, nelmts, indx, rowp, matvals, invec, outvec); time = cclock() time; time /= LOOPS; ok = check(ncols, nrows, nelmts, indx, rowp, outvec); 18

19 MOD2AS: spmxv.c void spmxv(int ncols, int nrows, int nelmts, int indx[nelmts], int rowp[nrows], double matvals[nelmts], double invec[ncols], double outvec[nrows]) { int i, j, k; int start, end; int nrest = 0; int warpoff = 0; double somme = 0.; #define SIZE 32*2 double partial[nrows * SIZE]; for (i = 0; i < nrows; i++) { outvec[i] = (double) 0.0; 19

20 2D Grid // do partial sums in // taking warp sizes into account for (i = 0; i < nrows; i++) { for (warpoff = 0; warpoff < SIZE; warpoff++) { start = rowp[i]; if (i == (nrows 1)) { end = nelmts; else { end = rowp[i + 1]; somme = 0; // partial reduction for (j = start + warpoff; j < end; j += SIZE) { somme += (matvals[j] * invec[indx[j]]); partial[i * SIZE + warpoff] = somme; // for warpoff // for i 20

21 1D Grid // finish summing for (i = 0; i < nrows; i++) { somme = 0.; for (k = 0; k < SIZE; k++) { somme += partial[i * SIZE + k]; outvec[i] = somme; // spmxv 21

22 Memory access for mod2as // Row i start end SIZE threads in blocks // // Row i SIZE 22

23 MOD2AS: results on Uchu Program mod2as: Sparse (CRS) Matrix-vector Multiply #Rows #Cols %Fill Time(s) Mflop/s OK T T T T T T T T T T T T T T T T 23

24 24

25 Harpertow n HMPP in a real code Test case =10000x10000, 10 iterations DP Accélération ,05 8,6 10,3 Sér ie1 0 Harpertown 2.800Ghz 1626s gcc Harpertown 2.800Ghz s icc Nehalem EP 2.933GHz s icc Tesla T10 1.3GHz s HMPP A ccélératio n GCC ICC HMPP CUDA Nehalem-EP 6,15 5,15 1 Sér ie1 Tesla T10 1.3GHz s CUDA 0 ICC HMPP CUDA 25

26 CUDA Performances after optimization 26

27 HMPP port : some conclusions Kernels are important to understand a language Yet only full applications can exercise a language Only a small part of HMPP has been tested here Port to HMPP is easy The difficulty lies in the data movements HMPP allows for decent performances in real cases Asynchronous transfers and calls are key for performance 27

CAPS Technology. ProHMPT, 2009 March12 th

CAPS Technology. ProHMPT, 2009 March12 th CAPS Technology ProHMPT, 2009 March12 th Overview of the Talk 1. HMPP in a nutshell Directives for Hardware Accelerators (HWA) 2. HMPP Code Generation Capabilities Efficient code generation for CUDA 3.

More information

Heterogeneous Multicore Parallel Programming

Heterogeneous Multicore Parallel Programming Innovative software for manycore paradigms Heterogeneous Multicore Parallel Programming S. Chauveau & L. Morin & F. Bodin Introduction Numerous legacy applications can benefit from GPU computing Many programming

More information

MIGRATION OF LEGACY APPLICATIONS TO HETEROGENEOUS ARCHITECTURES Francois Bodin, CTO, CAPS Entreprise. June 2011

MIGRATION OF LEGACY APPLICATIONS TO HETEROGENEOUS ARCHITECTURES Francois Bodin, CTO, CAPS Entreprise. June 2011 MIGRATION OF LEGACY APPLICATIONS TO HETEROGENEOUS ARCHITECTURES Francois Bodin, CTO, CAPS Entreprise June 2011 FREE LUNCH IS OVER, CODES HAVE TO MIGRATE! Many existing legacy codes needs to migrate to

More information

Pragma-based GPU Programming and HMPP Workbench. Scott Grauer-Gray

Pragma-based GPU Programming and HMPP Workbench. Scott Grauer-Gray Pragma-based GPU Programming and HMPP Workbench Scott Grauer-Gray Pragma-based GPU programming Write programs for GPU processing without (directly) using CUDA/OpenCL Place pragmas to drive processing on

More information

Dealing with Heterogeneous Multicores

Dealing with Heterogeneous Multicores Dealing with Heterogeneous Multicores François Bodin INRIA-UIUC, June 12 th, 2009 Introduction Main stream applications will rely on new multicore / manycore architectures It is about performance not parallelism

More information

Parallel Hybrid Computing F. Bodin, CAPS Entreprise

Parallel Hybrid Computing F. Bodin, CAPS Entreprise Parallel Hybrid Computing F. Bodin, CAPS Entreprise Introduction Main stream applications will rely on new multicore / manycore architectures It is about performance not parallelism Various heterogeneous

More information

RapidMind & PGI Accelerator Compiler. Dr. Volker Weinberg Leibniz-Rechenzentrum der Bayerischen Akademie der Wissenschaften

RapidMind & PGI Accelerator Compiler. Dr. Volker Weinberg Leibniz-Rechenzentrum der Bayerischen Akademie der Wissenschaften RapidMind & PGI Accelerator Compiler Dr. Volker Weinberg Leibniz-Rechenzentrum der Bayerischen Akademie der Wissenschaften volker.weinberg@lrz.de PRACE Workshop New Languages & Future Technology Prototypes

More information

Incremental Migration of C and Fortran Applications to GPGPU using HMPP HPC Advisory Council China Conference 2010

Incremental Migration of C and Fortran Applications to GPGPU using HMPP HPC Advisory Council China Conference 2010 Innovative software for manycore paradigms Incremental Migration of C and Fortran Applications to GPGPU using HMPP HPC Advisory Council China Conference 2010 Introduction Many applications can benefit

More information

Parallel Hybrid Computing Stéphane Bihan, CAPS

Parallel Hybrid Computing Stéphane Bihan, CAPS Parallel Hybrid Computing Stéphane Bihan, CAPS Introduction Main stream applications will rely on new multicore / manycore architectures It is about performance not parallelism Various heterogeneous hardware

More information

arxiv: v2 [cs.pf] 19 Feb 2010

arxiv: v2 [cs.pf] 19 Feb 2010 RapidMind: Portability across Architectures and its Limitations Iris Christadler and Volker Weinberg Leibniz-Rechenzentrum der Bayerischen Akademie der Wissenschaften, D-85748 Garching bei München, Germany

More information

Parallel Hybrid Computing F. Bodin, CAPS Entreprise

Parallel Hybrid Computing F. Bodin, CAPS Entreprise Parallel Hybrid Computing F. Bodin, CAPS Entreprise Introduction Main stream applications will rely on new multicore / manycore architectures It is about performance not parallelism Various heterogeneous

More information

Addressing Heterogeneity in Manycore Applications

Addressing Heterogeneity in Manycore Applications Addressing Heterogeneity in Manycore Applications RTM Simulation Use Case stephane.bihan@caps-entreprise.com Oil&Gas HPC Workshop Rice University, Houston, March 2008 www.caps-entreprise.com Introduction

More information

Locality-Aware Automatic Parallelization for GPGPU with OpenHMPP Directives

Locality-Aware Automatic Parallelization for GPGPU with OpenHMPP Directives Locality-Aware Automatic Parallelization for GPGPU with OpenHMPP Directives José M. Andión, Manuel Arenaz, François Bodin, Gabriel Rodríguez and Juan Touriño 7th International Symposium on High-Level Parallel

More information

Code Migration Methodology for Heterogeneous Systems

Code Migration Methodology for Heterogeneous Systems Code Migration Methodology for Heterogeneous Systems Directives based approach using HMPP - OpenAcc F. Bodin, CAPS CTO Introduction Many-core computing power comes from parallelism o Multiple forms of

More information

JCudaMP: OpenMP/Java on CUDA

JCudaMP: OpenMP/Java on CUDA JCudaMP: OpenMP/Java on CUDA Georg Dotzler, Ronald Veldema, Michael Klemm Programming Systems Group Martensstraße 3 91058 Erlangen Motivation Write once, run anywhere - Java Slogan created by Sun Microsystems

More information

Accelerating Financial Applications on the GPU

Accelerating Financial Applications on the GPU Accelerating Financial Applications on the GPU Scott Grauer-Gray Robert Searles William Killian John Cavazos Department of Computer and Information Science University of Delaware Sixth Workshop on General

More information

Acceleration of a Python-based Tsunami Modelling Application via CUDA and OpenHMPP

Acceleration of a Python-based Tsunami Modelling Application via CUDA and OpenHMPP Acceleration of a Python-based Tsunami Modelling Application via CUDA and OpenHMPP Zhe Weng and Peter Strazdins*, Computer Systems Group, Research School of Computer Science, The Australian National University

More information

Data-parallel programming with Intel Array Building Blocks (ArBB)

Data-parallel programming with Intel Array Building Blocks (ArBB) Available online at www.prace-ri.eu Partnership for Advanced Computing in Europe Data-parallel programming with Intel Array Building Blocks (ArBB) Volker Weinberg * Leibniz Rechenzentrum der Bayerischen

More information

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015

Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Profiling and Parallelizing with the OpenACC Toolkit OpenACC Course: Lecture 2 October 15, 2015 Oct 1: Introduction to OpenACC Oct 6: Office Hours Oct 15: Profiling and Parallelizing with the OpenACC Toolkit

More information

Evaluation of Asynchronous Offloading Capabilities of Accelerator Programming Models for Multiple Devices

Evaluation of Asynchronous Offloading Capabilities of Accelerator Programming Models for Multiple Devices Evaluation of Asynchronous Offloading Capabilities of Accelerator Programming Models for Multiple Devices Jonas Hahnfeld 1, Christian Terboven 1, James Price 2, Hans Joachim Pflug 1, Matthias S. Müller

More information

Heterogeneous multicore parallel programming for graphics processing units

Heterogeneous multicore parallel programming for graphics processing units Scientific Programming 17 (2009) 325 335 325 DOI 10.3233/SPR-2009-0292 IOS Press Heterogeneous multicore parallel programming for graphics processing units Francois Bodin and Stephane Bihan CAPS entreprise,

More information

OpenACC (Open Accelerators - Introduced in 2012)

OpenACC (Open Accelerators - Introduced in 2012) OpenACC (Open Accelerators - Introduced in 2012) Open, portable standard for parallel computing (Cray, CAPS, Nvidia and PGI); introduced in 2012; GNU has an incomplete implementation. Uses directives in

More information

An Extension of XcalableMP PGAS Lanaguage for Multi-node GPU Clusters

An Extension of XcalableMP PGAS Lanaguage for Multi-node GPU Clusters An Extension of XcalableMP PGAS Lanaguage for Multi-node Clusters Jinpil Lee, Minh Tuan Tran, Tetsuya Odajima, Taisuke Boku and Mitsuhisa Sato University of Tsukuba 1 Presentation Overview l Introduction

More information

OpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016

OpenACC. Part I. Ned Nedialkov. McMaster University Canada. October 2016 OpenACC. Part I Ned Nedialkov McMaster University Canada October 2016 Outline Introduction Execution model Memory model Compiling pgaccelinfo Example Speedups Profiling c 2016 Ned Nedialkov 2/23 Why accelerators

More information

Parallelism III. MPI, Vectorization, OpenACC, OpenCL. John Cavazos,Tristan Vanderbruggen, and Will Killian

Parallelism III. MPI, Vectorization, OpenACC, OpenCL. John Cavazos,Tristan Vanderbruggen, and Will Killian Parallelism III MPI, Vectorization, OpenACC, OpenCL John Cavazos,Tristan Vanderbruggen, and Will Killian Dept of Computer & Information Sciences University of Delaware 1 Lecture Overview Introduction MPI

More information

Introduction to GPGPU and GPU-architectures

Introduction to GPGPU and GPU-architectures Introduction to GPGPU and GPU-architectures Henk Corporaal Gert-Jan van den Braak http://www.es.ele.tue.nl/ Contents 1. What is a GPU 2. Programming a GPU 3. GPU thread scheduling 4. GPU performance bottlenecks

More information

How to write code that will survive the many-core revolution Write once, deploy many(-cores) F. Bodin, CTO

How to write code that will survive the many-core revolution Write once, deploy many(-cores) F. Bodin, CTO How to write code that will survive the many-core revolution Write once, deploy many(-cores) F. Bodin, CTO Foreword How to write code that will survive the many-core revolution? is being setup as a collective

More information

OmpSs Fundamentals. ISC 2017: OpenSuCo. Xavier Teruel

OmpSs Fundamentals. ISC 2017: OpenSuCo. Xavier Teruel OmpSs Fundamentals ISC 2017: OpenSuCo Xavier Teruel Outline OmpSs brief introduction OmpSs overview and influence in OpenMP Execution model and parallelization approaches Memory model and target copies

More information

INTRODUCTION TO ACCELERATED COMPUTING WITH OPENACC. Jeff Larkin, NVIDIA Developer Technologies

INTRODUCTION TO ACCELERATED COMPUTING WITH OPENACC. Jeff Larkin, NVIDIA Developer Technologies INTRODUCTION TO ACCELERATED COMPUTING WITH OPENACC Jeff Larkin, NVIDIA Developer Technologies AGENDA Accelerated Computing Basics What are Compiler Directives? Accelerating Applications with OpenACC Identifying

More information

OpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware

OpenACC Standard. Credits 19/07/ OpenACC, Directives for Accelerators, Nvidia Slideware OpenACC Standard Directives for Accelerators Credits http://www.openacc.org/ o V1.0: November 2011 Specification OpenACC, Directives for Accelerators, Nvidia Slideware CAPS OpenACC Compiler, HMPP Workbench

More information

OpenACC Fundamentals. Steve Abbott November 15, 2017

OpenACC Fundamentals. Steve Abbott November 15, 2017 OpenACC Fundamentals Steve Abbott , November 15, 2017 AGENDA Data Regions Deep Copy 2 while ( err > tol && iter < iter_max ) { err=0.0; JACOBI ITERATION #pragma acc parallel loop reduction(max:err)

More information

Finite Element Integration and Assembly on Modern Multi and Many-core Processors

Finite Element Integration and Assembly on Modern Multi and Many-core Processors Finite Element Integration and Assembly on Modern Multi and Many-core Processors Krzysztof Banaś, Jan Bielański, Kazimierz Chłoń AGH University of Science and Technology, Mickiewicza 30, 30-059 Kraków,

More information

INTRODUCTION TO COMPILER DIRECTIVES WITH OPENACC

INTRODUCTION TO COMPILER DIRECTIVES WITH OPENACC INTRODUCTION TO COMPILER DIRECTIVES WITH OPENACC DR. CHRISTOPH ANGERER, NVIDIA *) THANKS TO JEFF LARKIN, NVIDIA, FOR THE SLIDES 3 APPROACHES TO GPU PROGRAMMING Applications Libraries Compiler Directives

More information

Programming paradigms for GPU devices

Programming paradigms for GPU devices Programming paradigms for GPU devices OpenAcc Introduction Sergio Orlandini s.orlandini@cineca.it 1 OpenACC introduction express parallelism optimize data movements practical examples 2 3 Ways to Accelerate

More information

Performance Diagnosis for Hybrid CPU/GPU Environments

Performance Diagnosis for Hybrid CPU/GPU Environments Performance Diagnosis for Hybrid CPU/GPU Environments Michael M. Smith and Karen L. Karavanic Computer Science Department Portland State University Performance Diagnosis for Hybrid CPU/GPU Environments

More information

Large-scale Deep Unsupervised Learning using Graphics Processors

Large-scale Deep Unsupervised Learning using Graphics Processors Large-scale Deep Unsupervised Learning using Graphics Processors Rajat Raina Anand Madhavan Andrew Y. Ng Stanford University Learning from unlabeled data Classify vs. car motorcycle Input space Unlabeled

More information

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh

GPU Programming. Alan Gray, James Perry EPCC The University of Edinburgh GPU Programming EPCC The University of Edinburgh Contents NVIDIA CUDA C Proprietary interface to NVIDIA architecture CUDA Fortran Provided by PGI OpenCL Cross platform API 2 NVIDIA CUDA CUDA allows NVIDIA

More information

Parallel Programming. Libraries and Implementations

Parallel Programming. Libraries and Implementations Parallel Programming Libraries and Implementations Reusing this material This work is licensed under a Creative Commons Attribution- NonCommercial-ShareAlike 4.0 International License. http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_us

More information

Towards a Performance- Portable FFT Library for Heterogeneous Computing

Towards a Performance- Portable FFT Library for Heterogeneous Computing Towards a Performance- Portable FFT Library for Heterogeneous Computing Carlo C. del Mundo*, Wu- chun Feng* *Dept. of ECE, Dept. of CS Virginia Tech Slides Updated: 5/19/2014 Forecast (Problem) AMD Radeon

More information

Using a GPU in InSAR processing to improve performance

Using a GPU in InSAR processing to improve performance Using a GPU in InSAR processing to improve performance Rob Mellors, ALOS PI 152 San Diego State University David Sandwell University of California, San Diego What is a GPU? (Graphic Processor Unit) A graphics

More information

Accelerated Test Execution Using GPUs

Accelerated Test Execution Using GPUs Accelerated Test Execution Using GPUs Vanya Yaneva Supervisors: Ajitha Rajan, Christophe Dubach Mathworks May 27, 2016 The Problem Software testing is time consuming Functional testing The Problem Software

More information

GPU CUDA Programming

GPU CUDA Programming GPU CUDA Programming 이정근 (Jeong-Gun Lee) 한림대학교컴퓨터공학과, 임베디드 SoC 연구실 www.onchip.net Email: Jeonggun.Lee@hallym.ac.kr ALTERA JOINT LAB Introduction 차례 Multicore/Manycore and GPU GPU on Medical Applications

More information

Parallel Programming Libraries and implementations

Parallel Programming Libraries and implementations Parallel Programming Libraries and implementations Partners Funding Reusing this material This work is licensed under a Creative Commons Attribution- NonCommercial-ShareAlike 4.0 International License.

More information

GPU Performance Optimisation. Alan Gray EPCC The University of Edinburgh

GPU Performance Optimisation. Alan Gray EPCC The University of Edinburgh GPU Performance Optimisation EPCC The University of Edinburgh Hardware NVIDIA accelerated system: Memory Memory GPU vs CPU: Theoretical Peak capabilities NVIDIA Fermi AMD Magny-Cours (6172) Cores 448 (1.15GHz)

More information

Introduction to OpenACC. Shaohao Chen Research Computing Services Information Services and Technology Boston University

Introduction to OpenACC. Shaohao Chen Research Computing Services Information Services and Technology Boston University Introduction to OpenACC Shaohao Chen Research Computing Services Information Services and Technology Boston University Outline Introduction to GPU and OpenACC Basic syntax and the first OpenACC program:

More information

High Performance Computing and GPU Programming

High Performance Computing and GPU Programming High Performance Computing and GPU Programming Lecture 1: Introduction Objectives C++/CPU Review GPU Intro Programming Model Objectives Objectives Before we begin a little motivation Intel Xeon 2.67GHz

More information

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh.

GPGPU. Alan Gray/James Perry EPCC The University of Edinburgh. GPGPU Alan Gray/James Perry EPCC The University of Edinburgh a.gray@ed.ac.uk Contents Introduction GPU Technology Programming GPUs GPU Performance Optimisation 2 Introduction 3 Introduction Central Processing

More information

Leftover. UMA vs. NUMA Parallel Computer Architecture. Shared-memory Distributed-memory

Leftover. UMA vs. NUMA   Parallel Computer Architecture. Shared-memory Distributed-memory Leftover Parallel Computer Architecture Shared-memory Distributed-memory 1 2 Shared- Parallel Computer Shared- Parallel Computer System Bus Main Shared Programming through threading Multiple processors

More information

Lecture 8: GPU Programming. CSE599G1: Spring 2017

Lecture 8: GPU Programming. CSE599G1: Spring 2017 Lecture 8: GPU Programming CSE599G1: Spring 2017 Announcements Project proposal due on Thursday (4/28) 5pm. Assignment 2 will be out today, due in two weeks. Implement GPU kernels and use cublas library

More information

Heterogeneous Computing and OpenCL

Heterogeneous Computing and OpenCL Heterogeneous Computing and OpenCL Hongsuk Yi (hsyi@kisti.re.kr) (Korea Institute of Science and Technology Information) Contents Overview of the Heterogeneous Computing Introduction to Intel Xeon Phi

More information

Device Memories and Matrix Multiplication

Device Memories and Matrix Multiplication Device Memories and Matrix Multiplication 1 Device Memories global, constant, and shared memories CUDA variable type qualifiers 2 Matrix Multiplication an application of tiling runningmatrixmul in the

More information

CME 213 S PRING Eric Darve

CME 213 S PRING Eric Darve CME 213 S PRING 2017 Eric Darve Review Secret behind GPU performance: simple cores but a large number of them; even more threads can exist live on the hardware (10k 20k threads live). Important performance

More information

SPOC : GPGPU programming through Stream Processing with OCaml

SPOC : GPGPU programming through Stream Processing with OCaml SPOC : GPGPU programming through Stream Processing with OCaml Mathias Bourgoin - Emmanuel Chailloux - Jean-Luc Lamotte January 23rd, 2012 GPGPU Programming Two main frameworks Cuda OpenCL Different Languages

More information

Kepler Overview Mark Ebersole

Kepler Overview Mark Ebersole Kepler Overview Mark Ebersole TFLOPS TFLOPS 3x Performance in a Single Generation 3.5 3 2.5 2 1.5 1 0.5 0 1.25 1 Single Precision FLOPS (SGEMM) 2.90 TFLOPS.89 TFLOPS.36 TFLOPS Xeon E5-2690 Tesla M2090

More information

Hands-on CUDA Optimization. CUDA Workshop

Hands-on CUDA Optimization. CUDA Workshop Hands-on CUDA Optimization CUDA Workshop Exercise Today we have a progressive exercise The exercise is broken into 5 steps If you get lost you can always catch up by grabbing the corresponding directory

More information

Learn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh

Learn CUDA in an Afternoon. Alan Gray EPCC The University of Edinburgh Learn CUDA in an Afternoon Alan Gray EPCC The University of Edinburgh Overview Introduction to CUDA Practical Exercise 1: Getting started with CUDA GPU Optimisation Practical Exercise 2: Optimising a CUDA

More information

CUDA Performance Considerations (2 of 2)

CUDA Performance Considerations (2 of 2) Administrivia CUDA Performance Considerations (2 of 2) Patrick Cozzi University of Pennsylvania CIS 565 - Spring 2011 Friday 03/04, 11:59pm Assignment 4 due Presentation date change due via email Not bonus

More information

Tesla GPU Computing A Revolution in High Performance Computing

Tesla GPU Computing A Revolution in High Performance Computing Tesla GPU Computing A Revolution in High Performance Computing Mark Harris, NVIDIA Agenda Tesla GPU Computing CUDA Fermi What is GPU Computing? Introduction to Tesla CUDA Architecture Programming & Memory

More information

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer

OpenACC. Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer OpenACC Introduction and Evolutions Sebastien Deldon, GPU Compiler engineer 3 WAYS TO ACCELERATE APPLICATIONS Applications Libraries Compiler Directives Programming Languages Easy to use Most Performance

More information

Lab 1 Part 1: Introduction to CUDA

Lab 1 Part 1: Introduction to CUDA Lab 1 Part 1: Introduction to CUDA Code tarball: lab1.tgz In this hands-on lab, you will learn to use CUDA to program a GPU. The lab can be conducted on the SSSU Fermi Blade (M2050) or NCSA Forge using

More information

OpenACC programming for GPGPUs: Rotor wake simulation

OpenACC programming for GPGPUs: Rotor wake simulation DLR.de Chart 1 OpenACC programming for GPGPUs: Rotor wake simulation Melven Röhrig-Zöllner, Achim Basermann Simulations- und Softwaretechnik DLR.de Chart 2 Outline Hardware-Architecture (CPU+GPU) GPU computing

More information

CS 179: GPU Computing LECTURE 4: GPU MEMORY SYSTEMS

CS 179: GPU Computing LECTURE 4: GPU MEMORY SYSTEMS CS 179: GPU Computing LECTURE 4: GPU MEMORY SYSTEMS 1 Last time Each block is assigned to and executed on a single streaming multiprocessor (SM). Threads execute in groups of 32 called warps. Threads in

More information

Lecture 5. Performance programming for stencil methods Vectorization Computing with GPUs

Lecture 5. Performance programming for stencil methods Vectorization Computing with GPUs Lecture 5 Performance programming for stencil methods Vectorization Computing with GPUs Announcements Forge accounts: set up ssh public key, tcsh Turnin was enabled for Programming Lab #1: due at 9pm today,

More information

GPU Programming. Ringberg Theorie Seminar 2010

GPU Programming. Ringberg Theorie Seminar 2010 or How to tremendously accelerate your code? Michael Kraus, Christian Konz Max-Planck-Institut für Plasmaphysik, Garching Ringberg Theorie Seminar 2010 Introduction? GPU? GPUs can do more than just render

More information

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017

INTRODUCTION TO OPENACC. Analyzing and Parallelizing with OpenACC, Feb 22, 2017 INTRODUCTION TO OPENACC Analyzing and Parallelizing with OpenACC, Feb 22, 2017 Objective: Enable you to to accelerate your applications with OpenACC. 2 Today s Objectives Understand what OpenACC is and

More information

GPU Computing: Development and Analysis. Part 1. Anton Wijs Muhammad Osama. Marieke Huisman Sebastiaan Joosten

GPU Computing: Development and Analysis. Part 1. Anton Wijs Muhammad Osama. Marieke Huisman Sebastiaan Joosten GPU Computing: Development and Analysis Part 1 Anton Wijs Muhammad Osama Marieke Huisman Sebastiaan Joosten NLeSC GPU Course Rob van Nieuwpoort & Ben van Werkhoven Who are we? Anton Wijs Assistant professor,

More information

A Tutorial on CUDA Performance Optimizations

A Tutorial on CUDA Performance Optimizations A Tutorial on CUDA Performance Optimizations Amit Kalele Prasad Pawar Parallelization & Optimization CoE TCS Pune 1 Outline Overview of GPU architecture Optimization Part I Block and Grid size Shared memory

More information

An Hybrid Data Transfer Optimization Technique for GPGPU

An Hybrid Data Transfer Optimization Technique for GPGPU An Hybrid Data Transfer Optimization Technique for GPGPU Eric Petit, François Bodin, Romain Dolbeau To cite this version: Eric Petit, François Bodin, Romain Dolbeau. An Hybrid Data Transfer Optimization

More information

Lecture 11: GPU programming

Lecture 11: GPU programming Lecture 11: GPU programming David Bindel 4 Oct 2011 Logistics Matrix multiply results are ready Summary on assignments page My version (and writeup) on CMS HW 2 due Thursday Still working on project 2!

More information

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming

Overview. Lecture 1: an introduction to CUDA. Hardware view. Hardware view. hardware view software view CUDA programming Overview Lecture 1: an introduction to CUDA Mike Giles mike.giles@maths.ox.ac.uk hardware view software view Oxford University Mathematical Institute Oxford e-research Centre Lecture 1 p. 1 Lecture 1 p.

More information

How to Write Code that Will Survive the Many-Core Revolution

How to Write Code that Will Survive the Many-Core Revolution How to Write Code that Will Survive the Many-Core Revolution Write Once, Deploy Many(-Cores) Guillaume BARAT, EMEA Sales Manager CAPS worldwide ecosystem Customers Business Partners Involved in many European

More information

Cartoon parallel architectures; CPUs and GPUs

Cartoon parallel architectures; CPUs and GPUs Cartoon parallel architectures; CPUs and GPUs CSE 6230, Fall 2014 Th Sep 11! Thanks to Jee Choi (a senior PhD student) for a big assist 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ~ socket 14 ~ core 14 ~ HWMT+SIMD

More information

Objective. GPU Teaching Kit. OpenACC. To understand the OpenACC programming model. Introduction to OpenACC

Objective. GPU Teaching Kit. OpenACC. To understand the OpenACC programming model. Introduction to OpenACC GPU Teaching Kit Accelerated Computing OpenACC Introduction to OpenACC Objective To understand the OpenACC programming model basic concepts and pragma types simple examples 2 2 OpenACC The OpenACC Application

More information

Overview of research activities Toward portability of performance

Overview of research activities Toward portability of performance Overview of research activities Toward portability of performance Do dynamically what can t be done statically Understand evolution of architectures Enable new programming models Put intelligence into

More information

General Purpose GPU Computing in Partial Wave Analysis

General Purpose GPU Computing in Partial Wave Analysis JLAB at 12 GeV - INT General Purpose GPU Computing in Partial Wave Analysis Hrayr Matevosyan - NTC, Indiana University November 18/2009 COmputationAL Challenges IN PWA Rapid Increase in Available Data

More information

Mathematical computations with GPUs

Mathematical computations with GPUs Master Educational Program Information technology in applications Mathematical computations with GPUs GPU architecture Alexey A. Romanenko arom@ccfit.nsu.ru Novosibirsk State University GPU Graphical Processing

More information

Introduction to CUDA (1 of n*)

Introduction to CUDA (1 of n*) Administrivia Introduction to CUDA (1 of n*) Patrick Cozzi University of Pennsylvania CIS 565 - Spring 2011 Paper presentation due Wednesday, 02/23 Topics first come, first serve Assignment 4 handed today

More information

CellSs Making it easier to program the Cell Broadband Engine processor

CellSs Making it easier to program the Cell Broadband Engine processor Perez, Bellens, Badia, and Labarta CellSs Making it easier to program the Cell Broadband Engine processor Presented by: Mujahed Eleyat Outline Motivation Architecture of the cell processor Challenges of

More information

PROGRAMOVÁNÍ V C++ CVIČENÍ. Michal Brabec

PROGRAMOVÁNÍ V C++ CVIČENÍ. Michal Brabec PROGRAMOVÁNÍ V C++ CVIČENÍ Michal Brabec PARALLELISM CATEGORIES CPU? SSE Multiprocessor SIMT - GPU 2 / 17 PARALLELISM V C++ Weak support in the language itself, powerful libraries Many different parallelization

More information

Portability of OpenMP Offload Directives Jeff Larkin, OpenMP Booth Talk SC17

Portability of OpenMP Offload Directives Jeff Larkin, OpenMP Booth Talk SC17 Portability of OpenMP Offload Directives Jeff Larkin, OpenMP Booth Talk SC17 11/27/2017 Background Many developers choose OpenMP in hopes of having a single source code that runs effectively anywhere (performance

More information

CUDA OPTIMIZATION WITH NVIDIA NSIGHT ECLIPSE EDITION

CUDA OPTIMIZATION WITH NVIDIA NSIGHT ECLIPSE EDITION CUDA OPTIMIZATION WITH NVIDIA NSIGHT ECLIPSE EDITION WHAT YOU WILL LEARN An iterative method to optimize your GPU code Some common bottlenecks to look out for Performance diagnostics with NVIDIA Nsight

More information

An Introduction to OpenACC. Zoran Dabic, Rusell Lutrell, Edik Simonian, Ronil Singh, Shrey Tandel

An Introduction to OpenACC. Zoran Dabic, Rusell Lutrell, Edik Simonian, Ronil Singh, Shrey Tandel An Introduction to OpenACC Zoran Dabic, Rusell Lutrell, Edik Simonian, Ronil Singh, Shrey Tandel Chapter 1 Introduction OpenACC is a software accelerator that uses the host and the device. It uses compiler

More information

Parallel Accelerators

Parallel Accelerators Parallel Accelerators Přemysl Šůcha ``Parallel algorithms'', 2017/2018 CTU/FEL 1 Topic Overview Graphical Processing Units (GPU) and CUDA Vector addition on CUDA Intel Xeon Phi Matrix equations on Xeon

More information

Introduction to Compiler Directives with OpenACC

Introduction to Compiler Directives with OpenACC Introduction to Compiler Directives with OpenACC Agenda Fundamentals of Heterogeneous & GPU Computing What are Compiler Directives? Accelerating Applications with OpenACC - Identifying Available Parallelism

More information

NVIDIA GTX200: TeraFLOPS Visual Computing. August 26, 2008 John Tynefield

NVIDIA GTX200: TeraFLOPS Visual Computing. August 26, 2008 John Tynefield NVIDIA GTX200: TeraFLOPS Visual Computing August 26, 2008 John Tynefield 2 Outline Execution Model Architecture Demo 3 Execution Model 4 Software Architecture Applications DX10 OpenGL OpenCL CUDA C Host

More information

Parallel Programming. Exploring local computational resources OpenMP Parallel programming for multiprocessors for loops

Parallel Programming. Exploring local computational resources OpenMP Parallel programming for multiprocessors for loops Parallel Programming Exploring local computational resources OpenMP Parallel programming for multiprocessors for loops Single computers nowadays Several CPUs (cores) 4 to 8 cores on a single chip Hyper-threading

More information

Unified Memory. Notes on GPU Data Transfers. Andreas Herten, Forschungszentrum Jülich, 24 April Member of the Helmholtz Association

Unified Memory. Notes on GPU Data Transfers. Andreas Herten, Forschungszentrum Jülich, 24 April Member of the Helmholtz Association Unified Memory Notes on GPU Data Transfers Andreas Herten, Forschungszentrum Jülich, 24 April 2017 Handout Version Overview, Outline Overview Unified Memory enables easy access to GPU development But some

More information

INTRODUCTION TO OPENACC

INTRODUCTION TO OPENACC INTRODUCTION TO OPENACC Hossein Pourreza hossein.pourreza@umanitoba.ca March 31, 2016 Acknowledgement: Most of examples and pictures are from PSC (https://www.psc.edu/images/xsedetraining/openacc_may2015/

More information

Sparse Linear Algebra in CUDA

Sparse Linear Algebra in CUDA Sparse Linear Algebra in CUDA HPC - Algorithms and Applications Alexander Pöppl Technical University of Munich Chair of Scientific Computing November 22 nd 2017 Table of Contents Homework - Worksheet 2

More information

Automated Finite Element Computations in the FEniCS Framework using GPUs

Automated Finite Element Computations in the FEniCS Framework using GPUs Automated Finite Element Computations in the FEniCS Framework using GPUs Florian Rathgeber (f.rathgeber10@imperial.ac.uk) Advanced Modelling and Computation Group (AMCG) Department of Earth Science & Engineering

More information

On Level Scheduling for Incomplete LU Factorization Preconditioners on Accelerators

On Level Scheduling for Incomplete LU Factorization Preconditioners on Accelerators On Level Scheduling for Incomplete LU Factorization Preconditioners on Accelerators Karl Rupp, Barry Smith rupp@mcs.anl.gov Mathematics and Computer Science Division Argonne National Laboratory FEMTEC

More information

Information Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY

Information Coding / Computer Graphics, ISY, LiTH. Introduction to CUDA. Ingemar Ragnemalm Information Coding, ISY Introduction to CUDA Ingemar Ragnemalm Information Coding, ISY This lecture: Programming model and language Introduction to memory spaces and memory access Shared memory Matrix multiplication example Lecture

More information

University of Bielefeld

University of Bielefeld Geistes-, Natur-, Sozial- und Technikwissenschaften gemeinsam unter einem Dach Introduction to GPU Programming using CUDA Olaf Kaczmarek University of Bielefeld STRONGnet Summerschool 2011 ZIF Bielefeld

More information

Parallel Accelerators

Parallel Accelerators Parallel Accelerators Přemysl Šůcha ``Parallel algorithms'', 2017/2018 CTU/FEL 1 Topic Overview Graphical Processing Units (GPU) and CUDA Vector addition on CUDA Intel Xeon Phi Matrix equations on Xeon

More information

Exploiting GPU Caches in Sparse Matrix Vector Multiplication. Yusuke Nagasaka Tokyo Institute of Technology

Exploiting GPU Caches in Sparse Matrix Vector Multiplication. Yusuke Nagasaka Tokyo Institute of Technology Exploiting GPU Caches in Sparse Matrix Vector Multiplication Yusuke Nagasaka Tokyo Institute of Technology Sparse Matrix Generated by FEM, being as the graph data Often require solving sparse linear equation

More information

Parallel Computing. November 20, W.Homberg

Parallel Computing. November 20, W.Homberg Mitglied der Helmholtz-Gemeinschaft Parallel Computing November 20, 2017 W.Homberg Why go parallel? Problem too large for single node Job requires more memory Shorter time to solution essential Better

More information

OpenACC Course. Office Hour #2 Q&A

OpenACC Course. Office Hour #2 Q&A OpenACC Course Office Hour #2 Q&A Q1: How many threads does each GPU core have? A: GPU cores execute arithmetic instructions. Each core can execute one single precision floating point instruction per cycle

More information

S Comparing OpenACC 2.5 and OpenMP 4.5

S Comparing OpenACC 2.5 and OpenMP 4.5 April 4-7, 2016 Silicon Valley S6410 - Comparing OpenACC 2.5 and OpenMP 4.5 James Beyer, NVIDIA Jeff Larkin, NVIDIA GTC16 April 7, 2016 History of OpenMP & OpenACC AGENDA Philosophical Differences Technical

More information

CUDA Programming Model

CUDA Programming Model CUDA Xing Zeng, Dongyue Mou Introduction Example Pro & Contra Trend Introduction Example Pro & Contra Trend Introduction What is CUDA? - Compute Unified Device Architecture. - A powerful parallel programming

More information

Mixed MPI-OpenMP EUROBEN kernels

Mixed MPI-OpenMP EUROBEN kernels Mixed MPI-OpenMP EUROBEN kernels Filippo Spiga ( on behalf of CINECA ) PRACE Workshop New Languages & Future Technology Prototypes, March 1-2, LRZ, Germany Outline Short kernel description MPI and OpenMP

More information