Parallel Numerical Algorithms http://sudalab.is.s.u-tokyo.ac.jp/~reiji/pna14/ [ 10 ] GPU and CUDA Parallel Numerical Algorithms / IST / UTokyo 1
PNA16 Lecture Plan General Topics 1. Architecture and Performance 2. Dependency 3. Locality 4. Scheduling MIMD / Distributed Memory 5. MPI: Message Passing Interface 6. Collective Communication 7. Distributed Data Structure MIMD / Shared Memory 8. OpenMP 9. Cache Performance Special Lectures 5/30 How to use FX10 (Prof. Ohshima) 6/6 Dynamic Parallelism (Prof. Peri) SIMD / Shared Memory 10. GPU and CUDA 11. SIMD Performance Parallel Numerical Algorithms / IST / UTokyo 2
http://top500.org/ World top 10 supercomputers Sunway 10 millions of cores! Xeon Phi GPU GPU Many-core accelerators Parallel Numerical Algorithms / IST / UTokyo 3
Accelerators Additional processor attached to CPU Xeon E7-8890 v4 Tesla P100 CPU PCI Express GPU Host Memory Device Memory Parallel Numerical Algorithms / IST / UTokyo 4
Accelerators Additional processor attached to CPU 3.5 GHz 24 cores 845 Gflops Xeon E7-8890 v4 CPU PCI Express 12.5x Tesla P100 GPU 1.48 GHz 3584 cores 10,600 Gflops Host Memory Device Memory Parallel Numerical Algorithms / IST / UTokyo 5
Accelerators Additional processor attached to CPU 3.5 GHz 24 cores 845 Gflops Xeon E7-8890 v4 CPU PCI Express 16 GB/s Tesla P100 GPU 1.48 GHz 3584 cores 10,600 Gflops 102GB/s Host Memory 7.05x 720 GB/s Device Memory Parallel Numerical Algorithms / IST / UTokyo 6
Accelerators Additional processor attached to CPU 3.5 GHz 24 cores 845 Gflops Xeon E7-8890 v4 CPU 102GB/s PCI Express 16 GB/s Tesla P100 GPU 720 GB/s 1.48 GHz 3584 cores 10,600 Gflops Host Memory Device Memory Relatively small number, but much faster than network Parallel Numerical Algorithms / IST / UTokyo 7
Source of high performance Use more transistors and electricity to arithmetics Slower clock (higher flops/watt) Higher parallelism and simpler hardware (not fast in sequential computations) Fixed form factor, fixed size of memory Parallel Numerical Algorithms / IST / UTokyo 8
GPU Programming Models Old days Direct 3D, OpenGL, Ct Specialized for graphics CUDA Extended C language General purpose, NVIDIA proprietary OpenCL C++ Library Open Standard, Low Level OpenACC, OpenMP 4.0 Directive API Parallel Numerical Algorithms / IST / UTokyo 9
Simplest CUDA code #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; sum<<<1,1>>>(1.0f, 2.0f); cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 10
Simplest CUDA code #include <stdio.h> #include <cuda_runtime.h> Include header file device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; sum<<<1,1>>>(1.0f, 2.0f); cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 11
Simplest CUDA code CPU GPU #include <stdio.h> #include <cuda_runtime.h> c_dev device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; sum<<<1,1>>>(1.0f, 2.0f); Variable on device memory cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 12
Simplest CUDA code #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; Kernel: a function runs on GPU called from CPU c_dev Can use only arguments and variables on Device Memory, sum<<<1,1>>>(1.0f, 2.0f); cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 13
Simplest CUDA code #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; c_hos c_dev int main(void) { float c_hos; Variable on CPU sum<<<1,1>>>(1.0f, 2.0f); cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 14
Simplest CUDA code 1.0f 2.0f 1.0f 2.0f #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; c_hos c_dev sum<<<1,1>>>(1.0f, 2.0f); Launch a kernel cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 15
Simplest CUDA code 1.0f 2.0f 3.0f #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; c_hos c_dev sum<<<1,1>>>(1.0f, 2.0f); Launch a kernel cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Parallel Numerical Algorithms / IST / UTokyo 16
Simplest CUDA code 1.0f 2.0f 3.0f #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; sum<<<1,1>>>(1.0f, 2.0f); c_hos c_dev cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Copy data from GPU to CPU Parallel Numerical Algorithms / IST / UTokyo 17
Simplest CUDA code 1.0f 2.0f 3.0f #include <stdio.h> #include <cuda_runtime.h> device float c_dev; global void sum(float a, float b) { c_dev = a + b; int main(void) { float c_hos; sum<<<1,1>>>(1.0f, 2.0f); c_hos c_dev cudamemcpyfromsymbol(&c_hos, "c_dev", sizeof(float), 0, cudamemcpydevicetohost); printf("%f n", c_hos); return 0; Print result: 3.0 Can refer only variables on Host Memory Parallel Numerical Algorithms / IST / UTokyo 18
A little more detail of GPU Graphic Card Device Memory GDDR 5 Device Memory GDDR 5 GPU (Graphics Processing Unit) SM (Streaming Multiprocessor) SM (Streaming Multiprocessor) Device Memory GDDR 5 CUDA Cores CUDA Cores Device Memory GDDR 5 Register Shared Register Shared 16 lanes of PCI Express 3.0 to Host CPU Parallel Numerical Algorithms / IST / UTokyo 19
CUDA Cores SMs works independently (like CPU cores) 32/64 CUDA cores work in SIMD fashion A CUDA core is an ALU (arithmetic logic unit) rather than a processor core SM (Streaming Multiprocessor) CUDA Cores Scheduler Register Shared 2 sets of 64 CUDA cores are installed in an SM 2 64 = 128 CUDA cores per SM Maximum 56 SMs per GPU Maximum 56 128 = 7168 CUDA cores per GPU Parallel Numerical Algorithms / IST / UTokyo 20
SIMD programming How to program SIMD parallelism? 1. Vectorization Programmer writes a usual code Compiler converts vectorizable loops into SIMD instructions for (i = 0; i < N; i++) c[i] = a[i] * b[i]; for (i = 0; i < N; i+=4) c[i:i+3] = a[i:i+3] * b[i:i+3]; 2. SIMT (Single Instruction, Multiple Threads) Parallel Numerical Algorithms / IST / UTokyo 21
Simple CUDA code #define N 1024 device float a_dev[n], b_dev[n]; global void shift(float s) { int i = blockidx.x * blockdim.x + threadidx.x; b_dev[i] = s + a_dev[i]; int main(void) {... cudamemcpytosymbol(&a_dev, &a_hos, sizeof(float) * N, 0, cudamemcpyhosttodevice); shift<<<n/256,256>>>(2.0f); cudamemcpyfromsymbol(&b_hos, &b_dev, sizeof(float) * N, 0, cudamemcpydevicetohost);... Parallel Numerical Algorithms / IST / UTokyo 22
Simple CUDA code #define N 1024 device float a_dev[n], b_dev[n]; global void shift(float s) { int i = blockidx.x * blockdim.x + threadidx.x; b_dev[i] = s + a_dev[i]; int main(void) {... cudamemcpytosymbol(&a_dev, &a_hos, sizeof(float) * N, 0, cudamemcpyhosttodevice); shift<<<n/256,256>>>(2.0f); Launch 4 blocks of 256 threads each cudamemcpyfromsymbol(&b_hos, &b_dev, sizeof(float) * N, 0, cudamemcpydevicetohost);... Parallel Numerical Algorithms / IST / UTokyo 23
Thread Hierarchy Host program Memcpy Block Grid Block Block Block Kernel launch Thread Memcpy shift<<<4,256>>>(2.0f); Launch a grid with 4 blocks of 256 threads each In total 1024 threads Note: 1024 is too small for 2880 CUDA cores Parallel Numerical Algorithms / IST / UTokyo 24
Simple CUDA code #define N 1024 device float a_dev[n], b_dev[n]; global void shift(float s) { int i = blockidx.x * blockdim.x + threadidx.x; b_dev[i] = s + a_dev[i]; int main(void) {... cudamemcpytosymbol(&a_dev, &a_hos, sizeof(float) * N, 0, cudamemcpyhosttodevice); shift<<<n/256,256>>>(2.0f); Launch 4 blocks of 256 threads each cudamemcpyfromsymbol(&b_hos, &b_dev, sizeof(float) * N, 0, cudamemcpydevicetohost);... Parallel Numerical Algorithms / IST / UTokyo 25
Simple CUDA code #define N 1024 device float a_dev[n], b_dev[n]; global void shift(float s) { int i = blockidx.x * blockdim.x + threadidx.x; b_dev[i] = s + a_dev[i]; Computes who am I GridDim: number of blocks in the grid blockidx: index of my block within the grid blockdim: number of threads in the block threadidx: index of my thread within the block int main(void) {... cudamemcpytosymbol(&a_dev, &a_hos, sizeof(float) * N, 0, cudamemcpyhosttodevice); shift<<<n/256,256>>>(2.0f); Block Block Block Block cudamemcpyfromsymbol(&b_hos, &b_dev, sizeof(float) * N, 0, cudamemcpydevicetohost);... blockidx.x = 2 threadidx.x = 1 Parallel Numerical Algorithms / IST / UTokyo 26
Simple CUDA code #define N 1024 device float a_dev[n], b_dev[n]; global void shift(float s) { int i = blockidx.x * blockdim.x + threadidx.x; b_dev[i] = s + a_dev[i]; int main(void) {... cudamemcpytosymbol(&a_dev, &a_hos, sizeof(float) * N, 0, cudamemcpyhosttodevice); shift<<<n/256,256>>>(2.0f); SIMD addition cudamemcpyfromsymbol(&b_hos, &b_dev, sizeof(float) * N, 0, cudamemcpydevicetohost);... Parallel Numerical Algorithms / IST / UTokyo 27
Conditional Branch in SIMD if (x < 0) x = -x; if (x < 0) x *= a; else x *= b; Some operations are masked Called Divergence Thread 0 x < 0 x = -x Thread 0 x < 0 x *= a x *= b Thread 1 x < 0 x = -x Thread 1 x < 0 x *= a x *= b Thread 2 x < 0 x = -x Thread 2 x < 0 x *= a x *= b Thread 3 x < 0 x = -x Thread 3 x < 0 x *= a x *= b Thread 4 x < 0 x = -x Thread 4 x < 0 x *= a x *= b Thread 5 x < 0 x = -x Thread 5 x < 0 x *= a x *= b Thread 6 x < 0 x = -x Thread 6 x < 0 x *= a x *= b Thread 7 x < 0 x = -x Thread 7 x < 0 x *= a x *= b Thread 8 x < 0 x = -x Thread 8 x < 0 x *= a x *= b Thread 9 x < 0 x = -x Thread 9 x < 0 x *= a x *= b Parallel Numerical Algorithms / IST / UTokyo 28
Warp 32 consecutive threads form a warp Block Warp Warp Warp Warp Warp Warp Threads in a warp runs in SIMD fashion Different warps can do different operations Parallel Numerical Algorithms / IST / UTokyo 29
Warp Divergence if (i < n) computinga(); else computingb(); if (p) { if (q) computinga(); else computingb(); else { if (q) computingc(); else computingd(); If n is a multiple of 32, then no warp diverges A A B B B B Otherwise, one warp diverges A A A 4-way divergence B Only ¼ of CUDA cores are working at each time B B B Up to 32-way divergence happens Very low utilization of hardware Parallel Numerical Algorithms / IST / UTokyo 30
Summary Hardware Number GPU Chip SM MIMD Core Up to 56 per GPU Scheduler 4 per SM CUDA Core SIMD element 128 per SM Number Run on Execution Grid 1 per kernel launch a GPU Block Specified number an SM MIMD Warp 32 threads a scheduler MIMD Thread Specified number a CUDA Core SIMD Parallel Numerical Algorithms / IST / UTokyo 31
Memory Hierarchy Register Each thread has a set of registers Visible only from the owner thread Local memory Visible only from the owner thread Shared memory Assigned to each block Visible from threads within the block Global memory Visible from all threads Per thread Per block Per program Parallel Numerical Algorithms / IST / UTokyo 32
Memory hardware Graphic Card Device Memory Device Memory Device Memory L2 cache L2 cache L2 cache GPU (Graphics Processing Unit) SM (Streaming Multiprocessor) CUDA Cores Device Memory L2 cache Register Shared L1 cache Global, Local Register Shared, (Local) Parallel Numerical Algorithms / IST / UTokyo 33
Memory Lifetime Register, Local, and Shared memory Allocated at start of the block Deallocated at end of the block Device memory Persistent through application program Allocated and deallocated by host program Parallel Numerical Algorithms / IST / UTokyo 34
Device memory allocation cudamalloc(void** p_dev, size_t size) Array is not cleared Return an error if physical memory is insufficient Starting address is aligned for textures cudafree(void* p_dev) Deallocates device memory allocated by cudamalloc (or cudamallocpitch) Static allocation is OK device float array[n]; in host codes
Data transfer cudamemcpy(void* dst, const void* src, size_t count, enum cudamemcpykind kind) Copy data of count bytes from src to dst kind is one of the followings: cudamemcpyhosttodevice cudamemcpydevicetohost cudamemcpyhosttohost cudamemcpydevicetodevice in host codes
Register and Local memory Automatically allocated to auto variables in device code Scalar variables are usually compiled into register Arrays are usually compiled into local memory in device codes Parallel Numerical Algorithms / IST / UTokyo 37
Shared Memory shared qualifier Use static allocation for array as: shared float a[nx][ny]; Dynamic allocation is not impossible Shared by a threads in a block Memory barrier: syncthreads() Slower (~40 cycles) and smaller (64 KB) than register memory (64K 4byte = 256KB) in device codes
Device Memory Allocated by host program or Static allocation with device qualifier All threads can access Memory barrier: new kernel launch Very high throughput on high-end GPUs Condition: coalesced access Latency is big (200 or more cycles) L1 cache:?kb, ~40 cycles L2 cache:?kb, ~200 cycles in device codes
Memory fence syncthreads() Synchronizes all threads in a block Makes writes to shared and global memory visible New kernel launch Is the way to synchronize all threads in a grid Or careful use of threadfence() functions Parallel Numerical Algorithms / IST / UTokyo 40
Shared memory example shared float a_sh[b][n], b_sh[n][b]; global matmul(float *a_dev, float *b_dev, float *c_dev) { int ilb = blockidx.x * B, jlb = blockidx.y * B; int i, k, thidx = threadidx.x; float c; for (k = 0; k < N; k++) { a_sh[thidx][k] = a_dev[(ilb + thidx) * N + k]; b_sh[k][thidx] = b_dev[k * N + (jlb + thidx)]; synchthreads(); for (i = 0; i < B; i++) { c = 0.0; for (k = 0; k < N; k++) c += a_sh[i][k] * b_sh[k][thidx]; c_dev[(ilb + i) * N + (jlb + thidx)] = c; Parallel Numerical Algorithms / IST / UTokyo 41
Shared memory example shared float a_sh[b][n], b_sh[n][b]; global matmul(float *a_dev, float *b_dev, float *c_dev) { int ilb = blockidx.x * B, jlb = blockidx.y * B; int i, k, thidx = threadidx.x; float c; for (k = 0; k < N; k++) { a_sh[thidx][k] = a_dev[(ilb + thidx) * N + k]; b_sh[k][thidx] = b_dev[k * N + (jlb + thidx)]; synchthreads(); for (i = 0; i < B; i++) { c = 0.0; for (k = 0; k < N; k++) c += a_sh[i][k] * b_sh[k][thidx]; c_dev[(ilb + i) * N + (jlb + thidx)] = c; a_sh b_sh Parallel Numerical Algorithms / IST / UTokyo 42
Shared memory example shared float a_sh[b][n], b_sh[n][b]; global matmul(float *a_dev, float *b_dev, float *c_dev) { int ilb = blockidx.x * B, jlb = blockidx.y * B; int i, k, thidx = threadidx.x; float c; for (k = 0; k < N; k++) { a_sh[thidx][k] = a_dev[(ilb + thidx) * N + k]; b_sh[k][thidx] = b_dev[k * N + (jlb + thidx)]; synchthreads(); for (i = 0; i < B; i++) { c = 0.0; for (k = 0; k < N; k++) c += a_sh[i][k] * b_sh[k][thidx]; c_dev[(ilb + i) * N + (jlb + thidx)] = c; ilb jlb Parallel Numerical Algorithms / IST / UTokyo 43
Shared memory example shared float a_sh[b][n], b_sh[n][b]; global matmul(float *a_dev, float *b_dev, float *c_dev) { int ilb = blockidx.x * B, jlb = blockidx.y * B; int i, k, thidx = threadidx.x; float c; for (k = 0; k < N; k++) { a_sh[thidx][k] = a_dev[(ilb + thidx) * N + k]; b_sh[k][thidx] = b_dev[k * N + (jlb + thidx)]; synchthreads(); for (i = 0; i < B; i++) { c = 0.0; for (k = 0; k < N; k++) c += a_sh[i][k] * b_sh[k][thidx]; c_dev[(ilb + i) * N + (jlb + thidx)] = c; Parallel Numerical Algorithms / IST / UTokyo 44
Shared memory example shared float a_sh[b][n], b_sh[n][b]; global matmul(float *a_dev, float *b_dev, float *c_dev) { int ilb = blockidx.x * B, jlb = blockidx.y * B; int i, k, thidx = threadidx.x; float c; for (k = 0; k < N; k++) { a_sh[thidx][k] = a_dev[(ilb + thidx) * N + k]; b_sh[k][thidx] = b_dev[k * N + (jlb + thidx)]; synchthreads(); for (i = 0; i < B; i++) { c = 0.0; for (k = 0; k < N; k++) c += a_sh[i][k] * b_sh[k][thidx]; c_dev[(ilb + i) * N + (jlb + thidx)] = c; Parallel Numerical Algorithms / IST / UTokyo 45
Shared memory example shared float a_sh[b][n], b_sh[n][b]; global matmul(float *a_dev, float *b_dev, float *c_dev) { int ilb = blockidx.x * B, jlb = blockidx.y * B; int i, k, thidx = threadidx.x; float c; for (k = 0; k < N; k++) { a_sh[thidx][k] = a_dev[(ilb + thidx) * N + k]; b_sh[k][thidx] = b_dev[k * N + (jlb + thidx)]; synchthreads(); for (i = 0; i < B; i++) { c = 0.0; for (k = 0; k < N; k++) c += a_sh[i][k] * b_sh[k][thidx]; c_dev[(ilb + i) * N + (jlb + thidx)] = c; Parallel Numerical Algorithms / IST / UTokyo 46
PNA16 Lecture Plan General Topics 1. Architecture and Performance 2. Dependency 3. Locality 4. Scheduling MIMD / Distributed Memory 5. MPI: Message Passing Interface 6. Collective Communication 7. Distributed Data Structure MIMD / Shared Memory 8. OpenMP 9. Cache Performance Special Lectures 5/30 How to use FX10 (Prof. Ohshima) 6/6 Dynamic Parallelism (Prof. Peri) SIMD / Shared Memory 10. GPU and CUDA 11. SIMD Performance Parallel Numerical Algorithms / IST / UTokyo 47