CUDA Memory Hierarchy

Size: px

Start display at page:

Download "CUDA Memory Hierarchy"

Randell Maxwell
6 years ago
Views:

1 CUDA Memory Hierarchy Piotr Danilewski October 2012 Saarland University

2 Memory GTX 690

3 GTX 690 Memory host memory main GPU memory (global memory) shared memory caches registers

4 Memory host memory GPU global memory SM shared memory shared memory shared memory registers registers registers ALU

5 device code Memory host memory host code GPU global memory SM shared memory shared memory shared memory registers registers registers ALU

6 Memory host memory GPU global memory contstant memory texture memory local memory SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory shared memory shared memory registers registers registers ALU

7 device code Memory host memory host code GPU global memory contstant memory texture memory local memory SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory shared memory shared memory registers registers registers ALU

8 Memory host memory GPU global memory contstant memory texture memory local memory L2 cache SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory L1 cache shared memory L1 cache shared memory L1 cache registers registers registers ALU

9 Memory host memory GPU global memory contstant memory texture memory local memory L2 cache SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory L1 cache shared memory L1 cache shared memory L1 cache registers registers registers ALU?

10 global memory Dynamic int* devptr; cudamalloc(&devptr, sizeof(int)); Static device int devvar; cudamemcpy( devptr, hostptr, sizeof(int), cudamemcpyhosttodevice ) cudamemcpy( hostptr, devptr, sizeof(int), cudamemcpydevicetohost ) cudamemcpytosymbol( devvar, &hostvar, sizeof(int), 0, cudamemcpyhosttodevice ) cudamemcpyfromsymbol( &hostvar, devvar, sizeof(int), 0, cudamemcpydevicetohost ) cudafree(devptr) automatically at the end of application

11 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

12 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

13 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

14 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

15 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

16 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

17 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

18 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

19 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

20 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

21 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);

22 global memory! Dynamic global add4f(float* u, float* v) { u[i]+=v[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; float* devu, devv; cudamalloc(&devu, size); cudamalloc(&devv, size); cudamemcpy(devu, hostu, size, cudamemcpy(devv, hostv, size, add4f<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devv); cudafree(devu);! Static device float devu[4]; device float devv[4]; global adduv() { devu[i]+=devv[i]; float hostu[4] = {1, 2, 3, 4; float hostv[4] = {1, 2, 3, 4; cudamemcpytosymbol(devu, hostu, size, 0, cudamemcpytosymbol(devv, hostv, size, 0, adduv<<<1,4>>>(); cudamemcpyfromsymbol(hostu, devu, size, 0, cudamemcpydevicetohost);?

23 host memory GPU global memory contstant memory texture memory local memory L2 cache SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory L1 cache shared memory L1 cache shared memory L1 cache registers registers registers ALU

24 contstant memory constant cache constant cache constant cache constant int devvar; cudamemcpytosymbol( devvar, &hostvar, sizeof(int), 0, cudamemcpyhosttodevice ) cudamemcpyfromsymbol( &hostvar, devvar, sizeof(int), 0, cudamemcpydevicetohost ) automatically at the end of application

25 contstant memory constant cache constant cache constant cache parameters - up to 4KB [Fermi] [Kepler] Load Uniform [Fermi] [Kepler?] - variable resides in global memory - read-only in the kernel - not dependent on threadidx

26 host memory GPU global memory contstant memory texture memory local memory SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory shared memory shared memory registers registers registers ALU

27 shared memory shared host memory shared memory Static Dynamic - file scope - file scope - device function - device function shared int sharr[4]; extern shared int sharr[]; kernel<<<grid,block,sizeof(int)*4>>>(); size is fixed at compile time size is a run-time parameter all extern shared variables occupy same memory not accesible from host automatically at the end of kernel

28 shared memory shared host memory shared memory Static global example(float* u) { shared int tmp[4]; tmp[i]=u[i]; u[i]=tmp[i]*tmp[i]+tmp[3-i]; float hostu[4] = {1, 2, 3, 4; float* devu; cudamalloc(&devu, size); cudamemcpy(devu, hostu, size, example<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devu); u i =u i2 +u n-i-1 Dynamic extern shared int tmp[]; global example(float* u) { tmp[i]=u[i]; u[i]=tmp[i]*tmp[i]+tmp[3-i]; float hostu[4] = {1, 2, 3, 4; float* devu; cudamalloc(&devu, size); cudamemcpy(devu, hostu, size, example<<<1,4,size>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devu);

29 shared memory shared host memory shared memory Static global example(float* u) { shared int tmp[4]; tmp[i]=u[i]; u[i]=tmp[i]*tmp[i]+tmp[3-i]; float hostu[4] = {1, 2, 3, 4; float* devu; cudamalloc(&devu, size); cudamemcpy(devu, hostu, size, example<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devu); u i =u i2 +u n-i-1 Dynamic extern shared int tmp[]; global example(float* u) { tmp[i]=u[i]; u[i]=tmp[i]*tmp[i]+tmp[3-i]; float hostu[4] = {1, 2, 3, 4; float* devu; cudamalloc(&devu, size); cudamemcpy(devu, hostu, size, example<<<1,4,size>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devu);

30 shared memory shared host memory shared memory Static global example(float* u) { shared int tmp[4]; tmp[i]=u[i]; u[i]=tmp[i]*tmp[i]+tmp[3-i]; float hostu[4] = {1, 2, 3, 4; float* devu; cudamalloc(&devu, size); cudamemcpy(devu, hostu, size, example<<<1,4>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devu); u i =u i2 +u n-i-1 Dynamic extern shared int tmp[]; global example(float* u) { tmp[i]=u[i]; u[i]=tmp[i]*tmp[i]+tmp[3-i]; float hostu[4] = {1, 2, 3, 4; float* devu; cudamalloc(&devu, size); cudamemcpy(devu, hostu, size, example<<<1,4,size>>>(devu, devv); cudamemcpy(hostu, devu, size, cudamemcpydevicetohost); cudafree(devu);

31 shared L1 shared L1 shared L1 shared memory shared host memory shared memory memory cache memory cache memory cache few bytes for control parameters - up to 256B [pre-fermi] L1 cache - 16KB, 32KB or 48KB [Fermi] [Kepler] [Kepler] only

32 host memory GPU global memory contstant memory texture memory local memory L2 cache SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory L1 cache shared memory L1 cache shared memory L1 cache registers registers registers ALU

33 local memory per-thread, but slow (unless L1 or L2) no explicit keyword used for register spills used when address is used used with recursive functions

34 local memory per-thread, but slow (unless L1 or L2) no explicit keyword used for register spills used when address is used used with recursive functions max regs thread [pre-fermi] [Fermi] [Kepler] /255

35 local memory per-thread, but slow (unless L1 or L2) no explicit keyword used for register spills used when address is used used with recursive functions [pre-fermi] [Fermi] [Kepler] max regs thread /255 max regs SM 8K/16K 32K 64K

36 local memory per-thread, but slow (unless L1 or L2) no explicit keyword used for register spills used when address is used used with recursive functions max regs thread max regs SM max threads SM max regs thread if half [pre-fermi] 128 8K/16K 768/ /32 [Fermi] 63 32K [Kepler] 63/255 64K

37 local memory per-thread, but slow (unless L1 or L2) no explicit keyword used for register spills used when address is used used with recursive functions float x =...; float *px = &x; float x[4]; x[i] =...; float x[1000];

38 local memory per-thread, but slow (unless L1 or L2) no explicit keyword used for register spills used when address is used used with recursive functions

39 host memory GPU global memory contstant memory texture memory local memory L2 cache SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory L1 cache shared memory L1 cache shared memory L1 cache registers registers registers ALU

40 texture memory texture cache texture cache texture cache separate cache in TPC [pre-fermi] or SM [Fermi] [Kepler] 1D, 2D and 3D memory addressing normalized addressing value normalization layered textures and cubemaps [Fermi] [Kepler] handling border (clamp, warp, mirror) interpolation (nearest, linear)

41 texture memory texture cache texture cache texture cache texture<float, Type, ReadMode> texref; texref.normalized =... texref.addressmode[0] =... texref.filtermode = float* devptr; cudamalloc(&devptr,...); cudachannelformatdesc channeldesc = cudacreatechanneldesc<float>(); size_t offset; cudabindtexture(&offset, texref, devptr, channeldesc, size); Host: use devptr cudaunbindtexture(texref); cudafree(devptr);

42 Memory Usage texture memory texture cache texture cache texture cache texture<float, Type, ReadMode> texref; texref.normalized =... texref.addressmode[0] =... texref.filtermode = Data type: one of char, short, int, long, long long, float, double or vector structures of size 2 or 4 (e.g. short2, float4) Value normalization: cudareadmodeelementtype cudareadmodenormalizedfloat Dimensionality: cudatexturetype1d cudatexturetype2d cudatexturetype3d cudatexturetype1dlayered cudatexturetype2dlayered cudatexturetypecubemap default float* devptr; cudamalloc(&devptr,...); cudatexturetypecubemaplayered cudachannelformatdesc channeldesc = cudacreatechanneldesc<float>(); size_t offset; cudabindtexture(&offset, texref, devptr, channeldesc, size); default Host: use devptr cudaunbindtexture(texref); cudafree(devptr);

43 texture memory texture cache texture cache texture cache texture<float, Type, ReadMode> texref; texref.normalized =... texref.addressmode[0] =... texref.filtermode = more float* devptr; cudamalloc(&devptr,...); Normalized addressing - false - [0..N-1] - true - [0..1-1/N] normalized addressing Interpolation cudafiltermodepoint default Border handling (per each dimension) cudaaddressmodeborder cudaaddressmodeclamp cudaaddressmodewarp cudaaddressmodemirror cudafiltermodelinear cudachannelformatdesc channeldesc = cudacreatechanneldesc<float>(); size_t offset; cudabindtexture(&offset, texref, devptr, channeldesc, size); default Host: use devptr cudaunbindtexture(texref); cudafree(devptr);

44 texture memory texture cache texture cache texture cache [pre-fermi] [Fermi] [Kepler] texture<float, Type, ReadMode> texref; [Kepler] cudatexturedesc texdesc; cudaresourcedesc resdesc; texref.normalized =... texref.addressmode[0] =... texref.filtermode = cudacreatetextureobject(...); float* devptr; cudamalloc(&devptr,...); cudachannelformatdesc channeldesc = cudacreatechanneldesc<float>(); size_t offset; cudabindtexture(&offset, texref, devptr, channeldesc, size); Host: use devptr cudaunbindtexture(texref); cudafree(devptr);

45 Memory Usage texture memory texture cache texture cache texture cache texture<float, Type, ReadMode> texref; cudabindtexture(&offset, texref, devptr, channeldesc, size); Device code float v = tex1dfetch(texref, x); float v = tex1d(texref, x); float v = tex2d(texref, x, y); float v = tex3d(texref, x, y, z);... Dimensionality: cudatexturetype1d default cudatexturetype2d cudatexturetype3d cudatexturetype1dlayered cudatexturetype2dlayered cudatexturetypecubemap cudatexturetypecubemaplayered (address not normalized, no interpolation)

46 Recap host memory GPU global memory contstant memory texture memory local memory device cudamalloc cudamemcpytosymbol cudamemcpy cudafree constant cudamemcpytosymbol texture<...> cudabindtexture L2 cache SM texture cache texture cache texture cache constant cache constant cache constant cache shared memory L1 cache shared memory L1 cache shared memory shared registers registers registers ALU

47 Not covered accessing host memory from kernel asynchronous memory transfers L1 cache configuration efficient access patterns texture objects [Kepler] cudaarray cudasurface

48 Thank you Questions?

Class. Windows and CUDA : Shu Guo. Program from last time: Constant memory

Class. Windows and CUDA : Shu Guo. Program from last time: Constant memory Class Windows and CUDA : Shu Guo Program from last time: Constant memory Windows on CUDA Reference: NVIDIA CUDA Getting Started Guide for Microsoft Windows Whiting School has Visual Studio Cuda 5.5 Installer