Zero Copy Memory and Multiple GPUs

Size: px

Start display at page:

Download "Zero Copy Memory and Multiple GPUs"

Edward Andrews
5 years ago
Views:

1 Zero Copy Memory and Multiple GPUs Goals Zero Copy Memory Pinned and mapped memory on the host can be read and written to from the GPU program (if the device permits this) This may result in performance gains, particularly when the memory is read and written only once Results may vary for discrete GPUs and integrated GPUs Portable pinned memory To see performance gains all threads must view the memory as pinned, not just the thread allocating the memory There are flags that will make pinned memory portable

2 Timezero - 1 The kernel We use our dot product application once again Our kernel code is unchanged from previous visits Reduction is used to generate partial results The CPU completes the calculation #include "../common/book.h" #define imin(a,b) (a<b?a:b) const int N = 33 * 1024 * 1024; const int threadsperblock = 256; const int blockspergrid = imin( 32, (N+threadsPerBlock-1) / threadsperblock ); global void dot( int size, float *a, float *b, float *c ) { shared float cache[threadsperblock]; int tid = threadidx.x + blockidx.x * blockdim.x; int cacheindex = threadidx.x; float temp = 0; while (tid < size) { temp += a[tid] * b[tid]; tid += blockdim.x * griddim.x; // set the cache values cache[cacheindex] = temp; // synchronize threads in this block syncthreads(); // for reductions, threadsperblock must be a power of 2 // because of the following code int i = blockdim.x/2; while (i!= 0) { if (cacheindex < i) cache[cacheindex] += cache[cacheindex + i]; syncthreads(); i /= 2; if (cacheindex == 0) c[blockidx.x] = cache[0];

3 Timezero - 2 Here is our original version a, b, and partial_c are allocated on the host The device copies are allocated using cudamalloc The data is initialized on the host The timer is started Vectors a and b are copied to the GPU The kernel is called float malloc_test( int size ) { cudaevent_t start, stop; float *a, *b, c, *partial_c; float *dev_a, *dev_b, *dev_partial_c; float elapsedtime; HANDLE_ERROR( cudaeventcreate( &start ) ); HANDLE_ERROR( cudaeventcreate( &stop ) ); // allocate memory on the CPU side a = (float*)malloc( size*sizeof(float) ); b = (float*)malloc( size*sizeof(float) ); partial_c = (float*)malloc( blockspergrid*sizeof(float) ); // allocate the memory on the GPU HANDLE_ERROR( cudamalloc( (void**)&dev_a, size*sizeof(float) ) ); HANDLE_ERROR( cudamalloc( (void**)&dev_b, size*sizeof(float) ) ); HANDLE_ERROR( cudamalloc( (void**)&dev_partial_c, blockspergrid*sizeof(float) ) ); // fill in the host memory with data for (int i=0; i<size; i++) { a[i] = i; b[i] = i*2; HANDLE_ERROR( cudaeventrecord( start, 0 ) ); // copy the arrays 'a' and 'b' to the GPU HANDLE_ERROR( cudamemcpy( dev_a, a, size*sizeof(float), cudamemcpyhosttodevice ) ); HANDLE_ERROR( cudamemcpy( dev_b, b, size*sizeof(float), cudamemcpyhosttodevice ) ); dot<<<blockspergrid,threadsperblock>>>( size, dev_a, dev_b, dev_partial_c );

4 Timezero - 3 Original version The partial results are copied back The elapsed time is found The final result is calculated on the CPU The device memory is freed The host memory is freed The timer events are destroyed The result is printed The elapsed time is returned // copy the array 'c' back from the GPU to the CPU HANDLE_ERROR( cudamemcpy( partial_c, dev_partial_c, blockspergrid*sizeof(float), cudamemcpydevicetohost ) ); HANDLE_ERROR( cudaeventrecord( stop, 0 ) ); HANDLE_ERROR( cudaeventsynchronize( stop ) ); HANDLE_ERROR( cudaeventelapsedtime( &elapsedtime, start, stop ) ); // finish up on the CPU side c = 0; for (int i=0; i<blockspergrid; i++) { c += partial_c[i]; HANDLE_ERROR( cudafree( dev_a ) ); HANDLE_ERROR( cudafree( dev_b ) ); HANDLE_ERROR( cudafree( dev_partial_c ) ); // free memory on the CPU side free( a ); free( b ); free( partial_c ); // free events HANDLE_ERROR( cudaeventdestroy( start ) ); HANDLE_ERROR( cudaeventdestroy( stop ) ); printf( "Value calculated: %f\n", c ); return elapsedtime;

5 Timezero - 4 The zero copy version Create the timers Call cudahostalloc with the added flag cudahostallocmapped This means we will read and write host memory from the GPU Since the memory spaces are different we need to call cudahostgetdevicepointer for each vector There are no data copies We initialize the data the same as before float cuda_host_alloc_test( int size ) { cudaevent_t start, stop; float *a, *b, c, *partial_c; float *dev_a, *dev_b, *dev_partial_c; float elapsedtime; HANDLE_ERROR( cudaeventcreate( &start ) ); HANDLE_ERROR( cudaeventcreate( &stop ) ); // allocate the memory on the CPU HANDLE_ERROR( cudahostalloc( (void**)&a, size*sizeof(float), cudahostallocwritecombined cudahostallocmapped ) ); HANDLE_ERROR( cudahostalloc( (void**)&b, size*sizeof(float), cudahostallocwritecombined cudahostallocmapped ) ); HANDLE_ERROR( cudahostalloc( (void**)&partial_c, blockspergrid*sizeof(float), cudahostallocmapped ) ); // find out the GPU pointers HANDLE_ERROR( cudahostgetdevicepointer( &dev_a, a, 0 ) ); HANDLE_ERROR( cudahostgetdevicepointer( &dev_b, b, 0 ) ); HANDLE_ERROR( cudahostgetdevicepointer( &dev_partial_c, partial_c, 0 ) ); // fill in the host memory with data for (int i=0; i<size; i++) { a[i] = i; b[i] = i*2;

6 Timezero - 5 Zero copy (continued) Start the timer and call the kernel No copy back of results! Get the elapsed time The final result is calculated on the host Memory is freed The timer events are destroyed The result is printed The elapsed time is returned HANDLE_ERROR( cudaeventrecord( start, 0 ) ); dot<<<blockspergrid,threadsperblock>>>( size, dev_a, dev_b, dev_partial_c ); HANDLE_ERROR( cudathreadsynchronize() ); HANDLE_ERROR( cudaeventrecord( stop, 0 ) ); HANDLE_ERROR( cudaeventsynchronize( stop ) ); HANDLE_ERROR( cudaeventelapsedtime( &elapsedtime, start, stop ) ); // finish up on the CPU side c = 0; for (int i=0; i<blockspergrid; i++) { c += partial_c[i]; HANDLE_ERROR( cudafreehost( a ) ); HANDLE_ERROR( cudafreehost( b ) ); HANDLE_ERROR( cudafreehost( partial_c ) ); // free events HANDLE_ERROR( cudaeventdestroy( start ) ); HANDLE_ERROR( cudaeventdestroy( stop ) ); printf( "Value calculated: %f\n", c ); return elapsedtime;

7 Timezero - 6 The main program Make sure mapped memory is supported Set the flag to indicate you are using mapped memory Now run each version of the program For the GPUs used by the authors there was about a 45% speedup int main( void ) { cudadeviceprop prop; int whichdevice; HANDLE_ERROR( cudagetdevice( &whichdevice ) ); HANDLE_ERROR( cudagetdeviceproperties( &prop, whichdevice ) ); if (prop.canmaphostmemory!= 1) { printf( "Device can not map memory.\n" ); return 0; float elapsedtime; HANDLE_ERROR( cudasetdeviceflags( cudadevicemaphost ) ); // try it with malloc elapsedtime = malloc_test( N ); printf( "Time using cudamalloc: %3.1f ms\n", elapsedtime ); // now try it with cudahostalloc elapsedtime = cuda_host_alloc_test( N ); printf( "Time using cudahostalloc: %3.1f ms\n", elapsedtime );

8 When to Use Zero Copy Memory Discrete GPU Dedicated DRAM, usually on separate circuit board Usually will improve performance if data is read and written only once However, if data is read multiple times then there will be a significant performance penalty because data is NOT cached Integrated GPU Built into system s chipset, shares regular CPU memory Zero copy memory is always a win since it is the same memory, but beware of using too much of it The cuda device properties has a boolean field integrated that tells you if your memory is integrated or not For the dot product program that follows performance gains were in the 30-40% range by using zero copy

9 Multidevice - 1 Why multiple devices? You may have a built-in GPU and another GPU on a separate card NVIDIA supports multiple GPUs using SLI (Scalable Link Interface) We use our dot product program yet again There are no changes for the code shown at the right #include "../common/book.h" #define imin(a,b) (a<b?a:b) #define N (33*1024*1024) const int threadsperblock = 256; const int blockspergrid = imin( 32, (N/2+threadsPerBlock-1) / threadsperblock ); global void dot( int size, float *a, float *b, float *c ) { shared float cache[threadsperblock]; int tid = threadidx.x + blockidx.x * blockdim.x; int cacheindex = threadidx.x; float temp = 0; while (tid < size) { temp += a[tid] * b[tid]; tid += blockdim.x * griddim.x; // set the cache values cache[cacheindex] = temp; // synchronize threads in this block syncthreads(); // for reductions, threadsperblock must be a power of 2 // because of the following code int i = blockdim.x/2; while (i!= 0) { if (cacheindex < i) cache[cacheindex] += cache[cacheindex + i]; syncthreads(); i /= 2; if (cacheindex == 0) c[blockidx.x] = cache[0];

10 Multidevice - 2 DataStruct stores the device ID and the size of the vector it is working on Routine method Declare data objects Allocate local and cuda memory Copy vectors a and b to the GPU Call the kernel to do the calculation struct DataStruct { int deviceid; int size; float *a; float *b; float returnvalue; ; void* routine( void *pvoiddata ) { DataStruct *data = (DataStruct*)pvoidData; HANDLE_ERROR( cudasetdevice( data->deviceid ) ); int size = data->size; float *a, *b, c, *partial_c; float *dev_a, *dev_b, *dev_partial_c; // allocate memory on the CPU side a = data->a; b = data->b; partial_c = (float*)malloc( blockspergrid*sizeof(float) ); // allocate the memory on the GPU HANDLE_ERROR( cudamalloc( (void**)&dev_a, size*sizeof(float) ) ); HANDLE_ERROR( cudamalloc( (void**)&dev_b, size*sizeof(float) ) ); HANDLE_ERROR( cudamalloc( (void**)&dev_partial_c, blockspergrid*sizeof(float) ) ); // copy the arrays 'a' and 'b' to the GPU HANDLE_ERROR( cudamemcpy( dev_a, a, size*sizeof(float), cudamemcpyhosttodevice ) ); HANDLE_ERROR( cudamemcpy( dev_b, b, size*sizeof(float), cudamemcpyhosttodevice ) ); dot<<<blockspergrid,threadsperblock>>>( size, dev_a, dev_b, dev_partial_c );

11 Multidevice - 3 Routine (continued) Copy the results back and combine them Then do the usual cleanup // copy the array 'c' back from the GPU to the CPU HANDLE_ERROR( cudamemcpy( partial_c, dev_partial_c, blockspergrid*sizeof(float), cudamemcpydevicetohost ) ); // finish up on the CPU side c = 0; for (int i=0; i<blockspergrid; i++) { c += partial_c[i]; HANDLE_ERROR( cudafree( dev_a ) ); HANDLE_ERROR( cudafree( dev_b ) ); HANDLE_ERROR( cudafree( dev_partial_c ) ); // free memory on the CPU side free( partial_c ); data->returnvalue = c; return 0; Main program Insure at least 2 devices are present Allocate memory on the host for a and b int main( void ) { int devicecount; HANDLE_ERROR( cudagetdevicecount( &devicecount ) ); if (devicecount < 2) { printf( "We need at least two compute 1.0 or greater " "devices, but only found %d\n", devicecount ); return 0; float *a = (float*)malloc( sizeof(float) * N ); HANDLE_NULL( a ); float *b = (float*)malloc( sizeof(float) * N ); HANDLE_NULL( b );

12 Multidevice - 4 Main program (continued) We initialize data as before Assuming two devices, we divide the data in half We call routine twice, once for each data set The thread for data[0] is in a new thread; main is handling data[1] in its own thread When finished, we end the thread handling data[0], clean up, and print the combined results // fill in the host memory with data for (int i=0; i<n; i++) { a[i] = i; b[i] = i*2; // prepare for multithread DataStruct data[2]; data[0].deviceid = 0; data[0].size = N/2; data[0].a = a; data[0].b = b; data[1].deviceid = 1; data[1].size = N/2; data[1].a = a + N/2; data[1].b = b + N/2; CUTThread thread = start_thread( routine, &(data[0]) ); routine( &(data[1]) ); end_thread( thread ); // free memory on the CPU side free( a ); free( b ); printf( "Value calculated: %f\n", data[0].returnvalue + data[1].returnvalue ); return 0;

13 Potential Problems with Pinned Memory We learned in Ch10 how to pin memory on the host Using the techniques discussed previously the memory will appear page locked only to the thread that has allocated the memory If the pointer to this memory is shared between threads, the other threads will see this as pageable memory Remedy allocate pinned memory as portable When we use cudahostalloc we need to specify the flag cudahostallocportable As we will see in the program examples, it is possible to combine multiple flags so that host memory is portable, zero copy, and write enabled

14 Portable PM - 1 We use our dot product program one last time The code shown to the right is unchanged from previous examples #include "../common/book.h" #define imin(a,b) (a<b?a:b) #define N (33*1024*1024) const int threadsperblock = 256; const int blockspergrid = imin( 32, (N/2+threadsPerBlock-1) / threadsperblock ); global void dot( int size, float *a, float *b, float *c ) { shared float cache[threadsperblock]; int tid = threadidx.x + blockidx.x * blockdim.x; int cacheindex = threadidx.x; float temp = 0; while (tid < size) { temp += a[tid] * b[tid]; tid += blockdim.x * griddim.x; // set the cache values cache[cacheindex] = temp; // synchronize threads in this block syncthreads(); // for reductions, threadsperblock must be a power of 2 // because of the following code int i = blockdim.x/2; while (i!= 0) { if (cacheindex < i) cache[cacheindex] += cache[cacheindex + i]; syncthreads(); i /= 2; if (cacheindex == 0) c[blockidx.x] = cache[0];

15 Portable PM - 2 DataStruct is the same We need to avoid possibly calling cudasetdevice twice (an error) so we use an if statement Since we will use zero copy memory there is no cudamemcpy Rather there is cudahostgetdevicepointer Call the kernel as before struct DataStruct { int deviceid; int size; int offset; float *a; float *b; float returnvalue; ; void* routine( void *pvoiddata ) { DataStruct *data = (DataStruct*)pvoidData; if (data->deviceid!= 0) { HANDLE_ERROR( cudasetdevice( data->deviceid ) ); HANDLE_ERROR( cudasetdeviceflags( cudadevicemaphost ) ); int size = data->size; float *a, *b, c, *partial_c; float *dev_a, *dev_b, *dev_partial_c; // allocate memory on the CPU side a = data->a; b = data->b; partial_c = (float*)malloc( blockspergrid*sizeof(float) ); // allocate the memory on the GPU HANDLE_ERROR( cudahostgetdevicepointer( &dev_a, a, 0 ) ); HANDLE_ERROR( cudahostgetdevicepointer( &dev_b, b, 0 ) ); HANDLE_ERROR( cudamalloc( (void**)&dev_partial_c, blockspergrid*sizeof(float) ) ); // offset 'a' and 'b' to where this GPU is gets it data dev_a += data->offset; dev_b += data->offset; dot<<<blockspergrid,threadsperblock>>>( size, dev_a, dev_b, dev_partial_c );

16 Portable PM - 3 Copy the results back Combine the results Clean up // copy the array 'c' back from the GPU to the CPU HANDLE_ERROR( cudamemcpy( partial_c, dev_partial_c, blockspergrid*sizeof(float), cudamemcpydevicetohost ) ); // finish up on the CPU side c = 0; for (int i=0; i<blockspergrid; i++) { c += partial_c[i]; HANDLE_ERROR( cudafree( dev_partial_c ) ); // free memory on the CPU side free( partial_c ); data->returnvalue = c; return 0; Main program Make sure there are multiple GPU devices int main( void ) { int devicecount; HANDLE_ERROR( cudagetdevicecount( &devicecount ) ); if (devicecount < 2) { printf( "We need at least two compute 1.0 or greater " "devices, but only found %d\n", devicecount ); return 0;

17 Portable PM - 4 Make sure the host memory can be mapped Prepare to set flags Combine three flags in cudahostalloc Initialize the data as before cudadeviceprop prop; for (int i=0; i<2; i++) { HANDLE_ERROR( cudagetdeviceproperties( &prop, i ) ); if (prop.canmaphostmemory!= 1) { printf( "Device %d can not map memory.\n", i ); return 0; float *a, *b; HANDLE_ERROR( cudasetdevice( 0 ) ); HANDLE_ERROR( cudasetdeviceflags( cudadevicemaphost ) ); HANDLE_ERROR( cudahostalloc( (void**)&a, N*sizeof(float), cudahostallocwritecombined cudahostallocportable cudahostallocmapped ) ); HANDLE_ERROR( cudahostalloc( (void**)&b, N*sizeof(float), cudahostallocwritecombined cudahostallocportable cudahostallocmapped ) ); // fill in the host memory with data for (int i=0; i<n; i++) { a[i] = i; b[i] = i*2;

18 Portable PM - 5 Assuming two devices, we divide the data in half We call routine twice, once for each data set The thread for data[0] is in a new thread; main is handling data[1] in its own thread When finished, we end the thread handling data[0], clean up, and print the combined results // prepare for multithread DataStruct data[2]; data[0].deviceid = 0; data[0].offset = 0; data[0].size = N/2; data[0].a = a; data[0].b = b; data[1].deviceid = 1; data[1].offset = N/2; data[1].size = N/2; data[1].a = a; data[1].b = b; CUTThread thread = start_thread( routine, &(data[1]) ); routine( &(data[0]) ); end_thread( thread ); // free memory on the CPU side HANDLE_ERROR( cudafreehost( a ) ); HANDLE_ERROR( cudafreehost( b ) ); printf( "Value calculated: %f\n", data[0].returnvalue + data[1].returnvalue ); return 0;

Zero-copy. Table of Contents. Multi-GPU Learning CUDA to Solve Scientific Problems. Objectives. Technical Issues Zero-copy. Multigpu.

Zero-copy. Table of Contents. Multi-GPU Learning CUDA to Solve Scientific Problems. Objectives. Technical Issues Zero-copy. Multigpu. Table of Contents Multi-GPU Learning CUDA to Solve Scientific Problems. 1 Objectives Miguel Cárdenas Montes 2 Zero-copy Centro de Investigaciones Energéticas Medioambientales y Tecnológicas, Madrid, Spain