CS 677: Parallel Programming for Many-core Processors Lecture 12

Size: px

Start display at page:

Download "CS 677: Parallel Programming for Many-core Processors Lecture 12"

Kathleen Esther Reynolds
5 years ago
Views:

1 1 CS 677: Parallel Programming for Many-core Processors Lecture 12 Instructor: Philippos Mordohai Webpage:

2 CS Department Project Poster Day May 5, 12-2pm (99% confirmed) Lieb third floor conference room and corridors 5% of total grade as bonus Suggestion: 9-12 printed pages Demos would be cool 2

3 CS Department Project Poster Day Your name, course number, project title Project objective: what are you trying to accomplish? what makes this computation worthy of a project? Method General description of method (not of the implementation) Suitability for GPU acceleration Design choices for GPU implementation Workload allocation, use of resources, bottlenecks (avoided and not avoided) Experimental results including timings 3

4 Final Project Presentations April 29 Send me PPT/PDF file by 5pm 12 min presentation + 2 min Q&A Counts for 15% of total grade 4

5 Final Project Presentations Target audience: fellow classmates Content: Problem description What is the computation and why is it important? Suitability for GPU acceleration Amdahl s Law: describe the inherent parallelism. Argue that it is close to 100% of computation. Compare with CPU version 5

6 Final Project Presentations Content (cont.): GPU Implementation Which steps of the algorithm were ported to the GPU? Work load allocation to threads Use of resources (registers, shared memory, constant memory, etc.) Occupancy achieved Results Experiments performed Timings and comparisons against CPU version 6

7 Final Report Due May 7 (11:59pm) 6-10 pages including figures, tables and references Content See presentation instructions Do not repeat course material Counts for 20% of total grade NO LATE SUBMISSIONS 7

8 Outline OpenCL Convolution Example Parallel Min() Example 8

9 Image Convolution Using OpenCL Udeepta Bordoloi, ATI Stream Application Engineer 10/13/2009 Note: ATI Stream Technology is now called AMD Accelerated Parallel Processing (APP) Technology. 9

10 Step 1 The Algorithm Ignore boundaries Output size: (input_image_width filter_width + 1) by (input_image_height filter_width + 1) 10

11 C Version void Convolve(float * pinput, float * pfilter, float * poutput, const int ninwidth, const int nwidth, const int nheight, const int nfilterwidth, const int nnumthreads) { for (int yout = 0; yout < nheight; yout++) { const int yintopleft = yout; for (int xout = 0; xout < nwidth; xout++) { const int xintopleft = xout; float sum = 0; 11

12 C Version (2) for (int r = 0; r < nfilterwidth; r++) { const int idxftmp = r * nfilterwidth; const int yin = yintopleft + r; const int idxintmp = yin * ninwidth + xintopleft; for (int c = 0; c < nfilterwidth; c++) { const int idxf = idxftmp + c; const int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; } } //for (int r = 0 12

13 C Version (3) } const int idxout = yout * nwidth + xout; poutput[idxout] = sum; } //for (int xout = 0 } //for (int yout = 0 13

14 Parameters struct paramstruct { int nwidth; //Output image width int nheight; //Output image height int ninwidth; //Input image width int ninheight; //Input image height int nfilterwidth; //Filter size is nfilterwidth X //nfilterwidth int niterations; //Run timing loop for niterations //Test CPU performance with 1,4,8 etc. OpenMP threads std::vector ompthreads; int nompruns; //ompthreads.size() bool bcputiming; //Time CPU performance } params; 14

15 OpenMP for Comparison //This #pragma splits the work between multiple threads #pragma omp parallel for num_threads(nnumthreads) for (int yout = 0; yout < nheight; yout++)... void InitParams(int argc, char* argv[]) { // time the OpenMP convolution performance with // different numbers of threads params.ompthreads.push_back(4); params.ompthreads.push_back(1); params.ompthreads.push_back(8); params.nompruns = params.ompthreads.size(); } 15

16 First Kernel kernel void Convolve(const global float * pinput, constant float * pfilter, global float * poutput, const int ninwidth, const int nfilterwidth) { const int nwidth = get_global_size(0); const int xout = get_global_id(0); const int yout = get_global_id(1); const int xintopleft = xout; const int yintopleft = yout; float sum = 0; 16

17 First Kernel (2) for (int r = 0; r < nfilterwidth; r++) { const int idxftmp = r * nfilterwidth; const int yin = yintopleft + r; const int idxintmp = yin * ninwidth + xintopleft; for (int c = 0; c < nfilterwidth; c++) { const int idxf = idxftmp + c; const int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; } } //for (int r = 0 const int idxout = yout * nwidth + xout; Output[idxOut] = sum; } 17

18 Initialize OpenCL cl_context context = clcreatecontextfromtype(,cl_device_type_cpu, ); // get list of devices - quad core counts as one device size_t listsize; /* First, get the size of device list */ clgetcontextinfo(context, CL_CONTEXT_DEVICES,, &listsize); /* Now, allocate the device list */ cl_device_id devices = (cl_device_id *)malloc(listsize); /* Next, get the device list data */ clgetcontextinfo(context, CL_CONTEXT_DEVICES, listsize, devices, ); 18

19 Initialize OpenCL (2) cl_command_queue queue = clcreatecommandqueue(context, devices[0], ); cl_program program = clcreateprogramwithsource(context, 1, &source, ); clbuildprogram(program, 1, devices, ); cl_kernel kernel = clcreatekernel(program, "Convolve", ); // get error messages clgetprogrambuildinfo(program, devices[0], CL_PROGRAM_BUILD_LOG, ); 19

20 Initialize Buffers cl_mem inputcl = clcreatebuffer(context, CL_MEM_READ_ONLY CL_MEM_USE_HOST_PTR, host_buffer_size, host_buffer_ptr, ); //If the device is a GPU (CL_DEVICE_TYPE_GPU), we can // explicitly copy data to the input image buffer on the // device: clenqueuewritebuffer(queue, inputcl,, host_buffer_ptr, ); // And copy back from the output image buffer after the // convolution kernel execution. clenqueuereadbuffer(queue, outputcl,, host_buffer_ptr, ); 20

21 Execute Kernel /* input buffer, arg 0 */ clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *)&inputcl); /* filter buffer, arg 1 */ clsetkernelarg(kernel, 1, sizeof(cl_mem), (void *)&filtercl); /* output buffer, arg 2 */ clsetkernelarg(kernel, 2, sizeof(cl_mem), (void *)&outputcl); /* input image width, arg 3*/ clsetkernelarg(kernel, 3, sizeof(int), (void *)&ninwidth); /* filter width, arg 4*/ clsetkernelarg(kernel, 4, sizeof(int), (void *)&nfilterwidth); 21

22 Execute Kernel clenqueuendrangekernel(queue, kernel, data_dimensionality,, total_work_size, work_group_size, ); // release all buffers clreleasebuffer(inputcl);... // release all resources clreleasekernel(kernel); clreleaseprogram(program); clreleasecommandqueue(queue); clreleasecontext(context); 22

23 Timing clfinish(queue); //Timer Started here(); for (int i = 0; i < niterations; i++) clenqueuendrangekernel( ); clfinish(queue); //Timer Stopped here(); //Average Time = ElapsedTime()/nIterations; clfinish() call before both starting and stopping the timer ensures that we time the kernel execution activity to its completion and nothing else On 4-core AMD Phenom treated as a single device by OpenCL 23

24 C++ Bindings cl_context context = clcreatecontextfromtype(,cl_device_type_cpu, ); cl::context context = cl::context(cl_device_type_cpu); // get list of devices - quad core counts as one device size_t listsize; /* First, get the size of device list */ clgetcontextinfo(context, CL_CONTEXT_DEVICES,, &listsize); /* Now, allocate the device list */ cl_device_id devices = (cl_device_id *)malloc(listsize); /* Next, get the device list data */ clgetcontextinfo(context, CL_CONTEXT_DEVICES, listsize, devices, ); std::vector<cl::device> devices = context.getinfo(); See 24

25 C++ Bindings (2) cl::commandqueue queue = cl::commandqueue(context, devices[0]); cl::program program = cl::program(context, ); program.build(devices); cl::kernel kernel = cl::kernel(program, "Convolve"); string str = program.getbuildinfo(devices[0]); // Buffer init is similar to C version // using methods of queue 25

26 Execute Kernel /* input buffer, arg 0 */ clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *)&inputcl); kernel.setarg(0, inputcl); /* filter buffer, arg 1 */ clsetkernelarg(kernel, 1, sizeof(cl_mem), (void *)&filtercl); kernel.setarg(1, filtercl); // etc. queue.clenqueuendrangekernel(kernel,, total_work_size, work_group_size, ); 26

27 Loop Unrolling kernel void Convolve_Unroll(const global float * pinput, constant float * pfilter, global float * poutput, const int ninwidth, const int nfilterwidth) { const int nwidth = get_global_size(0); const int xout = get_global_id(0); const int yout = get_global_id(1); const int xintopleft = xout; const int yintopleft = yout; float sum = 0; for (int r = 0; r < nfilterwidth; r++) { const int idxftmp = r * nfilterwidth; const int yin = yintopleft + r; const int idxintmp = yin * ninwidth + xintopleft; 27

28 Loop Unrolling (2) int c = 0; while (c <= nfilterwidth-4) { int idxf = idxftmp + c; int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; idxf++; idxin++; sum += pfilter[idxf]*pinput[idxin]; idxf++; idxin++; sum += pfilter[idxf]*pinput[idxin]; idxf++; idxin++; sum += pfilter[idxf]*pinput[idxin]; c += 4; } 28

29 Loop Unrolling (3) } for (int c1 = c; c1 < nfilterwidth; c1++) { const int idxf = idxftmp + c1; const int idxin = idxintmp + c1; sum += pfilter[idxf]*pinput[idxin]; } } //for (int r = 0 const int idxout = yout * nwidth + xout; poutput[idxout] = sum; // what does this do? 29

30 Performance 30

31 Unrolled Kernel 2 (if Kernel) // last loop int cmod = nfilterwidth c; if (cmod == 1) { int idxf = idxftmp + c; int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; } else if (cmod == 2) { int idxf = idxftmp + c; int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; sum += pfilter[idxf+1]*pinput[idxin+1]; } 31

32 Unrolled Kernel 2 (2) } else if (cmod == 3) { int idxf = idxftmp + c; int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; sum += pfilter[idxf+1]*pinput[idxin+1]; sum += pfilter[idxf+2]*pinput[idxin+2]; } } //for (int r = 0 const int idxout = yout * nwidth + xout; poutput[idxout] = sum; 32

33 Performance Yet another way to achieve similar results is to write four different versions of the ConvolveUnroll kernel. The four versions will correspond to (filterwidth%4) equalling 0, 1, 2, or 3. The particular version called can be decided at run-time depending on the value of filterwidth 33

34 Kernel with Invariants Loop unrolling did not help when the filter width is low So far, kernels have been written in a generic way so that they will work for all filter sizes What if we can focus on a particular filter size? E.g We can now unroll the inner loop five times and get rid of the loop condition If we use the invariant in the loop condition, a good compiler will unroll the loop itself 34

35 Kernel with Invariants kernel void Convolve_Def(const global float * pinput, constant float * pfilter, global float * poutput, const int ninwidth, const int nfilterwidth) { const int nwidth = get_global_size(0); const int xout = get_global_id(0); const int yout = get_global_id(1); const int xintopleft = xout; const int yintopleft = yout; float sum = 0; for (int r = 0; r < FILTER_WIDTH; r++) { const int idxftmp = r * FILTER_WIDTH; const int yin = yintopleft + r; const int idxintmp = yin * ninwidth + xintopleft; 35

36 Kernel with Invariants (2) } for (int c = 0; c < FILTER_WIDTH; c++) { const int idxf = idxftmp + c; const int idxin = idxintmp + c; sum += pfilter[idxf]*pinput[idxin]; } } //for (int r = 0 const int idxout = yout * nwidth + xout; poutput[idxout] = sum; 36

37 Setting Filter Width // this can be done online and offline /* create a cl source string */ std::string sourcestr = Convert-File-To-String(File-Name); cl::program::sources sources(1, std::make_pair(sourcestr.c_str(), sourcestr.length())); /* create a cl program object */ program = cl::program(context, sources); /* build a cl program executable with some #defines */ char options[128]; sprintf(options, "-DFILTER_WIDTH=%d", filter_width); program.build(devices, options); /* create a kernel object for a kernel with the given name */ cl::kernel kernel = cl::kernel(program, "Convolve_Def"); 37

38 Performance 38

39 Performance 39

40 Performance Unroll + if on remainder 40

41 Vectorization kernel void Convolve_Unroll(const global float * pinput, constant float * pfilter, global float * poutput, const int ninwidth, const int nfilterwidth) { const int nwidth = get_global_size(0); const int xout = get_global_id(0); const int yout = get_global_id(1); const int xintopleft = xout; const int yintopleft = yout; float sum0 = 0; float sum1 = 0; float sum2 = 0; float sum3 = 0; for (int r = 0; r < nfilterwidth; r++) { const int idxftmp = r * nfilterwidth; 41

42 Vectorization (2) const int yin = yintopleft + r; const int idxintmp = yin * ninwidth + xintopleft; int c = 0; while (c <= nfilterwidth-4) { float mul0, mul1, mul2, mul3; int idxf = idxftmp + c; int idxin = idxintmp + c; mul0 = pfilter[idxf]*pinput[idxin]; idxf++; idxin++; mul1 += pfilter[idxf]*pinput[idxin]; idxf++; idxin++; mul2 += pfilter[idxf]*pinput[idxin]; idxf++; idxin++; mul3 += pfilter[idxf]*pinput[idxin]; 42

43 Vectorization (3) } sum0 += mul0; sum1 += mul1; sum2 += mul2; sum3 += mul3; c += 4; } for (int c1 = c; c1 < nfilterwidth; c1++) { const int idxf = idxftmp + c1; const int idxin = idxintmp + c1; sum0 += pfilter[idxf]*pinput[idxin]; } } //for (int r = 0 const int idxout = yout * nwidth + xout; poutput[idxout] = sum0 + sum1 + sum2 + sum3; 43

44 Vectorized Kernel kernel void Convolve_Float4(const global float * pinput, constant float * pfilter, global float * poutput, const int ninwidth, const int nfilterwidth) { const int nwidth = get_global_size(0); const int xout = get_global_id(0); const int yout = get_global_id(1); const int xintopleft = xout; const int yintopleft = yout; float4 sum4 = 0; for (int r = 0; r < nfilterwidth; r++) { const int idxftmp = r * nfilterwidth; const int yin = yintopleft + r; const int idxintmp = yin * ninwidth + xintopleft; 44

45 Vectorized Kernel int c = 0; int c4 = 0; while (c <= nfilterwidth-4) { float4 filter4 = vload4(c4,pfilter+idxftmp); float4 in4 = vload4(c4,pinput +idxintmp); sum4 += in4 * filter4; c += 4; c4++; } for (int c1 = c; c1 < nfilterwidth; c1++) { const int idxf = idxftmp + c1; const int idxin = idxintmp + c1; sum4.x += pfilter[idxf]*pinput[idxin]; } } //for (int r = 0 const int idxout = yout * nwidth + xout; poutput[idxout] = sum4.x + sum4.y + sum4.z + sum4.w; } 45

46 Performance 46

47 Performance if Kernel 47

48 Performance Kernel with Invariants 48

49 OpenMP Comparison 49

50 OpenMP Comparison 50

51 Parallel Min() Programming Guide AMD Accelerated Parallel Processing OpenCL (November 2013) 51

52 // // Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved. // #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include "Timer.h" #define NDEVS 2 // A parallel min() kernel that works well on CPU and GPU const char *kernel_source = " \n" "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable \n" "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable \n" " \n " // 9. The source buffer is accessed as 4-vectors. \n" " \n" 52

53 " kernel void minp( global uint4 *src, \n" " global uint *gmin, \n" " local uint *lmin, \n" " global uint *dbg, \n" " int nitems, \n" " uint dev ) \n" "{ \n" " // 10. Set up global memory access pattern. \n" " \n" " uint count = ( nitems / 4 ) / get_global_size(0); \n" " uint idx = (dev == 0)? get_global_id(0) * count \n" " : get_global_id(0); \n" " uint stride = (dev == 0)? 1 : get_global_size(0); \n" " uint pmin = (uint) -1; \n" " \n" 53

54 " // 11. First, compute private min, for this work-item. \n" " \n" " for( int n=0; n < count; n++, idx += stride ) \n" " { \n" " pmin = min( pmin, src[idx].x ); \n" " pmin = min( pmin, src[idx].y ); \n" " pmin = min( pmin, src[idx].z ); \n" " pmin = min( pmin, src[idx].w ); \n" " } \n" " \n" " // 12. Reduce min values inside work-group. \n" " \n" " if( get_local_id(0) == 0 ) \n" " lmin[0] = (uint) -1; \n" " \n" " barrier( CLK_LOCAL_MEM_FENCE ); \n" " \n" " (void) atom_min( lmin, pmin ); \n" " \n" " barrier( CLK_LOCAL_MEM_FENCE ); \n" " \n" 54

55 " // Write out to global. \n" " \n" " if( get_local_id(0) == 0 ) \n" " gmin[ get_group_id(0) ] = lmin[0]; \n " \n" " // Dump some debug information. \n" " \n" " if( get_global_id(0) == 0 ) \n" " { \n" " dbg[0] = get_num_groups(0); \n" " dbg[1] = get_global_size(0); \n" " dbg[2] = count; \n" " dbg[3] = stride; \n" " } \n" "} \n" " \n 55

56 "// 13. Reduce work-group min values from global to global. \n" " \n" " kernel void reduce( global uint4 *src, \n" " global uint *gmin ) \n" "{ \n" " (void) atom_min( gmin, gmin[get_global_id(0)] ) ; \n" "} \n"; 56

57 int main(int argc, char ** argv) { cl_platform_id platform; int dev, nw; cl_device_type devs[ndevs] = { CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU }; cl_uint *src_ptr; unsigned int num_src_items = 4096*4096; // 1. quick & dirty MWC random init of source buffer. // Random seed (portable). time_t ltime; time(&ltime); src_ptr = (cl_uint *) malloc( num_src_items * sizeof(cl_uint) ); cl_uint a = (cl_uint) ltime, b = (cl_uint) ltime; cl_uint min = (cl_uint) -1; 57

58 // Do serial computation of min() for result verification. for( int i=0; i < num_src_items; i++ ) { src_ptr[i] = (cl_uint) (b = ( a * ( b & )) + ( b >> 16 )); min = src_ptr[i] < min? src_ptr[i] : min; } // Get a platform. clgetplatformids( 1, &platform, NULL ); // 3. Iterate over devices. for(dev=0; dev < NDEVS; dev++) { cl_device_id device; cl_context context; cl_command_queue queue; 58

59 cl_program program; cl_kernel minp; cl_kernel reduce; cl_mem src_buf; cl_mem dst_buf; cl_mem dbg_buf; cl_uint *dst_ptr, *dbg_ptr; printf("\n%s: ", dev == 0? "CPU" : "GPU"); // Find the device. clgetdeviceids( platform, devs[dev], 1, &device, NULL); 59

60 // 4. Compute work sizes. cl_uint compute_units; size_t global_work_size; size_t local_work_size; size_t num_groups; clgetdeviceinfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &compute_units, NULL); if( devs[dev] == CL_DEVICE_TYPE_CPU ) { global_work_size = compute_units * 1; // 1 thread per core local_work_size = 1; } 60

61 Wavefront = CUDA warp currently has 64 work items else { cl_uint ws = 64; global_work_size = compute_units * 7 * ws; // 7 wavefronts per SIMD while( (num_src_items / 4) % global_work_size!= 0 ) global_work_size += ws; local_work_size = ws; } num_groups = global_work_size/local_work_size; // Create a context and command queue on that //device. context = clcreatecontext( NULL, 1, &device, NULL, NULL, NULL); queue = clcreatecommandqueue(context, device, 0, NULL); 61

62 // Minimal error check. if( queue == NULL ) { printf("compute device setup failed\n"); return(-1); } // Perform runtime source compilation, and obtain // kernel entry point. program = clcreateprogramwithsource( context, 1, &kernel_source, NULL, NULL ); // 5. Print compiler error messages SKIPPED 62

63 minp = clcreatekernel( program, "minp", NULL ); reduce = clcreatekernel( program, "reduce", NULL ); // Create input, output and debug buffers. src_buf = clcreatebuffer( context, CL_MEM_READ_ONLY CL_MEM_COPY_HOST_PTR, num_src_items * sizeof(cl_uint), src_ptr, NULL ); dst_buf = clcreatebuffer( context, CL_MEM_READ_WRITE, num_groups * sizeof(cl_uint), NULL, NULL ); dbg_buf = clcreatebuffer( context, CL_MEM_WRITE_ONLY, global_work_size * sizeof(cl_uint), NULL, NULL ); 63

64 clsetkernelarg(minp, 0, sizeof(void *), (void*) &src_buf); clsetkernelarg(minp, 1, sizeof(void *), (void*) &dst_buf); clsetkernelarg(minp, 2, 1*sizeof(cl_uint), (void*) NULL); clsetkernelarg(minp, 3, sizeof(void *), (void*) &dbg_buf); clsetkernelarg(minp, 4, sizeof(num_src_items), (void*) &num_src_items); clsetkernelarg(minp, 5, sizeof(dev), (void*) &dev); clsetkernelarg(reduce, 0, sizeof(void *), (void*) &src_buf); clsetkernelarg(reduce, 1, sizeof(void *), (void*) &dst_buf); CPerfCounter t; t.reset(); t.start(); 64

65 // 6. Main timing loop. #define NLOOPS 500 cl_event ev; int nloops = NLOOPS; while(nloops--) { clenqueuendrangekernel( queue, minp, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &ev); clenqueuendrangekernel( queue, reduce, 1, NULL, &num_groups, NULL, 1, &ev, NULL); } 65

66 clfinish( queue ); t.stop(); printf("b/w %.2f GB/sec, ", ((float) num_src_items * sizeof(cl_uint) * NLOOPS) / t.getelapsedtime() / 1e9 ); // 7. Look at the results via synchronous buffer map. dst_ptr = (cl_uint *) clenqueuemapbuffer( queue, dst_buf, CL_TRUE, CL_MAP_READ, 0, num_groups * sizeof(cl_uint), 0, NULL, NULL, NULL ); dbg_ptr = (cl_uint *) clenqueuemapbuffer( queue, dbg_buf, CL_TRUE, CL_MAP_READ, 0, global_work_size * sizeof(cl_uint), 0, NULL, NULL, NULL ); 66

67 // 8. Print some debug info. printf("%d groups, %d threads, count %d, stride %d\n", dbg_ptr[0], dbg_ptr[1], dbg_ptr[2], dbg_ptr[3]}; if( dst_ptr[0] == min ) printf("result correct\n"); else printf("result INcorrect\n"); } // iterate over devices } printf("\n"); return 0; 67

68 Binary Search Design a GPU-friendly binary search algorithm Assume input array is sorted and enormous 68

CS 677: Parallel Programming for Many-core Processors Lecture 12

1 CS 677: Parallel Programming for Many-core Processors Lecture 12 Instructor: Philippos Mordohai Webpage: www.cs.stevens.edu/~mordohai E-mail: Philippos.Mordohai@stevens.edu Final Project Presentations