APPENDIX. Source code. Part 1. Part 2. Part 3.

Size: px

Start display at page:

Download "APPENDIX. Source code. Part 1. Part 2. Part 3."

Allen Adams
5 years ago
Views:

1 APPENDIX Source code Part 1. Part 2. Part 3. 1

2 Source Code Part 1. arrayfun pagefun bsxfun 2

3 Source Code Part 1. arrayfun() function y = foo(x) y = 1 + x.*(1 + x.*(1 + x.*(1 + x.*(1 + x.*(1 + x.*(1 + x.*(1 + x.*(1 + x./9)./8)./7)./6)./5)./4)./3)./2); %% arrayfun clear ; clc; % display(' GPU Performance [ CPU vs GPU vs GPU with arrayfun ] '); display(' GPU Performance [ CPU vs GPU with arrayfun ] '); type foo.m; cpux gpux = rand(1e4, 1e3); = gpuarray(cpux); % CPU COMPUTING cpuy = foo(cpux); % cpuy = arrayfun(@foo, cpux); tcpu = toc; % GPU ONLY COMPUTING gpuy1 = foo(gpux); tgpu1 = toc; % GPU WITH ARRAYFUN COMPUTING gpuy2 = arrayfun(@foo, gpux); tgpu2 = toc; % MAXIMUM ABSOLUTE ERROR err1 = max(abs(cpuy(:) - gpuy1(:))); err2 = max(abs(cpuy(:) - gpuy2(:))); % DISPLAY display(['execution time on CPU : ' num2str(tcpu, '%2.6f') ' sec']); % display(['execution time on GPU only : ' num2str(tgpu1, '%2.6f') ' sec']); display(['execution time on GPU with arrayfun : ' num2str(tgpu2, '%2.6f') ' sec']); % display(['maximum absolute error for CPU / GPU only : ' num2str(err1, '%2.4e')]); display(['maximum absolute error for CPU / GPU with arrayfun : ' num2str(err2, '%2.4e')]); % display(['acceleration ratio for CPU / GPU only : x ' num2str(tcpu/tgpu1, '%2.4f')]); display(['acceleration ratio for CPU / GPU with arrayfun : x ' num2str(tcpu/tgpu2, '%2.4f')]); 3

4 Source Code Part 1. pagefun() %% pagefun clear ; % clc; % display(' GPU Performance [ CPU vs GPU vs GPU with pagefun ] '); display(' GPU Performance [ CPU vs GPU with pagefun ] '); cpux gpux = rand(1e2, 1e2, 1e1, 1e1); = gpuarray(cpux); % CPU COMPUTING cpuy = zeros(size(cpux)); for i = 1:size(cpuX, 3) for j = 1:size(cpuX, 4) cpuy(:,:,i,j) = transpose(cpux(:,:,i,j)); tcpu = toc; % GPU ONLY COMPUTING gpuy1 = zeros(size(gpux), 'gpuarray'); for i = 1:size(cpuX, 3) for j = 1:size(cpuX, 4) gpuy1(:,:,i,j) = transpose(gpux(:,:,i,j)); tgpu1 = toc; % GPU WITH PAGEFUN COMPUTING gpuy2 = pagefun(@transpose, gpux); tgpu2 = toc; % MAXIMUM ABSOLUTE ERROR err1 = max(abs(cpuy(:) - gpuy1(:))); err2 = max(abs(cpuy(:) - gpuy2(:))); % DISPLAY display(['execution time on CPU : ' num2str(tcpu, '%2.6f') ' sec']); % display(['execution time on GPU only : ' num2str(tgpu1, '%2.6f') ' sec']); display(['execution time on GPU with arrayfun : ' num2str(tgpu2, '%2.6f') ' sec']); % display(['maximum absolute error for CPU / GPU only : ' num2str(err1, '%2.4e')]); display(['maximum absolute error for CPU / GPU with arrayfun : ' num2str(err2, '%2.4e')]); % display(['acceleration ratio for CPU / GPU only : x ' num2str(tcpu/tgpu1, '%2.4f')]); display(['acceleration ratio for CPU / GPU with arrayfun : x ' num2str(tcpu/tgpu2, '%2.4f')]); 4

5 Source Code Part 1. bsxfun() %% bsxfun clear ; % clc; % display(' GPU Performance [ CPU vs GPU vs GPU with bsxfun ] '); display(' GPU Performance [ CPU vs GPU with bsxfun ] '); cpux cpuy gpux gpuy = rand(1e4, 1e3); = mean(cpux); = gpuarray(cpux); = gpuarray(cpuy); % CPU cpuz = zeros(size(cpux)); for j = 1:size(cpuX, 2) cpuz(:, j) = minus(cpux(:, j), cpuy(j)); % cpuz = cpux - repmat(cpuy, [size(cpux, 1), 1]); tcpu = toc; % GPU ONLY % gpuz1 = zeros(size(gpux), 'gpuarray'); % for j = 1:size(cpuX, 2) % gpuz1(:, j) = minues(gpux(:, j), gpuy(j)); % gpuz1 = gpux - repmat(gpuy, [size(gpux, 1), 1]); tgpu1 = toc; % GPU WITH BSXFUN COMPUTING gpuz2 = bsxfun(@minus, gpux, gpuy); tgpu2 = toc; % MAXIMUM ABSOLUTE ERROR err1 = max(abs(cpuz(:) - gpuz1(:))); err2 = max(abs(cpuz(:) - gpuz2(:))); % DISPLAY display(['execution time on CPU : ' num2str(tcpu, '%2.6f') ' sec']); % display(['execution time on GPU only : ' num2str(tgpu1, '%2.6f') ' sec']); display(['execution time on GPU with arrayfun : ' num2str(tgpu2, '%2.6f') ' sec']); % display(['maximum absolute error for CPU / GPU only : ' num2str(err1, '%2.4e')]); display(['maximum absolute error for CPU / GPU with arrayfun : ' num2str(err2, '%2.4e')]); % display(['acceleration ratio for CPU / GPU only : x ' num2str(tcpu/tgpu1, '%2.4f')]); display(['acceleration ratio for CPU / GPU with arrayfun : x ' num2str(tcpu/tgpu2, '%2.4f')]); 5

6 Source Code Part 2. mrics_gpu.m test_mrics.m 6

7 Source Code Part 2. mrics_gpu() function u = mrics_gpu(r,f, mu, lambda, gamma, ninner, nbreg) [rows,cols] = size(f); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % GPUARRAY %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% f R = gpuarray(f); = gpuarray(r); % Reserve memory for the auxillary variables f0 = f; u = zeros(rows,cols, 'gpuarray'); x = zeros(rows,cols, 'gpuarray'); y = zeros(rows,cols, 'gpuarray'); bx = zeros(rows,cols, 'gpuarray'); by = zeros(rows,cols, 'gpuarray'); % Build Kernels scale = sqrt(rows*cols); murf = ifft2(mu*(conj(r).*f))*scale; uker = zeros(rows,cols, 'gpuarray'); uker(1,1) = 4;uker(1,2)=-1;uker(2,1)=-1;uker(rows,1)=-1;uker(1,cols)=-1; uker = mu*(conj(r).*r)+lambda*fft2(uker)+gamma; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Do the reconstruction for outer = 1:nBreg; for inner = 1:nInner; % update u rhs = murf+lambda*dxt(x-bx)+lambda*dyt(y-by)+gamma*u; u = ifft2(fft2(rhs)./uker); % update x and y dx = Dx(u); dy =Dy(u); [x,y] = shrink2( dx+bx, dy+by,1/lambda); % update bregman parameters bx = bx+dx-x; by = by+dy-y; f = f+f0-r.*fft2(u)/scale; murf = ifft2(mu*r.*f)*scale; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % GATHER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% u = gather(u); return; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 7

8 function d = Dx(u) [rows,cols] = size(u); d = zeros(rows,cols, 'gpuarray'); d(:,2:cols) = u(:,2:cols)-u(:,1:cols-1); d(:,1) = u(:,1)-u(:,cols); Return function d = Dxt(u) [rows,cols] = size(u); d = zeros(rows,cols, 'gpuarray'); d(:,1:cols-1) = u(:,1:cols-1)-u(:,2:cols); d(:,cols) = u(:,cols)-u(:,1); return function d = Dy(u) [rows,cols] = size(u); d = zeros(rows,cols, 'gpuarray'); d(2:rows,:) = u(2:rows,:)-u(1:rows-1,:); d(1,:) = u(1,:)-u(rows,:); return function d = Dyt(u) [rows,cols] = size(u); d = zeros(rows,cols, 'gpuarray'); d(1:rows-1,:) = u(1:rows-1,:)-u(2:rows,:); d(rows,:) = u(rows,:)-u(1,:); return function [xs,ys] = shrink2(x,y,lambda) s = sqrt(x.*conj(x)+y.*conj(y)); ss = s-lambda; ss = ss.*(ss>0); s = s+(s<lambda); ss = ss./s; xs = ss.*x; ys = ss.*y; return; 8

9 Source Code Part 2. test_mrics () N = 512; % The image will be NxN sparsity =.25; % use only 25% on the K-Space data for CS mu =.1; lambda =.1; gamma = mu/1000; % build an image of a square % image = zeros(n,n); % image(n/4:3*n/4,n/4:3*n/4)=255; image = phantom(n)*255; % build the sampling matrix, R R = rand(n,n); R = double(r<sparsity); R(1, 1) = 1; % DC POINT % Form the CS data F = R.*fft2(image)/N; % Recover the image recovered = mrics(r,f, mu, lambda, gamma,10, 5); toc; recovered2 = mrics_gpu(r,f, mu, lambda, gamma,10, 5); toc; wnd = [0, 255]; % build a figure to display results figure; subplot(2,2,1); imagesc(abs(image), wnd); colormap('gray'); title('original'); subplot(2,2,2); imagesc(abs(r)); colormap('gray'); title('r'); subplot(2,2,3); % imagesc(abs(ifft2(f))); colormap('gray'); imagesc(abs(recovered), wnd); colormap('gray'); title('set unknown to 0'); subplot(2,2,4); imagesc(abs(recovered2), wnd); colormap('gray'); title('split Bregman Recovery'); figure; imagesc(abs(recovered - recovered2)); colormap('gray'); colorbar; title('cpu_{recovery} - GPU_{reconvery}'); 9

10 Source Code Part 3. iradon_gpu.m iradonmexcu.cu demo_iradon.m 10

11 Source Code Part 3. iradon_gpu() function [img,h] = iradon_gpu(varargin) narginchk(2,6); [p,theta,filter,d,interp,n] = parse_inputs(varargin{:}); [p,h] = filterprojections(p, filter, d); % Define the x & y axes for the reconstructed image so that the origin % (center) is in the spot which RADON would choose. center = floor((n + 1)/2); xleft = -center + 1; x = (1:N) xleft; x = repmat(x, N, 1); ytop = center - 1; y = (N:-1:1).' - N + ytop; y = repmat(y, 1, N); len = size(p,1); ctridx = ceil(len/2); % index of the center of the projections % Zero pad the projections to size 1+2*ceil(N/sqrt(2)) if this % quantity is greater than the length of the projections imgdiag = 2*ceil(N/sqrt(2))+1; % largest distance through image. if size(p,1) < imgdiag rz = imgdiag - size(p,1); % how many rows of zeros p = [zeros(ceil(rz/2),size(p,2)); p; zeros(floor(rz/2),size(p,2))]; ctridx = ctridx+ceil(rz/2); img = iradonmexcu(n, single(theta), single(x), single(y), single(p)); img = img*pi/(2*length(theta)); return ; 11

12 Source Code Part 3. iradonmexcu.cu #include <string.h> #include "mex.h" * Declare a prototype of a kernel function. global void iradon(float *img, int N, int len, int view, float *theta, float *x, float *y, float *p); * Declare a main function. void mexfunction (int nlhs, mxarray *plhs[], int nrhs, const mxarray *prhs[]) { * Connect from the MATLAB ARRAY POINTER * to the MEX ARRAY POINTER. int N = (int) mxgetscalar(prhs[0]); float *theta = (float *) mxgetdata(prhs[1]); float *x = (float *) mxgetdata(prhs[2]); float *y = (float *) mxgetdata(prhs[3]); float *p = (float *) mxgetdata(prhs[4]); int len = (int) mxgetm(prhs[4]); int view = (int) mxgetn(prhs[4]); * Create a OUT MATRIX. mwsize DIM = 2; mwsize DIMS[2] = {N, N}; plhs[0] mxreal); = mxcreatenumericarray(dim, (const mwsize *)DIMS, mxsingle_class, float *img = (float *) mxgetdata(plhs[0]); 12

13 * Create a GPU ARRAY. * Copy a MEMORY from MEX ARRAY * to GPU ARRAY. float *gtheta = 0; float *gx = 0; float *gy = 0; float *gp = 0; float *gimg = 0; cudamalloc(&gtheta, sizeof(float)*view); cudamemset(gtheta, 0, sizeof(float)*view); cudamemcpy(gtheta, theta, sizeof(float)*view, cudamemcpyhosttodevice); cudamalloc(&gx, sizeof(float)*n*n); cudamemset(gtheta, 0, sizeof(float)*n*n); cudamemcpy(gx, x, sizeof(float)*n*n, cudamemcpyhosttodevice); cudamalloc(&gy, sizeof(float)*n*n); cudamemset(gtheta, 0, sizeof(float)*n*n); cudamemcpy(gy, y, sizeof(float)*n*n, cudamemcpyhosttodevice); cudamalloc(&gp, sizeof(float)*len*view); cudamemset(gp, 0, sizeof(float)*len*view); cudamemcpy(gp, p, sizeof(float)*len*view, cudamemcpyhosttodevice); cudamalloc(&gimg, sizeof(float)*n*n); cudamemset(gimg, 0, sizeof(float)*n*n); 13

14 * Create a 3-d GRID. * 1st GRID : X axis of OBJECT * 2nd GRID : Y axis of OBJECT * 3th GRID : view axis of PROJECTION int threadnum = 8; dim3 dim3 blk.x blk.y blk.z grd.x grd.y grd.z blk; grd; = threadnum; = threadnum; = threadnum; = ceil(float(n)/threadnum); = ceil(float(n)/threadnum); = ceil(float(view)/threadnum); * Call the kernel using a CUDA runtime API. iradon<<<grd, blk>>>(gimg, N, len, view, gtheta, gx, gy, gp); * Copy a MEMORY from GPU ARRAY * to MEX ARRAY. cudamemcpy(img, gimg, sizeof(float)*n*n, cudamemcpydevicetohost); * MUST BE Destroy the GPU ARRAY. cudafree(gtheta); gtheta = 0; cudafree(gx); gx = 0; cudafree(gy); gy = 0; cudafree(gp); gp = 0; cudafree(gimg); gimg = 0; } return ; 14

15 * Declare a main function. global void iradon(float *img, int N, int len, int view, float *theta, float *x, float *y, float *p) { * Calcurate a global linear index, assuming a 3-d GRID. * 1st GRID : X axis of OBJECT * 2nd GRID : Y axis of OBJECT * 3th GRID : view axis of PROJECTION * * Except the index if exceeded the boundary. int xidx = blockdim.x*blockidx.x + threadidx.x; int yidx = blockdim.y*blockidx.y + threadidx.y; int viewidx = blockdim.z*blockidx.z + threadidx.z; if (xidx >= N) return ; if (yidx >= N) return ; if (viewidx >= view) return ; int xyidx = N*xIdx + yidx; * Calculate a detector position (t) matched a xy position of object. int ctridx = int(ceil((len - 1.0f)/2.0f) + 1) - 1; float t = x[xyidx]*cosf(theta[viewidx]) + y[xyidx]*sinf(theta[viewidx]) + ctridx; * Fetch a projection data using 1d interpolation int t_b = floor(t); int t_u = ceil(t); int pidx_b = len*viewidx + t_b; int pidx_u = len*viewidx + t_u; float wgt_b = t_u - t; float wgt_u = 1 - wgt_b; float projcontrib = wgt_b*p[pidx_b] + wgt_u*p[pidx_u]; * Accumulate a projection data on the object matrix atomicadd(&img[xyidx], projcontrib); } return ; 15

16 Source Code Part 3. demo_iradon() clear; clc; % mex iradonmexcu.cu; %% N = 512; VIEW = 720; THETA = linspace(0, 360, VIEW + 1); THETA() = []; % OBJECT OBJ % RADON PROJ = phantom(n); = radon(obj, THETA); % IRADON ON MATLAB RECON_MATLAB = iradon(proj, THETA, N); tmat = toc; % IRADON ON GPU RECON_GPU tgpu = iradon_gpu(proj, THETA, N); = toc; % MAX ABSOLUTE ERROR ERR = max(abs(recon_matlab(:) - RECON_GPU(:))); % MEAN SQUARED ERROR % ERR1 = mse(obj(:), RECON_MATLAB(:)); % ERR2 = mse(obj(:), RECON_GPU(:)); %% FIGURE figure(1); colormap gray; subplot(231); imagesc(obj, [0, 1]); title('object'); axis off image; subplot(2,3,[2,3]); imagesc(proj); title('projection'); axis off; subplot(234); imagesc(recon_matlab, [0, 1]); title('recon_{matlab}'); axis off image; subplot(235); imagesc(recon_gpu, [0, 1]); title('recon_{gpu}'); axis off image; subplot(236); imagesc(recon_matlab - RECON_GPU); title('difference_{matlab - GPU}'); axis off image; % DISPLAY display(['execution time on MATLAB : ' num2str(tmat, '%2.6f') ' sec']); display(['execution time on GPU : ' num2str(tgpu, '%2.6f') ' sec']); display(['acceleration ratio for MATLAB / GPU : x ' num2str(tmat/tgpu, '%2.4f')]); 16

17 Thank you Bio Imaging & Signal Processing Lab. (BISPL) Dept. of Bio & Brain Engineering Korea Advanced Institute of Science & Technology (KAIST) 17

MATRIX INVERSION SPEED UP WITH CUDA JORGE SORIANO PINEDO ELECTRICAL AND COMPUTER ENGINEERING

MATRIX INVERSION SPEED UP WITH CUDA BY JORGE SORIANO PINEDO ELECTRICAL AND COMPUTER ENGINEERING Submitted in partial fulfillment of the requirements for the degree of Electrical engineering in ECE in the