Data parallelism. [ any app performing the same operation across a data stream ]

Size: px

Start display at page:

Download "Data parallelism. [ any app performing the *same* operation across a data stream ]"

Frederick Griffith
5 years ago
Views:

3 Data parallelism [ any app performing the *same* operation across a data stream ]

4 Contrast stretching: Version Cores Time (secs) Speedup

5 while (step < NumSteps &&!converged) { step++; diffs = 0; foreach (non-boundary row r of M) /* for each pixel, stretch */ foreach (non-boundary column c of M) { M'[r][c] += stretch(m[r-1][c-1], M[r-1][c], M[r-1][c+1], M[r][c-1], M[r][c], M[r][c+1], M[r+1][c-1], M[r+1][c], M[r+1][c+1]); if (M'[r][c]!= M[r][c]) diffs++; } converged = (diffs == 0); } foreach (non-boundary row r of M) /* update original matrix with new values */ foreach (non-boundary column c of M) M[r][c] = M'[r][c];

7 A very common pattern in MPI applications

8 ... M W W... W Read image Distribute Stretch... Collect Write image

9 so you can send/recv *multiple* rows in *one* MPI_Send / MPI_Recv matrix[1][3] = 4; 4 4

uchar **image; if (myrank == 0) { // MASTER: image =

matrix parameters: params[0] = rows; params[1] = cols; for

sizeof(params)/sizeof(params[0]), MPI_INT, w, 0 /*tag*/,

10 uchar **image; if (myrank == 0) { // MASTER: image = ReadBitmapFile(filename, &rows, &cols); // first, broadcast matrix parameters: params[0] = rows; params[1] = cols; for (w=1; w < numprocs; w++) MPI_Send(params, sizeof(params)/sizeof(params[0]), MPI_INT, w, 0 /*tag*/, MPI_COMM_WORLD); // now distribute data (assume it divides evenly!): rowsper = rows / numprocs; } for (w=1; w < numprocs; w++) MPI_Send(image[w*rowsPer], rowsper*cols, MPI_UNSIGNED_CHAR, w, 0 /*tag*/, MPI_COMM_WORLD);

$2 extra rows allocated for ghost data 0 1 2 3 else { // WORKERS: MPI_Recv(params,$ rows = params[0]; // grab image parameters: cols = params[1]; // allocate memory for our

New2dMatrix<uchar>(rowsPer+2, cols); } // now receive our chunk, storing into data rows (skip

11 2 extra rows allocated for ghost data else { // WORKERS: MPI_Recv(params, sizeof(params)/sizeof(params[0]), MPI_INT, 0 /*master*/, 0 /*tag*/, MPI_COMM_WORLD, &status); rows = params[0]; // grab image parameters: cols = params[1]; // allocate memory for our chunk of matrix (include room for 2 ghost rows): rowsper = rows / numprocs; image = New2dMatrix<uchar>(rowsPer+2, cols); } // now receive our chunk, storing into data rows (skip over ghost row): MPI_Recv(image[1], rowsper*cols, MPI_UNSIGNED_CHAR, 0 /*master*/, 0 /*tag*/, MPI_COMM_WORLD, &status);

$converged) { // 1 of 2: everyone send *last* data row down, receive as *first* ghost row: if (myrank < numprocs-1) MPI_Send(image[rowsPer], cols, MPI_UNSIGNED_CHAR, myrank+1, ); if (myrank > 0)$

12 while (step < NumSteps &&!converged) { // 1 of 2: everyone send *last* data row down, receive as *first* ghost row: if (myrank < numprocs-1) MPI_Send(image[rowsPer], cols, MPI_UNSIGNED_CHAR, myrank+1, ); if (myrank > 0) MPI_Recv(image[0], cols, MPI_UNSIGNED_CHAR, myrank-1, ); } // 2 of 2: everyone send *first* data row up, receive as *last* ghost row: if (myrank > 0) MPI_Send(image[1], cols, MPI_UNSIGNED_CHAR, myrank-1, ); if (myrank < numprocs-1) MPI_Recv(image[rowsPer+1], cols, MPI_UNSIGNED_CHAR, myrank+1,.. // code to stretch my part of the image.

$.. W if (myrank > 0) // Workers: send local diffs, receive global diffs: { MPI_Send(&diffs, 1, MPI_LONG_LONG, 0 /*master*/, 0 /*tag*/, ); MPI_Recv(&diffs, 1, MPI_LONG_LONG, 0 /*master*/, 0 /*tag*/,$

13 diffs while (step < NumSteps &&!converged) {... M W W... W if (myrank > 0) // Workers: send local diffs, receive global diffs: { MPI_Send(&diffs, 1, MPI_LONG_LONG, 0 /*master*/, 0 /*tag*/, ); MPI_Recv(&diffs, 1, MPI_LONG_LONG, 0 /*master*/, 0 /*tag*/, ); } else // Master: collect all diffs, sum, distribute final value: { for (w=1; w < numprocs; w++) { MPI_Recv(&temp, 1, MPI_LONG_LONG, w, 0 /*tag*/, ); diffs += temp; } for (w=1; w < numprocs; w++) MPI_Send(&diffs, 1, MPI_LONG_LONG, w, 0 /*tag*/, ); }.

14 point-to-point collective MPI_Bcast MPI_Scatter MPI_Gather MPI_Reduce MPI_Barrier MPI_Sendrecv Allgather, Allreduce, Alltoall, Reduce_scatter, Scan, Scatterv, Gatherv,

15 if (myrank == 0) // MASTER: for (each worker) MPI_Send(...); else // WORKER: MPI_Recv(...); MPI_Bcast( ); more efficient fewer conditionals avoid deadlock

16 MPI_Bcast MPI_Scatter P0 P1 P2 P3 A B C D A B C D A B C D A B C D A B C D A B C D

$MPI_Bcast(buffer, count, datatype, root, communicator); root buffer buffer int params[2]; if (myrank == 0) { // MASTER sets up for broadcast: image = ReadBitmapFile(filename, &rows, &cols); }$

17 MPI_Bcast(buffer, count, datatype, root, communicator); root buffer buffer int params[2]; if (myrank == 0) { // MASTER sets up for broadcast: image = ReadBitmapFile(filename, &rows, &cols); } params[0] = rows; // image parameters: params[1] = cols; /* Broadcast image size to ALL processes */ int root = 0; // master broadcasts: MPI_Bcast(params, sizeof(params)/sizeof(params[0]), MPI_INT, root, MPI_COMM_WORLD); rows = params[0]; // *everyone* now has image parameters: cols = params[1];

MPI_Scatter(sendbuf, sendcount, sendtype, recvbuf, recvcount,

recvbuf /* all processes */ // allocate memory for CHUNK: rowsper =

ghost rows: // master must point to data (workers can point

image[0] : NULL; root = 0; // MASTER scatters, everyone receives:

18 MPI_Scatter(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, communicator); sendbuf sendcount sendbuf root recvbuf /* all processes */ // allocate memory for CHUNK: rowsper = rows / numprocs; chunk = New2dMatrix<uchar>(rowsPer+2, cols); // +2 ghost rows: // master must point to data (workers can point anywhere): uchar *sendbuf = (myrank == 0)? image[0] : NULL; root = 0; // MASTER scatters, everyone receives: MPI_Scatter(sendbuf, rowsper*cols, MPI_UNSIGNED_CHAR, chunk[1], rowsper*cols, MPI_UNSIGNED_CHAR, root, MPI_COMM_WORLD);

19 disjoint MPI_Scatter(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, communicator); communicator if (myrank <...) MPI_Scatter(, MPI_COMM_WORLD); MPI_Scatter(image[0],, root, );

20 MPI_Gather MPI_Reduce P0 A B C D E F G H P0 P1 P2 P3 A C E G B D F H P0 AopCopEopG BopDopFopH

MPI_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount,

all processes */ root = 0; // MASTER gathers: // master receives

0)? image[0] : NULL; MPI_Gather(chunk[1], rowsper*cols,

21 MPI_Gather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, communicator); sendcount sendbuf root recvbuf /* all processes */ root = 0; // MASTER gathers: // master receives back into image, workers not involved: uchar *recvbuf = (myrank == 0)? image[0] : NULL; MPI_Gather(chunk[1], rowsper*cols, MPI_UNSIGNED_CHAR, recvbuf, rowsper*cols, MPI_UNSIGNED_CHAR, root, MPI_COMM_WORLD);

sendbuf recvbuf root Min, Max, Sum, Product,

MPI_Reduce(&diffs, &sum, 1, MPI_LONG_LONG,

22 MPI_Reduce(sendbuf, recvbuf, count, datatype, op, root, communicator); op count sendbuf recvbuf root Min, Max, Sum, Product, And, Or, Xor can also define your own operations P0 P1 P2 P3 A B C D for (...) // stretch my part of the image: for (...) {...; if ( ) diffs++; } /* Compute total differences across all processes for convergence testing */ long long sum = -1; MPI_Reduce(&diffs, &sum, 1, MPI_LONG_LONG, MPI_SUM, 0 /*master*/, MPI_COMM_WORLD); MPI_Bcast(&sum, 1, MPI_LONG_LONG, 0 /*master*/, MPI_COMM_WORLD); converged = (sum == 0);

communicator); long long sum = -1; MPI_Allreduce(&diffs,

23 Reduce data in sendbuf, then broadcast answers out to everyone MPI_Allreduce(sendbuf, recvbuf, count, datatype, op, communicator); long long sum = -1; MPI_Allreduce(&diffs, &sum, 1, MPI_LONG_LONG, MPI_SUM, MPI_COMM_WORLD); converged = (sum == 0);

$void MPIAPI myintsum(void *in, void *inout, int *len, MPI_Datatype *datatype) { int *i_in = (int *) in; int *i_inout = (int *) inout; } for (int i=0; i < *len; i++)$

24 void MPIAPI myintsum(void *in, void *inout, int *len, MPI_Datatype *datatype) { int *i_in = (int *) in; int *i_inout = (int *) inout; } for (int i=0; i < *len; i++) i_inout[i] = i_in[i] + i_inout[i]; void main(...) { MPI_Op opmysum; MPI_Init(...); MPI_Op_create(myIntSum, 1 /*true, op is commutative*/, &opmysum);... MPI_Reduce(..., opmysum,...);

Collective Communications

Collective Communications Reusing this material This work is licensed under a Creative Commons Attribution- NonCommercial-ShareAlike 4.0 International License. http://creativecommons.org/licenses/by-nc-sa/4.0/deed.en_us