HMPP port. G. Colin de Verdière (CEA)

Size: px

Start display at page:

Download "HMPP port. G. Colin de Verdière (CEA)"

Jocelin Quinn
5 years ago
Views:

1 HMPP port G. Colin de Verdière (CEA)

2 Overview.Uchu prototype HMPP MOD2AS MOD2AM HMPP in a real code 2

3 The UCHU prototype Bull servers 1 login node 4 nodes 2 Haperton, 8GB 2 NVIDIA Tesla S1070 IB DDR Slurm Software gcc/icc/ifort Cuda 2.3/OpenCL Rapidmind HMPP 2.2.1sp2 Uchu1 Nehalem-EP 24GB uchu2 uchu3 uchu4 uchu5 Tesla S1070 GPU0 GPU1 GPU0 GPU1 Tesla S1070 3

4 The real UCHU 4

5 HMPP HMPP is a high-level abstraction for manycore programming Takes annotated source code (pragma) Keeps code portability C & Fortran Has a low learning curve yet can reach significant performances Offers many high level features Asynchronous data transfers and kernel execution Offers target oriented optimisations 5

6 HMPP: Compilation workflow HMPP annoted source code HMPP Preprocessor HMPP Compiler Back-end Generators Application source code Standard Compiler Host application Target source code Target compiler Accelerated codelet library NVIDIA CUDA AMD CAL/IL OpenCL HMPP Runtime Target driver CPU GPU 6

7 HMPP: a small example #pragma hmpp sgemm codelet, target=cuda, args[vin1,vin2,vout].io=inout void sgemm( int m, int n, int k, float alpha, float vin1[n][n], float vin2[n][n], float beta, float vout[n][n] ) { /*... */ int main(int argc, char **argv) { /*... */ for( j = 0 ; j < NB_RUNS ; j++ ) { #pragma hmpp sgemm callsite sgemm( size, size, size, alpha, vin1, vin2, beta, vout ); /*... */ Declare codelet with CUDA target Synchronous codelet call 7

8 HMPP: a more elaborate version int main(int argc, char **argv) { /*... */ #pragma hmpp sgemm allocate /*... */ #pragma hmpp sgemm advancedload, args[vin1;m;n;k;alpha;beta] /*... */ Preload data for( j = 0 ; j < 2 ; j++ ) { #pragma hmpp sgemm callsite & #pragma hmpp sgemm args[m;n;k;alpha;beta;vin1].advancedload=true sgemm( size, size, size, alpha, vin1, vin2, beta, vout ); /*... */ /*... */ #pragma hmpp sgemm release Avoid reloading data 8

9 MOD2AM: original code void mxm(int m, int l, int n, double a[m][l], double b[l][n], double c[m][n]) { int i, j, k, lf, mf; mf = m m % 4; for (i = 0; i < mf; i += 4) { for (j = 0; j < n; j++) { c[i][j] = 0.0; c[i + 1][j] = 0.0; c[i + 2][j] = 0.0; c[i + 3][j] = 0.0; lf = l l % 4; for (i = 0; i < m; i++) { for (j = 0; j < lf; j += 4) { for (k = 0; k < n; k++) { c[i][k] = c[i][k] + a[i][j + 0] * b[j + 0][k] + a[i][j + 1] * b[j + 1][k] + a[i][j + 2] * b[j + 2][k] + a[i][j + 3] * b[j + 3][k]; for (i = 0; i < m; i++) { for (j = lf; j < l; j++) { for (k = 0; k < n; k++) { c[i][k] = c[i][k] + a[i][j] * b[j][k]; // mxm 9

10 MOD2AM: HMPP version main() #pragma hmpp Hmxm advancedload, args[a;b], args[a].size={m,l, args[b].size={l,n for (i = 0; i < nrep; i++) { #pragma hmpp Hmxm callsite, args[a;b].advancedload=true mxm(m, l, n, (double (*)[m]) a, (double (*)[n]) b, (double (*)[n]) c); #pragma hmpp Hmxm delegatedstore, args[c] time = cclock() time; ok = check(m, l, n, (double (*)[n]) c); 10

11 MOD2AM: mxm.h #pragma hmpp Hmxm codelet, args[a;b].io=in, args[c].io=out, TARGET=CUDA, args[a].size={m,l, args[b].size={l,n, args[c].size={m,n void mxm(int m, int l, int n, double a[m][l], double b[l][n], double c[m][n]); 11

12 MOD2AM: mxm.c optimized #include mxm.h void mxm (int m, int l, int n, const double a[m][l], const double b[l][n], double c[m][n]) { #define HUNROLL 2 int i, j, k, lf, mf; int rest = n % HUNROLL; for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { c[i][j] = 0.0; 12

13 #pragma hmppcg unroll(6), jam(k) for (i = 0; i < m; i++) { #pragma hmppcg unroll(2), split, noremainder for (k = 0; k < n rest; k++) { double prod = 0.0; double va, vb; j = 0; va = a[i][j]; // trick: prefetching of arrays a and b vb = b[j][k]; for (j = 1; j < l; j++) { // prefetching: index 0 has been loaded prod += va * vb; va = a[i][j]; vb = b[j][k]; prod += va * vb; // prefetching: don t forget the last index c[i][k] += prod; 13

14 if (rest) { // leftovers using the same trick for (i = 0; i < m; i++) { for (k = n rest; k < n; k++) { double prod = 0.0; double va, vb; j = 0; va = a[i][j]; vb = b[j][k]; for (j = 1; j < l; j++) { prod += va * vb; va = a[i][j]; vb = b[j][k]; prod += va * vb; c[i][k] += prod; 14

15 MOD2AM: results on uchu (78GF DP peak) env HMPPCG_FLAGS="--cuda-block-size 64x1" ccc_mprun./h.mod2am T T T T T T T T T T T T T T T T T 15

16 16

17 MOD2AS: original code #endif void SPMXV(int ncols, int nrows, int nelmts, int indx[nelmts], int rowp[nrows], REAL matvals[nelmts], REAL invec[ncols], REAL outvec[nrows]) { int i, j, nrest; for (i = 0; i < nrows; i++) outvec[i] = (REAL) 0.0; for (i = 0; i < nrows 1; i++) { nrest = (rowp[i + 1] rowp[i]) % 4; for (j = rowp[i]; j < rowp[i + 1] nrest; j += 4) { outvec[i] = outvec[i] + (matvals[j] * invec[indx[j]]) + matvals[j + 1] * invec[indx[j + 1]] + matvals[j + 2] * invec[indx[j + 2]] + matvals[j + 3] * invec[indx[j + 3]]; for (j = rowp[i + 1] nrest; j < rowp[i + 1]; j++) { outvec[i] = outvec[i] + matvals[j] * invec[indx[j]]; for (j = rowp[nrows 1]; j < nelmts; j++) { outvec[nrows 1] = outvec[nrows 1] + matvals[j] * invec[indx[j]]; 17

18 MOD2AS: main() #define LOOPS 1 for (j = 0; j < LOOPS; j++) { for (i = 0; i < nrows; i++) outvec[i] = (REAL) 0.0; #pragma hmpp Hspmxv callsite spmxv(ncols, nrows, nelmts, indx, rowp, matvals, invec, outvec); time = cclock() time; time /= LOOPS; ok = check(ncols, nrows, nelmts, indx, rowp, outvec); 18

19 MOD2AS: spmxv.c void spmxv(int ncols, int nrows, int nelmts, int indx[nelmts], int rowp[nrows], double matvals[nelmts], double invec[ncols], double outvec[nrows]) { int i, j, k; int start, end; int nrest = 0; int warpoff = 0; double somme = 0.; #define SIZE 32*2 double partial[nrows * SIZE]; for (i = 0; i < nrows; i++) { outvec[i] = (double) 0.0; 19

20 2D Grid // do partial sums in // taking warp sizes into account for (i = 0; i < nrows; i++) { for (warpoff = 0; warpoff < SIZE; warpoff++) { start = rowp[i]; if (i == (nrows 1)) { end = nelmts; else { end = rowp[i + 1]; somme = 0; // partial reduction for (j = start + warpoff; j < end; j += SIZE) { somme += (matvals[j] * invec[indx[j]]); partial[i * SIZE + warpoff] = somme; // for warpoff // for i 20

21 1D Grid // finish summing for (i = 0; i < nrows; i++) { somme = 0.; for (k = 0; k < SIZE; k++) { somme += partial[i * SIZE + k]; outvec[i] = somme; // spmxv 21

22 Memory access for mod2as // Row i start end SIZE threads in blocks // // Row i SIZE 22

23 MOD2AS: results on Uchu Program mod2as: Sparse (CRS) Matrix-vector Multiply #Rows #Cols %Fill Time(s) Mflop/s OK T T T T T T T T T T T T T T T T 23

24 24

25 Harpertow n HMPP in a real code Test case =10000x10000, 10 iterations DP Accélération ,05 8,6 10,3 Sér ie1 0 Harpertown 2.800Ghz 1626s gcc Harpertown 2.800Ghz s icc Nehalem EP 2.933GHz s icc Tesla T10 1.3GHz s HMPP A ccélératio n GCC ICC HMPP CUDA Nehalem-EP 6,15 5,15 1 Sér ie1 Tesla T10 1.3GHz s CUDA 0 ICC HMPP CUDA 25

26 CUDA Performances after optimization 26

27 HMPP port : some conclusions Kernels are important to understand a language Yet only full applications can exercise a language Only a small part of HMPP has been tested here Port to HMPP is easy The difficulty lies in the data movements HMPP allows for decent performances in real cases Asynchronous transfers and calls are key for performance 27

CAPS Technology. ProHMPT, 2009 March12 th

CAPS Technology. ProHMPT, 2009 March12 th CAPS Technology ProHMPT, 2009 March12 th Overview of the Talk 1. HMPP in a nutshell Directives for Hardware Accelerators (HWA) 2. HMPP Code Generation Capabilities Efficient code generation for CUDA 3.