Лекция 8 Введение в программирование сопроцессора Intel Xeon Phi

Size: px

Start display at page:

Download "Лекция 8 Введение в программирование сопроцессора Intel Xeon Phi"

Moses Townsend
5 years ago
Views:

1 Лекция 8 Введение в программирование сопроцессора Intel Xeon Phi Курносов Михаил Георгиевич mkurnosov@gmail.com WWW: Курс «Параллельное программирование» Сибирский государственный университет телекоммуникаций и информатики (г. Новосибирск) Осенний семестр, 6 CC-BY

) GPGPU: x86, cache coherency, 4-bit ring bus, 4-way SMT, разработка отменена в г.

2 Intel Xeon Phi Intel Teraflops Research Chip (Polaris, x-7 гг.) 8 cores, 8 routers, self-correction system Single-Chip Cloud Computer (SCC, 9) 48 P54C Pentium cores, 4x6 D-mesh of tiles ( cores) Intel Larrabee (X- гг.) GPGPU: x86, cache coherency, 4-bit ring bus, 4-way SMT, разработка отменена в г. Larrabee GPU (SIGGRAPH, 8) Intel MIC (Intel Many Integrated Core Architecture) кеш-когерентный мультипроцессор с общей памятью, аппаратная многопоточность (4-way SMT), широкие векторные регистры (5 бит) Knights Ferry (): in-order cores, 4-way SMT, ring bus, 75 GFLOPS Knights Corner/ Xeon Phi (): nm, >= 5 cores Knights Landing ( nd gen., 4 nm, ): Silvermont (Atom) cores, 4 threads per core Knights Hill ( rd gen., nm)

Intel Knights Corner micro-architecture Больше 5 ядер Pentium P54C: x86 ISA, in-order, 4-way SMT, 5-bit SIMD units Cache: KB L data/i-cache, coherent L cache (5

for OS only!): 4 threads RAM: GDDR5 6 GiB Intel Compiler: $ source /opt/intel_

3 Intel Knights Corner micro-architecture Больше 5 ядер Pentium P54C: x86 ISA, in-order, 4-way SMT, 5-bit SIMD units Cache: KB L data/i-cache, coherent L cache (5 KB), двусторонняя кольцевая шина Кольцевая шина: двусторонняя Подключается через шину PCI Express cnmiс (oak): Intel Xeon Phi A Cores: 57 4-way SMT (core #57 for OS only!): 4 threads RAM: GDDR5 6 GiB Intel Compiler: $ source /opt/intel_cluster/bin/iccvars.sh intel64

4 Подсчет простых чисел (serial version) int is_prime_number(int n) int limit = sqrt(n) + ; for (int i = ; i <= limit; i++) if (n % i == ) return ; return (n > )? : ; Число операций O( n) int count_prime_numbers(int a, int b) int nprimes = ; if (a <= ) /* Count '' as a prime number */ nprimes = ; a = ; if (a % == ) /* Shift 'a' to odd number */ a++; /* Loop over odd numbers: a, a +, a + 4,..., b */ for (int i = a; i <= b; i += ) if (is_prime_number(i)) nprimes++; return nprimes; Проверка всех нечетных чисел в интервале [a, b]

5 Подсчет простых чисел (OpenMP) int count_prime_numbers_omp(int a, int b) int nprimes = ; /* Count '' as a prime number */ if (a <= ) nprimes = ; a = ; /* Shift 'a' to odd number */ if (a % == ) a++; #pragma omp parallel /* Loop over odd numbers: a, a +, a + 4,..., b */ #pragma omp for schedule(dynamic, ) reduction(+:nprimes) for (int i = a; i <= b; i += ) if (is_prime_number(i)) nprimes++; return nprimes;

6 Подсчет простых чисел (OpenMP) int count_prime_numbers_omp(int a, int b) int nprimes = ; /* Count '' as a prime number */ if (a <= ) nprimes = ; a = ; /* Shift 'a' to odd number */ if (a % == ) a++; cnmic (oak.cpct.sibsutis.ru) System board: ASUS ZPE-D8 WS (Dual CPU) CPU: x Intel Xeon E5-6v (.4GHz, Haswell, 6 cores) RAM: 64 GiB, DDR4 Mhz (4x8 GiB, 4x8 GiB) Coprocessor: Intel Xeon Phi A (Cores: 56 4-way SMT, RAM: GDDR5 6 GiB) $ icc -std=c99 -Wall -O -fopenmp -c primes.c -o primes.o $ icc -o primes primes.o -lm -fopenmp #pragma omp parallel /* Loop over odd numbers: a, a +, a + 4,..., b */ #pragma omp for schedule(dynamic, ) reduction(+:nprimes) for (int i = a; i <= b; i += ) return nprimes; if (is_prime_number(i)) nprimes++; $ export OMP_NUM_THREADS= $./primes Count prime numbers in [, ] Result (host serial): 686 Result (host parallel): 686 Execution time (host serial):.76 Execution time (host parallel):.7 Speedup host_serial/host_omp:.7

7 Подсчет простых чисел: Xeon Phi offload (serial) #include <offload.h> double run_phi_serial() #ifdef INTEL_OFFLOAD printf("intel Xeon Phi devices: %d\n", _Offload_number_of_devices()); #endif int n; double t = wtime(); #pragma offload target(mic) out(n) n = count_prime_numbers_phi(a, b); t = wtime() - t; printf("result (phi serial): %d\n", n); return t; Структурный блок выгружается (offload) для выполнения на сопроцессор Требуется указать: объекты, которые необходимо скопировать в память сопроцессора перед выполнением блока (in) объекты, которые необходимо скопировать из памяти сопроцессора после выполнением блока (out)

8 Подсчет простых чисел: Xeon Phi offload (serial) attribute ((target(mic))) int count_prime_numbers_phi(int a, int b) int nprimes = ; /* Count '' as a prime number */ if (a <= ) nprimes = ; a = ; /* Shift 'a' to odd number */ if (a % == ) a++; Функции и переменные, доступ к которым осуществляется на сопроцессоре, помечаются атрибутом attribute ((target(mic))) (гетерогенная компиляция) /* Loop over odd numbers: a, a +, a + 4,..., b */ for (int i = a; i <= b; i += ) if (is_prime_number(i)) nprimes++; return nprimes;

9 Подсчет простых чисел: Xeon Phi offload (serial) attribute ((target(mic))) int is_prime_number(int n) int limit = sqrt(n) + ; for (int i = ; i <= limit; i++) if (n % i == ) return ; return (n > )? : ; Функции и переменные, доступ к которым осуществляется на сопроцессоре, помечаются атрибутом attribute ((target(mic))) (гетерогенная компиляция)

10 Подсчет простых чисел: Xeon Phi offload (serial) int main(int argc, char **argv) printf("count prime numbers in [%d, %d]\n", a, b); double thost_serial = run_host_serial(); double thost_par = run_host_parallel(); double tphi_serial = run_phi_serial(); printf("execution time (host serial): %.6f\n", thost_serial); printf("execution time (host parallel): %.6f\n", thost_par); printf("execution time (phi serial): %.6f\n", tphi_serial); printf("ratio phi_serial/host_serial: %.f\n", tphi_serial / thost_serial); printf("speedup host_serial/host_omp: %.f\n", thost_serial / thost_par); return ; На данном тесте ядро Intel Xeon Phi A в ~5 раз медленнее ядра Intel Xeon E5-6 v $ export OMP_NUM_THREADS= $./primes Count prime numbers in [, ] Result (host serial): 686 Result (host parallel): 686 Intel Xeon Phi devices: Result (phi serial): 686 Execution time (host serial):.8 Execution time (host parallel):.8 Execution time (phi serial): 8.96 Ratio phi_serial/host_serial: 4.86 Speedup host_serial/host_omp:.74

11 Подсчет простых чисел: Xeon Phi offload (OpenMP) double run_phi_parallel() #ifdef INTEL_OFFLOAD printf("intel Xeon Phi devices: %d\n", _Offload_number_of_devices()); #endif int n; double t = wtime(); #pragma offload target(mic) out(n) n = count_prime_numbers_phi_omp(a, b); t = wtime() - t; printf("result (phi parallel): %d\n", n); return t;

12 Подсчет простых чисел: Xeon Phi offload (OpenMP) attribute ((target(mic))) int count_prime_numbers_phi_omp(int a, int b) int nprimes = ; /* Count '' as a prime number */ if (a <= ) nprimes = ; a = ; /* Shift 'a' to odd number */ if (a % == ) a++; /* Loop over odd numbers: a, a +, a + 4,..., b */ #pragma omp parallel #pragma omp for schedule(dynamic, ) reduction(+:nprimes) for (int i = a; i <= b; i += ) if (is_prime_number(i)) nprimes++; return nprimes;

13 Подсчет простых чисел: Xeon Phi offload (OpenMP) int main(int argc, char **argv) printf("count prime numbers in [%d, %d]\n", a, b); double thost_serial = run_host_serial(); double thost_par = run_host_parallel(); double tphi_serial = run_phi_serial(); double tphi_par = run_phi_parallel(); printf("execution time (host serial): %.6f\n", thost_serial); printf("execution time (host parallel): %.6f\n", thost_par); printf("execution time (phi serial): %.6f\n", tphi_serial); printf("execution time (phi parallel): %.6f\n", tphi_par); printf("ratio phi_serial/host_serial: %.f\n", tphi_serial / thost_serial); printf("speedup host_serial/host_omp: %.f\n", thost_serial / thost_par); printf("speedup host_omp/phi_omp: %.f\n", thost_par / tphi_par); printf("speedup host_serial/phi_omp: %.f\n", thost_serial / tphi_par); printf("speedup phi_serial/phi_omp: %.f\n", tphi_serial / tphi_par); return ;

14 Подсчет простых чисел: Xeon Phi offload (OpenMP) int main(int argc, char **argv) printf("count prime numbers in [%d, %d]\n", a, b); double thost_serial = run_host_serial(); double thost_par = run_host_parallel(); double tphi_serial = run_phi_serial(); double tphi_par = run_phi_parallel(); $ cat./launch.sh #!/bin/sh # Host OpenMP export OMP_NUM_THREADS= # Xeon Phi OpenMP export MIC_ENV_PREFIX=MIC printf("execution time (host serial): %.6f\n", thost_serial); export MIC_OMP_NUM_THREADS=4 printf("execution time (host parallel): %.6f\n", thost_par); printf("execution time (phi serial): %.6f\n", tphi_serial);./primes printf("execution time (phi parallel): %.6f\n", tphi_par); printf("ratio phi_serial/host_serial: %.f\n", tphi_serial / thost_serial); printf("speedup host_serial/host_omp: %.f\n", thost_serial / thost_par); printf("speedup host_omp/phi_omp: %.f\n", thost_par / tphi_par); printf("speedup host_serial/phi_omp: %.f\n", thost_serial / tphi_par); $./launch.sh printf("speedup phi_serial/phi_omp: %.f\n", tphi_serial / tphi_par); Count prime numbers in [, ] Execution time (host serial):.78 return ; Execution time (host parallel):.466 Execution time (phi serial): 8.6 Execution time (phi parallel):.7784 Ratio phi_serial/host_serial: 4.85 Phi-OpenMP-версия на 4 потоках в раза быстрее Speedup host_serial/host_omp:.6 последовательной Phi-версии Speedup host_omp/phi_omp:.4 Speedup host_serial/phi_omp:.58 Phi-OpenMP-версия медленее Host-OpenMP-версии в ~7 раз Speedup phi_serial/phi_omp:.4

15 Xeon Phi: привязка потоков к ядрам (affinity) $ cat./launch.sh #!/bin/sh # Host OpenMP export OMP_NUM_THREADS= # Xeon Phi OpenMP export MIC_ENV_PREFIX=MIC export MIC_OMP_NUM_THREADS=56 export MIC_KMP_AFFINITY=verbose Execution time (host serial):.9 Execution time (host parallel):.86 Execution time (phi serial): 8.78 Execution time (phi parallel):.546 Ratio phi_serial/host_serial: 4.86 Speedup host_serial/host_omp:. Speedup host_omp/phi_omp:. Speedup host_serial/phi_omp:.4 Speedup phi_serial/phi_omp:../primes OMP: Info #4: KMP_AFFINITY: decoding xapic ids. OMP: Info #5: KMP_AFFINITY: cpuid leaf not supported - decoding legacy APIC ids. OMP: Info #49: KMP_AFFINITY: Affinity capable, using global cpuid info OMP: Info #54: KMP_AFFINITY: Initial OS proc set respected:,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,4,4,4,4,44,45,46,47,48,49,5,5,5,5,54,55,56,57,58,59,6,6,6,6,64,65,66,67,68,69,7,7,7,7,74,75,76,77,78,79,8,8,8,8,84,85,86,87,88,89,9,9,9,9,94,95,96,97,98,99,,,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,4,4,4,4,44,45,46,47,48,49,5,5,5,5,54,55,56,57,58,59,6,6,6,6,64,65,66,67,68,69,7,7,7,7,74,75,76,77,78,79,8,8,8,8,84,85,86,87,88,89,9,9,9,9,94,95,96,97,98,99,,,,,4,5,6,7,8,9,,,,,4,5,6,7,8,9,,,,,4 OMP: Info #56: KMP_AFFINITY: 4 available OS procs OMP: Info #57: KMP_AFFINITY: Uniform topology OMP: Info #59: KMP_AFFINITY: packages x 56 cores/pkg x 4 threads/core (56 total cores)

16 Xeon Phi: привязка потоков к ядрам (affinity) OMP: Info #6: KMP_AFFINITY: OS proc to physical thread map: OMP: Info #7: KMP_AFFINITY: OS proc maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc 4 maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc 5 maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc 6 maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc 7 maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc 8 maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc 9 maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core thread... OMP: Info #7: KMP_AFFINITY: OS proc maps to package core 55 thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core 55 thread OMP: Info #7: KMP_AFFINITY: OS proc maps to package core 55 thread OMP: Info #7: KMP_AFFINITY: OS proc 4 maps to package core 55 thread Core Core Core Core 55

17 Xeon Phi: привязка потоков к ядрам (affinity) OMP: Info #4: KMP_AFFINITY: pid 4 thread bound to OS proc set OMP: Info #4: KMP_AFFINITY: pid 4 thread bound to OS proc set 5 OMP: Info #4: KMP_AFFINITY: pid 4 thread bound to OS proc set 9 OMP: Info #4: KMP_AFFINITY: pid 4 thread bound to OS proc set... OMP: Info #4: KMP_AFFINITY: pid 4 thread 54 bound to OS proc set 7 OMP: Info #4: KMP_AFFINITY: pid 4 thread 55 bound to OS proc set Cores (HW threads): Core Core Core Core Core 4 Core 5 Core 55 OMP threads: 55 Default Affinity

18 Xeon Phi: привязка потоков к ядрам (affinity) $ cat./launch.sh #!/bin/sh # Host OpenMP export OMP_NUM_THREADS= # Xeon Phi OpenMP export MIC_ENV_PREFIX=MIC export MIC_OMP_NUM_THREADS=56 export MIC_KMP_AFFINITY=granularity=fine,balanced Execution time (host serial):.5 Execution time (host parallel):.67 Execution time (phi serial): Execution time (phi parallel):.4887 Ratio phi_serial/host_serial: 4.86 Speedup host_serial/host_omp:.7 Speedup host_omp/phi_omp:. Speedup host_serial/phi_omp:.5 Speedup phi_serial/phi_omp: 7.4./primes fine, balanced Cores (HW threads): Core Core Core Core Core 4 Core 5 Core 55 OMP threads: 55

19 Xeon Phi: привязка потоков к ядрам (affinity) $ cat./launch.sh #!/bin/sh # Host OpenMP export OMP_NUM_THREADS= # Xeon Phi OpenMP export MIC_ENV_PREFIX=MIC export MIC_OMP_NUM_THREADS=56 export MIC_KMP_AFFINITY=granularity=fine,compact./primes Execution time (host serial):.5 Execution time (host parallel):.78 Execution time (phi serial): Execution time (phi parallel):.4787 Ratio phi_serial/host_serial: 4.86 Speedup host_serial/host_omp:.7 Speedup host_omp/phi_omp:.7 Speedup host_serial/phi_omp:.8 Speedup phi_serial/phi_omp:.9 fine, compact Cores (HW threads): Core Core Core Core Core 4 Core 5 Core OMP threads:

20 Xeon Phi: привязка потоков к ядрам (affinity) $ cat./launch.sh #!/bin/sh # Host OpenMP export OMP_NUM_THREADS= # Xeon Phi OpenMP export MIC_ENV_PREFIX=MIC export MIC_OMP_NUM_THREADS=56 export MIC_KMP_AFFINITY=granularity=fine,scatter./primes Execution time (host serial):.4 Execution time (host parallel):.878 Execution time (phi serial): Execution time (phi parallel):.55 Ratio phi_serial/host_serial: 4.86 Speedup host_serial/host_omp:.5 Speedup host_omp/phi_omp:. Speedup host_serial/phi_omp:. Speedup phi_serial/phi_omp:.7 fine, scatter Cores (HW threads): Core Core Core Core Core 4 Core 5 Core 55 OMP threads: 55

21 Xeon Phi: привязка потоков к ядрам (affinity) NUM_THREADS = fine, scatter: core_id = thread_id % ncores Cores (HW threads): Core Core Core Core Core 4 Core 5 Core 55 OMP threads: cores: 8x + 48x = 55 fine, balanced: Cores (HW threads): Core Core Core Core Core 4 Core 5 Core 55 OMP threads: cores: 8x + 48x = 8 9 fine, compact: Cores (HW threads): Core Core Core Core Core 4 Core 5 Core 9 OMP threads: cores: x4 =

22 Умножение матриц SGEMM

23 Умножение матриц SGEMM (serial) enum N =, M =, Q =, NREPS =, ; /* Naive matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_host(float *a, float *b, float *c, int n, int m, int q) /* FP ops: * n * q * m */ for (int i = ; i < n; i++) for (int j = ; j < q; j++) float s =.; for (int k = ; k < m; k++) s += a[i * m + k] * b[k * q + j]; c[i * q + j] = s; GigaFLOP = * M * Q * N / 9

24 Умножение матриц SGEMM (serial) double run_host(const char *msg, void (*sgemm_fun)(float *, float *, float *, int, int, int)) double gflop =. * N * Q * M * E-9; float *a, *b, *c; a = malloc(sizeof(*a) * N * M); b = malloc(sizeof(*b) * M * Q); c = malloc(sizeof(*c) * N * Q); srand(); for (int i = ; i < N; i++) for (int j = ; j < M; j++) a[i * M + j] = rand() % ; for (int i = ; i < M; i++) for (int j = ; j < Q; j++) b[i * Q + j] = rand() % ; /* Warmup */ double twarmup = wtime(); sgemm_fun(a, b, c, N, M, Q); twarmup = wtime() - twarmup;

25 Умножение матриц SGEMM (serial) /* Measures */ double tavg =.; double tmin = E6; double tmax =.; for (int i = ; i < NREPS; i++) double t = wtime(); sgemm_fun(a, b, c, N, M, Q); t = wtime() - t; tavg += t; tmin = (tmin > t)? t : tmin; tmax = (tmax < t)? t : tmax; tavg /= NREPS; printf("%s (%d runs): perf %.f GFLOPS; time: tavg %.6f, tmin %.6f, tmax %.6f, twarmup %.6f\n", msg, NREPS, gflop / tavg, tavg, tmin, tmax, twarmup); free(c); free(b); free(a); return tavg; # cnmic Intel Xeon CPU E5-6 v SGEMM N =, M =, Q = Host serial ( runs): perf.8 GFLOPS; time: tavg.974, tmin.888, tmax.844, twarmup.684 SGEMM N =, M =, Q = Host serial ( runs): perf.54 GFLOPS; time: tavg.58897, tmin.89, tmax.5474, twarmup.5786

26 Умножение матриц SGEMM (serial): opt /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_host_opt(float *a, float *b, float *c, int n, int m, int q) /* Permute loops k and j for improving cache utilization */ for (int i = ; i < n * q; i++) c[i] = ; /* FP ops: * n * m * q */ for (int i = ; i < n; i++) for (int k = ; k < m; k++) for (int j = ; j < q; j++) c[i * q + j] += a[i * m + k] * b[k * q + j];

27 Умножение матриц SGEMM (OpenMP) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_host_omp(float *a, float *b, float *c, int n, int m, int q) #pragma omp parallel int k = ; #pragma omp for for (int i = ; i < n; i++) for (int j = ; j < q; j++) c[k++] =.; #pragma omp for for (int i = ; i < n; i++) for (int k = ; k < m; k++) for (int j = ; j < q; j++) c[i * q + j] += a[i * m + k] * b[k * q + j];

28 Умножение матриц SGEMM (OpenMP) int main(int argc, char **argv) int omp_only = (argc > )? : ; printf("sgemm N = %d, M = %d, Q = %d\n", N, M, Q); if (!omp_only) double t_host = run_host("host serial", &sgemm_host); double t_host_opt = run_host("host opt", &sgemm_host_opt); double t_host_omp = run_host("host OMP", &sgemm_host_omp); printf("speedup (host/host_opt): %.f\n", t_host / t_host_opt); printf("speedup (host_opt/host_omp): %.f\n", t_host_opt / t_host_omp); else char buf[56]; sprintf(buf, "Host OMP %d", omp_get_max_threads()); run_host(buf, &sgemm_host_omp); return ;

29 Умножение матриц SGEMM (OpenMP) int main(int argc, char **argv) int omp_only = (argc > )? : ; printf("sgemm N = %d, M = %d, Q = %d\n", N, M, Q); if (!omp_only) double t_host = run_host("host serial", &sgemm_host); double t_host_opt = run_host("host opt", &sgemm_host_opt); double t_host_omp = run_host("host OMP", &sgemm_host_omp); # cnmic Intel Xeon CPU E5-6 v ( threads) printf("speedup (host/host_opt): %.f\n", t_host / t_host_opt); SGEMM N =, M =, Q = printf("speedup (host_opt/host_omp): %.f\n", t_host_opt / t_host_omp); else char buf[56]; Speedup (host/host_opt): 4.87 sprintf(buf, "Host OMP %d", omp_get_max_threads()); Speedup (host_opt/host_omp):.89 run_host(buf, &sgemm_host_omp); SGEMM N =, M =, Q = return ; Host serial ( runs): perf.8 GFLOPS; time: tavg.85, tmin.976, tmax.9, twarmup.6 Host opt ( runs): perf 8.78 GFLOPS; time: tavg.788, tmin.78, tmax.85, twarmup.8679 Host OMP ( runs): perf 4.6 GFLOPS; time: tavg.964, tmin.94, tmax.98, twarmup.67 Host serial ( runs): perf.6 GFLOPS; time: tavg 9.989, tmin , tmax , twarmup Host opt ( runs): perf 6.58 GFLOPS; time: tavg.4979, tmin.487, tmax.467, twarmup.474 Host OMP ( runs): perf 4. GFLOPS; time: tavg.785, tmin.6996, tmax.794, twarmup.8474 Speedup (host/host_opt): 4. Speedup (host_opt/host_omp): 6.5

30 Умножение матриц SGEMM (Xeon Phi) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_phi(float *a, float *b, float *c, int n, int m, int q) #pragma offload target(mic) in(a:length(n * m)) in(b:length(m * q)) out(c:length(n * q)) #pragma omp parallel int k = ; #pragma omp for for (int i = ; i < n; i++) for (int j = ; j < q; j++) c[k++] =.; #pragma omp for for (int i = ; i < n; i++) for (int k = ; k < m; k++) for (int j = ; j < q; j++) c[i * q + j] += a[i * m + k] * b[k * q + j];

31 Умножение матриц SGEMM (Xeon Phi) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_phi(float *a, float *b, float *c, int n, int m, int q) #pragma offload target(mic) in(a:length(n * m)) in(b:length(m * q)) out(c:length(n * q)) #pragma omp parallel int k = ; #pragma omp for for (int i = ; i < n; i++) for (int j = ; j < q; j++) c[k++] =.; N = M = Q = #pragma omp for for (int i = ; i < n; i++) # Intel Xeon Phi A SGEMM N =, for M (int =, k = Q ; = k < m; k++) Phi OMP 56 (5 runs): for perf (int.49 j = ; GFLOPS; j < q; time: j++) tavg.657, tmin.6, tmax.66, twarmup.855 c[i * q + j] += a[i * m + k] * b[k * q + j]; SGEMM N =, M =, Q = Phi OMP (5 runs): perf 4.44 GFLOPS; time: tavg.49456, tmin.4549, tmax.666, twarmup SGEMM N =, M =, Q = Phi OMP 4 (5 runs): perf 9.4 GFLOPS; time: tavg.585, tmin.475, tmax.5659, twarmup

32 Умножение матриц SGEMM (Xeon Phi) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_phi(float *a, float *b, float *c, int n, int m, int q) #pragma offload target(mic) in(a:length(n * m)) in(b:length(m * q)) out(c:length(n * q)) #pragma omp parallel int k = ; #pragma omp for for (int i = ; i < n; i++) for (int j = ; j < q; j++) c[k++] =.; N = M = Q = 5 #pragma omp for for (int i = ; i < n; i++) # Intel Xeon Phi A SGEMM N = 5, for M (int = 5, k = Q ; = 5 k < m; k++) Phi OMP (5 runs): for (int perf j = ; GFLOPS; j < q; time: j++) tavg.9989, tmin.7967, tmax.7986, twarmup c[i * q + j] += a[i * m + k] * b[k * q + j]; SGEMM N = 5, M = 5, Q = 5 Phi OMP 4 (5 runs): perf 8.86 GFLOPS; time: tavg.769, tmin.9698, tmax.445, twarmup

33 TeraFLOPS Умножение матриц SGEMM (Intel MKL)

34 Умножение матриц SGEMM (Intel MKL) #include <mkl.h> // for sgemm /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_phi_mkl(float *a, float *b, float *c, int n, int m, int q) /* * sblas_sgemm: C[] = alpha * A[] x B[] + beta * C[] */ float alpha =.; float beta =.; #pragma offload target(mic) in(a:length(n * m)) in(b:length(m * q)) out(c:length(n * q)) cblas_sgemm(cblasrowmajor, CblasNoTrans, CblasNoTrans, n, q, m, alpha, a, m, b, q, beta, c, q); double run_phi(const char *msg, void (*sgemm_fun)(float *, float *, float *, int, int, int)) a = mkl_malloc(sizeof(*a) * N * M, 64); //... mkl_free(a); return tavg; # Makefile CFLAGS := -Wall -g -std=c99 -fopenmp -mkl -O LDFLAGS := -mkl -lm -fopenmp

35 Умножение матриц SGEMM (Intel MKL) #include <mkl.h> // for sgemm /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_phi_mkl(float *a, float *b, float *c, int n, int m, int q) /* * sblas_sgemm: C[] = alpha * A[] x B[] + beta * C[] */ float alpha =.; float beta =.; #pragma offload target(mic) in(a:length(n * m)) in(b:length(m * q)) out(c:length(n * q)) cblas_sgemm(cblasrowmajor, CblasNoTrans, CblasNoTrans, n, q, m, alpha, a, m, b, q, beta, c, q); # Intel Xeon Phi A SGEMM N =, M =, Q = Phi MKL 4 ( runs): perf 7.6 GFLOPS; time: tavg.777, tmin.4476, tmax.479, twarmup.59 SGEMM double N run_phi(const = 5, M = 5, char Q *msg, = 5 void (*sgemm_fun)(float *, float *, float *, int, int, int)) Phi MKL 4 ( runs): perf 75.8 GFLOPS; time: tavg , tmin.6547, tmax.7788, twarmup.764 a = mkl_malloc(sizeof(*a) * N * M, 64); SGEMM // N... =, M =, Q = Phi MKL 4 ( runs): perf GFLOPS; time: tavg.848, tmin.76794, tmax.94655, twarmup 8.59 mkl_free(a); SGEMM return N = 5, tavg; M = 5, Q = 5 Phi MKL 4 ( runs): perf GFLOPS; time: tavg.6985, tmin.5898, tmax 4.55, twarmup

36 Умножение матриц SGEMM (Intel MKL) # launch.sh export MIC_ENV_PREFIX=MIC export MIC_OMP_NUM_THREADS=67 export MIC_USE_MB_BUFFERS=M export MIC_KMP_AFFINITY=explicit,granularity=fine,proclist=[-4:]./sgemm The Intel compiler offload runtime allocates memory with MB pages when the size of allocation exceeds the value of the MIC_USE_MB_BUFFERS environment variable # Intel Xeon Phi A SGEMM N =, M =, Q = Phi MKL 67 (5 runs): perf 99. GFLOPS; time: tavg.5566, tmin.49, tmax.6796, twarmup 4.87 SGEMM N = 5, M = 5, Q = 5 Phi MKL 67 (5 runs): perf GFLOPS; time: tavg 7.66, tmin 7.58, tmax 7.78, twarmup SGEMM N =, M =, Q = Phi MKL 67 (5 runs): perf 7.86 GFLOPS; time: tavg.95547, tmin.9675, tmax , twarmup TeraFLOPS How to Use Huge Pages to Improve Application Performance on Intel Xeon Phi Coprocessor //

37 Умножение матриц SGEMM (Xeon Phi) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_phi(float *a, float *b, float *c, int n, int m, int q) #pragma offload target(mic) in(a:length(n * m)) in(b:length(m * q)) out(c:length(n * q)) #pragma omp parallel int k = ; #pragma omp for for (int i = ; i < n; i++) for (int j = ; j < q; j++) c[k++] =.; #pragma omp for for (int i = ; i < n; i++) # Intel Xeon Phi A + MIC_USE_MB_BUFFERS=M + thread affinity GEMM N =, for M = (int, k Q = = ; k < m; k++) Phi OMP 67 ( runs): for perf (int 75. j = GFLOPS; ; j < time: q; j++) tavg , tmin 5.998, tmax , twarmup c[i * q + j] += a[i * m + k] * b[k * q + j];

38 PCI Express bandwidth test

39 PCI Express bandwidth test int main(int argc, char **argv) printf("xeon Phi bwtest: nreps = %d\n", NREPS); printf("size alignment talloc tsend trecv\n"); int s[] =, 4, 8, 6,, 64, 8, 56, 5, 4; int a[] =, 64, 8, 56, 5, 4, 48, 496, 89, 684; for (int j = ; j < NELEMS(a); j++) for (int i = ; i < NELEMS(s); i++) testbw(s[i] * 4 * 4, a[j]); return ;

40 PCI Express bandwidth test void testbw(int size, int alignment) uint8_t *buf = _mm_malloc(size, alignment); if (!buf) fprintf(stderr, "Can't allocate memory\n"); exit(exit_failure); // Init buffer (allocate pages) memset(buf,, size); double t, talloc; double tsend =.; double trecv =.; // Allocate buffer on Phi talloc = wtime(); #pragma offload target(mic) in(buf:length(size) free_if()) talloc = wtime() - talloc; Intel Xeon Phi buf[]

41 PCI Express bandwidth test // Measures for (int i = ; i < NREPS; i++) // Copy to Phi t = wtime(); #pragma offload target(mic) in(buf:length(size) alloc_if() free_if()) tsend += wtime() - t; Intel Xeon Phi buf[] // Copy from Phi t = wtime(); #pragma offload target(mic) out(buf:length(size) alloc_if() free_if()) trecv += wtime() - t; Intel Xeon Phi buf[] // Free on Phi #pragma offload target(mic) in(buf:length(size) alloc_if() free_if()) Intel Xeon Phi tsend /= NREPS; trecv /= NREPS; printf("%-d %-8d %-.6f %-.6f %-.6f\n", size, alignment, talloc, tsend, trecv); _mm_free(buf);

42 PCI Express bandwidth test

43 Запуск программы в Native Mode Native Mode. Программа компилируется на хост-машине кросс-компилятором (icc): sequential code, OpenMP, Intel Cilk Plus, Intel TBB, Intel MPI. Исполняемый файл и все библиотеки копируются в Intel Xeon Phi (scp, NFS). Программа запускается на Intel Xeon Phi (scp, ssh) Ограничения Объем памяти Intel Xeon Phi (Phi A GDDR5 6 GiB) Отсутствует энергонезависимое хранилище HDD/SSD (по NFS смонтирован каталог /home/micshare виртуальная сеть через PCI Express)

44 Умножение матриц SGEMM (OpenMP) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_omp(float *a, float *b, float *c, int n, int m, int q) #pragma omp parallel #pragma omp for for (int i = ; i < n; i++) int k = i * q; for (int j = ; j < q; j++) c[k++] =.; #pragma omp for for (int i = ; i < n; i++) for (int k = ; k < m; k++) for (int j = ; j < q; j++) c[i * q + j] += a[i * m + k] * b[k * q + j]; # Сборка с ключом -mmic $ icc -mmic -Wall -g -std=c99 -O -fopenmp -c sgemm.c -o sgemm.o $ icc -o sgemm sgemm.o -mmic -lm fopenmp

45 PCI Express Копирование исполняемого файла в Intel Xeon Phi cnmic # Экспорт по NFS /home/micshare Intel Xeon Phi -- mic # Доступна по NFS (PCI Express) /home/micshare # Домашний каталог в RAM /home/user # Копирование в домашний каталог $ scp./sgemm mic: Password: sgemm % 5KB 5.KB/s : # Запуск на Xeon Phi $ ssh mic./sgemm Password:./sgemm: error while loading shared libraries: libiomp5.so: cannot open shared object file: No such file or directory

46 PCI Express Копирование исполняемого файла в Intel Xeon Phi cnmic # Экспорт по NFS /home/micshare Intel Xeon Phi -- mic # Доступна по NFS (PCI Express) /home/micshare # Домашний каталог в RAM /home/user # Копирование и запуск утилитой micnativeloadex $ cat./launch.sh #!/bin/sh export SINK_LD_LIBRARY_PATH=/home/micshare/libmic micnativeloadex./sgemm -d -e "OMP_NUM_THREADS=67 KMP_AFFINITY=explicit,granularity=fine,proclist=[-4:]" $./launch.sh SGEMM N =, M =, Q =, Xeon Phi memory usage MiB (.9 %) Phi OMP Native 67 ( runs): perf 64. GFLOPS; time: tavg.98, tmin.99, tmax.4, twarmup.67

47 Intel Xeon Phi: AVX-5 void distance(float *x, float *y, float *z, float *d, int n) for (int i = ; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]); void distance_vec_mic(float *x, float *y, float *z, float *d, int n) m5 *xx = ( m5 *)x; m5 *yy = ( m5 *)y; m5 *zz = ( m5 *)z; m5 *dd = ( m5 *)d; int k = n / 6; for (int i = ; i < k; i++) m5 t = _mm5_mul_ps(xx[i], xx[i]); m5 t = _mm5_mul_ps(yy[i], yy[i]); m5 t = _mm5_mul_ps(zz[i], zz[i]); t = _mm5_add_ps(t, t); t = _mm5_add_ps(t, t); dd[i] = _mm5_sqrt_ps(t); for (int i = k * 6; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]);

48 OpenMP + SIMD void distance(float *x, float *y, float *z, float *d, int n) // SIMD only #pragma omp simd for (int i = ; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]); void distance(float *x, float *y, float *z, float *d, int n) // ing + SIMD #pragma omp parallel for simd for (int i = ; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]);

49 Запуск программы в Native Mode Native Mode. Программа компилируется на хост-машине кросс-компилятором (icc): sequential code, OpenMP, Intel Cilk Plus, Intel TBB, Intel MPI. Исполняемый файл и все библиотеки копируются в Intel Xeon Phi (scp, NFS). Программа запускается на Intel Xeon Phi (scp, ssh) Ограничения Объем памяти Intel Xeon Phi (Phi A GDDR5 6 GiB) Отсутствует энергонезависимое хранилище HDD/SSD (по NFS смонтирован каталог /home/micshare виртуальная сеть через PCI Express)

50 Умножение матриц SGEMM (OpenMP) /* Matrix multiplication C[n, q] = A[n, m] * B[m, q] */ void sgemm_omp(float *a, float *b, float *c, int n, int m, int q) #pragma omp parallel #pragma omp for for (int i = ; i < n; i++) int k = i * q; for (int j = ; j < q; j++) c[k++] =.; #pragma omp for for (int i = ; i < n; i++) for (int k = ; k < m; k++) for (int j = ; j < q; j++) c[i * q + j] += a[i * m + k] * b[k * q + j]; # Сборка с ключом -mmic $ icc -mmic -Wall -g -std=c99 -O -fopenmp -c sgemm.c -o sgemm.o $ icc -o sgemm sgemm.o -mmic -lm fopenmp

51 PCI Express Копирование исполняемого файла в Intel Xeon Phi cnmic # Экспорт по NFS /home/micshare Intel Xeon Phi -- mic # Доступна по NFS (PCI Express) /home/micshare # Домашний каталог в RAM /home/user # Копирование в домашний каталог $ scp./sgemm mic: Password: sgemm % 5KB 5.KB/s : # Запуск на Xeon Phi $ ssh mic./sgemm Password:./sgemm: error while loading shared libraries: libiomp5.so: cannot open shared object file: No such file or directory

52 PCI Express Копирование исполняемого файла в Intel Xeon Phi cnmic # Экспорт по NFS /home/micshare Intel Xeon Phi -- mic # Доступна по NFS (PCI Express) /home/micshare # Домашний каталог в RAM /home/user # Копирование и запуск утилитой micnativeloadex $ cat./launch.sh #!/bin/sh export SINK_LD_LIBRARY_PATH=/home/micshare/libmic micnativeloadex./sgemm -d -e "OMP_NUM_THREADS=67 KMP_AFFINITY=explicit,granularity=fine,proclist=[-4:]" $./launch.sh SGEMM N =, M =, Q =, Xeon Phi memory usage MiB (.9 %) Phi OMP Native 67 ( runs): perf 64. GFLOPS; time: tavg.98, tmin.99, tmax.4, twarmup.67

53 Intel Xeon Phi: AVX-5 void distance(float *x, float *y, float *z, float *d, int n) for (int i = ; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]); void distance_vec_mic(float *x, float *y, float *z, float *d, int n) m5 *xx = ( m5 *)x; m5 *yy = ( m5 *)y; m5 *zz = ( m5 *)z; m5 *dd = ( m5 *)d; int k = n / 6; for (int i = ; i < k; i++) m5 t = _mm5_mul_ps(xx[i], xx[i]); m5 t = _mm5_mul_ps(yy[i], yy[i]); m5 t = _mm5_mul_ps(zz[i], zz[i]); t = _mm5_add_ps(t, t); t = _mm5_add_ps(t, t); dd[i] = _mm5_sqrt_ps(t); for (int i = k * 6; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]);

54 OpenMP + SIMD void distance(float *x, float *y, float *z, float *d, int n) // SIMD only #pragma omp simd for (int i = ; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]); void distance(float *x, float *y, float *z, float *d, int n) // ing + SIMD #pragma omp parallel for simd for (int i = ; i < n; i++) d[i] = sqrtf(x[i] * x[i] + y[i] * y[i] + z[i] * z[i]);

55 Использование MPI на Intel Xeon Phi Native model все процессы MPI-программы выполняются на сопроцессоре Программа копируется на сопроцессор и запускается NUMA node cnmic Eth CPU CPU MEM MEM PCI-E MEM Xeon Phi NUMA node

56 Использование MPI на Intel Xeon Phi Native model все процессы MPI-программы выполняются на сопроцессоре Программа копируется на сопроцессор и запускается NUMA node cnmic Eth CPU CPU MEM MEM PCI-E MEM Xeon Phi MPI-процессы NUMA node

57 Использование MPI на Intel Xeon Phi Symmetric model процессы программы выполняются на хост-машине и сопроцессоре NUMA node cnmic Eth CPU CPU MEM MEM PCI-E MEM Xeon Phi MPI-процессы NUMA node

58 Использование MPI на Intel Xeon Phi Offload model процессы MPI-программы выполняются на хост-машине и выгружают (offload) часть кода на сопроцессор #pragma offload, Intel MKL (MIC) NUMA node cnmic Eth CPU CPU MEM MEM PCI-E MEM Xeon Phi #pragma offload, Intel MKL (MIC) NUMA node

59 MPI Native mode #include <unistd.h> #include <stdio.h> #include <mpi.h> int main(int argc, char **argv) int rank, len; char procname[mpi_max_processor_name]; MPI_Init(&argc, &argv); MPI_Get_processor_name(procname, &len); MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("rank %d on %s\n", rank, procname); MPI_Finalize(); return ; Eth NUMA node CPU CPU MEM MEM NUMA node PCI-E cnmic MEM Xeon Phi # Сборка программы $ cat./build.sh #!/bin/sh mpiicc -mmic -Wall -std=c99 -O -o \ hello./hello.c # Запуск программы $ cat./launch.sh #!/bin/sh TMP=`mktemp -p /home/micshare/tmp -u` cp -f./hello $TMP export I_MPI_MIC=enable mpirun -n -f hosts $TMP rm -f $TMP $./launch.sh Rank on cnmic-mic Rank on cnmic-mic Rank on cnmic-mic mic

60 MPI Symmetric mode #include <unistd.h> #include <stdio.h> #include <mpi.h> # Сборка программы $ cat./build.sh #!/bin/sh int main(int argc, char **argv) $./launch.sh int rank, len; Rankchar onprocname[mpi_max_processor_name]; cnmic Rank on cnmic RankMPI_Init(&argc, on cnmic-mic&argv); RankMPI_Get_processor_name(procname, 4 on cnmic-mic &len); RankMPI_Comm_rank(MPI_COMM_WORLD, on cnmic-mic &rank); cnmic #!/bin/sh printf("rank %d on %s\n", rank, procname); Eth NUMA node MEM MPI_Finalize(); return ; CPU CPU MEM PCI-E MEM Xeon Phi # Binary for Xeon Phi mpiicc -mmic -Wall -std=c99 -O -o hello.mic./hello.c # Binary for Host mpiicc -Wall -std=c99 -O -o hello./hello.c # Запуск программы Host + Phi Вариант $ cat./build.sh # Copy program to the NFS folder TMP=`mktemp -p /home/micshare/tmp -u` cp -f./hello.mic $TMP # Launch the program (see hosts file) export I_MPI_MIC=enable export I_MPI_FABRICS=shm:tcpmpi run -n -host cnmic./hello : -n -host mic $TMP NUMA node rm -f $TMP

61 MPI Symmetric mode #include <unistd.h> #include <stdio.h> #include <mpi.h> # Сборка программы $ cat./build.sh #!/bin/sh MEM NUMA node Xeon Phi # Binary for Xeon Phi mpiicc -mmic -Wall -std=c99 -O -o hello.mic./hello.c int main(int argc, char **argv) $./launch.sh int rank, len; Rankchar onprocname[mpi_max_processor_name]; cnmic # Binary for Host Rank on cnmic mpiicc -Wall -std=c99 -O -o hello./hello.c RankMPI_Init(&argc, on cnmic-mic&argv); RankMPI_Get_processor_name(procname, 4 on cnmic-mic &len); # Запуск программы Host + Phi Вариант RankMPI_Comm_rank(MPI_COMM_WORLD, on cnmic-mic &rank); $ cat./build.sh cnmic # Copy program to the NFS folder printf("rank NUMA node %d on %s\n", rank, procname); TMP=`mktemp -p /home/micshare/tmp -u` MEM cp -f./hello.mic "$TMP.mic" Eth MPI_Finalize(); MEM cp -f./hello $TMP return ; CPU # Launch the program (see hosts file) 4 5 export I_MPI_MIC=enable CPU export I_MPI_MIC_POSTFIX=.mic 4 5 PCI-E export I_MPI_FABRICS=shm:tcp mpirun -machinefile hosts $TMP rm -f $TMP "$TMP.mic" cnmic: mic:

62 MPI Offload mode int main(int argc, char **argv) int rank, len; char procname[mpi_max_processor_name]; MPI_Init(&argc, &argv); MPI_Get_processor_name(procname, &len); MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("rank %d on %s\n", rank, procname); # Сборка программы $ cat./build.sh #!/bin/sh mpiicc -Wall -std=c99 -O -o hello./hello.c # Запуск программы $ cat./launch.sh #!/bin/sh #pragma offload target(mic) in(rank) mpirun -n 4 -host cnmic./hello./hello printf("hello, from Xeon Phi (offloaded from proc %d)\n", rank); cnmic $./launch.sh MPI_Finalize(); NUMA node Rank on cnmic return ; Rank on cnmic MEM Eth MEM Rank on cnmic CPU Rank on cnmic 9 8 Hello, from Xeon Phi (offloaded from proc ) CPU 7 Xeon Phi 4 5 PCI-E 6 Hello, from Xeon Phi (offloaded from proc ) 54 MEM Hello, from Xeon Phi (offloaded from proc ) Hello, from Xeon Phi (offloaded from proc ) NUMA node

63 Конфигурация гибридного NUMA-узла cnmic $ hwloc-ls Machine (64GB) NUMANode L# (P# GB) Socket L# + L L# (5MB) L L# (56KB) + Ld L# (KB) + Li L# (KB) + Core L# HostBridge L# PCIBridge PCI de:f GPU L# "card" GPU L# "renderd8" GPU L# "controld64" PCI 886:8d6 PCIBridge PCI 886:5 Net L# "enp6s" PCIBridge PCI 886:5 Net L#4 "enp7s" PCIBridge PCI b:6 PCI 886:8d Block L#5 "sda" NUMANode L# (P# GB) Socket L# + L L# (5MB) L L#6 (56KB) + Ld L#6 (KB) + Li L#6 (KB) + Core L#6 HostBridge L#5 PCIBridge PCI 886:5d CoProc L#6 "mic" Eth NUMA node MEM CPU 4 5 CPU 4 5 MEM NUMA node PCI-E cnmic MEM Xeon Phi

Семинар 1 (23) Программирование сопроцессора Intel Xeon Phi

Семинар 1 (23) Программирование сопроцессора Intel Xeon Phi Семинар () Программирование сопроцессора Intel Xeon Phi Михаил Курносов E-mail: mkurnosov@gmail.com WWW: www.mkurnosov.net Цикл семинаров «Основы параллельного программирования» Институт физики полупроводников