Performance Portability Through Descriptive Parallelism

Jeff Larkin, April 20, 2016
Performance Portability Through
Descriptive Parallelism

2
My definition of performance portability
The same source code will run
productively on a variety of
different architectures.

3
Two Types of Portability
FUNCTIONAL PORTABILITY
The ability for a single code
to run anywhere.
PERFORMANCE PORTABILITY
The ability for a single code
to run well anywhere.

4
Performance Portability is Not
Best == Best
Optimal Everywhere
Optimal Anywhere?

5
I Assert
Descriptive Parallelism enables
performance portable code.

6
Descriptive or Prescriptive Parallelism
Programmer explicitly parallelizes the
code, compiler obeys
Requires little/no analysis by the compiler
Substantially different architectures
require different directives
Fairly consistent behavior between
implementations
Prescriptive
Compiler parallelizes the code with
guidance from the programmer
Compiler must make decisions from
available information
Compiler uses information from the
programmer and heuristics about the
architecture to make decisions
Quality of implementation greatly affects
results.
Descriptive

7
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}

8
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
#pragma omp target
{
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
}
}
}
}

9
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
#pragma omp target
{
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
}
}
}
}
#pragma omp target data map(alloc:Anew) map(A)
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}

10
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
#pragma omp target
{
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
}
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{

11
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
#pragma omp target
{
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
}
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
{
error = 0.0;
reduction(max:error) collapse(2)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
collapse(2)
for( int j = 1; j < n-1; j++)
{

12
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
#pragma omp target
{
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
}
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{

13
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
}
}
}
{
error = 0.0;
#pragma omp target
{
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
}
}
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
{
error = 0.0;
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
{
error = 0.0;
reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
collapse(2) schedule(static,1)

14
Execution Time (Smaller is Better)
1.00X
5.12X
0.12X
1.01X
2.47X
0.96X
4.00X
17.42X
4.96X
23.03X
0.00
20.00
40.00
60.00
80.00
100.00
120.00
140.00
Original CPU
Threaded
GPU
Threaded
GPU Teams GPU Split GPU
Collapse
GPU Split
Sched
GPU
Collapse
Sched
OpenACC
(CPU)
OpenACC
(GPU)
893
Same
directives
(unoptimized)
ExecutionTime(seconds)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – CLANGpre-3.8, PGI 16.3

15
Importance of Loop Scheduling
1.00X
5.11X
1.07X
0.96X
17.42X
0
20
40
60
80
100
120
Serial OpenMP (CPU) OpenMP (CPU)
interleaved
OpenMP (GPU) OpenMP (GPU)
interleaved
Time(s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz –CLANGpre-3.8, PGI 16.3

16
Importance of Loop Scheduling
1.00X
16.55X
2.36X
1.64X
29.84X
0
20
40
60
80
100
120
140
160
180
200
Serial OpenMP (CPU) OpenMP (CPU)
interleaved
OpenMP (GPU) OpenMP (GPU)
interleaved
Time(s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – Intel C Compiler 15.0, CLANGpre-3.8, PGI 16.3
ICC 15.0 CLANG

17
Why Can’t OpenMP Do This? (1)
reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
The compiler can freely specify
the default schedule, but not
add the collapse
Wrong loop for coalescing on the
GPU
Default schedule is implementation defined

18
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
Right loop for coalescing on the
GPU
Makes no sense to distribute to
teams on the CPU

19
reduction(max:error) collapse(2) simd
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
}
}
#pragma omp target teams distribute parallel for collapse(2) simd
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
}
}
Compile can always choose 1
team, making this behave as a
parallel for
Maybe the compiler can treat
SIMD as a coalescing hint?
Is this the new best practice?

20
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI

21
0
2
4
6
8
10
12
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
PGI 15.10 Compiler
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI

22
0
2
4
6
8
10
12
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
CPU + GPU: MPI + OpenACC
PGI 15.10 Compiler
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs
30X
* NEMOrun used all-MPI

23
Challenge to the Community
OpenMP should additionally adopt a descriptive programming style
It is possible to be performance portable by being descriptive first and
prescriptive as necessary
The community must adopt new best practices (parallel for isn’t
enough any more)

Performance Portability Through Descriptive Parallelism

Recommended

Recommended

More Related Content

What's hot

What's hot (18)

Viewers also liked

Viewers also liked (14)

Similar to Performance Portability Through Descriptive Parallelism

Similar to Performance Portability Through Descriptive Parallelism (20)

More from Jeff Larkin

More from Jeff Larkin (13)

Recently uploaded

Recently uploaded (20)

Performance Portability Through Descriptive Parallelism