SlideShare a Scribd company logo
1 of 23
Download to read offline
Jeff Larkin, April 20, 2016
Performance Portability Through
Descriptive Parallelism
2
My definition of performance portability
The same source code will run
productively on a variety of
different architectures.
3
Two Types of Portability
FUNCTIONAL PORTABILITY
The ability for a single code
to run anywhere.
PERFORMANCE PORTABILITY
The ability for a single code
to run well anywhere.
4
Performance Portability is Not
Best == Best
Optimal Everywhere
Optimal Anywhere?
5
I Assert
Descriptive Parallelism enables
performance portable code.
6
Descriptive or Prescriptive Parallelism
Programmer explicitly parallelizes the
code, compiler obeys
Requires little/no analysis by the compiler
Substantially different architectures
require different directives
Fairly consistent behavior between
implementations
Prescriptive
Compiler parallelizes the code with
guidance from the programmer
Compiler must make decisions from
available information
Compiler uses information from the
programmer and heuristics about the
architecture to make decisions
Quality of implementation greatly affects
results.
Descriptive
7
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
8
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
9
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
10
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
11
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for 
collapse(2)
for( int j = 1; j < n-1; j++)
{
12
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
13
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for 
collapse(2) schedule(static,1)
14
Execution Time (Smaller is Better)
1.00X
5.12X
0.12X
1.01X
2.47X
0.96X
4.00X
17.42X
4.96X
23.03X
0.00
20.00
40.00
60.00
80.00
100.00
120.00
140.00
Original CPU
Threaded
GPU
Threaded
GPU Teams GPU Split GPU
Collapse
GPU Split
Sched
GPU
Collapse
Sched
OpenACC
(CPU)
OpenACC
(GPU)
893
Same
directives
(unoptimized)
ExecutionTime(seconds)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – CLANGpre-3.8, PGI 16.3
15
Importance of Loop Scheduling
1.00X
5.11X
1.07X
0.96X
17.42X
0
20
40
60
80
100
120
Serial OpenMP (CPU) OpenMP (CPU)
interleaved
OpenMP (GPU) OpenMP (GPU)
interleaved
Time(s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz –CLANGpre-3.8, PGI 16.3
16
Importance of Loop Scheduling
1.00X
16.55X
2.36X
1.64X
29.84X
0
20
40
60
80
100
120
140
160
180
200
Serial OpenMP (CPU) OpenMP (CPU)
interleaved
OpenMP (GPU) OpenMP (GPU)
interleaved
Time(s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – Intel C Compiler 15.0, CLANGpre-3.8, PGI 16.3
ICC 15.0 CLANG
17
Why Can’t OpenMP Do This? (1)
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for 
collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
The compiler can freely specify
the default schedule, but not
add the collapse
Wrong loop for coalescing on the
GPU
Default schedule is implementation defined
18
Why Can’t OpenMP Do This? (2)
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
Right loop for coalescing on the
GPU
Makes no sense to distribute to
teams on the CPU
Default schedule is implementation defined
19
Why Can’t OpenMP Do This? (3)
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2) simd
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for collapse(2) simd
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
Compile can always choose 1
team, making this behave as a
parallel for
Maybe the compiler can treat
SIMD as a coalescing hint?
Is this the new best practice?
Default schedule is implementation defined
20
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
21
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
22
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
CPU + GPU: MPI + OpenACC
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs
30X
* NEMOrun used all-MPI
23
Challenge to the Community
OpenMP should additionally adopt a descriptive programming style
It is possible to be performance portable by being descriptive first and
prescriptive as necessary
The community must adopt new best practices (parallel for isn’t
enough any more)

More Related Content

What's hot

Post-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exercisePost-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exerciseIntel IT Center
 
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019 Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019 Unity Technologies
 
An introduction to ROP
An introduction to ROPAn introduction to ROP
An introduction to ROPSaumil Shah
 
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU SelectionMachine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU SelectionAkihiro Hayashi
 
SPU Optimizations - Part 2
SPU Optimizations - Part 2SPU Optimizations - Part 2
SPU Optimizations - Part 2Naughty Dog
 
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...Akihiro Hayashi
 
Introduction to Chainer
Introduction to ChainerIntroduction to Chainer
Introduction to ChainerSeiya Tokui
 
Lab 3 nust control
Lab 3 nust controlLab 3 nust control
Lab 3 nust controltalhawaqar
 
Algorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions EnumerationAlgorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions EnumerationFederico Cerutti
 
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019Unity Technologies
 
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...Federico Cerutti
 
TMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response TimeTMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response TimeIosif Itkin
 
Yampa AFRP Introduction
Yampa AFRP IntroductionYampa AFRP Introduction
Yampa AFRP IntroductionChengHui Weng
 
A Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB SimulinkA Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB SimulinkJaewook. Kang
 
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...Federico Cerutti
 
Practical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT MethodsPractical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT MethodsNaughty Dog
 

What's hot (18)

Post-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exercisePost-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exercise
 
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019 Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
 
An introduction to ROP
An introduction to ROPAn introduction to ROP
An introduction to ROP
 
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU SelectionMachine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
 
SPU Optimizations - Part 2
SPU Optimizations - Part 2SPU Optimizations - Part 2
SPU Optimizations - Part 2
 
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
 
Introduction to Chainer
Introduction to ChainerIntroduction to Chainer
Introduction to Chainer
 
Lab 3 nust control
Lab 3 nust controlLab 3 nust control
Lab 3 nust control
 
Algorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions EnumerationAlgorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions Enumeration
 
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
 
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
 
Cerutti -- TAFA2013
Cerutti -- TAFA2013Cerutti -- TAFA2013
Cerutti -- TAFA2013
 
TMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response TimeTMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response Time
 
Yampa AFRP Introduction
Yampa AFRP IntroductionYampa AFRP Introduction
Yampa AFRP Introduction
 
A Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB SimulinkA Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB Simulink
 
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
 
6. Implementation
6. Implementation6. Implementation
6. Implementation
 
Practical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT MethodsPractical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT Methods
 

Viewers also liked

Ebusiness Marketing Software
Ebusiness Marketing SoftwareEbusiness Marketing Software
Ebusiness Marketing SoftwareNextBee Media
 
Policy Development Process Infographic Portuguese
Policy Development Process Infographic PortuguesePolicy Development Process Infographic Portuguese
Policy Development Process Infographic PortugueseICANN
 
Realidad aumentada gc94
Realidad aumentada gc94Realidad aumentada gc94
Realidad aumentada gc94Gabriel Romero
 
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016Roy+Teddy
 
OpenMP Tutorial for Beginners
OpenMP Tutorial for BeginnersOpenMP Tutorial for Beginners
OpenMP Tutorial for BeginnersDhanashree Prasad
 
Motricidad gruesa
Motricidad gruesaMotricidad gruesa
Motricidad gruesalissethorbe
 
Actividad1 unidadiii mtsm
Actividad1 unidadiii mtsmActividad1 unidadiii mtsm
Actividad1 unidadiii mtsmTere Segura
 

Viewers also liked (14)

Ebusiness Marketing Software
Ebusiness Marketing SoftwareEbusiness Marketing Software
Ebusiness Marketing Software
 
Policy Development Process Infographic Portuguese
Policy Development Process Infographic PortuguesePolicy Development Process Infographic Portuguese
Policy Development Process Infographic Portuguese
 
Pump3.bas
Pump3.basPump3.bas
Pump3.bas
 
Presentation1
Presentation1Presentation1
Presentation1
 
Archivio62
Archivio62Archivio62
Archivio62
 
07.08.2006, LAW, Anti-Corruption Law
07.08.2006, LAW, Anti-Corruption Law07.08.2006, LAW, Anti-Corruption Law
07.08.2006, LAW, Anti-Corruption Law
 
Realidad aumentada gc94
Realidad aumentada gc94Realidad aumentada gc94
Realidad aumentada gc94
 
sas
sassas
sas
 
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
 
Epoch powerpoint
Epoch powerpointEpoch powerpoint
Epoch powerpoint
 
OpenMP Tutorial for Beginners
OpenMP Tutorial for BeginnersOpenMP Tutorial for Beginners
OpenMP Tutorial for Beginners
 
Motricidad gruesa
Motricidad gruesaMotricidad gruesa
Motricidad gruesa
 
caffiene and theine
caffiene and theinecaffiene and theine
caffiene and theine
 
Actividad1 unidadiii mtsm
Actividad1 unidadiii mtsmActividad1 unidadiii mtsm
Actividad1 unidadiii mtsm
 

Similar to Performance Portability Through Descriptive Parallelism

Network lap pgms 7th semester
Network lap pgms 7th semesterNetwork lap pgms 7th semester
Network lap pgms 7th semesterDOSONKA Group
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8alish sha
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8alish sha
 
programs on arrays.pdf
programs on arrays.pdfprograms on arrays.pdf
programs on arrays.pdfsowmya koneru
 
check the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdfcheck the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdfangelfragranc
 
Scaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of ScalazScaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of ScalazHeiko Seeberger
 
MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709Min-hyung Kim
 
From Javascript To Haskell
From Javascript To HaskellFrom Javascript To Haskell
From Javascript To Haskellujihisa
 
En nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalazEn nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalazTrond Marius Øvstetun
 
convert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdfconvert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdfmurtuzadahadwala3
 
Csci101 lect04 advanced_selection
Csci101 lect04 advanced_selectionCsci101 lect04 advanced_selection
Csci101 lect04 advanced_selectionElsayed Hemayed
 
Adding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPAdding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPJacques Rioux
 

Similar to Performance Portability Through Descriptive Parallelism (20)

Sorting Technique
Sorting TechniqueSorting Technique
Sorting Technique
 
Network lap pgms 7th semester
Network lap pgms 7th semesterNetwork lap pgms 7th semester
Network lap pgms 7th semester
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8
 
Python 1 liners
Python 1 linersPython 1 liners
Python 1 liners
 
programs on arrays.pdf
programs on arrays.pdfprograms on arrays.pdf
programs on arrays.pdf
 
Martha
MarthaMartha
Martha
 
C programs
C programsC programs
C programs
 
PRACTICAL COMPUTING
PRACTICAL COMPUTINGPRACTICAL COMPUTING
PRACTICAL COMPUTING
 
VTU Data Structures Lab Manual
VTU Data Structures Lab ManualVTU Data Structures Lab Manual
VTU Data Structures Lab Manual
 
check the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdfcheck the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdf
 
Scaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of ScalazScaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of Scalaz
 
R Code for EM Algorithm
R Code for EM AlgorithmR Code for EM Algorithm
R Code for EM Algorithm
 
MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709
 
From Javascript To Haskell
From Javascript To HaskellFrom Javascript To Haskell
From Javascript To Haskell
 
En nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalazEn nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalaz
 
convert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdfconvert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdf
 
Csci101 lect04 advanced_selection
Csci101 lect04 advanced_selectionCsci101 lect04 advanced_selection
Csci101 lect04 advanced_selection
 
Adding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPAdding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMP
 
Cpds lab
Cpds labCpds lab
Cpds lab
 

More from Jeff Larkin

FortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC DirectivesFortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC DirectivesJeff Larkin
 
SC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIASC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIAJeff Larkin
 
Refactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid ArchitecturesRefactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid ArchitecturesJeff Larkin
 
Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7Jeff Larkin
 
Progress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SEProgress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SEJeff Larkin
 
HPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorialHPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorialJeff Larkin
 
CUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU ComputingCUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU ComputingJeff Larkin
 
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...Jeff Larkin
 
May2010 hex-core-opt
May2010 hex-core-optMay2010 hex-core-opt
May2010 hex-core-optJeff Larkin
 
A Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming ModelsA Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming ModelsJeff Larkin
 
Cray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best PracticesCray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best PracticesJeff Larkin
 
XT Best Practices
XT Best PracticesXT Best Practices
XT Best PracticesJeff Larkin
 
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)Jeff Larkin
 

More from Jeff Larkin (13)

FortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC DirectivesFortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC Directives
 
SC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIASC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIA
 
Refactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid ArchitecturesRefactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid Architectures
 
Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7
 
Progress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SEProgress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SE
 
HPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorialHPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorial
 
CUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU ComputingCUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU Computing
 
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
 
May2010 hex-core-opt
May2010 hex-core-optMay2010 hex-core-opt
May2010 hex-core-opt
 
A Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming ModelsA Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming Models
 
Cray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best PracticesCray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best Practices
 
XT Best Practices
XT Best PracticesXT Best Practices
XT Best Practices
 
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
 

Recently uploaded

Pigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping ElbowsPigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping ElbowsPigging Solutions
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationRadu Cotescu
 
Key Features Of Token Development (1).pptx
Key  Features Of Token  Development (1).pptxKey  Features Of Token  Development (1).pptx
Key Features Of Token Development (1).pptxLBM Solutions
 
Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...Alan Dix
 
Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Paola De la Torre
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Patryk Bandurski
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonetsnaman860154
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdfhans926745
 
Azure Monitor & Application Insight to monitor Infrastructure & Application
Azure Monitor & Application Insight to monitor Infrastructure & ApplicationAzure Monitor & Application Insight to monitor Infrastructure & Application
Azure Monitor & Application Insight to monitor Infrastructure & ApplicationAndikSusilo4
 
How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?XfilesPro
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsEnterprise Knowledge
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationMichael W. Hawkins
 
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)Gabriella Davis
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking MenDelhi Call girls
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersEnhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersThousandEyes
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024Scott Keck-Warren
 
Benefits Of Flutter Compared To Other Frameworks
Benefits Of Flutter Compared To Other FrameworksBenefits Of Flutter Compared To Other Frameworks
Benefits Of Flutter Compared To Other FrameworksSoftradix Technologies
 

Recently uploaded (20)

Pigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping ElbowsPigging Solutions Piggable Sweeping Elbows
Pigging Solutions Piggable Sweeping Elbows
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
Key Features Of Token Development (1).pptx
Key  Features Of Token  Development (1).pptxKey  Features Of Token  Development (1).pptx
Key Features Of Token Development (1).pptx
 
Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...
 
Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
Azure Monitor & Application Insight to monitor Infrastructure & Application
Azure Monitor & Application Insight to monitor Infrastructure & ApplicationAzure Monitor & Application Insight to monitor Infrastructure & Application
Azure Monitor & Application Insight to monitor Infrastructure & Application
 
How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?How to Remove Document Management Hurdles with X-Docs?
How to Remove Document Management Hurdles with X-Docs?
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
Neo4j - How KGs are shaping the future of Generative AI at AWS Summit London ...
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersEnhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024
 
Benefits Of Flutter Compared To Other Frameworks
Benefits Of Flutter Compared To Other FrameworksBenefits Of Flutter Compared To Other Frameworks
Benefits Of Flutter Compared To Other Frameworks
 

Performance Portability Through Descriptive Parallelism

  • 1. Jeff Larkin, April 20, 2016 Performance Portability Through Descriptive Parallelism
  • 2. 2 My definition of performance portability The same source code will run productively on a variety of different architectures.
  • 3. 3 Two Types of Portability FUNCTIONAL PORTABILITY The ability for a single code to run anywhere. PERFORMANCE PORTABILITY The ability for a single code to run well anywhere.
  • 4. 4 Performance Portability is Not Best == Best Optimal Everywhere Optimal Anywhere?
  • 5. 5 I Assert Descriptive Parallelism enables performance portable code.
  • 6. 6 Descriptive or Prescriptive Parallelism Programmer explicitly parallelizes the code, compiler obeys Requires little/no analysis by the compiler Substantially different architectures require different directives Fairly consistent behavior between implementations Prescriptive Compiler parallelizes the code with guidance from the programmer Compiler must make decisions from available information Compiler uses information from the programmer and heuristics about the architecture to make decisions Quality of implementation greatly affects results. Descriptive
  • 7. 7 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); }
  • 8. 8 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); }
  • 9. 9 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } }
  • 10. 10 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) {
  • 11. 11 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) for( int j = 1; j < n-1; j++) {
  • 12. 12 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1)
  • 13. 13 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) schedule(static,1) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) schedule(static,1)
  • 14. 14 Execution Time (Smaller is Better) 1.00X 5.12X 0.12X 1.01X 2.47X 0.96X 4.00X 17.42X 4.96X 23.03X 0.00 20.00 40.00 60.00 80.00 100.00 120.00 140.00 Original CPU Threaded GPU Threaded GPU Teams GPU Split GPU Collapse GPU Split Sched GPU Collapse Sched OpenACC (CPU) OpenACC (GPU) 893 Same directives (unoptimized) ExecutionTime(seconds) NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – CLANGpre-3.8, PGI 16.3
  • 15. 15 Importance of Loop Scheduling 1.00X 5.11X 1.07X 0.96X 17.42X 0 20 40 60 80 100 120 Serial OpenMP (CPU) OpenMP (CPU) interleaved OpenMP (GPU) OpenMP (GPU) interleaved Time(s) NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz –CLANGpre-3.8, PGI 16.3
  • 16. 16 Importance of Loop Scheduling 1.00X 16.55X 2.36X 1.64X 29.84X 0 20 40 60 80 100 120 140 160 180 200 Serial OpenMP (CPU) OpenMP (CPU) interleaved OpenMP (GPU) OpenMP (GPU) interleaved Time(s) NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – Intel C Compiler 15.0, CLANGpre-3.8, PGI 16.3 ICC 15.0 CLANG
  • 17. 17 Why Can’t OpenMP Do This? (1) #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) schedule(static,1) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) schedule(static,1) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } The compiler can freely specify the default schedule, but not add the collapse Wrong loop for coalescing on the GPU Default schedule is implementation defined
  • 18. 18 Why Can’t OpenMP Do This? (2) #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } Right loop for coalescing on the GPU Makes no sense to distribute to teams on the CPU Default schedule is implementation defined
  • 19. 19 Why Can’t OpenMP Do This? (3) #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) simd for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) simd for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } Compile can always choose 1 team, making this behave as a parallel for Maybe the compiler can treat SIMD as a coalescing hint? Is this the new best practice? Default schedule is implementation defined
  • 20. 20 Performance Portability Results 0 2 4 6 8 10 12 miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics) CPU: MPI + OpenMP * SpeedupvssingleCPUCore PGI 15.10 Compiler miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU) NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
  • 21. 21 Performance Portability Results 0 2 4 6 8 10 12 miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics) CPU: MPI + OpenMP * CPU: MPI + OpenACC SpeedupvssingleCPUCore PGI 15.10 Compiler miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU) NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
  • 22. 22 Performance Portability Results 0 2 4 6 8 10 12 miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics) CPU: MPI + OpenMP * CPU: MPI + OpenACC CPU + GPU: MPI + OpenACC SpeedupvssingleCPUCore PGI 15.10 Compiler miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU) NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs 30X * NEMOrun used all-MPI
  • 23. 23 Challenge to the Community OpenMP should additionally adopt a descriptive programming style It is possible to be performance portable by being descriptive first and prescriptive as necessary The community must adopt new best practices (parallel for isn’t enough any more)