Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Accelerating HPC Applications on NVIDIA GPUs with OpenACC

1,453 views

Published on

In this deck from the Stanford HPC Conference, Doug Miles from NVIDIA presents: Accelerating HPC Applications on NVIDIA GPUs with OpenACC."

"OpenACC is a directive-based parallel programming model for GPU accelerated and heterogeneous parallel HPC systems. It offers higher programmer productivity compared to use of explicit models like CUDA and OpenCL.

Application source code instrumented with OpenACC directives remains portable to any system with a standard Fortran/C/C++ compiler, and can be efficiently parallelized for various types of HPC systems – multicore CPUs, heterogeneous CPU+GPU, and manycore processors.

This talk will include an introduction to the OpenACC programming model, provide examples of its use in a number of production applications, explain how OpenACC and CUDA Unified Memory working together can dramatically simplify GPU programming, and close with a few thoughts on OpenACC future directions."

Watch the video: https://youtu.be/CaE3n89QM8o

Learn more: https://www.openacc.org/
and
http://hpcadvisorycouncil.com

Sign up for our insideHPC Newsletter: http://insidehpc.com/newsletter

Published in: Technology
  • Be the first to comment

Accelerating HPC Applications on NVIDIA GPUs with OpenACC

  1. 1. Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018 ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC
  2. 2. 2 PGI — THE NVIDIA HPC SDK Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on Linux, macOS, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT, TotalView
  3. 3. 3 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View System Memory GPU Memory PCIe
  4. 4. 4 NVLink Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View System Memory GPU Memory
  5. 5. 5 attributes(global) subroutine mm_kernel ( A, B, C, N, M, L ) real :: A(N,M), B(M,L), C(N,L), Cij integer, value :: N, M, L integer :: i, j, kb, k, tx, ty real, shared :: Asub(16,16),Bsub(16,16) tx = threadidx%x ty = threadidx%y i = blockidx%x * 16 + tx j = blockidx%y * 16 + ty Cij = 0.0 do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) Bsub(tx,ty) = B(kb+ty-1,j) call syncthreads() do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo call syncthreads() enddo C(i,j) = Cij end subroutine mmul_kernel real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev . . . allocate (Adev(N,M), Bdev(M,L), Cdev(N,L)) Adev = A(1:N,1:M) Bdev = B(1:M,1:L) call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>> ( Adev, Bdev, Cdev, N, M, L ) C(1:N,1:L) = Cdev deallocate ( Adev, Bdev, Cdev ) . . . CPU Code Tesla Code CUDA FORTRAN
  6. 6. 6 CUDA FORTRAN module madd_device_module use cudafor contains subroutine madd_dev(a,b,c,sum,n1,n2) real,dimension(:,:),device :: a,b,c real :: sum integer :: n1,n2 type(dim3) :: grid, block !$cuf kernel do (2) <<<(*,*),(32,4)>>> do j = 1,n2 do i = 1,n1 a(i,j) = b(i,j) + c(i,j) sum = sum + a(i,j) enddo enddo end subroutine end module Equivalent hand-written CUDA kernels module madd_device_module use cudafor implicit none contains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256) ! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j) ! accumulates partial sum per thread enddo enddo ! Now add up all partial sums for the whole thread block ! Compute this thread's linear index in the thread block ! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x ! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads() ! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo ! Store the partial sum for the thread block bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) end subroutine ! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) real, dimension(:) :: blocksum real :: dsum integer, value :: nb real, shared :: bsum(256) integer :: tindex,tneighbor,i ! Again, we assume 256 threads in the thread block ! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo call syncthreads() ! This code is copied from the previous kernel ! Accumulate all the partial sums for this thread block to a single value ! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) end subroutine subroutine madd_dev(a,b,c,dsum,n1,n2) real, dimension(:,:), device :: a,b,c real, device :: dsum real, dimension(:), allocatable, device :: blocksum integer :: n1,n2,nb type(dim3) :: grid, block integer :: r ! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudaThreadSynchronize() ! don't deallocate too early deallocate(blocksum) end subroutine !$CUF KERNEL Directives
  7. 7. 7 OpenACC Directives Manage Data Movement Initiate Parallel Execution Optimize Loop Mappings #pragma acc data copyin(a,b) copyout(c) { ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } CPU, GPU, Manycore Performance portable Interoperable Single source Incremental
  8. 8. 8 GPU Memory System Memory ... #pragma acc data copy(b[0:n][0:m]) create(a[0:n][0:m]) { for (iter = 1; iter <= p; ++iter){ #pragma acc parallel loop for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ w1*(b[i-1][j]+b[i+1][j]+ b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } #pragma acc parallel loop for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) b[i][j] = a[i][j]; } } ... AA BB S2 (B)S1 (B)S1 (B)S2 (B) S1 (B) Sp (B)Sp (B) Sp (B) OpenACC for GPUs in a Nutshell
  9. 9. 9 Multicore CPU OpenACC is for Multicore, Manycore & GPUs % pgfortran -ta=multicore –fast –Minfo=acc -c update_tile_halo_kernel.f90 . . . 100, Loop is parallelizable Generating Multicore code 100, !$acc loop gang 102, Loop is parallelizable Tesla GPU % pgfortran -ta=tesla –fast -Minfo=acc –c update_tile_halo_kernel.f90 . . . 100, Loop is parallelizable 102, Loop is parallelizable Accelerator kernel generated Generating Tesla code 100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y 102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x 98 !$acc parallel 99 !$acc loop independent 100 do k=y_min-depth,y_max+depth 101 !$acc loop independent 102 do j=1,depth 103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k) 104 enddo 105 enddo 106 !$acc end parallel
  10. 10. Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs @ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPEC® is a registered trademark of the Standard Performance Evaluation Corporation (www.spec.org). SPEC ACCEL 1.2 BENCHMARKS 0 50 100 150 200 2-socket Skylake 2-socket EPYC 2-socket BroadwellGEOMEANSeconds Intel 2018 PGI 18.1 OpenMP 4.5 40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads 0 50 100 150 200 GEOMEANSeconds PGI 18.1 OpenACC 2-socket Broadwell 1x Volta V100 4.4x Speed-up
  11. 11. 11 OPENACC APPLICATIONS
  12. 12. 12 GAUSSIAN 16 Using OpenACC allowed us to continue development of our fundamental algorithms and software capabilities simultaneously with the GPU-related work. In the end, we could use the same code base for SMP, cluster/ network and GPU parallelism. PGI's compilers were essential to the success of our efforts. Mike Frisch, Ph.D. President and CEO Gaussian, Inc. Gaussian, Inc. 340QuinnipiacSt. Bldg. 40 Wallingford, CT 06492USA custserv@gaussian.com Gaussian isa registered trademark of Gaussian, Inc. All other trademarksand thepropertiesof their respectiveholders. Specif cationssubject tochangewitho Copyright © 2017, Gaussian, Inc. All rightsreserved. Roberto Gomperts NVIDIA Michael Frisch Gaussian Brent Leback NVIDIA/PGI Gio Project Contributors %GPUCPU=0 - 7 =0 - 7 UseGPUs0-7with CPUs0-7astheir controllers. Detailed information isavailableon our website.
  13. 13. 13 ANSYS FLUENT We’ve effectively used OpenACC for heterogeneous computing in ANSYS Fluent with impressive performance. We’re now applying this work to more of our models and new platforms. Sunil Sathe Lead Software Developer ANSYS Fluent Image courtesy: ANSYS
  14. 14. 14 VASP For VASP, OpenACC is the way forward for GPU acceleration. Performance is similar and in some cases better than CUDA C, and OpenACC dramatically decreases GPU development and maintenance efforts. We’re excited to collaborate with NVIDIA and PGI as an early adopter of CUDA Unified Memory. Prof. Georg Kresse Computational Materials Physics University of Vienna
  15. 15. 15 David Gutzwiller Lead Software Developer NUMECA NUMECA FINE/Open Porting our unstructured C++ CFD solver FINE/Open to GPUs using OpenACC would have been impossible two or three years ago, but OpenACC has developed enough that we’re now getting some really good results.
  16. 16. 16 MPAS-A Our team has been evaluating OpenACC as a pathway to performance portability for the Model for Prediction (MPAS) atmospheric model. Using this approach on the MPAS dynamical core, we have achieved performance on a single P100 GPU equivalent to 2.7 dual socketed Intel Xeon nodes on our new Cheyenne supercomputer. Richard Loft Director, Technology Development NCAR Image courtesy: NCAR
  17. 17. 17 OpenACC made it practical to develop for GPU-based hardware while retaining a single source for almost all the COSMO physics code. Dr. Oliver Fuhrer Senior Scientist Meteoswiss COSMO
  18. 18. 18 GAMERA FOR GPU With OpenACC and a compute node based on NVIDIA's Tesla P100 GPU, we achieved more than a 14X speed up over a K Computer node running our earthquake disaster simulation code Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo Hori, Lalith Wijerathne The University of Tokyo Map courtesy University of Tokyo
  19. 19. 19 QUANTUM ESPRESSO CUDA Fortran gives us the full performance potential of the CUDA programming model and NVIDIA GPUs. !$CUF KERNELS directives give us productivity and source code maintainability. It’s the best of both worlds. Filippo Spiga Head of Research Software Engineering University of Cambridge
  20. 20. 20 OPENACC AND CUDA UNIFIED MEMORY
  21. 21. 21 Programming GPU-Accelerated Systems CUDA Unified Memory for Dynamically Allocated Data GPU Developer View With CUDA Unified Memory Unified Memory GPU Developer View System Memory GPU Memory PCIe
  22. 22. 22 How CUDA Unified Memory Works on TESLA GPUs Servicing CPU and GPU Page Faults for Allocatable Data GPU Memory MappingCPU Memory Mapping PCIe or NVLink Page Fault Page Fault arrayarray __global__ void setValue(char *ptr, int index, char val) { ptr[index] = val; } cudaMallocManaged(&array, size); memset(array, size); setValue<<<...>>>(array, size/2, 5); ...
  23. 23. 23 #pragma acc data copyin(a,b) copyout(c) { ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory GPU Developer View With CUDA Unified Memory Unified Memory
  24. 24. 24 PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... GPU Developer View With CUDA Unified Memory Unified Memory
  25. 25. 25 Center for Accelerated Application Readiness (CAAR) Oak Ridge Leadership Computing Facility IBM POWER9 CPUs NVIDIA Volta V100 GPUs
  26. 26. 26 GTC: An OpenACC Production Application The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell production code for turbulence simulation in support of the burning plasma experiment ITER, the crucial next step in the quest for fusion energy. Being ported for runs on the ORNL Summit supercomputer http://phoenix.ps.uci.edu/gtc_group
  27. 27. 27 GTC Performance using OpenACC P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK UM : No Data Directives in sources, compiled with –ta=tesla:managed 2x 4x 6x 8x 10x 12x OpenPOWER | NVLink | Unified Memory | P100 | V100 14x 16x Data Directives Data Directives Data Directives 6.1X 5.9X 12.1X 12X 16.5X 20-core P8 P8+2xP100 UM P8+2xP100 P8+4xP100 UM P8+4xP100 x64+4xV100
  28. 28. 28 DEEP COPY
  29. 29. 29 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 3 Members: only static Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic • Real-world applications often have complex, aggregate data structures • CUDA Unified Memory can automatically manage Deep Copy, but …
  30. 30. 30 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 3 Members: only static Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic • Real-world applications often have complex, aggregate data structures • CUDA Unified Memory can automatically manage Deep Copy, but … • CUDA Unified Memory is only for allocatable data today
  31. 31. 31 FORTRAN AUTOMATIC FULL DEEP COPY Fortran Derived Types
  32. 32. 32 OPENACC 2.6 MANUAL DEEP COPY typedef struct points { float* x; float* y; float* z; int n; float coef, direction; } points; void sub ( int n, float* y ) { points p; #pragma acc data create (p) { p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc update device (p.n) #pragma acc data copyin (p.x[0:n], p.y[0: n]) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i]; . . . Supported Today in PGI Compilers
  33. 33. 33 DRAFT OPENACC 3.0 TRUE DEEP COPY typedef struct points { float* x; float* y; float* z; int n; float coef, direction; #pragma acc policy inout(x[0:n],y[0:n]) } points; void sub ( int n, float* y ) { points p; p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc data copy (p) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i]; . . . Still in definition by the OpenACC Committee
  34. 34. 34 WHITHER OPENACC?
  35. 35. 35 0 20 40 60 80 100 120 140 160 Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal PGI 18.1 OpenACC Intel 2018 OpenMP 7.6x 7.9x 10x 10x 11x 40x 14.8x 15x Volta V100 CLOVERLEAF SpeedupvsSingleHaswellCore Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4). Compilers: Intel 2018.0.128, PGI 18.1 Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC) Data compiled by PGI February 2018. AWE Hydrodynamics mini-App, bm32 data set http://uk-mac.github.io/CloverLeaf 109x 67x 142x 1x 2x 4x
  36. 36. 36 OPENACC DIRECTIVES FOR GPUS 75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS % pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90 pdv_kernel: ... 77, Loop is parallelizable 79, Loop is parallelizable Accelerator kernel generated Generating Tesla code 77, !$acc loop gang, vector(4) ! blockidx%y ! threadidx%y 79, !$acc loop gang, vector(32)! blockidx%x ! threadidx%x ... http://uk-mac.github.io/CloverLeaf
  37. 37. 37 OPENACC DIRECTIVES FOR MULTICORE CPUS 75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS % pgfortran –fast –ta=multicore ... PdV_kernel.f90 pdv_kernel: ... 77, Loop is parallelizable Generating Multicore code 77, !$acc loop gang 79, Loop is parallelizable 3 loop-carried redundant expressions removed with 9 operations and 9 arrays Innermost loop distributed: 2 new loops Generated vector SIMD code for the loop Generated 2 prefetch instructions for the loop Generated 12 prefetch instructions for the loop ... http://uk-mac.github.io/CloverLeaf
  38. 38. 38 FORTRAN 2018 DO CONCURRENT 75 76 77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) & 78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, & min_cell_volume,energy_change,recip_volume) 79 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 106 ENDDO 107 Fortran 2018 DO CONCURRENT + True Parallel Loops + Loop-scope shared/private data − No support for reductions − No support for atomics − No support for data management
  39. 39. 39 OPENACC FOR EVERYONE The PGI Community Edition, pgicompilers.com/community PROGRAMMING MODELS OpenACC, CUDA Fortran, OpenMP, C/C++/Fortran Compilers and Tools PLATFORMS X86, OpenPOWER, NVIDIA GPU UPDATES 1-2 times a year 6-9 times a year 6-9 times a year SUPPORT User Forums PGI Support PGI Premier Services LICENSE Annual Perpetual Volume/Site FREE

×