USING	
  OPENGL	
  AND	
  DIRECTX	
  FOR	
  
HETEROGENEOUS	
  COMPUTE	
  
KARL	
  HILLESLAND	
  
AGENDA	
  

THE	
  GRAPHICS	
  PIPELINE	
  

PROGRAMMING	
  THE	
  GPU	
  

FEEDING	
  THE	
  GPU	
  

2	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
The	
  Graphics	
  
Pipeline	
  
GRAPHICS	
  PIPELINE	
  
SHADER	
  CENTRIC	
  

OpenGL	
  

DirectX	
  

	
  
Vertex	
  Shader	
  

	
  

Vertex	
  Shader

	
  

TessellaQon	
  Control	
  Shader

	
  
TessellaQon	
  EvaluaQon	
  Shader	
  
Geometry	
  Shader	
  
Rasterizer	
  
Fragment	
  Shader	
  
Per-­‐Fragment	
  OperaQons	
  

TessellaQon	
  PrimiQve	
  Generator

4	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

	
  

Input	
  Assembler

Vertex	
  Puller

	
  
Tessellator	
  

Hull	
  Shader

	
  

Domain	
  Shader

	
  

Geometry	
  Shader

	
  

Rasterizer

	
  

Pixel	
  Shader

	
  

Output	
  Merger
GRAPHICS	
  PIPELINE	
  
MORE	
  DETAILS	
  

indices,	
  
verQces	
  

	
  

Input	
  Assembler

Thread	
  per	
  DS	
  vertex	
  (n3)	
  
Barycentric	
  
Domain	
  Shader

	
  

DS	
  vertex	
  
Collects	
  prims

	
  

vertex	
  

	
  

	
  

Tessellator

Patch	
  verts	
  n2	
  

	
  

	
  

Prim	
  verts	
   Geometry	
  Shader

5	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Prims	
  

Collects	
  Patches
Patch	
  verts	
  n1	
  

Tess	
  
factors	
  

Collects	
  patches

	
  

vertex	
  

Vertex	
  Shader
Thread	
  
per	
  vertex	
  

	
  

Patch	
  Constant

	
  

Hull	
  Shader

Control	
  point	
  

	
  

PrimiQve	
  Assembler

Thread	
  per	
  output	
  
control	
  point	
  n2	
  

	
  

Next	
  Slide
prim	
  

Hi-­‐Z/Stencil	
  info	
  

	
  

	
  

Rasterizer	
  2

	
  

Unroller

Rasterizer	
  1

Hi-­‐Z/Stencil

Unrolling,	
  
Masking

	
  

Pixel	
  Shader

	
  

	
  

Reordering

Depth/Stencil

	
  

	
  

	
  

	
  

Blending

Not	
  shown:	
  Any	
  shader	
  stage	
  can	
  read/write	
  to	
  memory,	
  
including	
  atomics,	
  filtering*,	
  decompression,	
  and	
  sRGB	
  
conversion	
  

	
  

Collects	
  Quads

Conversion

6	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Early-­‐Z/Stencil

	
  
WHAT’S	
  THE	
  POINT?	
  
!  The	
  Graphics	
  pipeline	
  has	
  a	
  lot	
  more	
  parts	
  
‒  Reorganizes	
  threads	
  
‒  Tracks	
  dependencies	
  
‒  Reorders	
  
‒  Extra	
  fixed-­‐funcQon	
  units	
  

!  Are	
  they	
  usable?	
  

7	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
GRAPHICS	
  IN	
  THE	
  NINETIES	
  	
  

	
  

Input	
  Assembler

	
  

Transform	
  and	
  LighQng

	
  

Rasterizer

	
  

Texturing	
  and	
  Fog

	
  

Output	
  Merger

8	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
VORONOI	
  DIAGRAMS	
  
GPGPU	
  WITHOUT	
  SHADERS	
  

!  Color	
  according	
  to	
  closest	
  
‒  Point	
  
‒  Line	
  

!  Could	
  be	
  weighted	
  
!  Useful	
  for	
  	
  
‒  Collision	
  DetecQon	
  
‒  Surface	
  ReconstrucQon	
  
‒  Robot	
  MoQon	
  Planning	
  
‒  Non-­‐PhotorealisQc	
  Rendering	
  
‒  Surface	
  SimplificaQon	
  
‒  Mesh	
  GeneraQon	
  

9	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
VORONOI	
  DIAGRAMS	
  IN	
  THE	
  NINETIES	
  

Simply	
  rasterize	
  the	
  
cones	
  using	
  graphics	
  
hardware	
  

Haeberli90,	
  Woo97	
  
10	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

2-­‐part	
  discrete	
  Voronoi	
  
diagram	
  representaQon	
  
Color	
  Buffer	
  

Site	
  IDs	
  

Depth	
  Buffer	
  

Distance	
  
OPENGL	
  1	
  SIMD	
  MACHINE	
  
PEERCY,	
  ET.	
  AL.	
  SIGGRAPH	
  2000	
  

SIMD	
  Concept	
  

OpenGL	
  1	
  SIMD	
  

InstrucQon	
  

OpenGL	
  call	
  (CPU)	
  

SIMD	
  Lane	
  

Pixel	
  

SIMD	
  Lane	
  Input	
  Data	
  

Texel	
  

SIMD	
  Lane	
  Output	
  Data	
  

Fragment	
  

ALU	
  

Blend	
  OperaQon	
  

CondiQonals	
  

Alpha	
  and	
  Stencil	
  Tests	
  

11	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

float y;
float4 contrived_example()
{
float x = f(u,v)
if( x*y > 0)
{
x = x + g(u,v)
}
return x*h(u,v);
}
USING	
  EARLY-­‐Z	
  OR	
  STENCIL	
  

Texture-­‐space	
  blur	
  

With	
  back-­‐face	
  culling	
  

ApplicaQons	
  of	
  Explicit	
  Early-­‐Z	
  Culling,	
  Real-­‐Time	
  Shading	
  Course,	
  Siggraph	
  2004.	
  

12	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Pressure	
  buffer	
  
used	
  for	
  sim	
  culling	
  
What’s	
  the	
  Point?	
  

The	
  graphics	
  pipeline	
  	
  
gives	
  you	
  access	
  to	
  more	
  

13	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
Programming	
  
the	
  GPU	
  
SHADER	
  TYPES	
  
!  Compute	
  (4.3)	
  
!  Vertex	
  (2,	
  ES	
  2)	
  
!  TessellaQon	
  Control	
  (4)	
  
!  TessellaQon	
  EvaluaQon	
  (4)	
  	
  
!  Geometry	
  (3)	
  
!  Fragment	
  (2,	
  ES	
  2)	
  

OpenGL	
  
15	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

! 
! 
! 
! 
! 
! 

Compute	
  (11)	
  
Vertex	
  (8)	
  
Hull	
  (11)	
  
Domain	
  (11)	
  
Geometry	
  (10)	
  
Pixel	
  (9)	
  

D3D	
  
15	
  
BASIC	
  GLSL	
  VERTEX	
  SHADER	
  
#version 430
in vec3 Position;
in vec2 UV;
out PosUV //Not available in GLES
{
vec3 vPositionWS;
vec2 vUV;
} vs_output;
uniform mat4x4 mMVP;
uniform mat4x4 mM;
void main(void)
{
gl_Position = mMVP * vec4(Position, 1.0);
vs_output.vPositionWS = mM * vec4(Position, 1.0);
vs_output.vUV = UV;
}
16	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

16	
  
BASIC	
  GLSL	
  PIXEL	
  SHADER	
  
in fsInput //Not available in GLES
{
vec3 vPositionWS;
vec2 vUV;
} fs_input;
uniform sampler2D sDiffuse;
out vec4 color_out;
void main(void)
{
color_out = texture( sDiffuse, fs_input.vUV );
}
17	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

17	
  
BASIC	
  HLSL	
  VERTEX	
  SHADER	
  
struct PosUV //Not available in GLES
{
float4 vPositionSS : SV_POSITION;
float3 vPositionWS : POSITION;
float2 vUV : TEXCOORD0;
};
float4x4 mMVP;
float4x4 mM;
PosUV main(
float3 Position : POSITION,
float2 UV: TEXCOORD0)
{
PosUV vs_output;
output.vPositionSS = mMVP * float4(Position, 1.0);
vs_output.vPositionWS = mMP * float4(Position, 1.0);
vs_output.vUV = UV;
return vs_output;
}

18	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

18	
  
BASIC	
  HLSL	
  PIXEL	
  SHADER	
  
struct fsInput
{
float3 vPositionWS : POSITION;
float2 vUV : TEXCOORD0;
};
sampler sWrapTriLin;
texture2D <float4> tDiffuse;
float4 main(fsInput i) : SV_TARGET
{
return tDiffuse.Sample(sWrapTriLin, i.vUV);
}
19	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

19	
  
BASIC	
  GEOMETRY	
  SHADER	
  
layout (triangles) in;
layout (triangle_strip, max_vertices = 3) out;
void main(void)
{
for(int i=0; i < gl_in.length(); i++)
{
gl_Position = gl_in[i].gl_Position;
EmitVertex();
}
EndPrimitive();
}
20	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

20	
  
TESSELLATION	
  
TessellaQon	
  Control	
  

Hull	
  Shader	
  

Patch	
  Constant	
  Func	
  

Tess	
  factors	
  
Tess	
  factors	
  
Tessellator	
  

Tessellator	
  

Topology	
  

Topology	
  
TessellaQon	
  
EvaluaQon	
  

OpenGL	
  4.0	
  
21	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Domain	
  
Shader	
  

D3D11	
  
21	
  
TESSELLATION	
  
// Tessellation Control
layout (vertices = 4) out;
void TCS(void)
{
if (gl_InvocationID == 0)
{
gl_TessLevelInner[0] = 2.0;
…

// Hull Shader
[outputcontrolpoints(4)]
[patchconstantfunc("ConstantsHS")]
[domain("quad")]
[partitioning(“integer")]
[outputtopology("triangle_cw")]

// Tessellation Evaluation
layout (quads, cw, equal_spacing) in
void TES(void)
{
…

HS_OUTPUT HullShader(…)
// Domain Shader
DS_OUTPUT DomainShader(…)

OpenGL	
  4.0	
  
22	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

D3D11	
  
22	
  
TESSELLATION	
  CONTROL	
  
out patch float tessFactor;
void main(void)
{
if (gl_InvocationID == 0)

TessellaQon	
  rate	
  can	
  be	
  set	
  by	
  any	
  
instance	
  

{
gl_TessLevelInner[0] = 2.0;
…
tessFactor = 2.0;
}

Values	
  can	
  be	
  
communicated	
  across	
  
threads	
  

barrier();
DoSomeWork(tessFactor, gl_InvocationID);

23	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

23	
  
COMPUTE	
  SHADERS	
  
Thread Group

Thread

Thread

group size y

Thread

global size y

global size x

Thread

group size x

!  Groups	
  can	
  share	
  local	
  memory	
  
!  Threads	
  can	
  be	
  synced	
  at	
  a	
  group	
  level	
  

24	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

24	
  
OPENGL	
  COMPUTE	
  
buffer BlockName { int linearOutput[] };
shared int var;
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1)
void ContrivedSample()
{
const uvec3 localIdx = gl_LocalInvocationID;
const uvec3 globalIdx = gl_GlobalInvocationID;
const uvec3 groupIdx = gl_WorkGroupID;
if(localId.x == 0)
var =

groupIdx.x;

barrier();
linearOutput[globalIdx.x] = var;
}
25	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

25	
  
DIRECT	
  COMPUTE	
  
RWStructuredBuffer<int> linearOutput;
groupshared int var;
[numthreads(64, 1, 1)]
void ContrivedSample(
uint3 globalIdx : SV_DispatchThreadID,
uint3 localIdx : SV_GroupThreadID,
uint3 groupIdx : SV_GroupID )
{
if(localIdx.x == 0)
var = groupIdx.x;
GroupMemoryBarrierWithGroupSync();
linearOutput[globalIdx.x] = var;
}

26	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

26	
  
PROGRAMMING	
  THE	
  GPU	
  
SYNCHRONIZATION	
  
MEMORY	
  COHERENCE-­‐	
  GL	
  /	
  DX	
  
Dispatch	
  

CS	
  

28	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Mem	
  

CS	
  

28	
  
MEMORY	
  COHERENCE-­‐	
  GL/DX	
  11.1	
  
Draw	
  

VS	
  

Mem	
  

GS	
  

VS	
  
GS	
  

FS	
  
FS	
  

RT	
  

29	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

29	
  
MEMORY	
  COHERENCE-­‐	
  GL	
  /	
  DX	
  11.1	
  

Draw	
  
VS	
  

Mem	
  

GS	
  

FS	
  
RT	
  

30	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

30	
  
Feeding	
  the	
  
GPU	
  
DRIVER	
  STACKS	
  (WINDOWS)	
  

	
  OpenGL	
  App	
  

DirectX	
  App	
  

OpenGL32.dll	
  

D3D11.dll	
  

D3D	
  UMD	
  

OpenGL	
  ICD	
  

DXGI	
  
KMD	
  

32	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

32	
  
DRIVER	
  STACKS	
  (LINUX)	
  

App	
  
libGL	
  

Gallium3D	
  
State	
  tracker	
  
DRI	
  

Or	
  

Hardware	
  layer	
  
Gallium3D	
  
WinSys	
  

libDRM-­‐radeon	
  

drm	
  

33	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

33	
  
FEEDING	
  THE	
  GPU	
  
GPU-­‐CPU	
  SYNCHRONIZATION	
  
DRIVER	
  COMMAND	
  QUEUE	
  
ApplicaQon	
  
Dr	
  
5	
  

Da	
  
5	
  

Da	
  1	
  

Dr	
  1	
  

Da	
  	
  
6	
  

Da	
  2	
  

Dr	
  
6	
  

Dr	
  2	
  

Da	
  3	
  

Dr	
  3	
  

Da	
  4	
  

Dr	
  4	
  

Da	
  5	
  

Dr	
  5	
  

Da	
  6	
  

Dr	
  6	
  

Driver/GPU	
  
Time	
  
Reorder	
  possible?	
  

35	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

35	
  
CPU/GPU	
  MEMORY	
  SYNCHRONIZATION	
  
BY	
  DRIVER	
  

App	
  
Memory	
  

Driver	
  
Copy	
  

App	
  
Memory	
  

Driver	
  
Copy	
  

Hints	
  

36	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

GPU	
  
Read	
  
Driver	
  
Copy	
  

GPU	
  
Read	
  

Stream,	
  StaQc,	
  Dynamic	
  
Draw,	
  Read,	
  Copy	
  
CPU/GPU	
  MEMORY	
  SYNCHRONIZATION	
  
MANUAL	
  

App	
  
Memory	
  

Da	
  1	
  

Dr	
  1	
  

Driver	
  
Copy	
  

App	
  Copy	
  

Da	
  2	
  

Dr	
  2	
  

Fence	
  
37	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Da	
  3	
  

Dr	
  3	
  

Da	
  4	
  

Dr	
  4	
  

Da	
  5	
  

GPU	
  
Read	
  

Dr	
  5	
  

Da	
  6	
  

Dr	
  6	
  
FEEDING	
  THE	
  GPU	
  
DATA	
  
LEGACY	
  OPENGL	
  OBJECT	
  MODEL	
  
!  glGenBuffers,	
  glGenTextures,	
  glGenSamplers,	
  …	
  
‒  Creates	
  name	
  /	
  handle	
  

!  glBindBuffer,	
  glBindTexture,	
  	
  
‒  Sets	
  as	
  current	
  

!  glBufferData,	
  glTexSubImage,	
  glMapBuffer	
  
‒  Supplies	
  data	
  

39	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

39	
  
BUFFER	
  BINDING	
  AND	
  CREATION	
  

glBindBuffer(target,name)	
  

Target	
  

binding	
  

BufferObject	
  

State,	
  Usage	
  

BufferData	
  

desc.BindFlags	
  =	
  <Target>	
  
pDevice-­‐>CreateBuffer(desc,…)	
  

40	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

40	
  
SETTING	
  DATA	
  (SIMPLEST	
  OPTION)	
  

glBufferData	
  
(target,	
  size,	
  pData,	
  usage)	
  

data	
  

Target	
  

binding	
  

BufferObject	
  

desc.Usage	
  =	
  <Usage>	
  
desc.CPUAccessFlags	
  =	
  <RWUsage>	
  
pDevice-­‐>CreateBuffer(desc,pData,)	
  
41	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

41	
  
BUFFER	
  TARGETS	
  
GL	
  Name	
  

Typical	
  Purpose	
  	
  

DX	
  Equivalent	
  

ARRAY	
  

VerQces	
  

VERTEX	
  

ELEMENT_ARRAY	
  

Indices	
  

INDEX	
  

UNIFORM	
  

Read-­‐only	
  vars	
  

CONSTANT	
  

TEXTURE_BUFFER	
  

Buffer-­‐as-­‐texture	
  

CONSTANT	
  (tbuffer)	
  

SHADER_STORAGE	
  

Read/write	
  

SHADER_RESOURCE	
  

TRANSFORM_FEEDBACK	
  

Stream	
  out	
  

Stream	
  out	
  

DRAW_INDIRECT	
  

indirect	
  draw	
  

DRAWINDIRECT	
  

ATOMIC_COUNTER	
  

Global	
  counter	
  var	
  

UAV_FLAG_COUNTER	
  

COPY_READ,	
  _WRITE	
  

Copying	
  (opQonal)	
  

Staging?	
  

PIXEL_PACK,	
  _UNPACK	
  

GPU	
  <-­‐>	
  CPU	
  

Staging?	
  

42	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

42	
  
DIRECTX	
  OBJECTS	
  AND	
  VIEWS	
  
!  Resource	
  (base	
  class)	
  
‒  Usage:	
  default,	
  immutable,	
  dynamic,	
  staging	
  
‒  Bind	
  flags:	
  vertex,	
  index,	
  shader	
  resource,	
  …	
  

!  Buffer	
  
!  Texture2D,	
  …	
  
!  DepthStencilView	
  
!  RenderTargetView	
  
!  ShaderResourceView	
  
!  UnorderedAccessView	
  

43	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

43	
  
OBJECT	
  AND	
  VIEW	
  EXAMPLE	
  
D3D11_BUFFER_DESC desc;
desc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
…
pDevice->CreateBuffer(&desc, data, &pBuffer);
D3D11_SHADER_RESOURCE_VIEW_DESC srvDesc;
srcDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
…
pDevice->CreateShaderResourceView(pBuffer, &srvDesc, &pView);
//at draw time
pContext->VSSetShaderResources(0, 1, pView);
44	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

44	
  
DATA	
  TYPES	
  
Image	
  

45	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

Linear	
  
IMMUTABLE	
  TEXTURES	
  (4.2,	
  GLES	
  3)	
  
glGenTextures(1, &texObjName);
glBindTexture(GL_TEXTURE_2D_ARRAY,
texObjName);
glTexStorage3D(GL_TEXTURE_2D_ARRAY, level, internalformat,
width, height, depth);
glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
0,0,0, width, height, depth,
format, type, pData);
	
  

CreateTexture2D( desc, srcDataLayout, pData);

46	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

46	
  
FEEDING	
  THE	
  GPU	
  
PROGRAMS	
  
SHADER	
  MANAGEMENT	
  -­‐	
  OPENGL	
  
Program	
  Object	
  
GLuint shader = glCreateShader(GL_VERTEX_SHADER);

Vertex	
  Shader	
  

glShaderSource(…);
glCompileShader();

Pixel	
  Shader	
  

GLuint program = glCreateProgram();
glAttachShader(program, shader);
glLinkProgram(program);
glUseProgram(program);

	
  

	
  
	
  

48	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

48	
  
BASIC	
  GLSL	
  PIXEL	
  SHADER	
  
in fsInput //Not available in GLES
{
vec3 vPositionWS;
vec2 vUV;
} fs_input;
uniform sampler2D sDiffuse;
out vec4 color_out;
void main(void)
{
color_out = texture( sDiffuse, fs_input.vUV );
}
49	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

49	
  
BASIC	
  GLSL	
  VERTEX	
  SHADER	
  
#version 430
in vec3 Position;
in vec2 UV;
out PosUV //Not available in GLES
{
vec3 vPositionWS;
vec2 vUV;
} vs_output;
uniform mat4x4 mMVP;
uniform mat4x4 mM;
void main(void)
{
gl_Position = mMVP * vec4(Position, 1.0);
vs_output.vPositionWS = mM * vec4(Position, 1.0);
vs_output.vUV = UV;
}
50	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

50	
  
SHADER	
  MANAGEMENT	
  -­‐	
  DX	
  
D3DCompile(source,..,vs_5_0,..,&pByteCode)
pShader = CreateVertexShader(pByteCode);
VSSetShader(pShader,0,0);

!  No	
  program	
  /	
  link	
  concept	
  in	
  API	
  

51	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

51	
  
PROGRAM	
  BINARIES	
  
OpenGL	
  
glGetProgramBinary(program,…,format,pBinaryOut);

DirectX	
  
D3DCompile(source,..,vs_5_0,..,&pByteCode)

	
  
!  Program	
  level	
  

!  Shader	
  level	
  

!  In	
  theory:	
  format	
  choices	
  

!  Portable	
  byte	
  code	
  

!  In	
  pracQce:	
  somewhat	
  final,	
  non-­‐portable	
  

52	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

52	
  
DRAW	
  CALLS	
  
OpenGL	
  

D3D	
  

glDrawArrays	
  

Draw	
  

glDrawArraysInstanced	
  

DrawInstanced(…,0)	
  

glDrawArraysInstancedBaseInstance	
  

DrawInstanced	
  

glDrawArraysIndirect	
  

DrawInstancedIndirect	
  

glMulQDrawArrays	
  

for(int	
  i=0;	
  i<n;	
  ++i)	
  
	
  	
  	
  Draw(count[i],	
  start[i]);	
  

glMulQDrawArraysIndirect	
  

for(int	
  i=0;	
  i<n;	
  ++i)	
  
	
  	
  	
  DrawInstancedIndirect(…)	
  

glDrawElements	
  

DrawIndexed	
  

…And	
  so	
  forth	
  

53	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

53	
  
COMPUTE	
  SHADERS	
  
glDispatchCompute(nGroupsX,nGroupsY,nGroupsZ)
	
  

Dispatch(nGroupsX,nGroupsY,nGroupsZ
)

	
  

glDispatchComputeIndirect(offset)

	
  

DispatchIndirect(pResource,offset)

	
  
OpenGL	
  4.3	
  

54	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

D3D11	
  

54	
  
Wrap	
  up	
  
IMAGE-­‐BASED	
  MODELING	
  

56	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
GENERATING	
  THE	
  MODEL	
  
Render:	
  projecQon,	
  
rasterizaQon,	
  
texturing,	
  depth	
  
buffering,	
  …	
  

57	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
TressFX	
  
HAIR	
  

!  AMD	
  technology	
  for	
  high-­‐quality	
  hair	
  rendering	
  
!  Thousands	
  of	
  hair	
  strands	
  individually	
  simulated	
  and	
  
rendered	
  on	
  the	
  GPU	
  
!  DirectCompute	
  physics	
  simulaQon	
  
!  Shader	
  Model	
  5.0	
  pixel	
  shader	
  using	
  compute	
  capabiliQes	
  for	
  
rendering	
  

58	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
NOT	
  EXPOSED	
  IN	
  GRAPHICS	
  APIS	
  (YET)	
  
!  Local	
  shared	
  memory	
  restricted	
  to	
  
‒  Compute	
  	
  
‒  TessellaQon	
  Control,	
  in	
  a	
  limited	
  sense	
  

!  Some	
  OpenCL	
  extensions	
  (e.g.,	
  64	
  bit	
  atomics)	
  
!  Numerical	
  compliance	
  
!  Some	
  OpenCL	
  1.2	
  addiQons	
  
!  OpenCL	
  2.0	
  addiQons	
  

59	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
SUMMARY	
  

The	
  graphics	
  pipeline	
  	
  
gives	
  you	
  access	
  to	
  different	
  hardware	
  

There	
  are	
  addiQonal	
  synchroniza6on	
  
issues	
  and	
  opportunites	
  

Mix	
  and	
  match	
  for	
  the	
  best	
  of	
  both	
  
compute	
  and	
  graphics	
  
60	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  
DISCLAIMER	
  &	
  ATTRIBUTION	
  
The	
  informaQon	
  presented	
  in	
  this	
  document	
  is	
  for	
  informaQonal	
  purposes	
  only	
  and	
  may	
  contain	
  technical	
  inaccuracies,	
  omissions	
  and	
  typographical	
  errors.	
  
	
  
The	
  informaQon	
  contained	
  herein	
  is	
  subject	
  to	
  change	
  and	
  may	
  be	
  rendered	
  inaccurate	
  for	
  many	
  reasons,	
  including	
  but	
  not	
  limited	
  to	
  product	
  and	
  roadmap	
  
changes,	
  component	
  and	
  motherboard	
  version	
  changes,	
  new	
  model	
  and/or	
  product	
  releases,	
  product	
  differences	
  between	
  differing	
  manufacturers,	
  sozware	
  
changes,	
  BIOS	
  flashes,	
  firmware	
  upgrades,	
  or	
  the	
  like.	
  AMD	
  assumes	
  no	
  obligaQon	
  to	
  update	
  or	
  otherwise	
  correct	
  or	
  revise	
  this	
  informaQon.	
  However,	
  AMD	
  
reserves	
  the	
  right	
  to	
  revise	
  this	
  informaQon	
  and	
  to	
  make	
  changes	
  from	
  Qme	
  to	
  Qme	
  to	
  the	
  content	
  hereof	
  without	
  obligaQon	
  of	
  AMD	
  to	
  noQfy	
  any	
  person	
  of	
  
such	
  revisions	
  or	
  changes.	
  
	
  
AMD	
  MAKES	
  NO	
  REPRESENTATIONS	
  OR	
  WARRANTIES	
  WITH	
  RESPECT	
  TO	
  THE	
  CONTENTS	
  HEREOF	
  AND	
  ASSUMES	
  NO	
  RESPONSIBILITY	
  FOR	
  ANY	
  
INACCURACIES,	
  ERRORS	
  OR	
  OMISSIONS	
  THAT	
  MAY	
  APPEAR	
  IN	
  THIS	
  INFORMATION.	
  
	
  
AMD	
  SPECIFICALLY	
  DISCLAIMS	
  ANY	
  IMPLIED	
  WARRANTIES	
  OF	
  MERCHANTABILITY	
  OR	
  FITNESS	
  FOR	
  ANY	
  PARTICULAR	
  PURPOSE.	
  IN	
  NO	
  EVENT	
  WILL	
  AMD	
  BE	
  
LIABLE	
  TO	
  ANY	
  PERSON	
  FOR	
  ANY	
  DIRECT,	
  INDIRECT,	
  SPECIAL	
  OR	
  OTHER	
  CONSEQUENTIAL	
  DAMAGES	
  ARISING	
  FROM	
  THE	
  USE	
  OF	
  ANY	
  INFORMATION	
  
CONTAINED	
  HEREIN,	
  EVEN	
  IF	
  AMD	
  IS	
  EXPRESSLY	
  ADVISED	
  OF	
  THE	
  POSSIBILITY	
  OF	
  SUCH	
  DAMAGES.	
  
	
  
ATTRIBUTION	
  
©	
  2013	
  Advanced	
  Micro	
  Devices,	
  Inc.	
  All	
  rights	
  reserved.	
  AMD,	
  the	
  AMD	
  Arrow	
  logo	
  and	
  combinaQons	
  thereof	
  are	
  trademarks	
  of	
  Advanced	
  Micro	
  Devices,	
  
Inc.	
  in	
  the	
  United	
  States	
  and/or	
  other	
  jurisdicQons.	
  	
  SPEC	
  	
  is	
  a	
  registered	
  trademark	
  of	
  the	
  Standard	
  Performance	
  EvaluaQon	
  CorporaQon	
  (SPEC).	
  Other	
  
names	
  are	
  for	
  informaQonal	
  purposes	
  only	
  and	
  may	
  be	
  trademarks	
  of	
  their	
  respecQve	
  owners.	
  

61	
   |	
  	
  	
  PRESENTATION	
  TITLE	
  	
  	
  |	
  	
  	
  DECEMBER	
  4,	
  2013	
  	
  	
  |	
  	
  	
  CONFIDENTIAL	
  

PG-4034, Using OpenGL and DirectX for Heterogeneous Compute, by Karl Hillesland

  • 1.
    USING  OPENGL  AND  DIRECTX  FOR   HETEROGENEOUS  COMPUTE   KARL  HILLESLAND  
  • 2.
    AGENDA   THE  GRAPHICS  PIPELINE   PROGRAMMING  THE  GPU   FEEDING  THE  GPU   2   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 3.
  • 4.
    GRAPHICS  PIPELINE   SHADER  CENTRIC   OpenGL   DirectX     Vertex  Shader     Vertex  Shader   TessellaQon  Control  Shader   TessellaQon  EvaluaQon  Shader   Geometry  Shader   Rasterizer   Fragment  Shader   Per-­‐Fragment  OperaQons   TessellaQon  PrimiQve  Generator 4   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL     Input  Assembler Vertex  Puller   Tessellator   Hull  Shader   Domain  Shader   Geometry  Shader   Rasterizer   Pixel  Shader   Output  Merger
  • 5.
    GRAPHICS  PIPELINE   MORE  DETAILS   indices,   verQces     Input  Assembler Thread  per  DS  vertex  (n3)   Barycentric   Domain  Shader   DS  vertex   Collects  prims   vertex       Tessellator Patch  verts  n2       Prim  verts   Geometry  Shader 5   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Prims   Collects  Patches Patch  verts  n1   Tess   factors   Collects  patches   vertex   Vertex  Shader Thread   per  vertex     Patch  Constant   Hull  Shader Control  point     PrimiQve  Assembler Thread  per  output   control  point  n2     Next  Slide
  • 6.
    prim   Hi-­‐Z/Stencil  info       Rasterizer  2   Unroller Rasterizer  1 Hi-­‐Z/Stencil Unrolling,   Masking   Pixel  Shader     Reordering Depth/Stencil         Blending Not  shown:  Any  shader  stage  can  read/write  to  memory,   including  atomics,  filtering*,  decompression,  and  sRGB   conversion     Collects  Quads Conversion 6   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Early-­‐Z/Stencil  
  • 7.
    WHAT’S  THE  POINT?   !  The  Graphics  pipeline  has  a  lot  more  parts   ‒  Reorganizes  threads   ‒  Tracks  dependencies   ‒  Reorders   ‒  Extra  fixed-­‐funcQon  units   !  Are  they  usable?   7   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 8.
    GRAPHICS  IN  THE  NINETIES       Input  Assembler   Transform  and  LighQng   Rasterizer   Texturing  and  Fog   Output  Merger 8   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 9.
    VORONOI  DIAGRAMS   GPGPU  WITHOUT  SHADERS   !  Color  according  to  closest   ‒  Point   ‒  Line   !  Could  be  weighted   !  Useful  for     ‒  Collision  DetecQon   ‒  Surface  ReconstrucQon   ‒  Robot  MoQon  Planning   ‒  Non-­‐PhotorealisQc  Rendering   ‒  Surface  SimplificaQon   ‒  Mesh  GeneraQon   9   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 10.
    VORONOI  DIAGRAMS  IN  THE  NINETIES   Simply  rasterize  the   cones  using  graphics   hardware   Haeberli90,  Woo97   10   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   2-­‐part  discrete  Voronoi   diagram  representaQon   Color  Buffer   Site  IDs   Depth  Buffer   Distance  
  • 11.
    OPENGL  1  SIMD  MACHINE   PEERCY,  ET.  AL.  SIGGRAPH  2000   SIMD  Concept   OpenGL  1  SIMD   InstrucQon   OpenGL  call  (CPU)   SIMD  Lane   Pixel   SIMD  Lane  Input  Data   Texel   SIMD  Lane  Output  Data   Fragment   ALU   Blend  OperaQon   CondiQonals   Alpha  and  Stencil  Tests   11   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   float y; float4 contrived_example() { float x = f(u,v) if( x*y > 0) { x = x + g(u,v) } return x*h(u,v); }
  • 12.
    USING  EARLY-­‐Z  OR  STENCIL   Texture-­‐space  blur   With  back-­‐face  culling   ApplicaQons  of  Explicit  Early-­‐Z  Culling,  Real-­‐Time  Shading  Course,  Siggraph  2004.   12   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Pressure  buffer   used  for  sim  culling  
  • 13.
    What’s  the  Point?   The  graphics  pipeline     gives  you  access  to  more   13   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 14.
  • 15.
    SHADER  TYPES   ! Compute  (4.3)   !  Vertex  (2,  ES  2)   !  TessellaQon  Control  (4)   !  TessellaQon  EvaluaQon  (4)     !  Geometry  (3)   !  Fragment  (2,  ES  2)   OpenGL   15   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   !  !  !  !  !  !  Compute  (11)   Vertex  (8)   Hull  (11)   Domain  (11)   Geometry  (10)   Pixel  (9)   D3D   15  
  • 16.
    BASIC  GLSL  VERTEX  SHADER   #version 430 in vec3 Position; in vec2 UV; out PosUV //Not available in GLES { vec3 vPositionWS; vec2 vUV; } vs_output; uniform mat4x4 mMVP; uniform mat4x4 mM; void main(void) { gl_Position = mMVP * vec4(Position, 1.0); vs_output.vPositionWS = mM * vec4(Position, 1.0); vs_output.vUV = UV; } 16   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   16  
  • 17.
    BASIC  GLSL  PIXEL  SHADER   in fsInput //Not available in GLES { vec3 vPositionWS; vec2 vUV; } fs_input; uniform sampler2D sDiffuse; out vec4 color_out; void main(void) { color_out = texture( sDiffuse, fs_input.vUV ); } 17   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   17  
  • 18.
    BASIC  HLSL  VERTEX  SHADER   struct PosUV //Not available in GLES { float4 vPositionSS : SV_POSITION; float3 vPositionWS : POSITION; float2 vUV : TEXCOORD0; }; float4x4 mMVP; float4x4 mM; PosUV main( float3 Position : POSITION, float2 UV: TEXCOORD0) { PosUV vs_output; output.vPositionSS = mMVP * float4(Position, 1.0); vs_output.vPositionWS = mMP * float4(Position, 1.0); vs_output.vUV = UV; return vs_output; } 18   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   18  
  • 19.
    BASIC  HLSL  PIXEL  SHADER   struct fsInput { float3 vPositionWS : POSITION; float2 vUV : TEXCOORD0; }; sampler sWrapTriLin; texture2D <float4> tDiffuse; float4 main(fsInput i) : SV_TARGET { return tDiffuse.Sample(sWrapTriLin, i.vUV); } 19   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   19  
  • 20.
    BASIC  GEOMETRY  SHADER   layout (triangles) in; layout (triangle_strip, max_vertices = 3) out; void main(void) { for(int i=0; i < gl_in.length(); i++) { gl_Position = gl_in[i].gl_Position; EmitVertex(); } EndPrimitive(); } 20   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   20  
  • 21.
    TESSELLATION   TessellaQon  Control   Hull  Shader   Patch  Constant  Func   Tess  factors   Tess  factors   Tessellator   Tessellator   Topology   Topology   TessellaQon   EvaluaQon   OpenGL  4.0   21   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Domain   Shader   D3D11   21  
  • 22.
    TESSELLATION   // TessellationControl layout (vertices = 4) out; void TCS(void) { if (gl_InvocationID == 0) { gl_TessLevelInner[0] = 2.0; … // Hull Shader [outputcontrolpoints(4)] [patchconstantfunc("ConstantsHS")] [domain("quad")] [partitioning(“integer")] [outputtopology("triangle_cw")] // Tessellation Evaluation layout (quads, cw, equal_spacing) in void TES(void) { … HS_OUTPUT HullShader(…) // Domain Shader DS_OUTPUT DomainShader(…) OpenGL  4.0   22   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   D3D11   22  
  • 23.
    TESSELLATION  CONTROL   outpatch float tessFactor; void main(void) { if (gl_InvocationID == 0) TessellaQon  rate  can  be  set  by  any   instance   { gl_TessLevelInner[0] = 2.0; … tessFactor = 2.0; } Values  can  be   communicated  across   threads   barrier(); DoSomeWork(tessFactor, gl_InvocationID); 23   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   23  
  • 24.
    COMPUTE  SHADERS   ThreadGroup Thread Thread group size y Thread global size y global size x Thread group size x !  Groups  can  share  local  memory   !  Threads  can  be  synced  at  a  group  level   24   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   24  
  • 25.
    OPENGL  COMPUTE   bufferBlockName { int linearOutput[] }; shared int var; layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) void ContrivedSample() { const uvec3 localIdx = gl_LocalInvocationID; const uvec3 globalIdx = gl_GlobalInvocationID; const uvec3 groupIdx = gl_WorkGroupID; if(localId.x == 0) var = groupIdx.x; barrier(); linearOutput[globalIdx.x] = var; } 25   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   25  
  • 26.
    DIRECT  COMPUTE   RWStructuredBuffer<int>linearOutput; groupshared int var; [numthreads(64, 1, 1)] void ContrivedSample( uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID ) { if(localIdx.x == 0) var = groupIdx.x; GroupMemoryBarrierWithGroupSync(); linearOutput[globalIdx.x] = var; } 26   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   26  
  • 27.
    PROGRAMMING  THE  GPU   SYNCHRONIZATION  
  • 28.
    MEMORY  COHERENCE-­‐  GL  /  DX   Dispatch   CS   28   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Mem   CS   28  
  • 29.
    MEMORY  COHERENCE-­‐  GL/DX  11.1   Draw   VS   Mem   GS   VS   GS   FS   FS   RT   29   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   29  
  • 30.
    MEMORY  COHERENCE-­‐  GL  /  DX  11.1   Draw   VS   Mem   GS   FS   RT   30   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   30  
  • 31.
  • 32.
    DRIVER  STACKS  (WINDOWS)    OpenGL  App   DirectX  App   OpenGL32.dll   D3D11.dll   D3D  UMD   OpenGL  ICD   DXGI   KMD   32   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   32  
  • 33.
    DRIVER  STACKS  (LINUX)   App   libGL   Gallium3D   State  tracker   DRI   Or   Hardware  layer   Gallium3D   WinSys   libDRM-­‐radeon   drm   33   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   33  
  • 34.
    FEEDING  THE  GPU   GPU-­‐CPU  SYNCHRONIZATION  
  • 35.
    DRIVER  COMMAND  QUEUE   ApplicaQon   Dr   5   Da   5   Da  1   Dr  1   Da     6   Da  2   Dr   6   Dr  2   Da  3   Dr  3   Da  4   Dr  4   Da  5   Dr  5   Da  6   Dr  6   Driver/GPU   Time   Reorder  possible?   35   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   35  
  • 36.
    CPU/GPU  MEMORY  SYNCHRONIZATION   BY  DRIVER   App   Memory   Driver   Copy   App   Memory   Driver   Copy   Hints   36   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   GPU   Read   Driver   Copy   GPU   Read   Stream,  StaQc,  Dynamic   Draw,  Read,  Copy  
  • 37.
    CPU/GPU  MEMORY  SYNCHRONIZATION   MANUAL   App   Memory   Da  1   Dr  1   Driver   Copy   App  Copy   Da  2   Dr  2   Fence   37   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Da  3   Dr  3   Da  4   Dr  4   Da  5   GPU   Read   Dr  5   Da  6   Dr  6  
  • 38.
  • 39.
    LEGACY  OPENGL  OBJECT  MODEL   !  glGenBuffers,  glGenTextures,  glGenSamplers,  …   ‒  Creates  name  /  handle   !  glBindBuffer,  glBindTexture,     ‒  Sets  as  current   !  glBufferData,  glTexSubImage,  glMapBuffer   ‒  Supplies  data   39   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   39  
  • 40.
    BUFFER  BINDING  AND  CREATION   glBindBuffer(target,name)   Target   binding   BufferObject   State,  Usage   BufferData   desc.BindFlags  =  <Target>   pDevice-­‐>CreateBuffer(desc,…)   40   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   40  
  • 41.
    SETTING  DATA  (SIMPLEST  OPTION)   glBufferData   (target,  size,  pData,  usage)   data   Target   binding   BufferObject   desc.Usage  =  <Usage>   desc.CPUAccessFlags  =  <RWUsage>   pDevice-­‐>CreateBuffer(desc,pData,)   41   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   41  
  • 42.
    BUFFER  TARGETS   GL  Name   Typical  Purpose     DX  Equivalent   ARRAY   VerQces   VERTEX   ELEMENT_ARRAY   Indices   INDEX   UNIFORM   Read-­‐only  vars   CONSTANT   TEXTURE_BUFFER   Buffer-­‐as-­‐texture   CONSTANT  (tbuffer)   SHADER_STORAGE   Read/write   SHADER_RESOURCE   TRANSFORM_FEEDBACK   Stream  out   Stream  out   DRAW_INDIRECT   indirect  draw   DRAWINDIRECT   ATOMIC_COUNTER   Global  counter  var   UAV_FLAG_COUNTER   COPY_READ,  _WRITE   Copying  (opQonal)   Staging?   PIXEL_PACK,  _UNPACK   GPU  <-­‐>  CPU   Staging?   42   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   42  
  • 43.
    DIRECTX  OBJECTS  AND  VIEWS   !  Resource  (base  class)   ‒  Usage:  default,  immutable,  dynamic,  staging   ‒  Bind  flags:  vertex,  index,  shader  resource,  …   !  Buffer   !  Texture2D,  …   !  DepthStencilView   !  RenderTargetView   !  ShaderResourceView   !  UnorderedAccessView   43   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   43  
  • 44.
    OBJECT  AND  VIEW  EXAMPLE   D3D11_BUFFER_DESC desc; desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; … pDevice->CreateBuffer(&desc, data, &pBuffer); D3D11_SHADER_RESOURCE_VIEW_DESC srvDesc; srcDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; … pDevice->CreateShaderResourceView(pBuffer, &srvDesc, &pView); //at draw time pContext->VSSetShaderResources(0, 1, pView); 44   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   44  
  • 45.
    DATA  TYPES   Image   45   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   Linear  
  • 46.
    IMMUTABLE  TEXTURES  (4.2,  GLES  3)   glGenTextures(1, &texObjName); glBindTexture(GL_TEXTURE_2D_ARRAY, texObjName); glTexStorage3D(GL_TEXTURE_2D_ARRAY, level, internalformat, width, height, depth); glTexSubImage3D(GL_TEXTURE_2D_ARRAY, 0,0,0, width, height, depth, format, type, pData);   CreateTexture2D( desc, srcDataLayout, pData); 46   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   46  
  • 47.
    FEEDING  THE  GPU   PROGRAMS  
  • 48.
    SHADER  MANAGEMENT  -­‐  OPENGL   Program  Object   GLuint shader = glCreateShader(GL_VERTEX_SHADER); Vertex  Shader   glShaderSource(…); glCompileShader(); Pixel  Shader   GLuint program = glCreateProgram(); glAttachShader(program, shader); glLinkProgram(program); glUseProgram(program);       48   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   48  
  • 49.
    BASIC  GLSL  PIXEL  SHADER   in fsInput //Not available in GLES { vec3 vPositionWS; vec2 vUV; } fs_input; uniform sampler2D sDiffuse; out vec4 color_out; void main(void) { color_out = texture( sDiffuse, fs_input.vUV ); } 49   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   49  
  • 50.
    BASIC  GLSL  VERTEX  SHADER   #version 430 in vec3 Position; in vec2 UV; out PosUV //Not available in GLES { vec3 vPositionWS; vec2 vUV; } vs_output; uniform mat4x4 mMVP; uniform mat4x4 mM; void main(void) { gl_Position = mMVP * vec4(Position, 1.0); vs_output.vPositionWS = mM * vec4(Position, 1.0); vs_output.vUV = UV; } 50   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   50  
  • 51.
    SHADER  MANAGEMENT  -­‐  DX   D3DCompile(source,..,vs_5_0,..,&pByteCode) pShader = CreateVertexShader(pByteCode); VSSetShader(pShader,0,0); !  No  program  /  link  concept  in  API   51   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   51  
  • 52.
    PROGRAM  BINARIES   OpenGL   glGetProgramBinary(program,…,format,pBinaryOut); DirectX   D3DCompile(source,..,vs_5_0,..,&pByteCode)   !  Program  level   !  Shader  level   !  In  theory:  format  choices   !  Portable  byte  code   !  In  pracQce:  somewhat  final,  non-­‐portable   52   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   52  
  • 53.
    DRAW  CALLS   OpenGL   D3D   glDrawArrays   Draw   glDrawArraysInstanced   DrawInstanced(…,0)   glDrawArraysInstancedBaseInstance   DrawInstanced   glDrawArraysIndirect   DrawInstancedIndirect   glMulQDrawArrays   for(int  i=0;  i<n;  ++i)        Draw(count[i],  start[i]);   glMulQDrawArraysIndirect   for(int  i=0;  i<n;  ++i)        DrawInstancedIndirect(…)   glDrawElements   DrawIndexed   …And  so  forth   53   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   53  
  • 54.
    COMPUTE  SHADERS   glDispatchCompute(nGroupsX,nGroupsY,nGroupsZ)   Dispatch(nGroupsX,nGroupsY,nGroupsZ )   glDispatchComputeIndirect(offset)   DispatchIndirect(pResource,offset)   OpenGL  4.3   54   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL   D3D11   54  
  • 55.
  • 56.
    IMAGE-­‐BASED  MODELING   56   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 57.
    GENERATING  THE  MODEL   Render:  projecQon,   rasterizaQon,   texturing,  depth   buffering,  …   57   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 58.
    TressFX   HAIR   ! AMD  technology  for  high-­‐quality  hair  rendering   !  Thousands  of  hair  strands  individually  simulated  and   rendered  on  the  GPU   !  DirectCompute  physics  simulaQon   !  Shader  Model  5.0  pixel  shader  using  compute  capabiliQes  for   rendering   58   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 59.
    NOT  EXPOSED  IN  GRAPHICS  APIS  (YET)   !  Local  shared  memory  restricted  to   ‒  Compute     ‒  TessellaQon  Control,  in  a  limited  sense   !  Some  OpenCL  extensions  (e.g.,  64  bit  atomics)   !  Numerical  compliance   !  Some  OpenCL  1.2  addiQons   !  OpenCL  2.0  addiQons   59   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 60.
    SUMMARY   The  graphics  pipeline     gives  you  access  to  different  hardware   There  are  addiQonal  synchroniza6on   issues  and  opportunites   Mix  and  match  for  the  best  of  both   compute  and  graphics   60   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL  
  • 61.
    DISCLAIMER  &  ATTRIBUTION   The  informaQon  presented  in  this  document  is  for  informaQonal  purposes  only  and  may  contain  technical  inaccuracies,  omissions  and  typographical  errors.     The  informaQon  contained  herein  is  subject  to  change  and  may  be  rendered  inaccurate  for  many  reasons,  including  but  not  limited  to  product  and  roadmap   changes,  component  and  motherboard  version  changes,  new  model  and/or  product  releases,  product  differences  between  differing  manufacturers,  sozware   changes,  BIOS  flashes,  firmware  upgrades,  or  the  like.  AMD  assumes  no  obligaQon  to  update  or  otherwise  correct  or  revise  this  informaQon.  However,  AMD   reserves  the  right  to  revise  this  informaQon  and  to  make  changes  from  Qme  to  Qme  to  the  content  hereof  without  obligaQon  of  AMD  to  noQfy  any  person  of   such  revisions  or  changes.     AMD  MAKES  NO  REPRESENTATIONS  OR  WARRANTIES  WITH  RESPECT  TO  THE  CONTENTS  HEREOF  AND  ASSUMES  NO  RESPONSIBILITY  FOR  ANY   INACCURACIES,  ERRORS  OR  OMISSIONS  THAT  MAY  APPEAR  IN  THIS  INFORMATION.     AMD  SPECIFICALLY  DISCLAIMS  ANY  IMPLIED  WARRANTIES  OF  MERCHANTABILITY  OR  FITNESS  FOR  ANY  PARTICULAR  PURPOSE.  IN  NO  EVENT  WILL  AMD  BE   LIABLE  TO  ANY  PERSON  FOR  ANY  DIRECT,  INDIRECT,  SPECIAL  OR  OTHER  CONSEQUENTIAL  DAMAGES  ARISING  FROM  THE  USE  OF  ANY  INFORMATION   CONTAINED  HEREIN,  EVEN  IF  AMD  IS  EXPRESSLY  ADVISED  OF  THE  POSSIBILITY  OF  SUCH  DAMAGES.     ATTRIBUTION   ©  2013  Advanced  Micro  Devices,  Inc.  All  rights  reserved.  AMD,  the  AMD  Arrow  logo  and  combinaQons  thereof  are  trademarks  of  Advanced  Micro  Devices,   Inc.  in  the  United  States  and/or  other  jurisdicQons.    SPEC    is  a  registered  trademark  of  the  Standard  Performance  EvaluaQon  CorporaQon  (SPEC).  Other   names  are  for  informaQonal  purposes  only  and  may  be  trademarks  of  their  respecQve  owners.   61   |      PRESENTATION  TITLE      |      DECEMBER  4,  2013      |      CONFIDENTIAL