CODE VECTORIZATION
 for mobile devices

             by Dmitriy Vovk
Hardware
• Typical hardware found in modern mobile
  devices:
  –   ARMv7 instructions set
  –   Cortex A8Cortex A9Custom cores (Krait, Swift)
  –   800 – 1500 MHz
  –   1-4 cores
  –   Thumb-2 instructions set
  –   VFPv3
  –   NEON, optional for Cortex A9. Nvidia Tegra 2 has
      no NEON support
NEON
• NEON is a general purpose SIMD engine
  designed by ARM for ARM processor
  architecture
• 16 registers, 128 bit wide each. Supports
  operations on 8, 16, 32 and 64 bits integers
  and 32 bits float values
NEON
• NEON can be used for:
  – Software geometry instancing;
  – Skinning on ES 1.1;
  – As a general vertex processor;
  – Other, typical, applications for SIMD.
NEON
• Some unified shader architectures, like
  popular Imagination Technologies USSE1
  (PowerVR SGX 530-545) are scalar, NEON is
  vector by nature. Move your vertex processing
  to CPU from GPU to speedup calculations*
• ???????
• PROFIT!!!111

•   *NOTE. That doesn’t apply to USSE2 hardware
NEON
• The weakest side of mobile GPUs is a fill rate.
  Fill rate is quickly killed by blending. 2D games
  are heavy on this. PowerVR USSE engine
  doesn’t care what to do – vertex or fragments
  processing. Moving you vertex processing to
  CPU (NEON) will leave some room space for
  fragment processing.
NEON
• There are 3 ways to use NEON vectorization in
  your code:
  1. Intrinsics
  2. Handwritten NEON assembly
  3. Autovectorization by compiler. –mllvm –
     vectorize –mllvm –bb-vectorize-aligned-only
     compiler flags for LLVM. -ftree-vectorizer-
     verbose=4 -mfpu=neon -funsafe-math-
     optimizations -ftree-vectorize for GCC
DEMO
Measurements
• Intrinsics:
Measurements
• Assembly :
Measurements
• Summary:
                          Running time, ms   CPU usage, %
     Intrinsics           2764               19
     Assembly             3664               20
     FPU                  6209               25-28
     FPU autovectorized   5028               22-24


• Intrinsics got me 25% speedup over assembly.
• Note that speed of intrinsics code vary from
  compiler to compiler.
NEON
• Intrinsics advantages over assembly:
  – Higher level code;
  – No need to manage registers;
  – You can vectorize basic blocks and build solution
    to every new problem with this blocks. In contrast
    to assembly – you have to solve each new
    problem from scratch;
NEON
• Assembly advantages over intrinsics:
  – Code generated from intrinsics vary from compiler
    to compiler and can give you really big difference
    in speed. Assembly code will always be the same.
Code
void Update() {
  GLKMatrix4 modelviewMat = {
        1, 0, 0, 0,
        0, 1, 0, 0,
        0, 0, 1, 0,
        0, 0, 0, 1 };

  const float Y_DELTA = 420.0f / QUADS_COUNT;

   for (int i = 0; i < QUADS_COUNT * VERTS_PER_QUAD; i += VERTS_PER_QUAD) {
     modelviewMat.m[12] = random() % 260;
     modelviewMat.m[13] = Y_DELTA ;
#ifdef ASM
     CalculateSpriteVertsWorldPos((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, (float32x4_t*)&data[i + 0].pos, (float32x4_t*)&data[i +
1].pos, (float32x4_t*)&data[i + 2].pos, (float32x4_t*)&data[i + 3].pos);
#else
     float32x4x4_t modelviewProj;
     Matrix4ByMatrix4((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, &modelviewProj);

    for (int j = 0; j < 4; ++j) {
       Matrix4ByVec4(&modelviewProj, (float32x4_t*)&squareVertices[j], (float32x4_t*)&data[i + j].pos);
    }
#endif
  }
  glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
  glBufferData(GL_ARRAY_BUFFER, sizeof(data), data, GL_STREAM_DRAW);
}
Code
__attribute__((always_inline)) void Matrix4ByVec4(const
float32x4x4_t* __restrict__ mat, const float32x4_t*
__restrict__ vec, float32x4_t* __restrict__ result)
{
   (*result) = vmulq_n_f32((*mat).val[0], (*vec)[0]);

    (*result) = vmlaq_n_f32((*result), (*mat).val[1], (*vec)[1]);
    (*result) = vmlaq_n_f32((*result), (*mat).val[2], (*vec)[2]);
    (*result) = vmlaq_n_f32((*result), (*mat).val[3], (*vec)[3]);
}
Code
__attribute__((always_inline)) void Matrix4ByMatrix4(const float32x4x4_t* __restrict__ m1, const float32x4x4_t* __restrict__
m2, float32x4x4_t* __restrict__ r)
{
#ifdef INTRINSICS
   (*r).val[0] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[0], 0));
   (*r).val[1] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[1], 0));
   (*r).val[2] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[2], 0));
   (*r).val[3] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[3], 0));

    (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[1], vgetq_lane_f32((*m2).val[0], 1));
    (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[1], vgetq_lane_f32((*m2).val[1], 1));
    (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[1], vgetq_lane_f32((*m2).val[2], 1));
    (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[1], vgetq_lane_f32((*m2).val[3], 1));

    (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[2], vgetq_lane_f32((*m2).val[0], 2));
    (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[2], vgetq_lane_f32((*m2).val[1], 2));
    (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[2], vgetq_lane_f32((*m2).val[2], 2));
    (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[2], vgetq_lane_f32((*m2).val[3], 2));

    (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[3], vgetq_lane_f32((*m2).val[0], 3));
    (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[3], vgetq_lane_f32((*m2).val[1], 3));
    (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[3], vgetq_lane_f32((*m2).val[2], 3));
    (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[3], vgetq_lane_f32((*m2).val[3], 3));
}
Code
__asm__ volatile                   "vmla.f32 q12, q11, d1[1]nt"      "vmla.f32 q10, q13, d4[1]nt"
 (                                 "vmla.f32 q13, q11, d3[1]nt"      "vmla.f32 q10, q14, d5[0]nt"
  "vldmia %6, { q0-q3 } nt"      "vmla.f32 q14, q11, d5[1]nt"      "vmla.f32 q10, q15, d5[1]nt"
  "vldmia %0, { q8-q11 }nt"      "vmla.f32 q15, q11, d7[1]nt"
                                                                       "vmla.f32 q11, q13, d6[1]nt"
  "vmul.f32 q12, q8, d0[0]nt"    "vldmia %1, { q0-q3 } nt"         "vmla.f32 q11, q14, d7[0]nt"
  "vmul.f32 q13, q8, d2[0]nt"                                        "vmla.f32 q11, q15, d7[1]nt"
  "vmul.f32 q14, q8, d4[0]nt"    "vmul.f32 q8, q12, d0[0]nt"
  "vmul.f32 q15, q8, d6[0]nt"    "vmul.f32 q9, q12, d2[0]nt"       "vstmia %2, { q8 }nt"
                                   "vmul.f32 q10, q12, d4[0]nt"      "vstmia %3, { q9 }nt"
  "vmla.f32 q12, q9, d0[1]nt"    "vmul.f32 q11, q12, d6[0]nt"      "vstmia %4, { q10 }nt"
  "vmla.f32 q13, q9, d2[1]nt"                                        "vstmia %5, { q11 }"
  "vmla.f32 q14, q9, d4[1]nt"    "vmla.f32 q8, q13, d0[1]nt"
  "vmla.f32 q15, q9, d6[1]nt"    "vmla.f32 q8, q14, d1[0]nt"       :
                                   "vmla.f32 q8, q15, d1[1]nt"       : "r" (proj), "r" (squareVertices), "r" (v1),
  "vmla.f32 q12, q10, d1[0]nt"                                    "r" (v2), "r" (v3), "r" (v4), "r" (modelView)
  "vmla.f32 q13, q10, d3[0]nt"   "vmla.f32 q9, q13, d2[1]nt"       : "memory", "q0", "q1", "q2", "q3",
  "vmla.f32 q14, q10, d5[0]nt"   "vmla.f32 q9, q14, d3[0]nt"    "q8", "q9", "q10", "q11", "q12", "q13",
                                                                    "q14", "q15"
  "vmla.f32 q15, q10, d7[0]nt"   "vmla.f32 q9, q15, d3[1]nt"
                                                                       );
Docs
• For detailed explanation on
  intrinsicsassembly see:
  http://infocenter.arm.com/help/index.jsp?topi
  c=/com.arm.doc.dui0491e/CIHJBEFE.html
Contact me



http://www.linkedin.com/in/dvovk/
  http://nukecode.blogspot.com/

Code vectorization for mobile devices

  • 1.
    CODE VECTORIZATION formobile devices by Dmitriy Vovk
  • 2.
    Hardware • Typical hardwarefound in modern mobile devices: – ARMv7 instructions set – Cortex A8Cortex A9Custom cores (Krait, Swift) – 800 – 1500 MHz – 1-4 cores – Thumb-2 instructions set – VFPv3 – NEON, optional for Cortex A9. Nvidia Tegra 2 has no NEON support
  • 3.
    NEON • NEON isa general purpose SIMD engine designed by ARM for ARM processor architecture • 16 registers, 128 bit wide each. Supports operations on 8, 16, 32 and 64 bits integers and 32 bits float values
  • 4.
    NEON • NEON canbe used for: – Software geometry instancing; – Skinning on ES 1.1; – As a general vertex processor; – Other, typical, applications for SIMD.
  • 5.
    NEON • Some unifiedshader architectures, like popular Imagination Technologies USSE1 (PowerVR SGX 530-545) are scalar, NEON is vector by nature. Move your vertex processing to CPU from GPU to speedup calculations* • ??????? • PROFIT!!!111 • *NOTE. That doesn’t apply to USSE2 hardware
  • 6.
    NEON • The weakestside of mobile GPUs is a fill rate. Fill rate is quickly killed by blending. 2D games are heavy on this. PowerVR USSE engine doesn’t care what to do – vertex or fragments processing. Moving you vertex processing to CPU (NEON) will leave some room space for fragment processing.
  • 7.
    NEON • There are3 ways to use NEON vectorization in your code: 1. Intrinsics 2. Handwritten NEON assembly 3. Autovectorization by compiler. –mllvm – vectorize –mllvm –bb-vectorize-aligned-only compiler flags for LLVM. -ftree-vectorizer- verbose=4 -mfpu=neon -funsafe-math- optimizations -ftree-vectorize for GCC
  • 8.
  • 9.
  • 10.
  • 11.
    Measurements • Summary: Running time, ms CPU usage, % Intrinsics 2764 19 Assembly 3664 20 FPU 6209 25-28 FPU autovectorized 5028 22-24 • Intrinsics got me 25% speedup over assembly. • Note that speed of intrinsics code vary from compiler to compiler.
  • 12.
    NEON • Intrinsics advantagesover assembly: – Higher level code; – No need to manage registers; – You can vectorize basic blocks and build solution to every new problem with this blocks. In contrast to assembly – you have to solve each new problem from scratch;
  • 13.
    NEON • Assembly advantagesover intrinsics: – Code generated from intrinsics vary from compiler to compiler and can give you really big difference in speed. Assembly code will always be the same.
  • 14.
    Code void Update() { GLKMatrix4 modelviewMat = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 }; const float Y_DELTA = 420.0f / QUADS_COUNT; for (int i = 0; i < QUADS_COUNT * VERTS_PER_QUAD; i += VERTS_PER_QUAD) { modelviewMat.m[12] = random() % 260; modelviewMat.m[13] = Y_DELTA ; #ifdef ASM CalculateSpriteVertsWorldPos((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, (float32x4_t*)&data[i + 0].pos, (float32x4_t*)&data[i + 1].pos, (float32x4_t*)&data[i + 2].pos, (float32x4_t*)&data[i + 3].pos); #else float32x4x4_t modelviewProj; Matrix4ByMatrix4((float32x4x4_t*)proj.m, (float32x4x4_t*)modelviewMat.m, &modelviewProj); for (int j = 0; j < 4; ++j) { Matrix4ByVec4(&modelviewProj, (float32x4_t*)&squareVertices[j], (float32x4_t*)&data[i + j].pos); } #endif } glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer); glBufferData(GL_ARRAY_BUFFER, sizeof(data), data, GL_STREAM_DRAW); }
  • 15.
    Code __attribute__((always_inline)) void Matrix4ByVec4(const float32x4x4_t*__restrict__ mat, const float32x4_t* __restrict__ vec, float32x4_t* __restrict__ result) { (*result) = vmulq_n_f32((*mat).val[0], (*vec)[0]); (*result) = vmlaq_n_f32((*result), (*mat).val[1], (*vec)[1]); (*result) = vmlaq_n_f32((*result), (*mat).val[2], (*vec)[2]); (*result) = vmlaq_n_f32((*result), (*mat).val[3], (*vec)[3]); }
  • 16.
    Code __attribute__((always_inline)) void Matrix4ByMatrix4(constfloat32x4x4_t* __restrict__ m1, const float32x4x4_t* __restrict__ m2, float32x4x4_t* __restrict__ r) { #ifdef INTRINSICS (*r).val[0] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[0], 0)); (*r).val[1] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[1], 0)); (*r).val[2] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[2], 0)); (*r).val[3] = vmulq_n_f32((*m1).val[0], vgetq_lane_f32((*m2).val[3], 0)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[1], vgetq_lane_f32((*m2).val[0], 1)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[1], vgetq_lane_f32((*m2).val[1], 1)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[1], vgetq_lane_f32((*m2).val[2], 1)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[1], vgetq_lane_f32((*m2).val[3], 1)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[2], vgetq_lane_f32((*m2).val[0], 2)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[2], vgetq_lane_f32((*m2).val[1], 2)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[2], vgetq_lane_f32((*m2).val[2], 2)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[2], vgetq_lane_f32((*m2).val[3], 2)); (*r).val[0] = vmlaq_n_f32((*r).val[0], (*m1).val[3], vgetq_lane_f32((*m2).val[0], 3)); (*r).val[1] = vmlaq_n_f32((*r).val[1], (*m1).val[3], vgetq_lane_f32((*m2).val[1], 3)); (*r).val[2] = vmlaq_n_f32((*r).val[2], (*m1).val[3], vgetq_lane_f32((*m2).val[2], 3)); (*r).val[3] = vmlaq_n_f32((*r).val[3], (*m1).val[3], vgetq_lane_f32((*m2).val[3], 3)); }
  • 17.
    Code __asm__ volatile "vmla.f32 q12, q11, d1[1]nt" "vmla.f32 q10, q13, d4[1]nt" ( "vmla.f32 q13, q11, d3[1]nt" "vmla.f32 q10, q14, d5[0]nt" "vldmia %6, { q0-q3 } nt" "vmla.f32 q14, q11, d5[1]nt" "vmla.f32 q10, q15, d5[1]nt" "vldmia %0, { q8-q11 }nt" "vmla.f32 q15, q11, d7[1]nt" "vmla.f32 q11, q13, d6[1]nt" "vmul.f32 q12, q8, d0[0]nt" "vldmia %1, { q0-q3 } nt" "vmla.f32 q11, q14, d7[0]nt" "vmul.f32 q13, q8, d2[0]nt" "vmla.f32 q11, q15, d7[1]nt" "vmul.f32 q14, q8, d4[0]nt" "vmul.f32 q8, q12, d0[0]nt" "vmul.f32 q15, q8, d6[0]nt" "vmul.f32 q9, q12, d2[0]nt" "vstmia %2, { q8 }nt" "vmul.f32 q10, q12, d4[0]nt" "vstmia %3, { q9 }nt" "vmla.f32 q12, q9, d0[1]nt" "vmul.f32 q11, q12, d6[0]nt" "vstmia %4, { q10 }nt" "vmla.f32 q13, q9, d2[1]nt" "vstmia %5, { q11 }" "vmla.f32 q14, q9, d4[1]nt" "vmla.f32 q8, q13, d0[1]nt" "vmla.f32 q15, q9, d6[1]nt" "vmla.f32 q8, q14, d1[0]nt" : "vmla.f32 q8, q15, d1[1]nt" : "r" (proj), "r" (squareVertices), "r" (v1), "vmla.f32 q12, q10, d1[0]nt" "r" (v2), "r" (v3), "r" (v4), "r" (modelView) "vmla.f32 q13, q10, d3[0]nt" "vmla.f32 q9, q13, d2[1]nt" : "memory", "q0", "q1", "q2", "q3", "vmla.f32 q14, q10, d5[0]nt" "vmla.f32 q9, q14, d3[0]nt" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "vmla.f32 q15, q10, d7[0]nt" "vmla.f32 q9, q15, d3[1]nt" );
  • 18.
    Docs • For detailedexplanation on intrinsicsassembly see: http://infocenter.arm.com/help/index.jsp?topi c=/com.arm.doc.dui0491e/CIHJBEFE.html
  • 19.
    Contact me http://www.linkedin.com/in/dvovk/ http://nukecode.blogspot.com/