Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Better performance through Superscalarity

78 views

Published on

How to employ "hidden" parallelism in CPU:s of today to improve the performance of CPU intensive algorithms

Published in: Technology
  • Be the first to comment

  • Be the first to like this

Better performance through Superscalarity

  1. 1. Better performance through Superscalarity Mårten Rånge
  2. 2. How many GigaFlops? i5 6600K 3.5 GHz (4x cores)
  3. 3. ~224 GigaFlops
  4. 4. 64 Flops/cycle
  5. 5. Zn+1 = Zn 2 + C (1) Z0 = C (2)
  6. 6. (x,y)
  7. 7. (x,y) + (c,d)
  8. 8. (x+c,y+d)
  9. 9. (x,y)2
  10. 10. (x2 - y2,2xy)
  11. 11. r aZk Z0 2 2a r2 Z1 = Z0 2 + C C |R| = 2 Zl Zm Z0 Zn+1 = Zn 2 + C
  12. 12. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; } r2 = x2 + y2 y x r (x,y)2 = (x2 - y2,2xy) Zn+1 = Zn 2 + C
  13. 13. SIMD
  14. 14. a = b+c
  15. 15. (a0,a1)=(b0,b1)+(c0,c1)
  16. 16. 0 1 2 3 4 5 6 7 4 6 8 10 +
  17. 17. AVX 8 flops/instruction
  18. 18. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  19. 19. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  20. 20. Minimize CPU stalls
  21. 21. opcode Latency Throughput vmulps 5 1 vaddps 3 1 vsubps 3 1 vcmpps 3 1 vmovmskps 1 1
  22. 22. Task<float>
  23. 23. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  24. 24. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  25. 25. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; r2[0] = x2[0] + y2[0]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  26. 26. x2[0] = x[0]*x[0] y2[0] = y[0]*y[0] r2[0] = x2[0]+y2[0] x2[1] = x[1]*x[1] y2[1] = y[1]*y[1] r2[1] = x2[1]+y2[1] Instructionqueue FU x2[0] y2[0] r2[0] x2[1] y2[1] r2[1] Resultqueue
  27. 27. Shouldn’t compilers do this for us?
  28. 28. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  29. 29. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; } Uses the mathematical properties of mandelbrot Uses knowledge that inf and NaN <= 4 is false
  30. 30. AVX512 & Hyper-threading
  31. 31. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  32. 32. Questions?

×