SlideShare a Scribd company logo
Better performance
through Superscalarity
Mårten Rånge
How many GigaFlops?
i5 6600K 3.5 GHz
(4x cores)
~224 GigaFlops
64 Flops/cycle
Zn+1 = Zn
2 + C (1)
Z0 = C (2)
(x,y)
(x,y) + (c,d)
(x+c,y+d)
(x,y)2
(x2 - y2,2xy)
r
aZk
Z0
2
2a
r2
Z1 = Z0
2 + C
C
|R| = 2
Zl
Zm
Z0
Zn+1 = Zn
2 + C
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
r2 = x2 + y2
y
x
r
(x,y)2 = (x2 - y2,2xy)
Zn+1 = Zn
2 + C
SIMD
a = b+c
(a0,a1)=(b0,b1)+(c0,c1)
0 1 2 3
4 5 6 7
4 6 8 10
+
AVX
8 flops/instruction
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
Minimize CPU stalls
opcode Latency Throughput
vmulps 5 1
vaddps 3 1
vsubps 3 1
vcmpps 3 1
vmovmskps 1 1
Task<float>
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
x2[0] = x[0]*x[0];
y2[0] = y[0]*y[0];
r2[0] = x2[0] + y2[0];
x2[1] = x[1]*x[1];
y2[1] = y[1]*y[1];
r2[1] = x2[1] + y2[1];
auto _4 = float8 (4.0);
cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
x2[0] = x[0]*x[0];
y2[0] = y[0]*y[0];
r2[0] = x2[0] + y2[0];
x2[1] = x[1]*x[1];
y2[1] = y[1]*y[1];
r2[1] = x2[1] + y2[1];
r2[0] = x2[0] + y2[0];
auto _4 = float8 (4.0);
cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
x2[0] = x[0]*x[0]
y2[0] = y[0]*y[0]
r2[0] = x2[0]+y2[0]
x2[1] = x[1]*x[1]
y2[1] = y[1]*y[1]
r2[1] = x2[1]+y2[1]
Instructionqueue
FU
x2[0]
y2[0]
r2[0]
x2[1]
y2[1]
r2[1]
Resultqueue
Shouldn’t compilers
do this for us?
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
Uses the mathematical properties of mandelbrot
Uses knowledge that inf and NaN <= 4 is false
AVX512
&
Hyper-threading
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
Questions?

More Related Content

What's hot

JavaScript - Agora nervoso
JavaScript - Agora nervosoJavaScript - Agora nervoso
JavaScript - Agora nervoso
Luis Vendrame
 
Vcs23
Vcs23Vcs23
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
hayato
 
ARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lectureARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lecture
anishgoel
 
Wap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithmWap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithm
Kapil Pandit
 
El
ElEl
Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016
Svet Ivantchev
 
Computer graphics programs in c++
Computer graphics programs in c++Computer graphics programs in c++
Computer graphics programs in c++
Ankit Kumar
 
10CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 1010CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 10
Vanishree Arun
 
Senior design project code for PPG
Senior design project code for PPGSenior design project code for PPG
Senior design project code for PPG
FrankDin1
 
Ssaw08 0624
Ssaw08 0624Ssaw08 0624
Ssaw08 0624
Atsushi Tadokoro
 
Numerical Method Assignment
Numerical Method AssignmentNumerical Method Assignment
Numerical Method Assignment
ashikul akash
 
OOXX
OOXXOOXX
Vcs9
Vcs9Vcs9
Snake.c
Snake.cSnake.c
Snake.c
Vijay Singh
 
When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)
Sylvain Hallé
 
Oprerator overloading
Oprerator overloadingOprerator overloading
Oprerator overloading
Parthipan Parthi
 

What's hot (17)

JavaScript - Agora nervoso
JavaScript - Agora nervosoJavaScript - Agora nervoso
JavaScript - Agora nervoso
 
Vcs23
Vcs23Vcs23
Vcs23
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
 
ARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lectureARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lecture
 
Wap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithmWap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithm
 
El
ElEl
El
 
Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016
 
Computer graphics programs in c++
Computer graphics programs in c++Computer graphics programs in c++
Computer graphics programs in c++
 
10CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 1010CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 10
 
Senior design project code for PPG
Senior design project code for PPGSenior design project code for PPG
Senior design project code for PPG
 
Ssaw08 0624
Ssaw08 0624Ssaw08 0624
Ssaw08 0624
 
Numerical Method Assignment
Numerical Method AssignmentNumerical Method Assignment
Numerical Method Assignment
 
OOXX
OOXXOOXX
OOXX
 
Vcs9
Vcs9Vcs9
Vcs9
 
Snake.c
Snake.cSnake.c
Snake.c
 
When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)
 
Oprerator overloading
Oprerator overloadingOprerator overloading
Oprerator overloading
 

Similar to Better performance through Superscalarity

Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for Speed
Yung-Yu Chen
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
Samsung Open Source Group
 
Vcs16
Vcs16Vcs16
Cocos2d Performance Tips
Cocos2d Performance TipsCocos2d Performance Tips
Cocos2d Performance Tips
Keisuke Hata
 
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxCOMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
TashiBhutia12
 
SCIPY-SYMPY.pdf
SCIPY-SYMPY.pdfSCIPY-SYMPY.pdf
SCIPY-SYMPY.pdf
FreddyGuzman19
 
PBL1-v1-002j.pptx
PBL1-v1-002j.pptxPBL1-v1-002j.pptx
PBL1-v1-002j.pptx
NAIST
 
Coscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usageCoscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usage
Wayne Tsai
 
include ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfinclude ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdf
contact32
 
PRACTICAL COMPUTING
PRACTICAL COMPUTINGPRACTICAL COMPUTING
PRACTICAL COMPUTING
Ramachendran Logarajah
 
Ocr code
Ocr codeOcr code
Ocr code
wi7sonjoseph
 
C# Assignmet Help
C# Assignmet HelpC# Assignmet Help
C# Assignmet Help
Programming Homework Help
 
06 Recursion in C.pptx
06 Recursion in C.pptx06 Recursion in C.pptx
06 Recursion in C.pptx
MouDhara1
 
Ejerciciosderivadasresueltos
EjerciciosderivadasresueltosEjerciciosderivadasresueltos
Ejerciciosderivadasresueltos
bellidomates
 
C c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdoC c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdo
Kim Phillips
 
Computer graphics lab manual
Computer graphics lab manualComputer graphics lab manual
Computer graphics lab manual
Uma mohan
 
All VLSI programs
All VLSI programsAll VLSI programs
All VLSI programs
Gouthaman V
 
Guia edo todas
Guia edo todasGuia edo todas
Guia edo todas
Gonzalo Jiménez
 
Integral table
Integral tableIntegral table
Integral table
Sasidhar Jannu
 
Writing MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptWriting MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScript
Roland Bouman
 

Similar to Better performance through Superscalarity (20)

Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for Speed
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
 
Vcs16
Vcs16Vcs16
Vcs16
 
Cocos2d Performance Tips
Cocos2d Performance TipsCocos2d Performance Tips
Cocos2d Performance Tips
 
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxCOMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
 
SCIPY-SYMPY.pdf
SCIPY-SYMPY.pdfSCIPY-SYMPY.pdf
SCIPY-SYMPY.pdf
 
PBL1-v1-002j.pptx
PBL1-v1-002j.pptxPBL1-v1-002j.pptx
PBL1-v1-002j.pptx
 
Coscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usageCoscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usage
 
include ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfinclude ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdf
 
PRACTICAL COMPUTING
PRACTICAL COMPUTINGPRACTICAL COMPUTING
PRACTICAL COMPUTING
 
Ocr code
Ocr codeOcr code
Ocr code
 
C# Assignmet Help
C# Assignmet HelpC# Assignmet Help
C# Assignmet Help
 
06 Recursion in C.pptx
06 Recursion in C.pptx06 Recursion in C.pptx
06 Recursion in C.pptx
 
Ejerciciosderivadasresueltos
EjerciciosderivadasresueltosEjerciciosderivadasresueltos
Ejerciciosderivadasresueltos
 
C c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdoC c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdo
 
Computer graphics lab manual
Computer graphics lab manualComputer graphics lab manual
Computer graphics lab manual
 
All VLSI programs
All VLSI programsAll VLSI programs
All VLSI programs
 
Guia edo todas
Guia edo todasGuia edo todas
Guia edo todas
 
Integral table
Integral tableIntegral table
Integral table
 
Writing MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptWriting MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScript
 

More from Mårten Rånge

Know your FOSS obligations
Know your FOSS obligationsKnow your FOSS obligations
Know your FOSS obligations
Mårten Rånge
 
Ray Marching Explained
Ray Marching ExplainedRay Marching Explained
Ray Marching Explained
Mårten Rånge
 
Property Based Tesing
Property Based TesingProperty Based Tesing
Property Based Tesing
Mårten Rånge
 
Monad - a functional design pattern
Monad - a functional design patternMonad - a functional design pattern
Monad - a functional design pattern
Mårten Rånge
 
Formlets
FormletsFormlets
Formlets
Mårten Rånge
 
Pragmatic metaprogramming
Pragmatic metaprogrammingPragmatic metaprogramming
Pragmatic metaprogramming
Mårten Rånge
 
Concurrency - responsiveness in .NET
Concurrency - responsiveness in .NETConcurrency - responsiveness in .NET
Concurrency - responsiveness in .NET
Mårten Rånge
 
Meta Programming
Meta ProgrammingMeta Programming
Meta Programming
Mårten Rånge
 
Concurrency scalability
Concurrency scalabilityConcurrency scalability
Concurrency scalability
Mårten Rånge
 
Concurrency
ConcurrencyConcurrency
Concurrency
Mårten Rånge
 

More from Mårten Rånge (10)

Know your FOSS obligations
Know your FOSS obligationsKnow your FOSS obligations
Know your FOSS obligations
 
Ray Marching Explained
Ray Marching ExplainedRay Marching Explained
Ray Marching Explained
 
Property Based Tesing
Property Based TesingProperty Based Tesing
Property Based Tesing
 
Monad - a functional design pattern
Monad - a functional design patternMonad - a functional design pattern
Monad - a functional design pattern
 
Formlets
FormletsFormlets
Formlets
 
Pragmatic metaprogramming
Pragmatic metaprogrammingPragmatic metaprogramming
Pragmatic metaprogramming
 
Concurrency - responsiveness in .NET
Concurrency - responsiveness in .NETConcurrency - responsiveness in .NET
Concurrency - responsiveness in .NET
 
Meta Programming
Meta ProgrammingMeta Programming
Meta Programming
 
Concurrency scalability
Concurrency scalabilityConcurrency scalability
Concurrency scalability
 
Concurrency
ConcurrencyConcurrency
Concurrency
 

Recently uploaded

Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!
Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!
Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!
SOFTTECHHUB
 
RESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for studentsRESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for students
KAMESHS29
 
20240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 202420240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 2024
Matthew Sinclair
 
How to use Firebase Data Connect For Flutter
How to use Firebase Data Connect For FlutterHow to use Firebase Data Connect For Flutter
How to use Firebase Data Connect For Flutter
Daiki Mogmet Ito
 
Monitoring Java Application Security with JDK Tools and JFR Events
Monitoring Java Application Security with JDK Tools and JFR EventsMonitoring Java Application Security with JDK Tools and JFR Events
Monitoring Java Application Security with JDK Tools and JFR Events
Ana-Maria Mihalceanu
 
“I’m still / I’m still / Chaining from the Block”
“I’m still / I’m still / Chaining from the Block”“I’m still / I’m still / Chaining from the Block”
“I’m still / I’m still / Chaining from the Block”
Claudio Di Ciccio
 
Data structures and Algorithms in Python.pdf
Data structures and Algorithms in Python.pdfData structures and Algorithms in Python.pdf
Data structures and Algorithms in Python.pdf
TIPNGVN2
 
Mind map of terminologies used in context of Generative AI
Mind map of terminologies used in context of Generative AIMind map of terminologies used in context of Generative AI
Mind map of terminologies used in context of Generative AI
Kumud Singh
 
Large Language Model (LLM) and it’s Geospatial Applications
Large Language Model (LLM) and it’s Geospatial ApplicationsLarge Language Model (LLM) and it’s Geospatial Applications
Large Language Model (LLM) and it’s Geospatial Applications
Rohit Gautam
 
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
SOFTTECHHUB
 
Introduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - CybersecurityIntroduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - Cybersecurity
mikeeftimakis1
 
Pushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 daysPushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 days
Adtran
 
Video Streaming: Then, Now, and in the Future
Video Streaming: Then, Now, and in the FutureVideo Streaming: Then, Now, and in the Future
Video Streaming: Then, Now, and in the Future
Alpen-Adria-Universität
 
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
名前 です男
 
Cosa hanno in comune un mattoncino Lego e la backdoor XZ?
Cosa hanno in comune un mattoncino Lego e la backdoor XZ?Cosa hanno in comune un mattoncino Lego e la backdoor XZ?
Cosa hanno in comune un mattoncino Lego e la backdoor XZ?
Speck&Tech
 
Presentation of the OECD Artificial Intelligence Review of Germany
Presentation of the OECD Artificial Intelligence Review of GermanyPresentation of the OECD Artificial Intelligence Review of Germany
Presentation of the OECD Artificial Intelligence Review of Germany
innovationoecd
 
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
James Anderson
 
Let's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with Slack
Let's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with SlackLet's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with Slack
Let's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with Slack
shyamraj55
 
Artificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopmentArtificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopment
Octavian Nadolu
 
Microsoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdfMicrosoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdf
Uni Systems S.M.S.A.
 

Recently uploaded (20)

Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!
Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!
Goodbye Windows 11: Make Way for Nitrux Linux 3.5.0!
 
RESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for studentsRESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for students
 
20240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 202420240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 2024
 
How to use Firebase Data Connect For Flutter
How to use Firebase Data Connect For FlutterHow to use Firebase Data Connect For Flutter
How to use Firebase Data Connect For Flutter
 
Monitoring Java Application Security with JDK Tools and JFR Events
Monitoring Java Application Security with JDK Tools and JFR EventsMonitoring Java Application Security with JDK Tools and JFR Events
Monitoring Java Application Security with JDK Tools and JFR Events
 
“I’m still / I’m still / Chaining from the Block”
“I’m still / I’m still / Chaining from the Block”“I’m still / I’m still / Chaining from the Block”
“I’m still / I’m still / Chaining from the Block”
 
Data structures and Algorithms in Python.pdf
Data structures and Algorithms in Python.pdfData structures and Algorithms in Python.pdf
Data structures and Algorithms in Python.pdf
 
Mind map of terminologies used in context of Generative AI
Mind map of terminologies used in context of Generative AIMind map of terminologies used in context of Generative AI
Mind map of terminologies used in context of Generative AI
 
Large Language Model (LLM) and it’s Geospatial Applications
Large Language Model (LLM) and it’s Geospatial ApplicationsLarge Language Model (LLM) and it’s Geospatial Applications
Large Language Model (LLM) and it’s Geospatial Applications
 
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
 
Introduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - CybersecurityIntroduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - Cybersecurity
 
Pushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 daysPushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 days
 
Video Streaming: Then, Now, and in the Future
Video Streaming: Then, Now, and in the FutureVideo Streaming: Then, Now, and in the Future
Video Streaming: Then, Now, and in the Future
 
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
 
Cosa hanno in comune un mattoncino Lego e la backdoor XZ?
Cosa hanno in comune un mattoncino Lego e la backdoor XZ?Cosa hanno in comune un mattoncino Lego e la backdoor XZ?
Cosa hanno in comune un mattoncino Lego e la backdoor XZ?
 
Presentation of the OECD Artificial Intelligence Review of Germany
Presentation of the OECD Artificial Intelligence Review of GermanyPresentation of the OECD Artificial Intelligence Review of Germany
Presentation of the OECD Artificial Intelligence Review of Germany
 
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
 
Let's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with Slack
Let's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with SlackLet's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with Slack
Let's Integrate MuleSoft RPA, COMPOSER, APM with AWS IDP along with Slack
 
Artificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopmentArtificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopment
 
Microsoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdfMicrosoft - Power Platform_G.Aspiotis.pdf
Microsoft - Power Platform_G.Aspiotis.pdf
 

Better performance through Superscalarity

  • 2. How many GigaFlops? i5 6600K 3.5 GHz (4x cores)
  • 5.
  • 6. Zn+1 = Zn 2 + C (1) Z0 = C (2)
  • 12. r aZk Z0 2 2a r2 Z1 = Z0 2 + C C |R| = 2 Zl Zm Z0 Zn+1 = Zn 2 + C
  • 13.
  • 14. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; } r2 = x2 + y2 y x r (x,y)2 = (x2 - y2,2xy) Zn+1 = Zn 2 + C
  • 15. SIMD
  • 18. 0 1 2 3 4 5 6 7 4 6 8 10 +
  • 20. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  • 21. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  • 23. opcode Latency Throughput vmulps 5 1 vaddps 3 1 vsubps 3 1 vcmpps 3 1 vmovmskps 1 1
  • 25. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  • 26. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  • 27. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; r2[0] = x2[0] + y2[0]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  • 28. x2[0] = x[0]*x[0] y2[0] = y[0]*y[0] r2[0] = x2[0]+y2[0] x2[1] = x[1]*x[1] y2[1] = y[1]*y[1] r2[1] = x2[1]+y2[1] Instructionqueue FU x2[0] y2[0] r2[0] x2[1] y2[1] r2[1] Resultqueue
  • 30. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  • 31. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; } Uses the mathematical properties of mandelbrot Uses knowledge that inf and NaN <= 4 is false
  • 33. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }