SlideShare a Scribd company logo
1 of 34
C++11
A subset of the features explored
What is happening?
• We want
• the performance of carefully optimized code
• the convenience of a high level language
• to use all our cores
Example: Laplacian Smoothing
Vertex moves
to center of
neighbors
Before
After 1k iterations
void laplacian_smooth0(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
L_attr[*v] =laplacian(m, *v);
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
m.pos(*v) += weight*L_attr[*v];
}
}
14.6Original
It is so C++98
void laplacian_smooth1(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexID v : m.vertices())
if(!boundary(m, v))
L_attr[v] =laplacian(m, v);
for(VertexID v : m.vertices()){
if(!boundary(m, v))
m.pos(v) += weight*L_attr[v];
}
}
}
14.2Range for
Much better to read. Not only is the
for loop clear, we did away with `*´.
vertices() returns a class which just
contains begin and end functions
void laplacian_smooth2(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = new_pos;
}
}
12.4Optimized
And we only need one loop.We can
memory move the vertex positions
void laplacian_smooth3(Manifold& m, float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
auto new_pos = m.positions_attribute_vector();
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = move(new_pos);
}
}
12.6move
Actually, we should move, but ... oh ...
now I copy somewhere else
void laplacian_smooth4(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
swap(m.positions_attribute_vector(),new_pos);
}
}
12.1swap
Now we only have two buffers for
vertex positions and always read from
one and write to the other.Then swap!
I think this version is the sweet spot
for single threaded code.
void laplacian_smooth4_5(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for_each_vertex(m, [&](VertexID v) {new_pos[v] = weight*laplacian(m, v)+m.pos(v);});
swap(m.positions_attribute_vector(),new_pos);
}
}
Lambda variation
Not much more clear.
Should be about the same
performance...
void laplacian_smooth5(Manifold& m, float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
auto new_pos = m.positions_attribute_vector();
vector<thread> t_vec;
for(auto v : m.vertices())
if(!boundary(m, v))
t_vec.push_back(thread([&](VertexID vid){
if(!boundary(m, vid))
new_pos[vid] = weight*laplacian(m, vid)+ m.pos(vid);},v));
for(int i=0;i<t_vec.size();++i)
t_vec[i].join();
m.positions_attribute_vector() = move(new_pos);
}
}
∞Threads done wrong
For a brief moment I must
have thought I was coding
to a GPU. First time I timed
it, I got 666 times longer run
time
inline void laplacian_smooth_vertex(Manifold& m,vector<VertexID>& vids,
VertexAttributeVector<Vec3d>& new_pos,
float weight){
for(auto v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
}
void laplacian_smooth6(Manifold& m, float weight, int max_iter)
{
vector<vector<VertexID>> vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
vector<thread> t_vec(CORES);
VertexAttributeVector<Vec3d> new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no] = thread(laplacian_smooth_vertex,
ref(m), ref(vertex_ids[thread_no]),
ref(new_pos), weight);
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no].join();
swap(m.positions_attribute_vector(), new_pos);
}
}
2.5
Almost five times
performance
improvement with
four physical cores.
hyperthreading
works!!
CORES = 8
Statistics
Median
Baseline 14,6 14,6 14,5 14,6 14,6 14,6
Range for 14,4 14,2 14,2 14,2 14,2 14,2
Copy back 12,4 12,4 12,4 12,4 12,4 12,4
Move back 12,5 12,5 12,9 12,9 12,6 12,6
Swap 12,1 12,1 12,1 12,1 12,2 12,1
2 threads 6,8 6,7 6,8 6,7 6,7 6,7
4 threads 4,1 4,1 4,1 4,1 4,1 4,1
8 threads 2,5 2,5 2,5 2,5 2,5 2,5
s s s s s s
Now make it generic!
typedef vector<vector<VertexID>> VertexIDBatches;
VertexIDBatches batch_vertices(Manifold& m) {
VertexIDBatches vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
return vertex_ids;
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
for(auto t : range(0, no_threads))
t_vec[t].join();
}
#1 Produces a vector of
vectors of vertex IDs
#2 Actually spawns off worker
threads
void laplacian_smooth7(Manifold& m, float weight, int max_iter)
{
auto vertex_ids = batch_vertices(m);
auto new_pos = m.positions_attribute_vector();
auto f = [&](const vector<VertexID>& vids) {
for(VertexID v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
};
for(auto _ : range(0, max_iter)) {
for_each_vertex_parallel(CORES, vertex_ids, f);
swap(m.positions_attribute_vector(), new_pos);
}
}
Slightly faster, much simpler. Note I threw in a range
class to get rid of old school for loops.
2.4
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<future<void>> f_vec(no_threads);
for(auto t : range(0, no_threads))
f_vec[t] = async(launch::async, f, ref(batches[t]));
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
for(auto t : range(0, no_threads))
t_vec[t].join();
}
See the code above is simpler and the destructor joins!
what happens if we ignore the future?!
But the async code takes 50% more time than the old code
where I join threads explicitly. Not sure why?!
More C++11 examples
Polymorphism with std::function
class MyClass {
int c;
public:
MyClass(int _c): c(_c) {}
function<int(int)> fun;
void set_fun(function<int(int,int)> f) {
fun = bind1st(f, c);
}
};
int fun1(int c, int x) { return c*x;}
int fun2(int c, int x) { return x/c;}
int main(int argc, const char * argv[]) {
MyClass m1{1},m2{2};
m1.set_fun(fun1);
m2.set_fun(fun2);
cout << m1.fun(42) << " " << m2.fun(42) << endl;
}
Maybe more exotic than
actually useful, but instructive
that polymorphism can be
achieved so differently from
when using virtual functions
Kinder, gentler member init
class VisObj
! {
! ! std::string file;
! ! GLGraphics::GLViewController view_ctrl;
! ! bool create_display_list;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics;
GLGraphics::ManifoldRenderer* renderer;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj(): file(""), view_ctrl(WINX,WINY, CGLA::Vec3f(0), 1.0),
create_display_list(true), harmonics(0) {}
// ... and so on
We never really liked these
long initialization lists and
always wondered why we
could not just initialize when
we declare
Kinder, gentler member init
class VisObj
! {
! ! std::string file = "";
! ! GLGraphics::GLViewController view_ctrl =
GLGraphics::GLViewController(WINX,WINY, CGLA::Vec3f(0), 1.0);
! ! bool create_display_list = true;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics = nullptr;
GLGraphics::ManifoldRenderer* renderer = nullptr;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj() {}
// and so on
Now, we can! What
is up with nullptr?!
ArithVec changes
template <class T, class V, unsigned int N>
class ArithVec
{
protected:
/// The actual contents of the vector.
std::array<T,N> data;
// ......... Look, I did away with C style
arrays
ArithVec::ArithVec(T _a, T _b, T _c, T _d)
{
assert(N==4);
data[0] = _a;
data[1] = _b;
data[2] = _c;
data[3] = _d;
}
ArithVec::ArithVec(T _a, T _b, T _c, T _d):
data({_a,_b,_c,_d}) {assert(N==4);}
Look! an initializer list ... hmmm MSVC does not like it
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::for_each(begin(), end(), [k](T& x){x*=k;});
return static_cast<const V&>(*this);
}
Note: begin() and end()
make the code nicer
than before
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
for(auto& x : data) {x*=k;}
return static_cast<const V&>(*this);
}
Morten: this is
actually simpler!
bool ArithVec:: operator==(const V& v) const
{
return std::equal(begin(),end(), v.begin());
}
bool ArithVec::operator==(const V& v) const
{
return std::inner_product(data, &data[N], v.get(), true,
! ! ! std::logical_and<bool>(), std::equal_to<T>());
}
Just to use the
obvious.This was
possible before
C++11
circulate with functors
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(Walker&)> f)
{
Walker w = m.walker(v);
for(; !w.full_circle(); w = w.circulate_vertex_ccw()) f(w);
return w.no_steps();
}
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(VertexID)> f)
{
return circulate_vertex_ccw(m, v, [&](Walker& w){f(w.vertex());});
}
Five slides that show
what we can do by having
circulator functions
accepting functors
int valency(const Manifold& m, VertexID v)
{
// perform full circulation to get valency
Walker vj = m.walker(v);
while(!vj.full_circle())
vj = vj.circulate_vertex_cw();
return vj.no_steps();
}
int valency(const Manifold& m, VertexID v)
{
return circulate_vertex_ccw(m,v, [](Walker){});
}
bool connected(const Manifold& m, VertexID v0, VertexID v1)
{
for(Walker vj = m.walker(v0); !vj.full_circle();
vj = vj.circulate_vertex_cw()){
if(vj.vertex() == v1)
return true;
}
return false;
}
bool connected(const Manifold& m, VertexID v0, VertexID v1)
{
bool c=false;
circulate_vertex_ccw(m, v0, [&](VertexID v){ c |= (v==v1);});
return c;
}
inline Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d p(0);
int n = circulate_vertex_ccw(m, v, [&](VertexID v){ p += m.pos(v); });
return p / n - m.pos(v);
}
Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d avg_pos(0);
int n = 0;
for(Walker w = m.walker(v); !w.full_circle(); w = w.circulate_vertex_cw()){
avg_pos += m.pos(w.vertex());
++n;
}
return avg_pos / n - m.pos(v);
}
int no_edges(const Manifold& m, FaceID f)
{
return circulate_face_ccw(m, f, [](Walker w){});
}
int no_edges(const Manifold& m, FaceID f)
{
// perform full circulation to get valency
Walker w = m.walker(f);
for(; !w.full_circle(); w = w.circulate_face_cw());
return w.no_steps();
}
Conclusions
• Multicore is very important and the C++11 thread library makes
concurrency easy.We will rely on the compiler for SIMD
optimization!
• range for is great. Makes code far more clear and we get rid of
iterators in many cases
• move semantics & RVO make clear code faster
• lambda functions improve on locality ... awesome with the STL
algorithms and std::function
• auto helps us avoid obfuscation with ugly type names
• uniform initialization and initializer lists also make code concise
Discussion
• A C++11 developer version of GEL has branched off: should
we go for built-in parallellism?
• Hmm - just so you know - there is much more in the C++11
standard.This is just the part I understand so far...
• Herb Sutter:“We broke all the books!”
• Yet the learning curve is less daunting than when we first had
to do templates.

More Related Content

What's hot

Projectile Motion
Projectile MotionProjectile Motion
Projectile Motion
Arlo Alegre
 
Kinematic equations of motion
Kinematic equations of motionKinematic equations of motion
Kinematic equations of motion
mantlfin
 
Lec 02 (constant acc 051)
Lec 02 (constant acc 051)Lec 02 (constant acc 051)
Lec 02 (constant acc 051)
nur amalina
 
M3 - Banfi Cavallo - Fila di sedie
M3 - Banfi Cavallo - Fila di sedieM3 - Banfi Cavallo - Fila di sedie
M3 - Banfi Cavallo - Fila di sedie
GIOVANNI LARICCIA
 

What's hot (15)

Monolith to Reactive Microservices
Monolith to Reactive MicroservicesMonolith to Reactive Microservices
Monolith to Reactive Microservices
 
Projectile Motion
Projectile MotionProjectile Motion
Projectile Motion
 
Chapter 2
Chapter 2Chapter 2
Chapter 2
 
Materi 3 Finite State Automata
Materi 3   Finite State AutomataMateri 3   Finite State Automata
Materi 3 Finite State Automata
 
Concurrency in Programming Languages
Concurrency in Programming LanguagesConcurrency in Programming Languages
Concurrency in Programming Languages
 
Kinematic equations of motion
Kinematic equations of motionKinematic equations of motion
Kinematic equations of motion
 
Dynamics Kinematics Curvilinear Motion
Dynamics Kinematics Curvilinear MotionDynamics Kinematics Curvilinear Motion
Dynamics Kinematics Curvilinear Motion
 
Projectile motion
Projectile motionProjectile motion
Projectile motion
 
Solving Accelerated Motion Problems
Solving Accelerated Motion ProblemsSolving Accelerated Motion Problems
Solving Accelerated Motion Problems
 
Clojure concurrency
Clojure concurrencyClojure concurrency
Clojure concurrency
 
Qt Widget In-Depth
Qt Widget In-DepthQt Widget In-Depth
Qt Widget In-Depth
 
Introduction to idris
Introduction to idrisIntroduction to idris
Introduction to idris
 
Lec 02 (constant acc 051)
Lec 02 (constant acc 051)Lec 02 (constant acc 051)
Lec 02 (constant acc 051)
 
M3 - Banfi Cavallo - Fila di sedie
M3 - Banfi Cavallo - Fila di sedieM3 - Banfi Cavallo - Fila di sedie
M3 - Banfi Cavallo - Fila di sedie
 
Linear motion of a particle
Linear motion of a particleLinear motion of a particle
Linear motion of a particle
 

Viewers also liked (9)

Dave's_CurriculumVitae_2015
Dave's_CurriculumVitae_2015Dave's_CurriculumVitae_2015
Dave's_CurriculumVitae_2015
 
Comunicado de la juventud radical de mendoza
Comunicado de la juventud radical de mendozaComunicado de la juventud radical de mendoza
Comunicado de la juventud radical de mendoza
 
Chile
ChileChile
Chile
 
Sobre la identidad de los pueblos
Sobre la identidad de los pueblosSobre la identidad de los pueblos
Sobre la identidad de los pueblos
 
175059616 rpp-perakitan-komputer
175059616 rpp-perakitan-komputer175059616 rpp-perakitan-komputer
175059616 rpp-perakitan-komputer
 
Comunicado por decreto 1625
Comunicado por decreto 1625Comunicado por decreto 1625
Comunicado por decreto 1625
 
zk resume
zk resumezk resume
zk resume
 
Día de los enamorados 2017
Día de los enamorados 2017Día de los enamorados 2017
Día de los enamorados 2017
 
Reactive Frustrations
Reactive FrustrationsReactive Frustrations
Reactive Frustrations
 

Similar to Experiments with C++11

Ti1220 Lecture 2: Names, Bindings, and Scopes
Ti1220 Lecture 2: Names, Bindings, and ScopesTi1220 Lecture 2: Names, Bindings, and Scopes
Ti1220 Lecture 2: Names, Bindings, and Scopes
Eelco Visser
 
Im looking for coding help I dont really need this to be explained.pdf
Im looking for coding help I dont really need this to be explained.pdfIm looking for coding help I dont really need this to be explained.pdf
Im looking for coding help I dont really need this to be explained.pdf
contact41
 

Similar to Experiments with C++11 (20)

C++11 - A Change in Style - v2.0
C++11 - A Change in Style - v2.0C++11 - A Change in Style - v2.0
C++11 - A Change in Style - v2.0
 
Cocoa Design Patterns in Swift
Cocoa Design Patterns in SwiftCocoa Design Patterns in Swift
Cocoa Design Patterns in Swift
 
openFrameworks 007 - 3D
openFrameworks 007 - 3DopenFrameworks 007 - 3D
openFrameworks 007 - 3D
 
Swift - One step forward from Obj-C
Swift -  One step forward from Obj-CSwift -  One step forward from Obj-C
Swift - One step forward from Obj-C
 
Writing DSL with Applicative Functors
Writing DSL with Applicative FunctorsWriting DSL with Applicative Functors
Writing DSL with Applicative Functors
 
Ti1220 Lecture 2: Names, Bindings, and Scopes
Ti1220 Lecture 2: Names, Bindings, and ScopesTi1220 Lecture 2: Names, Bindings, and Scopes
Ti1220 Lecture 2: Names, Bindings, and Scopes
 
C++ references
C++ referencesC++ references
C++ references
 
Hidden Gems in Swift
Hidden Gems in SwiftHidden Gems in Swift
Hidden Gems in Swift
 
Matlab robotics toolbox
Matlab robotics toolboxMatlab robotics toolbox
Matlab robotics toolbox
 
The Java Fx Platform – A Java Developer’S Guide
The Java Fx Platform – A Java Developer’S GuideThe Java Fx Platform – A Java Developer’S Guide
The Java Fx Platform – A Java Developer’S Guide
 
A scrupulous code review - 15 bugs in C++ code
A scrupulous code review - 15 bugs in C++ codeA scrupulous code review - 15 bugs in C++ code
A scrupulous code review - 15 bugs in C++ code
 
Operator overloading
Operator overloadingOperator overloading
Operator overloading
 
Tutorial 2
Tutorial     2Tutorial     2
Tutorial 2
 
SDC - Einführung in Scala
SDC - Einführung in ScalaSDC - Einführung in Scala
SDC - Einführung in Scala
 
F# Presentation for SmartDevs, Hereford
F# Presentation for SmartDevs, HerefordF# Presentation for SmartDevs, Hereford
F# Presentation for SmartDevs, Hereford
 
C++ Code as Seen by a Hypercritical Reviewer
C++ Code as Seen by a Hypercritical ReviewerC++ Code as Seen by a Hypercritical Reviewer
C++ Code as Seen by a Hypercritical Reviewer
 
Scala - brief intro
Scala - brief introScala - brief intro
Scala - brief intro
 
Coding in Style
Coding in StyleCoding in Style
Coding in Style
 
MATLAB for Technical Computing
MATLAB for Technical ComputingMATLAB for Technical Computing
MATLAB for Technical Computing
 
Im looking for coding help I dont really need this to be explained.pdf
Im looking for coding help I dont really need this to be explained.pdfIm looking for coding help I dont really need this to be explained.pdf
Im looking for coding help I dont really need this to be explained.pdf
 

Recently uploaded

Recently uploaded (20)

Application orientated numerical on hev.ppt
Application orientated numerical on hev.pptApplication orientated numerical on hev.ppt
Application orientated numerical on hev.ppt
 
Sociology 101 Demonstration of Learning Exhibit
Sociology 101 Demonstration of Learning ExhibitSociology 101 Demonstration of Learning Exhibit
Sociology 101 Demonstration of Learning Exhibit
 
On_Translating_a_Tamil_Poem_by_A_K_Ramanujan.pptx
On_Translating_a_Tamil_Poem_by_A_K_Ramanujan.pptxOn_Translating_a_Tamil_Poem_by_A_K_Ramanujan.pptx
On_Translating_a_Tamil_Poem_by_A_K_Ramanujan.pptx
 
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
 
TỔNG ÔN TẬP THI VÀO LỚP 10 MÔN TIẾNG ANH NĂM HỌC 2023 - 2024 CÓ ĐÁP ÁN (NGỮ Â...
TỔNG ÔN TẬP THI VÀO LỚP 10 MÔN TIẾNG ANH NĂM HỌC 2023 - 2024 CÓ ĐÁP ÁN (NGỮ Â...TỔNG ÔN TẬP THI VÀO LỚP 10 MÔN TIẾNG ANH NĂM HỌC 2023 - 2024 CÓ ĐÁP ÁN (NGỮ Â...
TỔNG ÔN TẬP THI VÀO LỚP 10 MÔN TIẾNG ANH NĂM HỌC 2023 - 2024 CÓ ĐÁP ÁN (NGỮ Â...
 
Single or Multiple melodic lines structure
Single or Multiple melodic lines structureSingle or Multiple melodic lines structure
Single or Multiple melodic lines structure
 
Micro-Scholarship, What it is, How can it help me.pdf
Micro-Scholarship, What it is, How can it help me.pdfMicro-Scholarship, What it is, How can it help me.pdf
Micro-Scholarship, What it is, How can it help me.pdf
 
How to Add New Custom Addons Path in Odoo 17
How to Add New Custom Addons Path in Odoo 17How to Add New Custom Addons Path in Odoo 17
How to Add New Custom Addons Path in Odoo 17
 
FSB Advising Checklist - Orientation 2024
FSB Advising Checklist - Orientation 2024FSB Advising Checklist - Orientation 2024
FSB Advising Checklist - Orientation 2024
 
How to Create and Manage Wizard in Odoo 17
How to Create and Manage Wizard in Odoo 17How to Create and Manage Wizard in Odoo 17
How to Create and Manage Wizard in Odoo 17
 
Accessible Digital Futures project (20/03/2024)
Accessible Digital Futures project (20/03/2024)Accessible Digital Futures project (20/03/2024)
Accessible Digital Futures project (20/03/2024)
 
Mehran University Newsletter Vol-X, Issue-I, 2024
Mehran University Newsletter Vol-X, Issue-I, 2024Mehran University Newsletter Vol-X, Issue-I, 2024
Mehran University Newsletter Vol-X, Issue-I, 2024
 
How to setup Pycharm environment for Odoo 17.pptx
How to setup Pycharm environment for Odoo 17.pptxHow to setup Pycharm environment for Odoo 17.pptx
How to setup Pycharm environment for Odoo 17.pptx
 
How to Give a Domain for a Field in Odoo 17
How to Give a Domain for a Field in Odoo 17How to Give a Domain for a Field in Odoo 17
How to Give a Domain for a Field in Odoo 17
 
Python Notes for mca i year students osmania university.docx
Python Notes for mca i year students osmania university.docxPython Notes for mca i year students osmania university.docx
Python Notes for mca i year students osmania university.docx
 
Google Gemini An AI Revolution in Education.pptx
Google Gemini An AI Revolution in Education.pptxGoogle Gemini An AI Revolution in Education.pptx
Google Gemini An AI Revolution in Education.pptx
 
HMCS Max Bernays Pre-Deployment Brief (May 2024).pptx
HMCS Max Bernays Pre-Deployment Brief (May 2024).pptxHMCS Max Bernays Pre-Deployment Brief (May 2024).pptx
HMCS Max Bernays Pre-Deployment Brief (May 2024).pptx
 
NO1 Top Black Magic Specialist In Lahore Black magic In Pakistan Kala Ilam Ex...
NO1 Top Black Magic Specialist In Lahore Black magic In Pakistan Kala Ilam Ex...NO1 Top Black Magic Specialist In Lahore Black magic In Pakistan Kala Ilam Ex...
NO1 Top Black Magic Specialist In Lahore Black magic In Pakistan Kala Ilam Ex...
 
Key note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdfKey note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdf
 
Basic Civil Engineering first year Notes- Chapter 4 Building.pptx
Basic Civil Engineering first year Notes- Chapter 4 Building.pptxBasic Civil Engineering first year Notes- Chapter 4 Building.pptx
Basic Civil Engineering first year Notes- Chapter 4 Building.pptx
 

Experiments with C++11

  • 1. C++11 A subset of the features explored
  • 2. What is happening? • We want • the performance of carefully optimized code • the convenience of a high level language • to use all our cores
  • 3. Example: Laplacian Smoothing Vertex moves to center of neighbors
  • 6. void laplacian_smooth0(Manifold& m,float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { VertexAttributeVector<Vec3d> L_attr(m.no_vertices()); for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v) if(!boundary(m, *v)) L_attr[*v] =laplacian(m, *v); for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v) if(!boundary(m, *v)) m.pos(*v) += weight*L_attr[*v]; } } 14.6Original It is so C++98
  • 7. void laplacian_smooth1(Manifold& m,float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { VertexAttributeVector<Vec3d> L_attr(m.no_vertices()); for(VertexID v : m.vertices()) if(!boundary(m, v)) L_attr[v] =laplacian(m, v); for(VertexID v : m.vertices()){ if(!boundary(m, v)) m.pos(v) += weight*L_attr[v]; } } } 14.2Range for Much better to read. Not only is the for loop clear, we did away with `*´. vertices() returns a class which just contains begin and end functions
  • 8. void laplacian_smooth2(Manifold& m,float weight, int max_iter) { auto new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for(auto v : m.vertices()) if(!boundary(m, v)) new_pos[v] = weight*laplacian(m, v)+m.pos(v); m.positions_attribute_vector() = new_pos; } } 12.4Optimized And we only need one loop.We can memory move the vertex positions
  • 9. void laplacian_smooth3(Manifold& m, float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { auto new_pos = m.positions_attribute_vector(); for(auto v : m.vertices()) if(!boundary(m, v)) new_pos[v] = weight*laplacian(m, v)+m.pos(v); m.positions_attribute_vector() = move(new_pos); } } 12.6move Actually, we should move, but ... oh ... now I copy somewhere else
  • 10. void laplacian_smooth4(Manifold& m,float weight, int max_iter) { auto new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for(auto v : m.vertices()) if(!boundary(m, v)) new_pos[v] = weight*laplacian(m, v)+m.pos(v); swap(m.positions_attribute_vector(),new_pos); } } 12.1swap Now we only have two buffers for vertex positions and always read from one and write to the other.Then swap! I think this version is the sweet spot for single threaded code.
  • 11. void laplacian_smooth4_5(Manifold& m,float weight, int max_iter) { auto new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for_each_vertex(m, [&](VertexID v) {new_pos[v] = weight*laplacian(m, v)+m.pos(v);}); swap(m.positions_attribute_vector(),new_pos); } } Lambda variation Not much more clear. Should be about the same performance...
  • 12. void laplacian_smooth5(Manifold& m, float weight, int max_iter) { for(int iter=0;iter<max_iter; ++iter) { auto new_pos = m.positions_attribute_vector(); vector<thread> t_vec; for(auto v : m.vertices()) if(!boundary(m, v)) t_vec.push_back(thread([&](VertexID vid){ if(!boundary(m, vid)) new_pos[vid] = weight*laplacian(m, vid)+ m.pos(vid);},v)); for(int i=0;i<t_vec.size();++i) t_vec[i].join(); m.positions_attribute_vector() = move(new_pos); } } ∞Threads done wrong For a brief moment I must have thought I was coding to a GPU. First time I timed it, I got 666 times longer run time
  • 13. inline void laplacian_smooth_vertex(Manifold& m,vector<VertexID>& vids, VertexAttributeVector<Vec3d>& new_pos, float weight){ for(auto v: vids) new_pos[v] = m.pos(v)+weight*laplacian(m, v); } void laplacian_smooth6(Manifold& m, float weight, int max_iter) { vector<vector<VertexID>> vertex_ids(CORES); auto batch_size = m.no_vertices()/CORES; int cnt = 0; for_each_vertex(m, [&](VertexID v) { if (!boundary(m, v)) vertex_ids[(cnt++/batch_size)%CORES].push_back(v); }); vector<thread> t_vec(CORES); VertexAttributeVector<Vec3d> new_pos = m.positions_attribute_vector(); for(int iter=0;iter<max_iter; ++iter) { for(int thread_no=0;thread_no<CORES;++thread_no) t_vec[thread_no] = thread(laplacian_smooth_vertex, ref(m), ref(vertex_ids[thread_no]), ref(new_pos), weight); for(int thread_no=0;thread_no<CORES;++thread_no) t_vec[thread_no].join(); swap(m.positions_attribute_vector(), new_pos); } } 2.5 Almost five times performance improvement with four physical cores. hyperthreading works!! CORES = 8
  • 14. Statistics Median Baseline 14,6 14,6 14,5 14,6 14,6 14,6 Range for 14,4 14,2 14,2 14,2 14,2 14,2 Copy back 12,4 12,4 12,4 12,4 12,4 12,4 Move back 12,5 12,5 12,9 12,9 12,6 12,6 Swap 12,1 12,1 12,1 12,1 12,2 12,1 2 threads 6,8 6,7 6,8 6,7 6,7 6,7 4 threads 4,1 4,1 4,1 4,1 4,1 4,1 8 threads 2,5 2,5 2,5 2,5 2,5 2,5 s s s s s s
  • 15. Now make it generic!
  • 16. typedef vector<vector<VertexID>> VertexIDBatches; VertexIDBatches batch_vertices(Manifold& m) { VertexIDBatches vertex_ids(CORES); auto batch_size = m.no_vertices()/CORES; int cnt = 0; for_each_vertex(m, [&](VertexID v) { if (!boundary(m, v)) vertex_ids[(cnt++/batch_size)%CORES].push_back(v); }); return vertex_ids; } template<typename T> void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) { vector<thread> t_vec(no_threads); for(auto t : range(0, no_threads)) t_vec[t] = thread(f, ref(batches[t])); for(auto t : range(0, no_threads)) t_vec[t].join(); } #1 Produces a vector of vectors of vertex IDs #2 Actually spawns off worker threads
  • 17. void laplacian_smooth7(Manifold& m, float weight, int max_iter) { auto vertex_ids = batch_vertices(m); auto new_pos = m.positions_attribute_vector(); auto f = [&](const vector<VertexID>& vids) { for(VertexID v: vids) new_pos[v] = m.pos(v)+weight*laplacian(m, v); }; for(auto _ : range(0, max_iter)) { for_each_vertex_parallel(CORES, vertex_ids, f); swap(m.positions_attribute_vector(), new_pos); } } Slightly faster, much simpler. Note I threw in a range class to get rid of old school for loops. 2.4
  • 18. template<typename T> void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) { vector<future<void>> f_vec(no_threads); for(auto t : range(0, no_threads)) f_vec[t] = async(launch::async, f, ref(batches[t])); } template<typename T> void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) { vector<thread> t_vec(no_threads); for(auto t : range(0, no_threads)) t_vec[t] = thread(f, ref(batches[t])); for(auto t : range(0, no_threads)) t_vec[t].join(); } See the code above is simpler and the destructor joins! what happens if we ignore the future?! But the async code takes 50% more time than the old code where I join threads explicitly. Not sure why?!
  • 20. Polymorphism with std::function class MyClass { int c; public: MyClass(int _c): c(_c) {} function<int(int)> fun; void set_fun(function<int(int,int)> f) { fun = bind1st(f, c); } }; int fun1(int c, int x) { return c*x;} int fun2(int c, int x) { return x/c;} int main(int argc, const char * argv[]) { MyClass m1{1},m2{2}; m1.set_fun(fun1); m2.set_fun(fun2); cout << m1.fun(42) << " " << m2.fun(42) << endl; } Maybe more exotic than actually useful, but instructive that polymorphism can be achieved so differently from when using virtual functions
  • 21. Kinder, gentler member init class VisObj ! { ! ! std::string file; ! ! GLGraphics::GLViewController view_ctrl; ! ! bool create_display_list; ! ! HMesh::Manifold mani; ! ! HMesh::Manifold old_mani; ! ! ! ! Harmonics* harmonics; GLGraphics::ManifoldRenderer* renderer; ! ! CGLA::Vec3d bsphere_center; ! ! float bsphere_radius; ! public: ! ! VisObj(): file(""), view_ctrl(WINX,WINY, CGLA::Vec3f(0), 1.0), create_display_list(true), harmonics(0) {} // ... and so on We never really liked these long initialization lists and always wondered why we could not just initialize when we declare
  • 22. Kinder, gentler member init class VisObj ! { ! ! std::string file = ""; ! ! GLGraphics::GLViewController view_ctrl = GLGraphics::GLViewController(WINX,WINY, CGLA::Vec3f(0), 1.0); ! ! bool create_display_list = true; ! ! HMesh::Manifold mani; ! ! HMesh::Manifold old_mani; ! ! ! ! Harmonics* harmonics = nullptr; GLGraphics::ManifoldRenderer* renderer = nullptr; ! ! CGLA::Vec3d bsphere_center; ! ! float bsphere_radius; ! public: ! ! VisObj() {} // and so on Now, we can! What is up with nullptr?!
  • 23. ArithVec changes template <class T, class V, unsigned int N> class ArithVec { protected: /// The actual contents of the vector. std::array<T,N> data; // ......... Look, I did away with C style arrays
  • 24. ArithVec::ArithVec(T _a, T _b, T _c, T _d) { assert(N==4); data[0] = _a; data[1] = _b; data[2] = _c; data[3] = _d; } ArithVec::ArithVec(T _a, T _b, T _c, T _d): data({_a,_b,_c,_d}) {assert(N==4);} Look! an initializer list ... hmmm MSVC does not like it
  • 25. /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { std::transform(data, &data[N], data, std::bind2nd(std::multiplies<T>(), k)); return static_cast<const V&>(*this); } /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { std::for_each(begin(), end(), [k](T& x){x*=k;}); return static_cast<const V&>(*this); } Note: begin() and end() make the code nicer than before
  • 26. /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { std::transform(data, &data[N], data, std::bind2nd(std::multiplies<T>(), k)); return static_cast<const V&>(*this); } /// Assignment multiplication with scalar. const V& ArithVec::operator *=(T k) { for(auto& x : data) {x*=k;} return static_cast<const V&>(*this); } Morten: this is actually simpler!
  • 27. bool ArithVec:: operator==(const V& v) const { return std::equal(begin(),end(), v.begin()); } bool ArithVec::operator==(const V& v) const { return std::inner_product(data, &data[N], v.get(), true, ! ! ! std::logical_and<bool>(), std::equal_to<T>()); } Just to use the obvious.This was possible before C++11
  • 28. circulate with functors inline int circulate_vertex_ccw(const Manifold& m, VertexID v, std::function<void(Walker&)> f) { Walker w = m.walker(v); for(; !w.full_circle(); w = w.circulate_vertex_ccw()) f(w); return w.no_steps(); } inline int circulate_vertex_ccw(const Manifold& m, VertexID v, std::function<void(VertexID)> f) { return circulate_vertex_ccw(m, v, [&](Walker& w){f(w.vertex());}); } Five slides that show what we can do by having circulator functions accepting functors
  • 29. int valency(const Manifold& m, VertexID v) { // perform full circulation to get valency Walker vj = m.walker(v); while(!vj.full_circle()) vj = vj.circulate_vertex_cw(); return vj.no_steps(); } int valency(const Manifold& m, VertexID v) { return circulate_vertex_ccw(m,v, [](Walker){}); }
  • 30. bool connected(const Manifold& m, VertexID v0, VertexID v1) { for(Walker vj = m.walker(v0); !vj.full_circle(); vj = vj.circulate_vertex_cw()){ if(vj.vertex() == v1) return true; } return false; } bool connected(const Manifold& m, VertexID v0, VertexID v1) { bool c=false; circulate_vertex_ccw(m, v0, [&](VertexID v){ c |= (v==v1);}); return c; }
  • 31. inline Vec3d laplacian(const Manifold& m, VertexID v) { Vec3d p(0); int n = circulate_vertex_ccw(m, v, [&](VertexID v){ p += m.pos(v); }); return p / n - m.pos(v); } Vec3d laplacian(const Manifold& m, VertexID v) { Vec3d avg_pos(0); int n = 0; for(Walker w = m.walker(v); !w.full_circle(); w = w.circulate_vertex_cw()){ avg_pos += m.pos(w.vertex()); ++n; } return avg_pos / n - m.pos(v); }
  • 32. int no_edges(const Manifold& m, FaceID f) { return circulate_face_ccw(m, f, [](Walker w){}); } int no_edges(const Manifold& m, FaceID f) { // perform full circulation to get valency Walker w = m.walker(f); for(; !w.full_circle(); w = w.circulate_face_cw()); return w.no_steps(); }
  • 33. Conclusions • Multicore is very important and the C++11 thread library makes concurrency easy.We will rely on the compiler for SIMD optimization! • range for is great. Makes code far more clear and we get rid of iterators in many cases • move semantics & RVO make clear code faster • lambda functions improve on locality ... awesome with the STL algorithms and std::function • auto helps us avoid obfuscation with ugly type names • uniform initialization and initializer lists also make code concise
  • 34. Discussion • A C++11 developer version of GEL has branched off: should we go for built-in parallellism? • Hmm - just so you know - there is much more in the C++11 standard.This is just the part I understand so far... • Herb Sutter:“We broke all the books!” • Yet the learning curve is less daunting than when we first had to do templates.