This document summarizes the evolution of a C++ laplacian smoothing algorithm across multiple iterations as it is optimized to take advantage of C++11 features. The initial algorithm is rewritten using range-based for loops, move semantics, lambda functions, and multithreading to improve performance and readability. Key changes include using uniform initialization instead of initializer lists, std::array instead of C-style arrays, algorithms like std::transform and std::for_each, and parallelizing the work across threads. The discussion notes there is more to C++11 and developing a multithreaded version of the GEL library could take advantage of built-in parallelism features in C++11.
6. void laplacian_smooth0(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
L_attr[*v] =laplacian(m, *v);
for(VertexIDIterator v = m.vertices_begin(); v != m.vertices_end(); ++v)
if(!boundary(m, *v))
m.pos(*v) += weight*L_attr[*v];
}
}
14.6Original
It is so C++98
7. void laplacian_smooth1(Manifold& m,float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
VertexAttributeVector<Vec3d> L_attr(m.no_vertices());
for(VertexID v : m.vertices())
if(!boundary(m, v))
L_attr[v] =laplacian(m, v);
for(VertexID v : m.vertices()){
if(!boundary(m, v))
m.pos(v) += weight*L_attr[v];
}
}
}
14.2Range for
Much better to read. Not only is the
for loop clear, we did away with `*´.
vertices() returns a class which just
contains begin and end functions
8. void laplacian_smooth2(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = new_pos;
}
}
12.4Optimized
And we only need one loop.We can
memory move the vertex positions
9. void laplacian_smooth3(Manifold& m, float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
auto new_pos = m.positions_attribute_vector();
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
m.positions_attribute_vector() = move(new_pos);
}
}
12.6move
Actually, we should move, but ... oh ...
now I copy somewhere else
10. void laplacian_smooth4(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(auto v : m.vertices())
if(!boundary(m, v))
new_pos[v] = weight*laplacian(m, v)+m.pos(v);
swap(m.positions_attribute_vector(),new_pos);
}
}
12.1swap
Now we only have two buffers for
vertex positions and always read from
one and write to the other.Then swap!
I think this version is the sweet spot
for single threaded code.
11. void laplacian_smooth4_5(Manifold& m,float weight, int max_iter)
{
auto new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for_each_vertex(m, [&](VertexID v) {new_pos[v] = weight*laplacian(m, v)+m.pos(v);});
swap(m.positions_attribute_vector(),new_pos);
}
}
Lambda variation
Not much more clear.
Should be about the same
performance...
12. void laplacian_smooth5(Manifold& m, float weight, int max_iter)
{
for(int iter=0;iter<max_iter; ++iter) {
auto new_pos = m.positions_attribute_vector();
vector<thread> t_vec;
for(auto v : m.vertices())
if(!boundary(m, v))
t_vec.push_back(thread([&](VertexID vid){
if(!boundary(m, vid))
new_pos[vid] = weight*laplacian(m, vid)+ m.pos(vid);},v));
for(int i=0;i<t_vec.size();++i)
t_vec[i].join();
m.positions_attribute_vector() = move(new_pos);
}
}
∞Threads done wrong
For a brief moment I must
have thought I was coding
to a GPU. First time I timed
it, I got 666 times longer run
time
13. inline void laplacian_smooth_vertex(Manifold& m,vector<VertexID>& vids,
VertexAttributeVector<Vec3d>& new_pos,
float weight){
for(auto v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
}
void laplacian_smooth6(Manifold& m, float weight, int max_iter)
{
vector<vector<VertexID>> vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
vector<thread> t_vec(CORES);
VertexAttributeVector<Vec3d> new_pos = m.positions_attribute_vector();
for(int iter=0;iter<max_iter; ++iter) {
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no] = thread(laplacian_smooth_vertex,
ref(m), ref(vertex_ids[thread_no]),
ref(new_pos), weight);
for(int thread_no=0;thread_no<CORES;++thread_no)
t_vec[thread_no].join();
swap(m.positions_attribute_vector(), new_pos);
}
}
2.5
Almost five times
performance
improvement with
four physical cores.
hyperthreading
works!!
CORES = 8
14. Statistics
Median
Baseline 14,6 14,6 14,5 14,6 14,6 14,6
Range for 14,4 14,2 14,2 14,2 14,2 14,2
Copy back 12,4 12,4 12,4 12,4 12,4 12,4
Move back 12,5 12,5 12,9 12,9 12,6 12,6
Swap 12,1 12,1 12,1 12,1 12,2 12,1
2 threads 6,8 6,7 6,8 6,7 6,7 6,7
4 threads 4,1 4,1 4,1 4,1 4,1 4,1
8 threads 2,5 2,5 2,5 2,5 2,5 2,5
s s s s s s
16. typedef vector<vector<VertexID>> VertexIDBatches;
VertexIDBatches batch_vertices(Manifold& m) {
VertexIDBatches vertex_ids(CORES);
auto batch_size = m.no_vertices()/CORES;
int cnt = 0;
for_each_vertex(m, [&](VertexID v) {
if (!boundary(m, v))
vertex_ids[(cnt++/batch_size)%CORES].push_back(v);
});
return vertex_ids;
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
for(auto t : range(0, no_threads))
t_vec[t].join();
}
#1 Produces a vector of
vectors of vertex IDs
#2 Actually spawns off worker
threads
17. void laplacian_smooth7(Manifold& m, float weight, int max_iter)
{
auto vertex_ids = batch_vertices(m);
auto new_pos = m.positions_attribute_vector();
auto f = [&](const vector<VertexID>& vids) {
for(VertexID v: vids)
new_pos[v] = m.pos(v)+weight*laplacian(m, v);
};
for(auto _ : range(0, max_iter)) {
for_each_vertex_parallel(CORES, vertex_ids, f);
swap(m.positions_attribute_vector(), new_pos);
}
}
Slightly faster, much simpler. Note I threw in a range
class to get rid of old school for loops.
2.4
18. template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<future<void>> f_vec(no_threads);
for(auto t : range(0, no_threads))
f_vec[t] = async(launch::async, f, ref(batches[t]));
}
template<typename T>
void for_each_vertex_parallel(int no_threads, const VertexIDBatches& batches, T& f) {
vector<thread> t_vec(no_threads);
for(auto t : range(0, no_threads))
t_vec[t] = thread(f, ref(batches[t]));
for(auto t : range(0, no_threads))
t_vec[t].join();
}
See the code above is simpler and the destructor joins!
what happens if we ignore the future?!
But the async code takes 50% more time than the old code
where I join threads explicitly. Not sure why?!
20. Polymorphism with std::function
class MyClass {
int c;
public:
MyClass(int _c): c(_c) {}
function<int(int)> fun;
void set_fun(function<int(int,int)> f) {
fun = bind1st(f, c);
}
};
int fun1(int c, int x) { return c*x;}
int fun2(int c, int x) { return x/c;}
int main(int argc, const char * argv[]) {
MyClass m1{1},m2{2};
m1.set_fun(fun1);
m2.set_fun(fun2);
cout << m1.fun(42) << " " << m2.fun(42) << endl;
}
Maybe more exotic than
actually useful, but instructive
that polymorphism can be
achieved so differently from
when using virtual functions
21. Kinder, gentler member init
class VisObj
! {
! ! std::string file;
! ! GLGraphics::GLViewController view_ctrl;
! ! bool create_display_list;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics;
GLGraphics::ManifoldRenderer* renderer;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj(): file(""), view_ctrl(WINX,WINY, CGLA::Vec3f(0), 1.0),
create_display_list(true), harmonics(0) {}
// ... and so on
We never really liked these
long initialization lists and
always wondered why we
could not just initialize when
we declare
22. Kinder, gentler member init
class VisObj
! {
! ! std::string file = "";
! ! GLGraphics::GLViewController view_ctrl =
GLGraphics::GLViewController(WINX,WINY, CGLA::Vec3f(0), 1.0);
! ! bool create_display_list = true;
! ! HMesh::Manifold mani;
! ! HMesh::Manifold old_mani;
! !
! ! Harmonics* harmonics = nullptr;
GLGraphics::ManifoldRenderer* renderer = nullptr;
! ! CGLA::Vec3d bsphere_center;
! ! float bsphere_radius;
! public:
! ! VisObj() {}
// and so on
Now, we can! What
is up with nullptr?!
23. ArithVec changes
template <class T, class V, unsigned int N>
class ArithVec
{
protected:
/// The actual contents of the vector.
std::array<T,N> data;
// ......... Look, I did away with C style
arrays
24. ArithVec::ArithVec(T _a, T _b, T _c, T _d)
{
assert(N==4);
data[0] = _a;
data[1] = _b;
data[2] = _c;
data[3] = _d;
}
ArithVec::ArithVec(T _a, T _b, T _c, T _d):
data({_a,_b,_c,_d}) {assert(N==4);}
Look! an initializer list ... hmmm MSVC does not like it
25. /// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::for_each(begin(), end(), [k](T& x){x*=k;});
return static_cast<const V&>(*this);
}
Note: begin() and end()
make the code nicer
than before
26. /// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
std::transform(data, &data[N], data,
std::bind2nd(std::multiplies<T>(), k));
return static_cast<const V&>(*this);
}
/// Assignment multiplication with scalar.
const V& ArithVec::operator *=(T k)
{
for(auto& x : data) {x*=k;}
return static_cast<const V&>(*this);
}
Morten: this is
actually simpler!
27. bool ArithVec:: operator==(const V& v) const
{
return std::equal(begin(),end(), v.begin());
}
bool ArithVec::operator==(const V& v) const
{
return std::inner_product(data, &data[N], v.get(), true,
! ! ! std::logical_and<bool>(), std::equal_to<T>());
}
Just to use the
obvious.This was
possible before
C++11
28. circulate with functors
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(Walker&)> f)
{
Walker w = m.walker(v);
for(; !w.full_circle(); w = w.circulate_vertex_ccw()) f(w);
return w.no_steps();
}
inline int circulate_vertex_ccw(const Manifold& m, VertexID v,
std::function<void(VertexID)> f)
{
return circulate_vertex_ccw(m, v, [&](Walker& w){f(w.vertex());});
}
Five slides that show
what we can do by having
circulator functions
accepting functors
29. int valency(const Manifold& m, VertexID v)
{
// perform full circulation to get valency
Walker vj = m.walker(v);
while(!vj.full_circle())
vj = vj.circulate_vertex_cw();
return vj.no_steps();
}
int valency(const Manifold& m, VertexID v)
{
return circulate_vertex_ccw(m,v, [](Walker){});
}
31. inline Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d p(0);
int n = circulate_vertex_ccw(m, v, [&](VertexID v){ p += m.pos(v); });
return p / n - m.pos(v);
}
Vec3d laplacian(const Manifold& m, VertexID v)
{
Vec3d avg_pos(0);
int n = 0;
for(Walker w = m.walker(v); !w.full_circle(); w = w.circulate_vertex_cw()){
avg_pos += m.pos(w.vertex());
++n;
}
return avg_pos / n - m.pos(v);
}
32. int no_edges(const Manifold& m, FaceID f)
{
return circulate_face_ccw(m, f, [](Walker w){});
}
int no_edges(const Manifold& m, FaceID f)
{
// perform full circulation to get valency
Walker w = m.walker(f);
for(; !w.full_circle(); w = w.circulate_face_cw());
return w.no_steps();
}
33. Conclusions
• Multicore is very important and the C++11 thread library makes
concurrency easy.We will rely on the compiler for SIMD
optimization!
• range for is great. Makes code far more clear and we get rid of
iterators in many cases
• move semantics & RVO make clear code faster
• lambda functions improve on locality ... awesome with the STL
algorithms and std::function
• auto helps us avoid obfuscation with ugly type names
• uniform initialization and initializer lists also make code concise
34. Discussion
• A C++11 developer version of GEL has branched off: should
we go for built-in parallellism?
• Hmm - just so you know - there is much more in the C++11
standard.This is just the part I understand so far...
• Herb Sutter:“We broke all the books!”
• Yet the learning curve is less daunting than when we first had
to do templates.