GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

100倍をぶち殺せたらいいな
関東GPGPU勉強会 #2
山田てるみ

自己紹介
• 山田てるみ
• @telmin_orca
• なんちゃってGPUプログラマ

2012∼2013
• Fermi -> Kepler
http://en.wikipedia.org/wiki/File:Nvidia_logo.svg
http://www.nvidia.co.jp/page/home.html

2012∼2013
• GCN Architecture

Debunking the
100X GPU vs. CPU
Myth
GPUが100倍速いという神話をぶち殺す

3 years later...
あれから3年が過ぎた…

NVIDIA
• Kepler Architecture
• GK110
• SM -> SMX
• 32(48) -> 192!!
• 1.03TFLOPS -> 3.5TFLOPS!!

の造りし
　　　も
の

Xeon Phi
• MIC Architecture
• 60 Core / 4 Threads
• 32KB / L1 cache
• 512KB / L2 cache
• 512-bit vector unit!

Debunking the
100X GPU vs. CPU
Myth? ver.2013

大事なこと
• 以下の測定結果にはHost <->device間の
データ転送時間は含まれていません
• 元論文に準拠しました
• Haswellのデータも追加しました

SAXPY
• y = Ax+y
• かけてたす
• 演算量少なすぎてメモリ律速

SAXPY
• 実験条件
• 要素数:10000000

SAXPY
void
simpleSaxpy(double*
x,
double*
y,
const
double
A,

const
size_t
num)
{

for(size_t
i
=
0;
i
<
num;
++i)
{

y[i]
=
a
*
x[i]
+
y[i];

}
}

SAXPY OpenMP +
AVX
void
avxSaxpy(const
double*
x,
double*
y,
const
double
a,
const
size_t
num)
{

__m256d
v_a
=
_mm256_set1_pd(a);
#pragma
omp
parallel
for

for(int
i
=
0
;
i
<
num
/
4;
++i)
{

__m256d
v_x0
=
_mm256_loadu_pd(&x[i
*
4]);

__m256d
v_y0
=
_mm256_loadu_pd(&y[i
*
4]);

__m256d
v01
=
_mm256_mul_pd(v_a,
v_x0);

__m256d
v02
=
_mm256_add_pd(v01,
v_y0);

_mm256_storeu_pd(&y[i
*
4],
v02);

}
}

SAXPY CUDA
__global__
void
cudaSaxpyKernel(const
double*
x,
double*
y,
const
double
a,

const
int
num_elements)
{

const
int
id
=
blockDim.x
*
blockIdx.x
+
threadIdx.x;

if(id
<
num_elements)
{

y[id]
=
a
*
x[id]
+
y[id];

}
}

SAXPY MIC
void
micSaxpy(const
double*
x,
double*
y,
const
double
a,
const
size_t
num)
{

__m512d
v_a
=
_mm512_set1_pd(a);

#pragma
omp
parallel
for

for(int
i
=
0;
i
<
num
/
8;
++i)
{

__m512d
v_x
=
_mm512_load_pd(&x[i
*
8]);

__m512d
v_y
=
_mm512_load_pd(&y[i
*
8]);

__m512d
res
=
_mm512_fmadd_pd(v_x,
v_a,
v_y);

_mm512_storenr_pd(&y[i
*
8],
res);

}
}

SAXPY
msec GFlops GB/s
Corei7
2600K
14.077 1.42071 17.0486
Corei7
4770K
12.448 1.606 19.279
Titan 0.134 141.461 848.763
XeonPhi 1.98 10.095 121.15

Histogram
• ヒストグラム

Histogram
• 実験条件
• 1920x1080画像
• bin: 256

Histogram
void
simpleHistogram(const
unsigned
char*
src,

std::vector<int>&
dst,

const
size_t
width,
const
size_t
height)
{

//
grayscale

for(size_t
y
=
0;
y
<
height;
++y)
{

for(size_t
x
=
0;
x
<
width;
++x)
{

unsigned
char
val
=
src[y
*
width
+
x];

dst[val]++;

}

}
}

Histogram OpenMP
void
openMPHistogram(const
unsigned
char*
src,

std::vector<int>&
dst,

const
size_t
width,
const
size_t
height)
{
#pragma
omp
parallel

{

std::vector<int>
local_dst(256);
#pragma
omp
for

for(size_t
y
=
0;
y
<
height;
++y)
{

for(size_t
x
=
0;
x
<
width;
++x)
{

unsigned
char
val
=
src[y
*
width
+
x];

local_dst[val]++;

}

}
#pragma
omp
critical

{

for(size_t
i
=
0;
i
<
256;
++i)
{

dist[i]
+=
local_dist[i];

}

}

}
}

Histogram CUDA
__global__
void
histogram_cuda_kernel(const
unsigned
char*
src,

int*
dst,

const
unsigned
int
width,

const
unsigned
int
height,

const
unsigned
int
num_elements)
{

int
idx
=
blockDim.x
*
blockIdx.x
+
threadIdx.x;

int
x
=
idx
%
width;

int
y
=
idx
/
width;

if(idx
<
num_elements)
{

unsigned
char
val
=
src[y
*
width
+
x];

atomicAdd(&dst[val],
1);

}
}

Histogram
msec MPixel / s
Corei7 2600K 0.734 2823.8
Corei7 4770K 0.273 3370.66
Titan 0.0816 25381.3
XeonPhi 117.5 17.6463

Histogram MIC
void
micHistogram_240(const
unsigned
char*
src,

int*
dst,

const
size_t
width,
const
size_t
height)
{
#pragma
omp
parallel
num_threads(240)

{

const
size_t
thread_id
=
omp_get_thread_num();

const
size_t
num_threads
=
omp_get_num_threads();

size_t
local_height
=
height
/
num_threads;

local_height
+=
(thread_id
%
2)?
0
:
1;

const
size_t
offset
=
5
*
thread_id
-‐
(thread_id
/
2);

std::vector<int>
local_dst(256);

std::vector<unsigned
char>
local_src(local_height
*
width);

memcpy(&local_src[0],
&src[offset
*
width],
sizeof(unsigned
char)
*
local_height
*
width);

for(size_t
y
=
0;
y
<
local_height;
++y)
{

for(size_t
x
=
0;
x
<
width;
++x)
{

size_t
val
=
local_src[y
*
width
+
x];

local_dst[val]++;

}

}

#pragma
omp
critical

{

for(size_t
i
=
0;
i
<
256;
++i)
{

dst[i]
+=
local_dst[i];

}

}

}
}

Histogram
msec MPixel / s
Corei7 2600K 0.734 2823.8
Corei7 4770K 0.273 3370.66
Titan 0.0816 25381.3
XeonPhi 2.074 999.806

NL-means
• Non-local Algorithm
• A non-local algorithm for image denoising
• http://bengal.missouri.edu/~kes25c/nl2.pdf
• バイラテラルフィルタの親戚

NL-means
• エッジキープ型のフィルタ
• ノイズを除去しつつもボケにくい！
• Aviutilとかにプラグインがある

NL-means
http://opencv.jp/opencv2-x-samples/non-local-means-ﬁlter
by fukushima1981.

NL-means
• 実験条件
• 1920x1080画像
• Window size : 7x7
• Template size : 3x3

NL-means
sec FPS
Corei7 2600K 2.086 0.479
Corei7 4770K 2.29 0.436
Titan 0.05826 17.16
XeonPhi 1.217 0.822

Aobench
• Ambient Occlution
• 前回もやった
• Intelもサンプルに
使用
http://software.intel.com/en-us/articles/data-and-thread-parallelism/

Aobench
• 実験条件
• 512x512画像
• NSUBSAMPLE: 2
• NTHETA: 16
• NPHI: 16

Aobench
sec FPS
Corei7 2600K 1.556 0.642
Corei7 4770K 1.448 0.6905
Titan 0.0162 61.71
XeonPhi 0.9 1.11

0.00%$
2000.00%$
4000.00%$
6000.00%$
8000.00%$
10000.00%$
12000.00%$
SAXPY$ Histogram$ NL:means$ Aobench$
Core$i7$2600K$
Core$i7$4770K$
Titan$
XeonPhi$

結論
• CPUがGPUを倒す未来は
もう少し先の物語…

GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

More Related Content

What's hot

Viewers also liked

Similar to GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

GPUが100倍速いという神話をぶち殺せたらいいな ver.2013