GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

100倍をぶち殺せたらいいな
関東GPGPU勉強会 #2
山田てるみ

自己紹介
• 山田てるみ
• @telmin_orca
• なんちゃってGPUプログラマ

2012∼2013
• Fermi -> Kepler
http://en.wikipedia.org/wiki/File:Nvidia_logo.svg
http://www.nvidia.co.jp/page/home.html

2012∼2013
• GCN Architecture

Debunking the
100X GPU vs. CPU
Myth
GPUが100倍速いという神話をぶち殺す

3 years later...
あれから3年が過ぎた…

NVIDIA
• Kepler Architecture
• GK110
• SM -> SMX
• 32(48) -> 192!!
• 1.03TFLOPS -> 3.5TFLOPS!!

の造りし
　　　も
の

Xeon Phi
• MIC Architecture
• 60 Core / 4 Threads
• 32KB / L1 cache
• 512KB / L2 cache
• 512-bit vector unit!

Debunking the
100X GPU vs. CPU
Myth? ver.2013

大事なこと
• 以下の測定結果にはHost <->device間の
データ転送時間は含まれていません
• 元論文に準拠しました
• Haswellのデータも追加しました

SAXPY
• y = Ax+y
• かけてたす
• 演算量少なすぎてメモリ律速

SAXPY
• 実験条件
• 要素数:10000000

SAXPY
void
simpleSaxpy(double*
x,
double*
y,
const
double
A,

const
size_t
num)
{

for(size_t
i
=
0;
i
<
num;
++i)
{

y[i]
=
a
*
x[i]
+
y[i];

}
}

SAXPY OpenMP +
AVX
void
avxSaxpy(const
double*
x,
double*
y,
const
double
a,
const
size_t
num)
{

__m256d
v_a
=
_mm256_set1_pd(a);
#pragma
omp
parallel
for

for(int
i
=
0
;
i
<
num
/
4;
++i)
{

__m256d
v_x0
=
_mm256_loadu_pd(&x[i
*
4]);

__m256d
v_y0
=
_mm256_loadu_pd(&y[i
*
4]);

__m256d
v01
=
_mm256_mul_pd(v_a,
v_x0);

__m256d
v02
=
_mm256_add_pd(v01,
v_y0);

_mm256_storeu_pd(&y[i
*
4],
v02);

}
}

SAXPY CUDA
__global__
void
cudaSaxpyKernel(const
double*
x,
double*
y,
const
double
a,

const
int
num_elements)
{

const
int
id
=
blockDim.x
*
blockIdx.x
+
threadIdx.x;

if(id
<
num_elements)
{

y[id]
=
a
*
x[id]
+
y[id];

}
}

SAXPY MIC
void
micSaxpy(const
double*
x,
double*
y,
const
double
a,
const
size_t
num)
{

__m512d
v_a
=
_mm512_set1_pd(a);

#pragma
omp
parallel
for

for(int
i
=
0;
i
<
num
/
8;
++i)
{

__m512d
v_x
=
_mm512_load_pd(&x[i
*
8]);

__m512d
v_y
=
_mm512_load_pd(&y[i
*
8]);

__m512d
res
=
_mm512_fmadd_pd(v_x,
v_a,
v_y);

_mm512_storenr_pd(&y[i
*
8],
res);

}
}

SAXPY
msec GFlops GB/s
Corei7
2600K
14.077 1.42071 17.0486
Corei7
4770K
12.448 1.606 19.279
Titan 0.134 141.461 848.763
XeonPhi 1.98 10.095 121.15

Histogram
• ヒストグラム

Histogram
• 実験条件
• 1920x1080画像
• bin: 256

Histogram
void
simpleHistogram(const
unsigned
char*
src,

std::vector<int>&
dst,

const
size_t
width,
const
size_t
height)
{

//
grayscale

for(size_t
y
=
0;
y
<
height;
++y)
{

for(size_t
x
=
0;
x
<
width;
++x)
{

unsigned
char
val
=
src[y
*
width
+
x];

dst[val]++;

}

}
}

Histogram OpenMP
void
openMPHistogram(const
unsigned
char*
src,

std::vector<int>&
dst,

const
size_t
width,
const
size_t
height)
{
#pragma
omp
parallel

{

std::vector<int>
local_dst(256);
#pragma
omp
for

for(size_t
y
=
0;
y
<
height;
++y)
{

for(size_t
x
=
0;
x
<
width;
++x)
{

unsigned
char
val
=
src[y
*
width
+
x];

local_dst[val]++;

}

}
#pragma
omp
critical

{

for(size_t
i
=
0;
i
<
256;
++i)
{

dist[i]
+=
local_dist[i];

}

}

}
}

Histogram CUDA
__global__
void
histogram_cuda_kernel(const
unsigned
char*
src,

int*
dst,

const
unsigned
int
width,

const
unsigned
int
height,

const
unsigned
int
num_elements)
{

int
idx
=
blockDim.x
*
blockIdx.x
+
threadIdx.x;

int
x
=
idx
%
width;

int
y
=
idx
/
width;

if(idx
<
num_elements)
{

unsigned
char
val
=
src[y
*
width
+
x];

atomicAdd(&dst[val],
1);

}
}

Histogram
msec MPixel / s
Corei7 2600K 0.734 2823.8
Corei7 4770K 0.273 3370.66
Titan 0.0816 25381.3
XeonPhi 117.5 17.6463

Histogram MIC
void
micHistogram_240(const
unsigned
char*
src,

int*
dst,

const
size_t
width,
const
size_t
height)
{
#pragma
omp
parallel
num_threads(240)

{

const
size_t
thread_id
=
omp_get_thread_num();

const
size_t
num_threads
=
omp_get_num_threads();

size_t
local_height
=
height
/
num_threads;

local_height
+=
(thread_id
%
2)?
0
:
1;

const
size_t
offset
=
5
*
thread_id
-‐
(thread_id
/
2);

std::vector<int>
local_dst(256);

std::vector<unsigned
char>
local_src(local_height
*
width);

memcpy(&local_src[0],
&src[offset
*
width],
sizeof(unsigned
char)
*
local_height
*
width);

for(size_t
y
=
0;
y
<
local_height;
++y)
{

for(size_t
x
=
0;
x
<
width;
++x)
{

size_t
val
=
local_src[y
*
width
+
x];

local_dst[val]++;

}

}

#pragma
omp
critical

{

for(size_t
i
=
0;
i
<
256;
++i)
{

dst[i]
+=
local_dst[i];

}

}

}
}

Histogram
msec MPixel / s
Corei7 2600K 0.734 2823.8
Corei7 4770K 0.273 3370.66
Titan 0.0816 25381.3
XeonPhi 2.074 999.806

NL-means
• Non-local Algorithm
• A non-local algorithm for image denoising
• http://bengal.missouri.edu/~kes25c/nl2.pdf
• バイラテラルフィルタの親戚

NL-means
• エッジキープ型のフィルタ
• ノイズを除去しつつもボケにくい！
• Aviutilとかにプラグインがある

NL-means
http://opencv.jp/opencv2-x-samples/non-local-means-ﬁlter
by fukushima1981.

NL-means
• 実験条件
• 1920x1080画像
• Window size : 7x7
• Template size : 3x3

NL-means
sec FPS
Corei7 2600K 2.086 0.479
Corei7 4770K 2.29 0.436
Titan 0.05826 17.16
XeonPhi 1.217 0.822

Aobench
• Ambient Occlution
• 前回もやった
• Intelもサンプルに
使用
http://software.intel.com/en-us/articles/data-and-thread-parallelism/

Aobench
• 実験条件
• 512x512画像
• NSUBSAMPLE: 2
• NTHETA: 16
• NPHI: 16

Aobench
sec FPS
Corei7 2600K 1.556 0.642
Corei7 4770K 1.448 0.6905
Titan 0.0162 61.71
XeonPhi 0.9 1.11

0.00%$
2000.00%$
4000.00%$
6000.00%$
8000.00%$
10000.00%$
12000.00%$
SAXPY$ Histogram$ NL:means$ Aobench$
Core$i7$2600K$
Core$i7$4770K$
Titan$
XeonPhi$

結論
• CPUがGPUを倒す未来は
もう少し先の物語…

GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Viewers also liked

Viewers also liked (11)

Similar to GPUが100倍速いという神話をぶち殺せたらいいな ver.2013

Similar to GPUが100倍速いという神話をぶち殺せたらいいな ver.2013 (20)

Recently uploaded

Recently uploaded (11)

GPUが100倍速いという神話をぶち殺せたらいいな ver.2013