More Related Content
Similar to GPUが100倍速いという神話をぶち殺せたらいいな ver.2013 (20)
GPUが100倍速いという神話をぶち殺せたらいいな ver.2013
- 4. 2012∼2013
• Fermi -> Kepler
http://en.wikipedia.org/wiki/File:Nvidia_logo.svg
http://www.nvidia.co.jp/page/home.html
- 15. Xeon Phi
• MIC Architecture
• 60 Core / 4 Threads
• 32KB / L1 cache
• 512KB / L2 cache
• 512-bit vector unit!
- 21. SAXPY OpenMP +
AVX
void
avxSaxpy(const
double*
x,
double*
y,
const
double
a,
const
size_t
num)
{
__m256d
v_a
=
_mm256_set1_pd(a);
#pragma
omp
parallel
for
for(int
i
=
0
;
i
<
num
/
4;
++i)
{
__m256d
v_x0
=
_mm256_loadu_pd(&x[i
*
4]);
__m256d
v_y0
=
_mm256_loadu_pd(&y[i
*
4]);
__m256d
v01
=
_mm256_mul_pd(v_a,
v_x0);
__m256d
v02
=
_mm256_add_pd(v01,
v_y0);
_mm256_storeu_pd(&y[i
*
4],
v02);
}
}
- 23. SAXPY MIC
void
micSaxpy(const
double*
x,
double*
y,
const
double
a,
const
size_t
num)
{
__m512d
v_a
=
_mm512_set1_pd(a);
#pragma
omp
parallel
for
for(int
i
=
0;
i
<
num
/
8;
++i)
{
__m512d
v_x
=
_mm512_load_pd(&x[i
*
8]);
__m512d
v_y
=
_mm512_load_pd(&y[i
*
8]);
__m512d
res
=
_mm512_fmadd_pd(v_x,
v_a,
v_y);
_mm512_storenr_pd(&y[i
*
8],
res);
}
}
- 28. Histogram
void
simpleHistogram(const
unsigned
char*
src,
std::vector<int>&
dst,
const
size_t
width,
const
size_t
height)
{
//
grayscale
for(size_t
y
=
0;
y
<
height;
++y)
{
for(size_t
x
=
0;
x
<
width;
++x)
{
unsigned
char
val
=
src[y
*
width
+
x];
dst[val]++;
}
}
}
- 29. Histogram OpenMP
void
openMPHistogram(const
unsigned
char*
src,
std::vector<int>&
dst,
const
size_t
width,
const
size_t
height)
{
#pragma
omp
parallel
{
std::vector<int>
local_dst(256);
#pragma
omp
for
for(size_t
y
=
0;
y
<
height;
++y)
{
for(size_t
x
=
0;
x
<
width;
++x)
{
unsigned
char
val
=
src[y
*
width
+
x];
local_dst[val]++;
}
}
#pragma
omp
critical
{
for(size_t
i
=
0;
i
<
256;
++i)
{
dist[i]
+=
local_dist[i];
}
}
}
}
- 30. Histogram CUDA
__global__
void
histogram_cuda_kernel(const
unsigned
char*
src,
int*
dst,
const
unsigned
int
width,
const
unsigned
int
height,
const
unsigned
int
num_elements)
{
int
idx
=
blockDim.x
*
blockIdx.x
+
threadIdx.x;
int
x
=
idx
%
width;
int
y
=
idx
/
width;
if(idx
<
num_elements)
{
unsigned
char
val
=
src[y
*
width
+
x];
atomicAdd(&dst[val],
1);
}
}
- 31. Histogram
msec MPixel / s
Corei7 2600K 0.734 2823.8
Corei7 4770K 0.273 3370.66
Titan 0.0816 25381.3
XeonPhi 117.5 17.6463
- 32. Histogram MIC
void
micHistogram_240(const
unsigned
char*
src,
int*
dst,
const
size_t
width,
const
size_t
height)
{
#pragma
omp
parallel
num_threads(240)
{
const
size_t
thread_id
=
omp_get_thread_num();
const
size_t
num_threads
=
omp_get_num_threads();
size_t
local_height
=
height
/
num_threads;
local_height
+=
(thread_id
%
2)?
0
:
1;
const
size_t
offset
=
5
*
thread_id
-‐
(thread_id
/
2);
std::vector<int>
local_dst(256);
std::vector<unsigned
char>
local_src(local_height
*
width);
memcpy(&local_src[0],
&src[offset
*
width],
sizeof(unsigned
char)
*
local_height
*
width);
for(size_t
y
=
0;
y
<
local_height;
++y)
{
for(size_t
x
=
0;
x
<
width;
++x)
{
size_t
val
=
local_src[y
*
width
+
x];
local_dst[val]++;
}
}
#pragma
omp
critical
{
for(size_t
i
=
0;
i
<
256;
++i)
{
dst[i]
+=
local_dst[i];
}
}
}
}
- 33. Histogram
msec MPixel / s
Corei7 2600K 0.734 2823.8
Corei7 4770K 0.273 3370.66
Titan 0.0816 25381.3
XeonPhi 2.074 999.806