86. アルゴリズムとスケジュール 86
Buffer<uint8_t> boxFilter3x3(Buffer<uint8_t>& src)
{
//変数宣言
Func in = BoundaryConditions::repeat_edge(src);
Func src16, blurx, blury;
Var x, y, c, xi, yi;
//アルゴリズム記述部
src16(x,y,c) = cast<uint16_t>(in(x,y,c));
blurx(x, y, c) = (src16(x - 1, y, c) + src16(x, y, c) + src16(x + 1, y, c)) / 3;
blury(x, y, c) = saturating_cast<uint8_t>((blurx(x, y - 1, c) + blurx(x, y, c) +
blurx(x, y + 1, c)) / 3);
//スケジュール記述部
blury.tile(x, y, xi, yi, 256, 32).vectorize(xi, 8).parallel(y);
blurx.compute_at(blury, x).store_at(blury, x).vectorize(x, 8);
Buffer<uint8_t> ret = blury.realize(src.width(), src.height(), src.channels());
return ret;
}
90. Jonathan Ragan-Kelley, Andrew Adams, Sylvain Paris, Marc Levoy, Saman Amarasinghe, Fredo Durand, “Decoupling Algorithms from
Schedules for Easy Optimization of Image Processing Pipelines,” ACM Trans. on Graphics (SIGGRAPH), 2012.
「Halideの提案1」
Jonathan Ragan-Kelley, Connelly Barnes, Andrew Adams, Sylvain Paris, Fredo Durand, Saman Amarasinghe, “Halide: A Language and
Compiler for Optimizing Parallelism, Locality, and Recomputation in Image Processing Pipelines,” Proc. PLDI 2013.
「Halideの提案2」
Jonathan Ragan-Kelley, “Decoupling Algorithms from the Organization of Computation for High Performance Image Processing:
The design and implementation of the Halide language and compiler,” Ph.D. dissertation, MIT, May 2014.
「上をまとめた博士論文」
Gaurav Chaurasia, Jonathan Ragan-Kelley, Sylvain Paris, George Drettakis, Fredo Durand, “Compiling High Performance Recursive
Filters,” Proc. High-Performance Graphics (HPG), Pages 85-94, 2015.
「Halideのリカーシブフィルタ機能」
Ravi Teja Mullapudi Andrew Adams, Dillon Sharlet, Jonathan Ragan-Kelley, Kayvon Fatahalian, “Automatically Scheduling Halide
Image Processing Pipelines,” ACM Trans. on Graphics (SIGGRAPH), 2016.
「スケジューラの自動最適化」
Liming Lou, Paul Nguyen, Jason Lawrence, Connelly Barnes, “Image Perforation: Automatically Accelerating Image Pipelines by
Intelligently Skipping Samples,” ACM Trans. on Graphics (SIGGRAPH), 2016.
「サブサンプリングベースの画像高速化のHalide実装」
Shih-Wei Liao, Sheng-Jun Tsai, Chieh-Hsun Yang , Chen-Kang Lo “Locality-aware scheduling for stencil code in Halide,”International
Conference on Parallel Processing Workshops, 2016.
「ステンシル計算向けスケジューラ」
Felix Heide, Steven Diamond, Jonathan Ragan-Kelley, Matthias Niesner, Wolfgang Heidrich, Gordon Wetzstein, “ProxImaL: Efficient
Image Optimization using Proximal Algorithms,” ACM Trans. on Graphics (SIGGRAPH), 2016.
「最適化アルゴリズムのためのDSL.Halideがベース」
Patricia Suriana, Andrew Adams, Shoaib Kamil, “Parallel Associative Reductions in Halide,” Proc. International Symposium on Code
Generation and Optimization,2017
「リダクションの並列化」
Halide関連文献 90
124. キャッシュ階層とレイテンシ 124
CPU L1 L2
1-3
cycles
<10
cycles
>20
cycles
10-20
cycles過去
現在
CPU Shared
Large L2CPU
L1
L1
CPU
CPU
CPU
CPU
L1
L1
L1
L1
L2
L2
L2
L2
Shared
Large L3
Main
Memory
Main
Memory
Main
Memory
>300
cycles
容量小 大
アクセス速度速 遅
140. 行列積(MATMUL) 140
for (i=0; i<N; i++)
for (j=0; j<N; j++)
for (k=0; k<N; k++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
C A B= ×
i
j
k
k
縦方向のアクセスはキャッシュが
うまく使えていない
N2のデータに対してO(N3)
141. 行列積(MATMUL) 141
for (j=0; j<N; j++)
for (k=0; k<N; k++)
for (i=0; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
C A B= ×
i
j
k
ループ順序の入れ替え
i
142. 行列積(MATMUL) 142
for (j=0; j<N; j++)
for (k=0; k<N; k++)
for (i=0; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
C A B= ×
i
j
k+1
ループ順序の入れ替え
i
143. A
行列積(MATMUL) 143
for (j=0; j<N; j++)
for (k=0; k<N; k++)
for (i=0; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
C B= ×
i
j
k+n
ループ順序の入れ替え
i
144. 行列積(MATMUL) 144
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
145. 行列積(MATMUL) 145
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
146. 行列積(MATMUL) 146
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
147. 行列積(MATMUL) 147
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
148. 行列積(MATMUL) 148
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
149. 行列積(MATMUL) 149
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
150. 行列積(MATMUL) 150
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
この領域が繰り返し処理される
→最も変動する場所が局所化
→近いキャッシュに乗ったまま処理可能
151. 行列積(MATMUL) 151
C A B= ×
i
j
k
タイリング
i
for (j=0; j<N; j++)
for (k=0; k<N/2; k++)
for (i=0; i<N/2; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=0; k<N/2; k++)
for (i= N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
for (k=N/2; k<N; k++)
for (i=N/2; i<N; i++)
C[j][i] += A[j][k] * B[k][i]; // C = A x B
残った領域もブロックで処理