More Related Content Similar to Cvim saisentan 半精度浮動小数点数 half
Similar to Cvim saisentan 半精度浮動小数点数 half (20) More from tomoaki0705 (20) Cvim saisentan 半精度浮動小数点数 half8. 用意するもの
• Linux が走るARM
• Raspberry Pi zero/1/2/3
• ODROID XU4/C2
• Jetson TK1/TX1
• PINE64
• 赤字は64bit対応
• 実機買った方が開発には最適!
9. 試して見よう
int main(int argc, char**argv)
{
printf("Hello World !!n");
__fp16 halfPrecision = 1.5f;
printf("half precision:%fn“, halfPrecision);
printf("half precision:sizeof %dn“, sizeof(halfPrecision));
printf("half precision:0x%04xn", *(short*)(void*)&halfPrecision);
float original[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f,16.0f,};
for (unsigned int i = 0;i < 16;i++)
{
__fp16 stub = original[i];
printf(“%2d 0x%04xn", (int)original[i], *(short*)&stub);
}
return 0;
}
https://github.com/tomoaki0705/sampleFp16
11. 実行結果
0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0
1/2
1/1024
1/4
1/8
1/16
1/32
1/64
1/128
1/256
1/512
2(17−15) × 1 +
1
2
+
1
4
= 22 ×
7
4
= 7
符号ビット(+)
指数部(17)
仮数部
指数部が全部0だとsubnormal、
指数部が全部1だとInfもしくはNaNと定義されている
13. アセンブラで見てみよう
• まさかのソフト実装!
• がっかり・・・・
• 一日一回感謝のアセンブラ
$ gcc –S -std=c99 -mfp16-format=ieee –O main.c.s main.c
movw r3, #15872 ←0x3e00
strh r3, [r7, #8] @ __fp16 ←stackに保存
ldrh r3, [r7, #8] @ __fp16 ←stackから読込
mov r0, r3 @ __fp16 ←レジスタに移動
bl __gnu_h2f_ieee ←関数コール
(half2float)
14. ARMのhalf変換命令
•half ←→floatの変換命令
• VCVTB.F16.F32 (float→half)
• VCVTB.F32.F16 (half→float)
• VCVTT.F16.F32 (float→half)
• VCVTT.F32.F16 (half→float)
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204ij/CJAGIFIJ.html
16. アセンブラで見てみよう2
movw r3, #15872
strh r3, [r7, #8] @ __fp16
add r2, r7, #8
vld1.16 {d7[2]}, [r2]
vcvtb.f32.f16 s15, s15
movw r3, #15872
strh r3, [r7, #8] @ __fp16
ldrh r3, [r7, #8] @ __fp16
mov r0, r3 @ __fp16
bl __gnu_h2f_ieee
FPUオプション無し FPU=vfpv4
19. ARMでのfp16命令(SIMD)
• vcvtはベクトルのV
• SIMDで演算してみよう!
• 浮動小数点間の変換
• float16x4_t vcvt_f16_f32(float32x4_t a);
• VCVT.F16.F32 d0, q0
• float32x4_t vcvt_f32_f16(float16x4_t a);
• VCVT.F32.F16 q0, d0
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0348bj/BABGABJH.html
20. ベクトル演算してみよう
const unsigned int cParallel = 8;
for (unsigned int x = 0;x <= cSize - cParallel;x += cParallel)
{
uint8x8_t srcInteger = vld1_u8(src+x); // load 64bits
float16x4_t gainHalfLow = *(float16x4_t*)(gain + x ); // load 32bits
float16x4_t gainHalfHigh = *(float16x4_t*)(gain + x + 4 ); // load 32bits
uint16x8_t srcIntegerShort = vmovl_u8(srcInteger); // uchar -> ushort
uint32x4_t srcIntegerLow = vmovl_u16(vget_low_s16 (srcIntegerShort)); // ushort -> uint
uint32x4_t srcIntegerHigh = vmovl_u16(vget_high_s16(srcIntegerShort)); // ushort -> uint
float32x4_t srcFloatLow = vcvtq_f32_u32(srcIntegerLow ); // uint -> float
float32x4_t srcFloatHigh = vcvtq_f32_u32(srcIntegerHigh); // uint -> float
float32x4_t gainFloatLow = vcvt_f32_f16(gainHalfLow ); // half -> float
float32x4_t gainFloatHigh = vcvt_f32_f16(gainHalfHigh); // half -> float
float32x4_t dstFloatLow = vmulq_f32(srcFloatLow, gainFloatLow ); // float * float
float32x4_t dstFloatHigh = vmulq_f32(srcFloatHigh, gainFloatHigh); // float * float
uint32x4_t dstIntegerLow = vcvtq_u32_f32(dstFloatLow ); // float -> uint
uint32x4_t dstIntegerHigh = vcvtq_u32_f32(dstFloatHigh); // float -> uint
uint16x8_t dstIntegerShort =
vcombine_u16(vmovn_u16(dstIntegerLow), vmovn_u16(dstIntegerHigh)); // uint -> ushort
uint8x8_t dstInteger = vmovn_u16(dstIntegerShort); // ushort -> uchar
vst1_u8(dst+x, dstInteger); // store
}
https://github.com/tomoaki0705/sampleFp16Vector
21. さすがに読みにくいので
const unsigned int cParallel = 8;
for (unsigned int x = 0;x <= cSize - cParallel;x += cParallel)
{
uchar8 srcInteger = load_uchar8(src+x); // load 64bits
half4 gainHalfLow = load_half4(gain + x ); // load 32bits
half4 gainHalfHigh = load_half4(gain + x + 4 ); // load 32bits
ushort8 srcIntegerShort = convert_uchar8_ushort8(srcInteger); // uchar -> ushort
uint4 srcIntegerLow = convert_ushort8_lo_uint4(srcIntegerShort); // ushort -> uint
uint4 srcIntegerHigh = convert_ushort8_hi_uint4(srcIntegerShort); // ushort -> uint
float4 srcFloatLow = convert_uint4_float4(srcIntegerLow ); // uint -> float
float4 srcFloatHigh = convert_uint4_float4(srcIntegerHigh); // uint -> float
float4 gainFloatLow = convert_half4_float4(gainHalfLow ); // half -> float
float4 gainFloatHigh = convert_half4_float4(gainHalfHigh); // half -> float
float4 dstFloatLow = multiply_float4(srcFloatLow , gainFloatLow ); // float * float
float4 dstFloatHigh = multiply_float4(srcFloatHigh, gainFloatHigh); // float * float
uint4 dstIntegerLow = convert_float4_uint4(dstFloatLow ); // float -> uint
uint4 dstIntegerHigh = convert_float4_uint4(dstFloatHigh); // float -> uint
ushort8 dstIntegerShort =
convert_uint4_ushort8(dstIntegerLow, dstIntegerHigh); // uint -> ushort
uchar8 dstInteger = convert_ushort8_uchar8(dstIntegerShort); // ushort -> uchar
store_uchar8(dst + x, dstInteger); // store
}
26. ベクトル演算してみよう
const unsigned int cParallel = 8;
for (unsigned int x = 0;x <= cSize - cParallel;x += cParallel)
{
__m128i srcInteger = _mm_loadl_epi64((__m128i const *)(src + x)); // load 64bits
__m128i gainHalfLow = _mm_loadl_epi64((__m128i const *)(gain + x )); // load 32bits
__m128i gainHalfHigh = _mm_loadl_epi64((__m128i const *)(gain + x + 4)); // load 32bits
__m128i srcIntegerShort = _mm_unpacklo_epi8(srcInteger, v_zero); // uchar -> ushort
__m128i srcIntegerLow = _mm_unpacklo_epi16(srcIntegerShort, v_zero); // ushort -> uint
__m128i srcIntegerHigh = _mm_unpackhi_epi16(srcIntegerShort, v_zero); // ushort -> uint
__m128i srcFloatLow = _mm_cvtepi32_ps(srcIntegerLow ); // uint -> float
__m128i srcFloatHigh = _mm_cvtepi32_ps(srcIntegerHigh); // uint -> float
__m128 gainFloatLow = _mm_cvtph_ps(gainHalfLow ); // half -> float
__m128 gainFloatHigh = _mm_cvtph_ps(gainHalfHigh); // half -> float
__m128 dstFloatLow = _mm_mul_ps(srcFloatLow , gainFloatLow ); // float * float
__m128 dstFloatHigh = _mm_mul_ps(srcFloatHigh, gainFloatHigh); // float * float
__m128i dstIntegerLow = _mm_cvtps_epi32(dstFloatLow ); // float -> uint
__m128i dstIntegerHigh = _mm_cvtps_epi32(dstFloatHigh); // float -> uint
__m128i dstIntegerShort = _mm_packs_epi32(dstIntegerLow, dstIntegerHigh); // uint -> ushort
__m128i dstInteger = _mm_packus_epi16(dstIntegerShort, v_zero); // ushort -> uchar
_mm_storel_epi64((__m128i *)(dst + x), dstInteger); // store
}
https://github.com/tomoaki0705/sampleFp16Vector
27. さすがに読みにくいので
const unsigned int cParallel = 8;
for (unsigned int x = 0;x <= cSize - cParallel;x += cParallel)
{
uchar8 srcInteger = load_uchar8(src+x); // load 64bits
half4 gainHalfLow = load_half4(gain + x ); // load 32bits
half4 gainHalfHigh = load_half4(gain + x + 4 ); // load 32bits
ushort8 srcIntegerShort = convert_uchar8_ushort8(srcInteger); // uchar -> ushort
uint4 srcIntegerLow = convert_ushort8_lo_uint4(srcIntegerShort); // ushort -> uint
uint4 srcIntegerHigh = convert_ushort8_hi_uint4(srcIntegerShort); // ushort -> uint
float4 srcFloatLow = convert_uint4_float4(srcIntegerLow ); // uint -> float
float4 srcFloatHigh = convert_uint4_float4(srcIntegerHigh); // uint -> float
float4 gainFloatLow = convert_half4_float4(gainHalfLow ); // half -> float
float4 gainFloatHigh = convert_half4_float4(gainHalfHigh); // half -> float
float4 dstFloatLow = multiply_float4(srcFloatLow , gainFloatLow ); // float * float
float4 dstFloatHigh = multiply_float4(srcFloatHigh, gainFloatHigh); // float * float
uint4 dstIntegerLow = convert_float4_uint4(dstFloatLow ); // float -> uint
uint4 dstIntegerHigh = convert_float4_uint4(dstFloatHigh); // float -> uint
ushort8 dstIntegerShort = convert_uint4_ushort8(dstIntegerLow, dstIntegerHigh);// uint -> ushort
uchar8 dstInteger = convert_ushort8_uchar8(dstIntegerShort); // ushort -> uchar
store_uchar8(dst + x, dstInteger); // store
}
$ gcc -mf16c main.cpp
32. まとめ
• ARM
• めでたしめでたし
• ARM(SIMD)
• めでたしめでたし
• Intel,AMD (x86)
• x86でもSSEの中に変換命令がある
• Ivy Bridge以降のIntelとPiledriver以降のAMDで利用可
• めでたしめでたし
• CUDA
https://blogs.msdn.microsoft.com/chuckw/2012/09/11/directxmath-f16c-and-fma/
33. CUDA
unsigned short a = g_indata[y*imgw+x];
float gain;
gain = __half2float(a);
float b = imageData[(y*imgw+x)*3 ];
float g = imageData[(y*imgw+x)*3+1];
float r = imageData[(y*imgw+x)*3+2];
g_odata[(y*imgw+x)*3 ] = clamp(b * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+1] = clamp(g * gain, 0.0f, 255.0f);
g_odata[(y*imgw+x)*3+2] = clamp(r * gain, 0.0f, 255.0f);
35. まとめ
• ARM
• めでたしめでたし
• ARM(SIMD)
• めでたしめでたし
• Intel,AMD (x86)
• めでたしめでたし
• CUDA
• CUDA 7.5から正式サポート
• Half での演算命令はPascalから搭載予定→搭載発表←New!!
• 一部の演算命令は既にJetson TX1で利用可能
• 変換命令自体は割と古いGPUでもHW的に存在する
http://www.slideshare.net/NVIDIAJapan/1071-gpu-cuda-75maxwell
http://pc.watch.impress.co.jp/docs/news/event/20160406_751833.html
37. 半精度浮動小数点数の限界 –
オーバーフロー
• float の最大
• 指数部8bit、仮数部23bit→ 10E38 まで扱える
• signed intの最大 + 2,147,483,647 より大きい
• half の最大
• 指数部5bit、仮数部10bit→65504まで扱える
• unsigned shortの最大 +65536より小さい!
38. 半精度浮動小数点数の限界 –
丸め誤差
• floatの丸め誤差
• 16777216(=2^24) までは整数を正確に表記できる
• halfの丸め誤差
• 2048 (=2^11)までは整数を正確に表記できる
• 1024-2048 のレンジだと小数点以下の情報は失われる
• 512-1024 のレンジだと0.5刻みでしか表現できない
• Ex. 180.5 + 178.2 + 185.2 + 150.3 + 160.3 = 854.5
• 正しい平均値 170.9
• Halfで計算 171.0 ←丸め誤差0.1
40. 指数で計算する場合
n Fn pnの結果 桁数(10進) 桁数(2進)
73 806515533049393 806515533049393 15 50
74 1304969544928657 1304969544928657 16 51
75 2111485077978050 2111485077978050 16 51
76 3416454622906707 3416454622906706 16 52
77 5527939700884757 5527939700884756 16 53
78 8944394323791464 8944394323791464 16 53
79 14472334024676221 14472334024676218 17 54
41. 参考文献
1. 半精度浮動小数点数 - Wikipedia
https://ja.wikipedia.org/wiki/半精度浮動小数点数
2. tomoaki0705/sampleFp16: sample code to treat FP16 on ARM
https://github.com/tomoaki0705/sampleFp16
3. ARM Information Center
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204ij/CJAGIFIJ.html
4. ARM Information Center
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0348bj/BABGABJH.html
5. tomoaki0705/sampleFp16Vector: float16bit sample code on x86 and ARM
https://github.com/tomoaki0705/sampleFp16Vector
6. opengl:fpu_vfp [HYPERでんち]
http://dench.flatlib.jp/opengl/fpu_vfp
7. ARMのFPU - AkiWiki
http://tessy.org/wiki/index.php?ARM%A4%CEFPU
8. F16C - Wikipedia, the free encyclopedia
https://en.wikipedia.org/wiki/F16C
9. DirectXMath: F16C and FMA | Games for Windows and the DirectX SDK
https://blogs.msdn.microsoft.com/chuckw/2012/09/11/directxmath-f16c-and-fma/
10. 1071: GPUコンピューティング最新情報 ~ CUDA 7.5とMaxwellアーキテクチャ ~
http://www.slideshare.net/NVIDIAJapan/1071-gpu-cuda-75maxwell
11. 【イベントレポート】次世代GPUアーキテクチャ「Pascal」が明らかに ~HBM2による720GB/secの超広帯域など - PC Watch
http://pc.watch.impress.co.jp/docs/news/event/20160406_751833.html
12. 結城浩の「マヨイドーロ問題」 | CodeIQ
https://codeiq.jp/q/2549
13. 結城浩の「マヨイドーロ問題」解説|CodeIQ MAGAZINE
https://codeiq.jp/magazine/2015/12/35521/