ビットを数える(x86/x64最適化勉強会1)サイボウズ・ラボ株式会社竹迫 良範<takesako@x86.cx>
ビットを数える例題ここにいるバイナリアンの数は何人か?机 1 0 0 1  = 0x0A (ビット2個)机 1 1 1 1  = 0x0F (ビット4個)机 1 1 1 0  = 0x0E (ビット3個)机 1 1 1 1  = 0x0F (ビット4個)机 1 1 1 1  = 0x0F (ビット4個)合計 17人
【閑話】Binary Hacker 中村実さんの日記http://www.nminoru.jp/~nminoru/programming/bitcount.html
(1) popCount 8bit If1 byte あたり 8回条件分岐して bit を数える	for (int i = 0; i < n; i++) {		if (*x & 0x01) c++;		if (*x & 0x02) c++;		if (*x & 0x04) c++;		if (*x & 0x08) c++;		if (*x & 0x10) c++;		if (*x & 0x20) c++;		if (*x & 0x40) c++;		if (*x & 0x80) c++;		x++;}
(2) popCount 8bit Table256 byte のテーブルを作成して表引きstatic const char popTable8bit[] = {  0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,  1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,  2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,  3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};for (int i = 0; i < n; i++) {	c += popTable8bit[(uint8)*x++];}
(3) popCount 16bit Table64KB のテーブルを作成して表引きstatic char popTable16bit[256 * 256];void _popCount16bitTableInit(void) {   for (int i = 0; i < 256; i++) {    for (int j = 0; j < 256; j++) {  popTable16bit[i*256 + j]= popTable8bit[i] + popTable8bit[j];    } }}for (int i = 0; i < n; i++) {	c += popTable18bit[(uint16)*w++];}
http://www.amazon.co.jp/dp/4434046683http://www.hackersdelight.org/
(4) popCount 32bit AND(SHR + ADD)ビットのふるいにかけながら足す   x = ((x & 0xaaaaaaaaUL) >> 1)       +  (x & 0x55555555UL);     x = ((x & 0xccccccccUL) >> 2)       +  (x & 0x33333333UL);     x = ((x & 0xf0f0f0f0UL) >> 4)       +  (x & 0x0f0f0f0fUL);     x = ((x & 0xff00ff00UL) >> 8)       +  (x & 0x00ff00ffUL);     x = ((x & 0xffff0000UL) >> 16)       +  (x & 0x0000ffffUL);
ビットのふるいにかけながら足す(16bit)0xAAAA   1010  1010  1010  1010 >> 10x5555 + 0101  0101  0101  0101 ---------------------0xCCCC 1100  1100  1100  1100 >> 20x3333 + 0011  0011  0011  0011 --------------------0xF0F0   1111  0000  1111  0000 >> 40x0F0F + 0000  1111  0000  1111 ----------------0xFF00   1111  1111  0000  0000 >> 80x00FF + 0000  0000  1111  1111----------
ビットのふるいにかけながら足す(16bit)0x5555   >1010  1010  1010  101 (>> 1)0x5555 + 0101  0101  0101  0101 ---------------------0x3333   >>1100  1100  1100  11 (>> 2)0x3333 + 0011  0011  0011  0011 --------------------0x0F0F   >>>>  1111  0000  1111 (>> 4)0x0F0F + 0000  1111  0000  1111 ----------------0x00FF   >>>>  >>>>  1111  1111 (>> 8)0x00FF + 0000  0000  1111  1111----------
(5) popCount 64bit AND (SHR + ADD)ビットのふるいにかけながら足す64bit対応   x = ((x & 0xaaaaaaaaaaaaaaaaULL) >> 1)       +  (x & 0x5555555555555555ULL);     x = ((x & 0xccccccccccccccccULL) >> 2)       +  (x & 0x3333333333333333ULL);     x = ((x & 0xf0f0f0f0f0f0f0f0ULL) >> 4)       +  (x & 0x0f0f0f0f0f0f0f0fULL);     x = ((x & 0xff00ff00ff00ff00ULL) >> 8)       +  (x & 0x00ff00ff00ff00ffULL);     x = ((x & 0xffff0000ffff0000ULL) >> 16)       +  (x & 0x0000ffff0000ffffULL);     x = ((x & 0xffffffff00000000ULL) >> 32) //      +  (x & 0x00000000ffffffffULL);       //
(6) popCount 64bit MMX + SSE (psadbw)__asm {   MOVD      MM0, [v+0]   ;v_lowPUNPCKLDQ MM0, [v+4]   ;v   MOVQ   MM1, MM0        ;v   PSRLD  MM0, 1         ;v >> 1   PAND   MM0, [C55]    ;(v >> 1) & 0x55555555   PSUBD  MM1, MM0      ;w = v - ((v >> 1) & 0x55555555)   MOVQ   MM0, MM1     ;w   PSRLD  MM1, 2        ;w >> 2   PAND   MM0, [C33]    ;w & 0x33333333   PAND   MM1, [C33]    ;(w >> 2) & 0x33333333   PADDD  MM0, MM1       ;x = (w & 0x33333333) + ((w >> 2) & 0x33333333)   MOVQ   MM1, MM0     ;x   PSRLD  MM0, 4        ;x >> 4   PADDD  MM0, MM1      ;x + (x >> 4)   PAND   MM0, [C0F]    ;y = (x + (x >> 4) & 0x0F0F0F0F)   PXOR   MM1, MM1    ;0   PSADBW MM0, MM1        ;sum all 8 bytes (Sum of Absolute Differences)  MOVD   EAX, MM0        ;result in EAX per calling convention   EMMS                   ;clear MMX state   MOV retVal, EAX        ;store result }
http://support.amd.com/us/Processor_TechDocs/25112.PDF  (pp.179-180)
(7) popCount 32bit MUL (no MMX, no SSE) __asm {   MOV EAX, [v]         ;v   MOV EDX, EAX         ;v   SHR EAX, 1           ;v >> 1   AND EAX, 055555555h  ;(v >> 1) & 0x55555555   SUB EDX, EAX         ;w = v - ((v >> 1) & 0x55555555)   MOV EAX, EDX         ;w   SHR EDX, 2           ;w >> 2   AND EAX, 033333333h  ;w & 0x33333333   AND EDX, 033333333h  ;(w >> 2) & 0x33333333   ADD EAX, EDX         ;x = (w & 0x33333333) + ((w >> 2) & 0x33333333)  MOV EDX, EAX         ;x   SHR EAX, 4           ;x >> 4   ADD EAX, EDX         ;x + (x >> 4)   AND EAX, 00F0F0F0Fh  ;y = (x + (x >> 4) & 0x0F0F0F0F)   IMUL EAX, 001010101h ;y * 0x01010101   SHR EAX, 24          ;population count = (y * 0x01010101) >> 24   MOV retVal, EAX      ;store result   }
Intel SSE4.2 では専用命令 POPCNT が追加http://intel.wingateweb.com/US08/published/sessions/SVRS005/SF08_SVRS005_100r.pdf
Intel SSE4.2 INSTRUCTION SET- POPCNT#include "intrin.h"POPCNT int     _mm_popcnt_u32(unsigned   int   a);POPCNT int64_t _mm_popcnt_u64(unsigned __int64 a);http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
(8) popCount 32bit SSE4.2 (POPCNT)_mm_popcnt_u32()#include "intrin.h"size_tpopCount32bitSSE42(char *x, int n){	uint32 *y = (uint32 *)x;size_t c = 0;	for (int i = 0; i < n; i += 4) {		c += _mm_popcnt_u32 (*y++);	}	return c;}
(9) popCount 64bit SSE4.2 (POPCNT)_mm_popcnt_u64()   // ※ 32bitモードでは実行不可#include "intrin.h"size_t popCount32bitSSE42(char *x, int n){uint64 *z = (uint64 *)x;size_t c = 0;	for (int i = 0; i < n; i += 8) {		c += _mm_popcnt_u64 (*z++);	}	return c;}
【実験環境】SSE4.2 (32bit/64bit)DELL Vostro DT 430 (2009年に購入)Core i7 860 @ 2.80 GHz (45nm Lynnfield)MMX, SSE(1, 2, 3, 3S, 4.1, 4.2), EM64T, VT-xWindows 7 Professional (64bit)Visual Studio 2008 (x64) 64bit C/C++ for amd64Visual Studio 2008 (x86) 32bit C/C++ for 80x86注意点最近の Core i5 / Core i7 の省エネ機能Turbo Boost の機能で負荷に応じてクロックが変わるベンチマーク結果が不安定に
Core Speed ↓×9倍1197.0 MHz(低負荷時)
Core Speed ↑×21倍 2792.9 MHz(高負荷時)
BIOS で Intel® SpeedStep™ tech. を Disable に
実験結果1(32bit 最適化/Ox/arch:SSE2)100KB 中の1bitを数える時間(単位:K clk)
実験結果1(32bit 最適化/Ox/arch:SSE2)
実験結果1(32bit 最適化/Ox/arch:SSE2)
実験結果1(32bit 最適化/Ox/arch:SSE2)
実験結果2(64bit 最適化/Ox/favor:INTEL64)100KB 中の1bitを数える時間(単位:K clk)
実験結果2(64bit 最適化/Ox/favor:INTEL64)
実験結果2(64bit 最適化/Ox/favor:INTEL64)
まとめSSE4.2 POPCNT命令が最速32bit 約0.8 (Clk/Byte)  0.1クロックで1bit64bit 約0.4 (Clk/Byte) 0.1クロックで2bitx64 環境では表引きが遅くなる現象が…x86:8bit 約2.1(c/B):16bit約1.1(c/B)x64:8bit 約3.2(c/B):16bit約1.6(c/B)原因のわかる人がいたら教えてください m(__)m応用例画像処理、機械学習、パターン認識、疎行列SSE4.2文字列命令との併用(ワードカウント)

x86x64 SSE4.2 POPCNT

  • 1.
  • 2.
    ビットを数える例題ここにいるバイナリアンの数は何人か?机 1 0 01 = 0x0A (ビット2個)机 1 1 1 1 = 0x0F (ビット4個)机 1 1 1 0 = 0x0E (ビット3個)机 1 1 1 1 = 0x0F (ビット4個)机 1 1 1 1 = 0x0F (ビット4個)合計 17人
  • 3.
  • 4.
    (1) popCount 8bitIf1 byte あたり 8回条件分岐して bit を数える for (int i = 0; i < n; i++) { if (*x & 0x01) c++; if (*x & 0x02) c++; if (*x & 0x04) c++; if (*x & 0x08) c++; if (*x & 0x10) c++; if (*x & 0x20) c++; if (*x & 0x40) c++; if (*x & 0x80) c++; x++;}
  • 5.
    (2) popCount 8bitTable256 byte のテーブルを作成して表引きstatic const char popTable8bit[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};for (int i = 0; i < n; i++) { c += popTable8bit[(uint8)*x++];}
  • 6.
    (3) popCount 16bitTable64KB のテーブルを作成して表引きstatic char popTable16bit[256 * 256];void _popCount16bitTableInit(void) { for (int i = 0; i < 256; i++) { for (int j = 0; j < 256; j++) { popTable16bit[i*256 + j]= popTable8bit[i] + popTable8bit[j]; } }}for (int i = 0; i < n; i++) { c += popTable18bit[(uint16)*w++];}
  • 7.
  • 8.
    (4) popCount 32bitAND(SHR + ADD)ビットのふるいにかけながら足す x = ((x & 0xaaaaaaaaUL) >> 1) + (x & 0x55555555UL); x = ((x & 0xccccccccUL) >> 2) + (x & 0x33333333UL); x = ((x & 0xf0f0f0f0UL) >> 4) + (x & 0x0f0f0f0fUL); x = ((x & 0xff00ff00UL) >> 8) + (x & 0x00ff00ffUL); x = ((x & 0xffff0000UL) >> 16) + (x & 0x0000ffffUL);
  • 9.
    ビットのふるいにかけながら足す(16bit)0xAAAA 1010 1010 1010 1010 >> 10x5555 + 0101 0101 0101 0101 ---------------------0xCCCC 1100 1100 1100 1100 >> 20x3333 + 0011 0011 0011 0011 --------------------0xF0F0 1111 0000 1111 0000 >> 40x0F0F + 0000 1111 0000 1111 ----------------0xFF00 1111 1111 0000 0000 >> 80x00FF + 0000 0000 1111 1111----------
  • 10.
    ビットのふるいにかけながら足す(16bit)0x5555 >1010 1010 1010 101 (>> 1)0x5555 + 0101 0101 0101 0101 ---------------------0x3333 >>1100 1100 1100 11 (>> 2)0x3333 + 0011 0011 0011 0011 --------------------0x0F0F >>>> 1111 0000 1111 (>> 4)0x0F0F + 0000 1111 0000 1111 ----------------0x00FF >>>> >>>> 1111 1111 (>> 8)0x00FF + 0000 0000 1111 1111----------
  • 11.
    (5) popCount 64bitAND (SHR + ADD)ビットのふるいにかけながら足す64bit対応 x = ((x & 0xaaaaaaaaaaaaaaaaULL) >> 1) + (x & 0x5555555555555555ULL); x = ((x & 0xccccccccccccccccULL) >> 2) + (x & 0x3333333333333333ULL); x = ((x & 0xf0f0f0f0f0f0f0f0ULL) >> 4) + (x & 0x0f0f0f0f0f0f0f0fULL); x = ((x & 0xff00ff00ff00ff00ULL) >> 8) + (x & 0x00ff00ff00ff00ffULL); x = ((x & 0xffff0000ffff0000ULL) >> 16) + (x & 0x0000ffff0000ffffULL); x = ((x & 0xffffffff00000000ULL) >> 32) // + (x & 0x00000000ffffffffULL); //
  • 12.
    (6) popCount 64bitMMX + SSE (psadbw)__asm { MOVD MM0, [v+0] ;v_lowPUNPCKLDQ MM0, [v+4] ;v MOVQ MM1, MM0 ;v PSRLD MM0, 1 ;v >> 1 PAND MM0, [C55] ;(v >> 1) & 0x55555555 PSUBD MM1, MM0 ;w = v - ((v >> 1) & 0x55555555) MOVQ MM0, MM1 ;w PSRLD MM1, 2 ;w >> 2 PAND MM0, [C33] ;w & 0x33333333 PAND MM1, [C33] ;(w >> 2) & 0x33333333 PADDD MM0, MM1 ;x = (w & 0x33333333) + ((w >> 2) & 0x33333333) MOVQ MM1, MM0 ;x PSRLD MM0, 4 ;x >> 4 PADDD MM0, MM1 ;x + (x >> 4) PAND MM0, [C0F] ;y = (x + (x >> 4) & 0x0F0F0F0F) PXOR MM1, MM1 ;0 PSADBW MM0, MM1 ;sum all 8 bytes (Sum of Absolute Differences) MOVD EAX, MM0 ;result in EAX per calling convention EMMS ;clear MMX state MOV retVal, EAX ;store result }
  • 13.
  • 14.
    (7) popCount 32bitMUL (no MMX, no SSE) __asm { MOV EAX, [v] ;v MOV EDX, EAX ;v SHR EAX, 1 ;v >> 1 AND EAX, 055555555h ;(v >> 1) & 0x55555555 SUB EDX, EAX ;w = v - ((v >> 1) & 0x55555555) MOV EAX, EDX ;w SHR EDX, 2 ;w >> 2 AND EAX, 033333333h ;w & 0x33333333 AND EDX, 033333333h ;(w >> 2) & 0x33333333 ADD EAX, EDX ;x = (w & 0x33333333) + ((w >> 2) & 0x33333333) MOV EDX, EAX ;x SHR EAX, 4 ;x >> 4 ADD EAX, EDX ;x + (x >> 4) AND EAX, 00F0F0F0Fh ;y = (x + (x >> 4) & 0x0F0F0F0F) IMUL EAX, 001010101h ;y * 0x01010101 SHR EAX, 24 ;population count = (y * 0x01010101) >> 24 MOV retVal, EAX ;store result }
  • 15.
    Intel SSE4.2 では専用命令POPCNT が追加http://intel.wingateweb.com/US08/published/sessions/SVRS005/SF08_SVRS005_100r.pdf
  • 16.
    Intel SSE4.2 INSTRUCTIONSET- POPCNT#include "intrin.h"POPCNT int _mm_popcnt_u32(unsigned int a);POPCNT int64_t _mm_popcnt_u64(unsigned __int64 a);http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
  • 17.
    (8) popCount 32bitSSE4.2 (POPCNT)_mm_popcnt_u32()#include "intrin.h"size_tpopCount32bitSSE42(char *x, int n){ uint32 *y = (uint32 *)x;size_t c = 0; for (int i = 0; i < n; i += 4) { c += _mm_popcnt_u32 (*y++); } return c;}
  • 18.
    (9) popCount 64bitSSE4.2 (POPCNT)_mm_popcnt_u64() // ※ 32bitモードでは実行不可#include "intrin.h"size_t popCount32bitSSE42(char *x, int n){uint64 *z = (uint64 *)x;size_t c = 0; for (int i = 0; i < n; i += 8) { c += _mm_popcnt_u64 (*z++); } return c;}
  • 19.
    【実験環境】SSE4.2 (32bit/64bit)DELL VostroDT 430 (2009年に購入)Core i7 860 @ 2.80 GHz (45nm Lynnfield)MMX, SSE(1, 2, 3, 3S, 4.1, 4.2), EM64T, VT-xWindows 7 Professional (64bit)Visual Studio 2008 (x64) 64bit C/C++ for amd64Visual Studio 2008 (x86) 32bit C/C++ for 80x86注意点最近の Core i5 / Core i7 の省エネ機能Turbo Boost の機能で負荷に応じてクロックが変わるベンチマーク結果が不安定に
  • 20.
    Core Speed ↓×9倍1197.0MHz(低負荷時)
  • 21.
    Core Speed ↑×21倍2792.9 MHz(高負荷時)
  • 22.
    BIOS で Intel®SpeedStep™ tech. を Disable に
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
    まとめSSE4.2 POPCNT命令が最速32bit 約0.8(Clk/Byte)  0.1クロックで1bit64bit 約0.4 (Clk/Byte) 0.1クロックで2bitx64 環境では表引きが遅くなる現象が…x86:8bit 約2.1(c/B):16bit約1.1(c/B)x64:8bit 約3.2(c/B):16bit約1.6(c/B)原因のわかる人がいたら教えてください m(__)m応用例画像処理、機械学習、パターン認識、疎行列SSE4.2文字列命令との併用(ワードカウント)