(1) popCount 8bitIf1 byte あたり 8回条件分岐して bit を数える for (int i = 0; i < n; i++) { if (*x & 0x01) c++; if (*x & 0x02) c++; if (*x & 0x04) c++; if (*x & 0x08) c++; if (*x & 0x10) c++; if (*x & 0x20) c++; if (*x & 0x40) c++; if (*x & 0x80) c++; x++;}
5.
(2) popCount 8bitTable256 byte のテーブルを作成して表引きstatic const char popTable8bit[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8};for (int i = 0; i < n; i++) { c += popTable8bit[(uint8)*x++];}
6.
(3) popCount 16bitTable64KB のテーブルを作成して表引きstatic char popTable16bit[256 * 256];void _popCount16bitTableInit(void) { for (int i = 0; i < 256; i++) { for (int j = 0; j < 256; j++) { popTable16bit[i*256 + j]= popTable8bit[i] + popTable8bit[j]; } }}for (int i = 0; i < n; i++) { c += popTable18bit[(uint16)*w++];}
Intel SSE4.2 INSTRUCTIONSET- POPCNT#include "intrin.h"POPCNT int _mm_popcnt_u32(unsigned int a);POPCNT int64_t _mm_popcnt_u64(unsigned __int64 a);http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
17.
(8) popCount 32bitSSE4.2 (POPCNT)_mm_popcnt_u32()#include "intrin.h"size_tpopCount32bitSSE42(char *x, int n){ uint32 *y = (uint32 *)x;size_t c = 0; for (int i = 0; i < n; i += 4) { c += _mm_popcnt_u32 (*y++); } return c;}
18.
(9) popCount 64bitSSE4.2 (POPCNT)_mm_popcnt_u64() // ※ 32bitモードでは実行不可#include "intrin.h"size_t popCount32bitSSE42(char *x, int n){uint64 *z = (uint64 *)x;size_t c = 0; for (int i = 0; i < n; i += 8) { c += _mm_popcnt_u64 (*z++); } return c;}
19.
【実験環境】SSE4.2 (32bit/64bit)DELL VostroDT 430 (2009年に購入)Core i7 860 @ 2.80 GHz (45nm Lynnfield)MMX, SSE(1, 2, 3, 3S, 4.1, 4.2), EM64T, VT-xWindows 7 Professional (64bit)Visual Studio 2008 (x64) 64bit C/C++ for amd64Visual Studio 2008 (x86) 32bit C/C++ for 80x86注意点最近の Core i5 / Core i7 の省エネ機能Turbo Boost の機能で負荷に応じてクロックが変わるベンチマーク結果が不安定に