Php7 hashtable

PHP7 HASHTABLE
wangtong@panda.tv
2017-07

 PHP7 Array Featrue
 PHP7 HashTable Struct
 HashTable Collisions
 Translation Table
 PHP7 HashTable Operation
 Example, Run php-src By GDB
 Packed HashTables
 Empty HashTable
 Application Scenarios
 PHP5 Hashtable
 Redis Hashtable
Agenda

We already know about php array
• Array 为 HashTable实现
• map，可字符数字做索引建
• Foreach 快于 for
• Foreach 顺序为插入顺序
• Count($arr) 快
• In_array 慢
• 理想情况Find 时间复杂度为 O(1)
• PHP有current,prev,next,each,end,reset等指针函数

PHP7 HashTable Struct
https://github.com/php/php-src/blob/PHP-7.0.11/Zend/zend_types.h#L176

/* ht struct */
struct Bucket {
zval val;
zend_ulong h;
zend_string *key;
} Bucket;
struct HashTable {
zend_refcounted_h gc;
union {
struct { ZEND_ENDIAN_LOHI_4( /*... ellipsis ...*/) } v;
uint32_t flags;
} u;
uint32_t nTableMask; // -nTableSize, -16; (uint)-16==4294967280
Bucket *arData; // array items,
uint32_t nNumUsed; // next slot available in arData
uint32_t nNumOfElements; // total num of busy elements in arData
uint32_t nTableSize; // table size, always a power of two, min:8
uint32_t nInternalPointer; // Used for iteration
zend_long nNextFreeElement; // next integer-based key available
dtor_func_t pDestructor; // data destructor
};

struct zval {
zend_value value; /* value */
union {
struct {
ZEND_ENDIAN_LOHI_4(
zend_uchar type, /* active type */
zend_uchar type_flags,
zend_uchar const_flags,
zend_uchar reserved) /*call info for EX(This*/
} v;
uint32_t type_info;
} u1;
union {
uint32_t var_flags;
uint32_t next; /* hash collision chain */
uint32_t cache_slot; /* literal cache slot */
uint32_t lineno; /* line number (for ast nodes) */
uint32_t num_args; /* arguments number for EX(This) */
uint32_t fe_pos; /* foreach position */
uint32_t fe_iter_idx; /* foreach iterator index */
} u2;
};
union zend_value {
zend_long lval;
double dval;
zend_refcounted *counted;
zend_string *str;
zend_array *arr;
zend_object *obj;
zend_resource *res;
zend_reference *ref;
zend_ast_ref *ast;
zval *zv;
void *ptr;
zend_class_entry *ce;
zend_function *func;
struct {
uint32_t w1;
uint32_t w2;
} ww;
};

Hashtable collisions
链接法：桶是一个可容纳多个数据的数据结构（例如链表或红黑树）
开放寻址法：所有元素都存放在槽中（装载因子<0.5可考虑）
//Hashtable collisions attack
$beginTime = microtime(true);
$size = pow(2,16);
$k = 0;
for($i=0;$i<$size;$i++){
$arr[$k] = 0;
$k += $size;
}
echo microtime(true) - $beginTime;
echo PHP_EOL;
//php5.6 28s, php7 7s

Hash layout
#define HT_HASH_SIZE(nTableMask) (((size_t)(uint32_t)-(int32_t)(nTableMask)) *
sizeof(uint32_t))
#define HT_DATA_SIZE(nTableSize) ((size_t)(nTableSize) * sizeof(Bucket))
#define HT_SIZE_EX(nTableSize, nTableMask) (HT_DATA_SIZE((nTableSize)) +
HT_HASH_SIZE((nTableMask)))
#define HT_SIZE(ht) HT_SIZE_EX((ht)->nTableSize, (ht)->nTableMask)
Bucket *arData;
arData = emalloc(HT_SIZE(ht)); /* now alloc this */

panda.tv tone shop … Bucket
…
val value …
… … … u1
-1 -1 1 u2.next
92236014990
29192316
0 65536 … h h …
0x… 0x… 0x… … *key *key nTableSize-1
0 1 2 3 4i idx=5 idx=6 7
gc …
u …
nTableMask -8
*arData 0x…
nNumUsed 3
nNumOfElements 3
nTableSize 8
nInternalPointer 0
nNextFreeElement 65537
pDestructor 0x…
…
922360…
7
company
gc
h
len
val[1]
2 0
-8 -7 -6 -5 -4 -3 -2 -1
$arr = [
'company'=>'panda.tv',
0=>’tone',
65536=>'shop'
];
0x0
Translation table bucket
PHP7 HashTable :
nIndex idx

PHP7 HashTable operation
https://github.com/php/php-src/blob/PHP-7.0.11/Zend/zend_hash.c#L552

HashTable Init
ZEND_API void ZEND_FASTCALL _zend_hash_init(HashTable *ht, uint32_t nSize, dtor_func_t
pDestructor, zend_bool persistent ZEND_FILE_LINE_DC)
{
GC_REFCOUNT(ht) = 1;
GC_TYPE_INFO(ht) = IS_ARRAY;
ht->u.flags = (persistent ? HASH_FLAG_PERSISTENT : 0) | HASH_FLAG_APPLY_PROTECTION |
HASH_FLAG_STATIC_KEYS;
ht->nTableSize = zend_hash_check_size(nSize);
ht->nTableMask = HT_MIN_MASK; // ((uint32_t) -2)
HT_SET_DATA_ADDR(ht, &uninitialized_bucket); //HT_SET_DATA_ADDR(ht, ptr) do { (ht)-
>arData = (Bucket*)(((char*)(ptr)) + HT_HASH_SIZE((ht)->nTableMask)); } while (0)
ht->nNumUsed = 0;
ht->nNumOfElements = 0;
ht->nInternalPointer = HT_INVALID_IDX;
ht->nNextFreeElement = 0;
ht->pDestructor = pDestructor;
}

Translation Table Demo
/* @auth xuruiliang@panda.tv, 在此感谢许老板帮写的demo*/
#include <iostream>
#include <cstdlib>
#include <cassert>
using namespace std;
struct P {
int x, y;
};
const int P_SIZE = 10;
int main()
{
struct P p1 = (struct P){.x = 100, .y = 101};
P *p = (P *)malloc(P_SIZE* (sizeof(int) + sizeof(P)));
assert(p != NULL);
((int *)p)[0] = 10;
p = (P *)((int *)p + P_SIZE);
p[3] = p1;
}

static zend_always_inline zval *_zend_hash_add_or_update_i(HashTable *ht, zend_string *key,
zval *pData, uint32_t flag ZEND_FILE_LINE_DC)
{
ZEND_HASH_IF_FULL_DO_RESIZE(ht); //if ((ht)->nNumUsed >= (ht)->nTableSize)
{ zend_hash_do_resize(ht); }
idx = ht->nNumUsed++; /* take the next avalaible slot number */
ht->nNumOfElements++; /* increment number of elements */
/* ... */
p = ht->arData + idx; /* Get the bucket in that slot from arData */
p->key = key; /* Affect it the key we want to insert at */
/* ... */
p->h = h = ZSTR_H(key); /* save the hash of the current key into the bucket */
ZVAL_COPY_VALUE(&p->val, pData); /* Copy the value into the bucket's value : add */
nIndex = h | ht->nTableMask; /* Get the translation table index */
// p->val.u2.next =
Z_NEXT(p->val) = HT_HASH(ht, nIndex); /* Put the actual element as next of us */
// ((uint32_t*)((ht)->arData))[(int32_t)(nIndex)]=((idx) * sizeof(Bucket))
HT_HASH(ht, nIndex) = HT_IDX_TO_HASH(idx); /* Put us into the actual translation slot */
HashTable Add

Hashtable添加更新元素
• nIndex为hash后的索引，idx为顺序索引
• nNumUsed+1, 用于idx，>nTableSize将触发扩容
• nNumOfElements+1, 记录实际元素个数
• nTableSize, 用于申请内存空间的大小
• nNextFreeElement+1, 用于下一个自增数字索引值
• p->val.u2.next 实现了hash冲突的解决
• p = ht->arData + idx， Bucket内存地址

Hashtable del
ZEND_API int ZEND_FASTCALL zend_hash_del(HashTable *ht, zend_string *key)
{
/* ... */
h = zend_string_hash_val(key); /* get the hash from the key (assuming string key here) */
nIndex = h | ht->nTableMask; /* get the translation table index */
idx = HT_HASH(ht, nIndex); /* Get the slot corresponding to that translation index */
while (idx != HT_INVALID_IDX) { /* If there is a corresponding slot */
p = HT_HASH_TO_BUCKET(ht, idx); /* Get the bucket from that slot */
if ((p->key == key) || /* Is it the right bucket ? same key pointer ? */
(p->h == h && /* ... or same hash */
p->key && /* and a key (string key based) */
ZSTR_LEN(p->key) == ZSTR_LEN(key) && /* and same key length */
memcmp(ZSTR_VAL(p->key), ZSTR_VAL(key), ZSTR_LEN(key)) == 0)) { /* and same key content ? */
_zend_hash_del_el_ex(ht, idx, p, prev); /* that's us ! delete us */
return SUCCESS;
}
prev = p;
idx = Z_NEXT(p->val); /* get the next corresponding slot from current one */
}
return FAILURE;
}

Hash fragmentation, resizing and compacting

HashTable Resize
static void ZEND_FASTCALL zend_hash_do_resize(HashTable *ht)
{
IS_CONSISTENT(ht);
HT_ASSERT(GC_REFCOUNT(ht) == 1);
if (ht->nNumUsed > ht->nNumOfElements + (ht->nNumOfElements >> 5)) { //只有到一定阈值才进行rehash操作
HANDLE_BLOCK_INTERRUPTIONS();
zend_hash_rehash(ht); //重建索引数组
HANDLE_UNBLOCK_INTERRUPTIONS();
} else if (ht->nTableSize < HT_MAX_SIZE) { //扩大为两倍
void *new_data, *old_data = HT_GET_DATA_ADDR(ht);
uint32_t nSize = ht->nTableSize + ht->nTableSize;
Bucket *old_buckets = ht->arData;
HANDLE_BLOCK_INTERRUPTIONS();
new_data = pemalloc(HT_SIZE_EX(nSize, -nSize), ht->u.flags & HASH_FLAG_PERSISTENT); //新分配arData空间，大小
为:(sizeof(Bucket) + sizeof(uint32_t)) * nSize
ht->nTableSize = nSize;
ht->nTableMask = -ht->nTableSize; //nTableSize负值
HT_SET_DATA_ADDR(ht, new_data); //将arData指针偏移到Bucket数组起始位置
memcpy(ht->arData, old_buckets, sizeof(Bucket) * ht->nNumUsed); //将旧的Bucket数组拷到新空间
pefree(old_data, ht->u.flags & HASH_FLAG_PERSISTENT); //释放旧空间
zend_hash_rehash(ht); //重建索引数组
HANDLE_UNBLOCK_INTERRUPTIONS();
} else {
zend_error_noreturn(E_ERROR, "Possible integer overflow in memory allocation (%zu * %zu + %zu)", ht-
>nTableSize * 2, sizeof(Bucket) + sizeof(uint32_t), sizeof(Bucket));
}
}

git clone -b PHP-7.0.11 git@github.com:php/php-src.git
cd php-src
~/php-src> ./buildconf
~/php-src> ./configure --disable-all --enable-debug --prefix=$HOME/php-debug
~/php-src> make
~/php-src> make install
gdb --args bin/php -f hashtable-debug.php
break /home/1/php-src/Zend/zend_hash.c:839 if h==589
break /home/1/php-src/Zend/zend_hash.c:628 if strcmp((char *)&key->val,"key14")==0
break /home/1/php-src/Zend/zend_hash.c:628 if strcmp((char *)&key->val,"key2")==0
break /home/1/php-src/Zend/zend_hash.c:839 if h==4153
break /home/1/php-src/Zend/zend_hash.c:561 if strncmp((char *)&key->val,"class_exists",key-
>len)==0
GDB调试方案

<?PHP
$tmp_user = array(
'name'=>'wangtong',
'worker_id'=>'P589',
'589'=>'see-nNextFreeElement', //here @1
'company'=>'panda.tv',
'email'=>'wangtong@panda.tv',
'location'=>'bj-soho-18',
'department01' => 'g-biz',
'department02' => 'g-tech',
1006440989 => 'see-nTableSize',
'key10' => 'pandatv.com',
'key11' => 'shop.gate.panda.tv',
'key12' => 'mall.gate.panda.tv',
'key13' => 'bag.gate.panda.tv',
'key14' => 'see-nTableSize', // here, @2
);
foreach($tmp_user as $k=>$v){
$user_info[$k]=$v; //here @1 @2
}
unset($user_info['worker_id']);
unset($user_info['589']);
unset($user_info['company']);
unset($user_info['email']);
unset($user_info['location']);
unset($user_info['department01']);
unset($user_info['department02']);
unset($user_info['1006440989']);
$user_info['key2']='see_nNumUsed'; // here @3
unset($user_info['key10']);
$user_info['key3']='val3';
$user_info['key4']='val4';
$user_info['4153'] = 'see-nTableSize';//here @
运行示例

示例运行结果
nTable
Size
nNumUs
ed
nNumOfEl
e
nNextFre
eEle
Func Mark
3个item@1 16 3 3 590 _zend_hash_index_a
dd_or_update_i
Init,nTableSize=16;
nNextFreeEle=589+1
14个
item@2
16 14 14 10064409
90
_zend_hash_add_or_
update_i
nNumUsed = 14;
nNumOfElement = 14;
3个item@1 8 3 3 590 _zend_hash_index_a
dd_or_update_i
Init, nTableSize=8;
14个
item@2
16 14 14 10064409
90
_zend_hash_add_or_
update_i
nTableSize *= 2;
Unset后@3 16 15 7 10064409
90
_zend_hash_add_or_
update_i
nNumUsed!=nNumOfEle;
Hash fragmentation
5个item@4 16 5 5 10064409
90
_zend_hash_index_a
dd_or_update_i
nNumUsed = 5;
resizing and compacting;

packed hashtables
• 理解为传统意义上的‘数组’，而不是map
• 在packed hashtables中，arHash数组为NULL，查找只会直接在
arData中进行。
• packed hashtable只会作用于键递增的数组，这些数组的key之间
可以有间隔，但必须总是递增的。
• bucket->h是冗余的; bucket->key的值永远都是NULL
• 最简单的理解：用idx做索引，没有转换表，没有key.

空hash表
• arData/arHash 数组只会在插入第一个元素时分配内存
• nTableSize（8）& ht->nTableMask (0) == 0
• arHash 数组只有一个带有 INVALID_IDX 值、下标为 0 的元素
(uninitialized_bucket,并且被静态分配了内存)
• 查找时，我们会一直找到 INVALID_IDX 值，意味着 key（实际上你
只想静态分配创建一个空表）没有被找到

应用场景
• 自动扩容会导致多次分配内存及复制操作
• 数字索引比字母索引效率更高
• 不会自动缩容，nNumUsed 达到 nTableSize会压缩
• In_array效率会低
• Hash冲突还是要注意的，Dos攻击。
• Foreach的顺序为插入顺序
• 尽量使用 Packed hashtable
• Time33 hash算法适合英文词汇的hash;Time65适合大小写混写hash
• 理想情况下O(1)的时间复杂度，平均查找复杂度为O(L)
一旦 nNumUsed 达到 nTableSize，PHP会通过丢弃任何 UNDEF 的记录，自动压缩 arData 数组

PHP5 Hashtable
https://github.com/php/php-src/blob/PHP-5.6.9/Zend/zend_hash.h#L67

typedef struct _hashtable {
uint nTableSize;
uint nTableMask;
uint nNumOfElements;
ulong nNextFreeElement;
Bucket *pInternalPointer; /* Used for
element traversal */
Bucket *pListHead;
Bucket *pListTail;
Bucket **arBuckets;
dtor_func_t pDestructor;
zend_bool persistent;
unsigned char nApplyCount;
zend_bool bApplyProtection;
#if ZEND_DEBUG
int inconsistent;
#endif
} HashTable;
typedef struct bucket {
ulong h;
uint nKeyLength;
void *pData;
void *pDataPtr;
struct bucket *pListNext;
struct bucket *pListLast;
struct bucket *pNext;
struct bucket *pLast;
const char *arKey;
} Bucket;

PHP5 vs PHP7
• PHP 5.x 每个元素需要 144 bytes。在 PHP 7 中，降低到了 36 bytes，
或者打包情况下 32 bytes
• Bukets 需要单独分配16bytes内存，冗余且降低缓存效率
• Zvals 需要分开分配会产生额外头开销冗余, 16bytes
• 双向链表中的每个bucket需要4个指针用于链表的连接，32字节
• php7更少的内存占用，更好的CPU缓存利用率，更好的性能
• Php7 在线性的内存地址上进行遍历，而不是在一段内存地址随机
的链表上遍历

<?PHP
$startMemory = memory_get_usage();
//$array = range(1, 100000);
for($i=0;$i<100000; $i++){
$array[$i] = $i;
}
echo memory_get_usage() - $startMemory, "
bytesn";
$array['k'.$i] = $i;
PHP5 vs PHP7 memory

Redis HashTable
https://github.com/antirez/redis/blob/2.8/src/dict.h#L69

Redis hashtable
typedef struct dictEntry {
void *key;
union {
void *val;
uint64_t u64;
int64_t s64;
double d;
} v;
struct dictEntry *next;
} dictEntry;
/* This is our hash table structure. Every dictionary has two of this as we
* implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
dictEntry **table;
unsigned long size;
unsigned long sizemask;
unsigned long used;
} dictht;
typedef struct dict {
dictType *type;
void *privdata;
dictht ht[2];
long rehashidx; /* rehashing not in
progress if rehashidx == -1 */
int iterators; /* number of iterators
currently running */
} dict;

Redis vs PHP7
• Redis业务场景在存储，所以需要实现扩容的异步化
• Redis hgetall无序，少有顺序遍历业务场景，无需保证顺序
• Redis 使用的是 MurmurHash2，更适用于规律性强的key

感谢
• 感谢极客好人许老板教我C语言
• 感谢cap与大家给我进步的机会，同我一起学习
• 感谢以下开源贡献者
• http://jpauli.github.io/2016/04/08/hashtables.html
• http://www.laruence.com/2009/08/23/1065.html
• http://www.laruence.com/2009/07/23/994.html
• https://juejin.im/entry/58f87f1c44d9040069ca999c
• https://crispgm.com/page/php7-new-hashtable-implementation.html

Php7 hashtable

Recommended

Recommended

More Related Content

What's hot

What's hot (19)

Similar to Php7 hashtable

Similar to Php7 hashtable (20)

Recently uploaded

Recently uploaded (20)

Php7 hashtable