最近两天测试了下tcmalloc,性能的确牛B.
所以修改了下固定对象分配器,模仿tcmalloc利用tls做thread cache.
下面是在我机器上对自己写的各个内存分配器与tcmalloc的对比测试,
fix_obj_pool finish:326
fix_obj_pool finish:165
fix_obj_pool finish:168
fix_obj_pool finish:164
fix_obj_pool finish:174
fix_obj_pool finish:164
fix_obj_pool finish:174
fix_obj_pool finish:185
fix_obj_pool finish:173
fix_obj_pool finish:168
gen_allocator finish:567
gen_allocator finish:264
gen_allocator finish:261
gen_allocator finish:260
gen_allocator finish:260
gen_allocator finish:261
gen_allocator finish:260
gen_allocator finish:261
gen_allocator finish:260
gen_allocator finish:263
block_obj_allocator finish:342
block_obj_allocator finish:257
block_obj_allocator finish:258
block_obj_allocator finish:257
block_obj_allocator finish:258
block_obj_allocator finish:257
block_obj_allocator finish:258
block_obj_allocator finish:259
block_obj_allocator finish:263
block_obj_allocator finish:262
tcmalloc finish:279
tcmalloc finish:266
tcmalloc finish:265
tcmalloc finish:267
tcmalloc finish:266
tcmalloc finish:266
tcmalloc finish:265
tcmalloc finish:264
tcmalloc finish:266
tcmalloc finish:267
test1 finish————
fix_obj_pool finish:606
fix_obj_pool finish:471
fix_obj_pool finish:469
fix_obj_pool finish:473
fix_obj_pool finish:468
fix_obj_pool finish:468
fix_obj_pool finish:470
fix_obj_pool finish:474
fix_obj_pool finish:475
fix_obj_pool finish:467
gen_allocator finish:928
gen_allocator finish:647
gen_allocator finish:677
gen_allocator finish:643
gen_allocator finish:645
gen_allocator finish:644
gen_allocator finish:643
gen_allocator finish:644
gen_allocator finish:643
gen_allocator finish:644
block_obj_allocator finish:586
block_obj_allocator finish:500
block_obj_allocator finish:502
block_obj_allocator finish:500
block_obj_allocator finish:502
block_obj_allocator finish:501
block_obj_allocator finish:501
block_obj_allocator finish:501
block_obj_allocator finish:501
block_obj_allocator finish:501
tcmalloc finish:551
tcmalloc finish:549
tcmalloc finish:549
tcmalloc finish:549
tcmalloc finish:551
tcmalloc finish:549
tcmalloc finish:548
tcmalloc finish:551
tcmalloc finish:549
tcmalloc finish:550
test2 finish————
fix_obj_pool finish:464
fix_obj_pool finish:466
fix_obj_pool finish:464
fix_obj_pool finish:465
fix_obj_pool finish:465
fix_obj_pool finish:466
fix_obj_pool finish:465
fix_obj_pool finish:464
fix_obj_pool finish:467
fix_obj_pool finish:465
gen_allocator finish:674
gen_allocator finish:661
gen_allocator finish:667
gen_allocator finish:656
gen_allocator finish:657
gen_allocator finish:658
gen_allocator finish:658
gen_allocator finish:660
gen_allocator finish:657
gen_allocator finish:660
block_obj_allocator finish:479
block_obj_allocator finish:479
block_obj_allocator finish:477
block_obj_allocator finish:477
block_obj_allocator finish:478
block_obj_allocator finish:480
block_obj_allocator finish:478
block_obj_allocator finish:481
block_obj_allocator finish:477
block_obj_allocator finish:478
tcmalloc finish:562
tcmalloc finish:565
tcmalloc finish:563
tcmalloc finish:562
tcmalloc finish:562
tcmalloc finish:563
tcmalloc finish:566
tcmalloc finish:565
tcmalloc finish:562
tcmalloc finish:562
test3 finish————
三个测试分别是
1)分配 1000万个16字节的对象
2)分配1000万,再释放1000万
3)分配10万,网站空间,释放10万,执行1000万/10万次
从输出可以看出fix_obj_pool 的第一个测试是最快的,因为它的分配处理最简单,但如果把释放也计入统计,优势就几乎没有了。
还有一个手段可以优化fix_obj_pool,就是释放时不将对象放回到可用列表中,只是增加一个计数,当整个内存块中的对象都被释放时
才将内存放回到可用列表中去。而gen_allocator效果是最差的,香港服务器,可以直接丢弃了。
从事测试结果可以看出,tcmalloc已经可以满足大多数的需求,网站空间,基本无必要自己写通用内存分配器。当然对象池还是可以考虑的。
代码如下:
block_obj_allocator.h
#ifndef _BLOCK_OBJ_ALLOCATOR#define _BLOCK_OBJ_ALLOCATORtypedef struct block_obj_allocator *block_obj_allocator_t;block_obj_allocator_t create_block_obj_allocator();void print_info(block_obj_allocator_t,int);#endif
block_obj_allocator.c
#include #include <pthread.h>#include #include <stdint.h>#include <assert.h>#include #include <stdlib.h>#include free_list{list_node next;uint32_t size;uint32_t init_size;list_node *head;list_node *tail;void *mem;};struct thread_allocator{list_node next;block_obj_allocator_t central_allocator;struct link_list *_free_list;struct link_list *_recover;uint32_t free_size;uint16_t array_idx;uint32_t collect_factor;};struct thread_cache{list_node next;struct thread_allocator _allocator[17];};struct block_obj_allocator{IMPLEMEMT(allocator);pthread_key_t t_key;spinlock_t _free_list_mtx[17];struct link_list *_free_list[17];spinlock_t mtx;struct link_list *thread_caches;};static void *free_list_get(struct free_list *f){void *ptr = (void*)f->head;f->head = f->head->next;if(!f->head)f->tail = NULL;–f->size;return ptr;}static void free_list_put(struct free_list *f,void *ptr){list_node *l = (list_node*)ptr;l->next = NULL;if(f->tail){f->tail->next = l;f->tail = l;}elsef->head = f->tail = l;++f->size;}#define DEFAULT_BLOCK_SIZE 1024*1024static struct free_list *creat_new_freelist(uint32_t size){uint32_t init_size = DEFAULT_BLOCK_SIZE/size;struct free_list *f = (struct free_list*)calloc(1,sizeof(*f));assert(f);f->mem = calloc(1,DEFAULT_BLOCK_SIZE);assert(f->mem);f->init_size = f->size = init_size;int32_t i = 0;for( ; i < init_size; ++i){list_node *l = (list_node*)(((uint8_t*)f->mem)+(i*size));free_list_put(f,l);}f->size = init_size;return f;}static struct free_list *central_get_freelist(block_obj_allocator_t central,uint16_t array_idx){free_list *f;spin_lock(central->_free_list_mtx[array_idx],4000);f = (struct free_list*)link_list_pop(central->_free_list[array_idx]);spin_unlock(central->_free_list_mtx[array_idx]);if(!f){//printf(“creat_new_freelist\n”);f = creat_new_freelist(1<<array_idx);}return f;}static void give_back_to_central(block_obj_allocator_t central,uint16_t array_idx,struct free_list *f){//printf(“give_back_to_central\n”);spin_lock(central->_free_list_mtx[array_idx],4000);LINK_LIST_PUSH_BACK(central->_free_list[array_idx],f);spin_unlock(central->_free_list_mtx[array_idx]);}void *thread_allocator_alloc(struct thread_allocator *a){void *ptr;struct free_list *f;if(!a->free_size){//thread cache不够内存了,从central获取f = central_get_freelist(a->central_allocator,a->array_idx);assert(f);LINK_LIST_PUSH_BACK(a->_free_list,f);a->free_size += f->size;}else{f = (struct free_list*)link_list_head(a->_free_list);if(!f){f = (struct free_list*)link_list_pop(a->_recover);LINK_LIST_PUSH_BACK(a->_free_list,f);}}ptr = free_list_get(f);assert(ptr);–a->free_size;if(!f->size){link_list_pop(a->_free_list);link_list_push_back(a->_recover,(list_node*)f);}return ptr;}void thread_allocator_dealloc(struct thread_allocator *a,void *ptr){struct free_list *f = (struct free_list*)link_list_head(a->_recover);if(f){free_list_put(f,ptr);++a->free_size;if(f->size == f->init_size){link_list_pop(a->_recover);//printf(“==init_size\n”);(a->free_size >= a->collect_factor){//将f归还给central_allocator; give_back_to_central(a->central_allocator,a->array_idx,f);a->free_size -= f->size;}elselink_list_push_back(a->_free_list,(list_node*)f);}}else{f = (struct free_list*)link_list_head(a->_free_list);assert(f);free_list_put(f,ptr);++a->free_size;}}void thread_allocator_info(struct thread_allocator *a){printf(,a->free_size);{struct free_list *f = (struct free_list*)link_list_head(a->_free_list);while(f){printf(,f->size);f = (struct free_list*)((list_node*)f)->next;}}{struct free_list *f = (struct free_list*)link_list_head(a->_recover);while(f){printf(,f->size);f = (struct free_list*)((list_node*)f)->next;}}}extern uint8_t GetK(uint32_t size);static struct thread_cache* thread_cache_create(block_obj_allocator_t ba){struct thread_cache *tc = calloc(1,sizeof(*tc));int32_t i = 0;for( ; i < 17; ++i){tc->_allocator[i].central_allocator = ba;tc->_allocator[i]._free_list = LINK_LIST_CREATE();tc->_allocator[i]._recover = LINK_LIST_CREATE();tc->_allocator[i].array_idx = i;tc->_allocator[i].collect_factor = ((DEFAULT_BLOCK_SIZE)*2)/(1<<i);}spin_lock(ba->mtx,4000);LINK_LIST_PUSH_BACK(ba->thread_caches,tc);spin_unlock(ba->mtx);return tc; }static void release_freelist(struct link_list *flist){list_node *l = link_list_head(flist);while(l){struct free_list *f = (struct free_list*)l;l = l->next;free(f->mem);free(f);//printf(“destroy_freelist\n”); }}static void destroy_thread_cache(struct thread_cache *tc){int32_t i = 0;for(; i < 17; ++i){release_freelist(tc->_allocator[i]._free_list);release_freelist(tc->_allocator[i]._recover);LINK_LIST_DESTROY(&(tc->_allocator[i]._free_list));LINK_LIST_DESTROY(&(tc->_allocator[i]._recover));}free(tc);}static void* thread_cache_alloc(struct thread_cache *tc,uint32_t size){size += sizeof(int32_t);uint8_t k = GetK(size);size = 1 << k;int32_t *ptr = (int32_t*)thread_allocator_alloc(&(tc->_allocator[k]));*ptr = k;ptr++;return (void*)ptr;}static void thread_cache_dealloc(struct thread_cache *tc,void *ptr){int32_t *_ptr = ((int32_t*)ptr)-1;uint8_t k = *_ptr;thread_allocator_dealloc(&(tc->_allocator[k]),_ptr);}static void thread_cache_info(struct thread_cache *tc,uint32_t size){size += sizeof(int32_t);uint8_t k = GetK(size);thread_allocator_info(&(tc->_allocator[k]));}static void* block_obj_al_alloc(struct allocator *a, int32_t size){block_obj_allocator_t ba = (block_obj_allocator_t)a;struct thread_cache *tc = (struct thread_cache*)pthread_getspecific(ba->t_key);if(!tc){tc = thread_cache_create(ba);pthread_setspecific(ba->t_key,(void*)tc);}return thread_cache_alloc(tc,size);}static void block_obj_al_dealloc(struct allocator*a, void *ptr){block_obj_allocator_t ba = (block_obj_allocator_t)a;struct thread_cache *tc = (struct thread_cache*)pthread_getspecific(ba->t_key);assert(tc);thread_cache_dealloc(tc,ptr);}static void destroy_block_obj_al(struct allocator **a){block_obj_allocator_t ba = (block_obj_allocator_t)*a;//销毁所有的thread_cache {list_node *l = link_list_head(ba->thread_caches);while(l){struct thread_cache *tc = (struct thread_cache *)l;l = l->next;destroy_thread_cache(tc);}LINK_LIST_DESTROY(&ba->thread_caches);}//销毁所有free_list {int32_t i = 0;for( ; i < 17; ++i){release_freelist(ba->_free_list[i]);LINK_LIST_DESTROY(&ba->_free_list[i]);}}{int32_t i = 0;for( ; i < 17; ++i){spin_destroy(&(ba->_free_list_mtx[i]));}}spin_destroy(&(ba->mtx));pthread_key_delete(ba->t_key);free(ba);*a = NULL;}block_obj_allocator_t create_block_obj_allocator(){block_obj_allocator_t ba = (block_obj_allocator_t)calloc(1,sizeof(*ba));ba->mtx = spin_create();ba->thread_caches = LINK_LIST_CREATE();int32_t i = 0;for( ; i < 17; ++i){ba->_free_list[i] = LINK_LIST_CREATE();ba->_free_list_mtx[i] = spin_create();}pthread_key_create(&ba->t_key,0);ba->super_class.Alloc = block_obj_al_alloc;ba->super_class.DeAlloc = block_obj_al_dealloc;ba->super_class.Destroy = destroy_block_obj_al;return ba;}void print_info(block_obj_allocator_t ba,int size){struct thread_cache *tc = (struct thread_cache*)pthread_getspecific(ba->t_key);thread_cache_info(tc,size);}
test.c
用最少的浪费面对现在