faster version of AllocSetFreeIndex for x86 architecture - Mailing list pgsql-hackers
From | Atsushi Ogawa |
---|---|
Subject | faster version of AllocSetFreeIndex for x86 architecture |
Date | |
Msg-id | 4A253CF0.1000702@hi-ho.ne.jp Whole thread Raw |
Responses |
Re: faster version of AllocSetFreeIndex for x86 architecture
Re: faster version of AllocSetFreeIndex for x86 architecture |
List | pgsql-hackers |
Hi, I made a faster version of AllocSetFreeIndex for x86 architecture. Attached files are benchmark programs and patch file. alloc_test.pl: benchmark script alloc_test.c: benchmark program aset_free_index.patch: patch for util/mmgr/aset.c This benchmark compares the original function with a faster version. To try the benchmark, only execute alloc_test.pl. This script compiles alloc_test.c and execute the benchmark. Results of benchmark script: Xeon(Core architecture), RedHat EL4, gcc 3.4.6 bytes : 4 8 16 32 64 128 256 512 1024 mix original: 0.780 0.780 0.820 0.870 0.930 0.970 1.030 1.080 1.130 0.950 patched : 0.380 0.170 0.170 0.170 0.170 0.180 0.170 0.180 0.180 0.280 Core2, Windows XP, gcc 3.4.4 (cygwin) bytes : 4 8 16 32 64 128 256 512 1024 mix original: 0.249 0.249 0.515 0.452 0.577 0.671 0.796 0.890 0.999 1.577 patched : 0.358 0.218 0.202 0.218 0.218 0.218 0.202 0.218 0.218 0.218 Xeon(Pentium4 architecture), RedHal EL4, gcc 3.4.6 bytes : 4 8 16 32 64 128 256 512 1024 mix original: 0.510 0.520 0.620 0.860 0.970 1.260 1.150 1.220 1.290 0.860 patched : 0.620 0.530 0.530 0.540 0.540 0.530 0.540 0.530 0.530 0.490 The effect of the patch that I measured by oprofile is: - test program: pgbench -c 1 -t 50000 (fsync=off) original: CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated) Counted GLOBAL_POWER_EVENTS events with a unit mask of 0x01 (mandatory) count 100000 samples % symbol name 66854 6.6725 AllocSetAlloc 47679 4.7587 base_yyparse 29058 2.9002 hash_search_with_hash_value 22053 2.2011 SearchCatCache 19264 1.9227 MemoryContextAllocZeroAligned 16223 1.6192 base_yylex 13819 1.3792 ScanKeywordLookup 13305 1.3279 expression_tree_walker 12144 1.2121 LWLockAcquire 11850 1.1827 XLogInsert 11817 1.1794 AllocSetFree patched: CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated) Counted GLOBAL_POWER_EVENTS events with a unit mask of 0x01 (mandatory) count 100000 samples % symbol name 47610 4.9333 AllocSetAlloc 47441 4.9158 base_yyparse 28243 2.9265 hash_search_with_hash_value 22197 2.3000 SearchCatCache 18984 1.9671 MemoryContextAllocZeroAligned 15747 1.6317 base_yylex 13368 1.3852 ScanKeywordLookup 12889 1.3356 expression_tree_walker 12092 1.2530 LWLockAcquire 12078 1.2515 XLogInsert (skip) 6248 0.6474 AllocSetFree I think this patch improves AllocSetAlloc/AllocSetFree performance. Best regards, --- Atsushi Ogawa a_ogawa@hi-ho.ne.jp #!/usr/bin/perl system "gcc -O2 -o alloc_test alloc_test.c"; my @test_bytes = (4,8,16,32,64,128,256,512,1024, '8 16 28 36 12 4 8 64 1024 8 24 12 8 64 16'); my $cnt = 10000000; my @old_result; my @new_result; my $t0, $t1, $e; foreach $e (@test_bytes) { $t0 = (times)[2]; system "./alloc_test old $cnt $e"; push @old_result, (times)[2] - $t0; $t0 = (times)[2]; system "./alloc_test new $cnt $e"; push @new_result, (times)[2] - $t0; } print " bytes : "; foreach $e (@test_bytes) { $e = 'mix' if($e =~ /\d+ \d+/); printf("%5s ", $e); } print "\n"; print " original: "; foreach $e (@old_result) { printf("%.3f ", $e); } print "\n"; print " patched : "; foreach $e (@new_result) { printf("%.3f ", $e); } print "\n"; #include <stdio.h> #define Assert(condition) #define ALLOC_MINBITS 3 #define ALLOCSET_NUM_FREELISTS 11 typedef size_t Size; /* * faster version of AllocSetFreeIndex for x86 architecure. * this function runs in O(1). */ static inline int AllocSetFreeIndex_new(Size size) { int idx; if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0)) size = (1 << ALLOC_MINBITS); /* bsr(Bit Scan Reverse): Search the most significant set bit */ __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1)); return idx - (ALLOC_MINBITS - 1); } static inline int AllocSetFreeIndex(Size size) { int idx = 0; if (size > 0) { size = (size - 1) >> ALLOC_MINBITS; while (size != 0) { idx++; size >>= 1; } Assert(idx < ALLOCSET_NUM_FREELISTS); } return idx; } int main(int argc, char *argv[]) { int loop_cnt; int size[16]; int i, j; int result = 0; if(argc < 4) { fprintf(stderr, "usage: asettest (new|old) loop_cnt size...\n"); return 1; } loop_cnt = atoi(argv[2]); for(i = 0; i < 16; i++) { if(argc <= i + 3) { size[i] = size[0]; } else { size[i] = atoi(argv[i + 3]); } } if(strcmp(argv[1], "new") == 0) { for(i = 0; i < loop_cnt; i++) { for(j = 0; j < 16; j++) { result += AllocSetFreeIndex_new(size[j]); } } } else if(strcmp(argv[1], "old") == 0) { for(i = 0; i < loop_cnt; i++) { for(j = 0; j < 16; j++) { result += AllocSetFreeIndex(size[j]); } } } else { fprintf(stderr, "usage: asettest (new|old) size loop_cnt\n"); return 1; } fprintf(stderr, "%s, size:%d, loop:%d, checksum:%d\n", argv[1], size[0], loop_cnt, result); return 0; } *** ./src/backend/utils/mmgr/aset.c.orig 2009-06-01 23:12:10.000000000 +0900 --- ./src/backend/utils/mmgr/aset.c 2009-06-02 08:47:30.000000000 +0900 *************** *** 263,268 **** --- 263,287 ---- * that size <= ALLOC_CHUNK_LIMIT. * ---------- */ + #if defined(__i386__) && defined(__GNUC__) + /* + * faster version of AllocSetFreeIndex for x86 architecure. + * this function runs in O(1). + */ + static inline int + AllocSetFreeIndex(Size size) + { + int idx; + + if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0)) + size = (1 << ALLOC_MINBITS); + + /* bsr(Bit Scan Reverse): Search the most significant set bit */ + __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1)); + + return idx - (ALLOC_MINBITS - 1); + } + #else static inline int AllocSetFreeIndex(Size size) { *************** *** 281,286 **** --- 300,306 ---- return idx; } + #endif /* defined(__i386__) && defined(__GNUC__) */ #ifdef RANDOMIZE_ALLOCATED_MEMORY
pgsql-hackers by date: