faster version of AllocSetFreeIndex for x86 architecture - Mailing list pgsql-hackers
| From | Atsushi Ogawa |
|---|---|
| Subject | faster version of AllocSetFreeIndex for x86 architecture |
| Date | |
| Msg-id | 4A253CF0.1000702@hi-ho.ne.jp Whole thread Raw |
| Responses |
Re: faster version of AllocSetFreeIndex for x86 architecture
Re: faster version of AllocSetFreeIndex for x86 architecture |
| List | pgsql-hackers |
Hi,
I made a faster version of AllocSetFreeIndex for x86 architecture.
Attached files are benchmark programs and patch file.
alloc_test.pl: benchmark script
alloc_test.c: benchmark program
aset_free_index.patch: patch for util/mmgr/aset.c
This benchmark compares the original function with a faster version.
To try the benchmark, only execute alloc_test.pl. This script compiles
alloc_test.c and execute the benchmark.
Results of benchmark script:
Xeon(Core architecture), RedHat EL4, gcc 3.4.6
bytes : 4 8 16 32 64 128 256 512 1024 mix
original: 0.780 0.780 0.820 0.870 0.930 0.970 1.030 1.080 1.130 0.950
patched : 0.380 0.170 0.170 0.170 0.170 0.180 0.170 0.180 0.180 0.280
Core2, Windows XP, gcc 3.4.4 (cygwin)
bytes : 4 8 16 32 64 128 256 512 1024 mix
original: 0.249 0.249 0.515 0.452 0.577 0.671 0.796 0.890 0.999 1.577
patched : 0.358 0.218 0.202 0.218 0.218 0.218 0.202 0.218 0.218 0.218
Xeon(Pentium4 architecture), RedHal EL4, gcc 3.4.6
bytes : 4 8 16 32 64 128 256 512 1024 mix
original: 0.510 0.520 0.620 0.860 0.970 1.260 1.150 1.220 1.290 0.860
patched : 0.620 0.530 0.530 0.540 0.540 0.530 0.540 0.530 0.530 0.490
The effect of the patch that I measured by oprofile is:
- test program: pgbench -c 1 -t 50000 (fsync=off)
original:
CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events
with a unit mask of 0x01 (mandatory) count 100000
samples % symbol name
66854 6.6725 AllocSetAlloc
47679 4.7587 base_yyparse
29058 2.9002 hash_search_with_hash_value
22053 2.2011 SearchCatCache
19264 1.9227 MemoryContextAllocZeroAligned
16223 1.6192 base_yylex
13819 1.3792 ScanKeywordLookup
13305 1.3279 expression_tree_walker
12144 1.2121 LWLockAcquire
11850 1.1827 XLogInsert
11817 1.1794 AllocSetFree
patched:
CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events
with a unit mask of 0x01 (mandatory) count 100000
samples % symbol name
47610 4.9333 AllocSetAlloc
47441 4.9158 base_yyparse
28243 2.9265 hash_search_with_hash_value
22197 2.3000 SearchCatCache
18984 1.9671 MemoryContextAllocZeroAligned
15747 1.6317 base_yylex
13368 1.3852 ScanKeywordLookup
12889 1.3356 expression_tree_walker
12092 1.2530 LWLockAcquire
12078 1.2515 XLogInsert
(skip)
6248 0.6474 AllocSetFree
I think this patch improves AllocSetAlloc/AllocSetFree performance.
Best regards,
---
Atsushi Ogawa
a_ogawa@hi-ho.ne.jp
#!/usr/bin/perl
system "gcc -O2 -o alloc_test alloc_test.c";
my @test_bytes = (4,8,16,32,64,128,256,512,1024,
'8 16 28 36 12 4 8 64 1024 8 24 12 8 64 16');
my $cnt = 10000000;
my @old_result;
my @new_result;
my $t0, $t1, $e;
foreach $e (@test_bytes) {
$t0 = (times)[2];
system "./alloc_test old $cnt $e";
push @old_result, (times)[2] - $t0;
$t0 = (times)[2];
system "./alloc_test new $cnt $e";
push @new_result, (times)[2] - $t0;
}
print " bytes : ";
foreach $e (@test_bytes) {
$e = 'mix' if($e =~ /\d+ \d+/);
printf("%5s ", $e);
}
print "\n";
print " original: ";
foreach $e (@old_result) { printf("%.3f ", $e); }
print "\n";
print " patched : ";
foreach $e (@new_result) { printf("%.3f ", $e); }
print "\n";
#include <stdio.h>
#define Assert(condition)
#define ALLOC_MINBITS 3
#define ALLOCSET_NUM_FREELISTS 11
typedef size_t Size;
/*
* faster version of AllocSetFreeIndex for x86 architecure.
* this function runs in O(1).
*/
static inline int
AllocSetFreeIndex_new(Size size)
{
int idx;
if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0))
size = (1 << ALLOC_MINBITS);
/* bsr(Bit Scan Reverse): Search the most significant set bit */
__asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1));
return idx - (ALLOC_MINBITS - 1);
}
static inline int
AllocSetFreeIndex(Size size)
{
int idx = 0;
if (size > 0)
{
size = (size - 1) >> ALLOC_MINBITS;
while (size != 0)
{
idx++;
size >>= 1;
}
Assert(idx < ALLOCSET_NUM_FREELISTS);
}
return idx;
}
int main(int argc, char *argv[])
{
int loop_cnt;
int size[16];
int i, j;
int result = 0;
if(argc < 4) {
fprintf(stderr, "usage: asettest (new|old) loop_cnt size...\n");
return 1;
}
loop_cnt = atoi(argv[2]);
for(i = 0; i < 16; i++) {
if(argc <= i + 3) {
size[i] = size[0];
} else {
size[i] = atoi(argv[i + 3]);
}
}
if(strcmp(argv[1], "new") == 0) {
for(i = 0; i < loop_cnt; i++) {
for(j = 0; j < 16; j++) {
result += AllocSetFreeIndex_new(size[j]);
}
}
} else if(strcmp(argv[1], "old") == 0) {
for(i = 0; i < loop_cnt; i++) {
for(j = 0; j < 16; j++) {
result += AllocSetFreeIndex(size[j]);
}
}
} else {
fprintf(stderr, "usage: asettest (new|old) size loop_cnt\n");
return 1;
}
fprintf(stderr, "%s, size:%d, loop:%d, checksum:%d\n",
argv[1], size[0], loop_cnt, result);
return 0;
}
*** ./src/backend/utils/mmgr/aset.c.orig 2009-06-01 23:12:10.000000000 +0900
--- ./src/backend/utils/mmgr/aset.c 2009-06-02 08:47:30.000000000 +0900
***************
*** 263,268 ****
--- 263,287 ----
* that size <= ALLOC_CHUNK_LIMIT.
* ----------
*/
+ #if defined(__i386__) && defined(__GNUC__)
+ /*
+ * faster version of AllocSetFreeIndex for x86 architecure.
+ * this function runs in O(1).
+ */
+ static inline int
+ AllocSetFreeIndex(Size size)
+ {
+ int idx;
+
+ if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0))
+ size = (1 << ALLOC_MINBITS);
+
+ /* bsr(Bit Scan Reverse): Search the most significant set bit */
+ __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1));
+
+ return idx - (ALLOC_MINBITS - 1);
+ }
+ #else
static inline int
AllocSetFreeIndex(Size size)
{
***************
*** 281,286 ****
--- 300,306 ----
return idx;
}
+ #endif /* defined(__i386__) && defined(__GNUC__) */
#ifdef RANDOMIZE_ALLOCATED_MEMORY
pgsql-hackers by date: