ice-broker scan thread - Mailing list pgsql-hackers
From | Qingqing Zhou |
---|---|
Subject | ice-broker scan thread |
Date | |
Msg-id | Pine.LNX.4.58.0511282217470.13586@josh.db Whole thread Raw |
Responses |
Re: ice-broker scan thread
Re: ice-broker scan thread |
List | pgsql-hackers |
I am considering add an "ice-broker scan thread" to accelerate PostgreSQL sequential scan IO speed. The basic idea of this thread is just like the "read-ahead" method, but the difference is this one does not read the data into shared buffer pool directly, instead, it reads the data into file system cache, which makes the integration easy and this is unique to PostgreSQL. What happens to the original sequential scan: for (;;) {/* * a physical read may happen, due to current content of * file system cache and if the kernel is smart enough to * understandyou want to do sequential scan */physical or logical read a page;process the page; } What happens to the sequential scan with ice-broker: for (;;) {/* since the ice-broker has read the page in already */logical read a page with big chance;process the page; } I wrote a program to simulate the sequential scan in PostgreSQL with/without ice-broker. The results indicate this technique has the following characters: (1) The important factor of speedup is the how much CPU time PostgreSQL used on each data page. If PG is fast enough, then no speedup occurs; else a 10% to 20% speedup is expected due to my test. (2) It uses more CPU - this is easy to understand, since it does more work; (3) The benefits also depends on other factors, like how smart your file system ... Here is a test results on my machine: --- $#uname -a Linux josh.db 2.4.29-1 #2 Tue Jan 25 17:03:33 EST 2005 i686 unknown $#cat /proc/meminfo | grep MemTotal MemTotal: 1030988 kB $#cat /proc/cpuinfo | grep CPU model name : Intel(R) Pentium(R) 4 CPU 2.40GHz $#./seqscan 10 $HOME/pginstall/bin/data/base/10794/18986 50 PostgreSQL sequential scan simulator configuration: Memory size: 943718400 CPU cost per page: 50 Scan threadread unit size: 4 With scan threads off - duration: 56862.738 ms With scan threads on - duration: 40611.101 ms With scan threads off - duration: 46859.207 ms With scan threads on - duration: 38598.234 ms With scan threads off - duration: 56919.572 ms With scan threads on - duration: 47023.606 ms With scan threads off - duration: 52976.825 ms With scan threads on - duration: 43056.506 ms With scan threads off - duration: 54292.979 ms With scan threads on - duration: 42946.526 ms With scan threads off - duration: 51893.590 ms With scan threads on - duration: 42137.684 ms With scan threads off - duration: 46552.571 ms With scan threads on - duration: 41892.628 ms With scan threads off - duration: 45107.800 ms With scan threads on - duration: 38329.785 ms With scan threads off - duration: 47527.787 ms With scan threads on - duration: 38293.581 ms With scan threads off - duration: 48810.656 ms With scan threads on - duration: 39018.500 ms --- Notice in above the cpu_cost=50 might looks too big (if you look into the code) - but in concurrent situation, it is not that huge. Also, on my windows box(PIII, 800), a cpu_cost=5 can is enough to prove the benefits of 10%. So in general, it does help in some situations, but not a rocket science since we can't predicate the performance of the file system. It fairly easy to be integrated, and we should add a GUC parameter to control it. We need more tests, any comments and tests are welcome, Regards, Qingqing --- /** seqscan.c* PostgreSQL sequential scan simulator with helper scan thread** Note* I wrote this simulatorto see if there is any benefits for sequential scan to* do read-ahead by another thread. The only thing youmay want to change in the* source file is MEMSZ, make it big enough to thrash your file system cache.** Use the following command to compile:* $gcc -O2 -Wall -pthread -lm seqscan.c -o seqscan* To use it:* $./seqscan <rounds> <datafile> <cpu_cost>* In which rounds is how many times you want to run the test (noticeeach round include* two disk-burn test), datafile is the path to any file (suggest size > 100M), and cpu_cost* is the cost that processing each page of the file. Try different cpu_cost.*/ #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <memory.h> #include <errno.h> #include <math.h> #ifdef WIN32 #include <io.h> #include <windows.h> #define PG_BINARY O_BINARY #else #include <unistd.h> #include <pthread.h> #include <sys/stat.h> #include <sys/time.h> #include <sys/file.h> #define PG_BINARY 0 #endif typedef char bool; #define true ((bool) 1) #define false ((bool) 0) #define BLCKSZ 8192 #define UNITSZ 4 #define MEMSZ (950*1024*1024) char *data_file; int cpu_cost; volatile bool stop_scan; char thread_buffer[BLCKSZ*UNITSZ]; static void cleanup_cache(void) {char *p; if (NULL == (p = (char *)malloc(MEMSZ))){ fprintf(stderr, "insufficient memory\n"); exit(-1);} memset(p, 'a', MEMSZ);free(p); } #ifdef WIN32 bool enable_aio = false; static const unsigned __int64 epoch = 116444736000000000L; static int gettimeofday(struct timeval * tp, struct timezone * tzp) {FILETIME file_time;SYSTEMTIME system_time;ULARGE_INTEGER ularge; GetSystemTime(&system_time);SystemTimeToFileTime(&system_time, &file_time);ularge.LowPart = file_time.dwLowDateTime;ularge.HighPart= file_time.dwHighDateTime; tp->tv_sec = (long) ((ularge.QuadPart - epoch) / 10000000L);tp->tv_usec = (long) (system_time.wMilliseconds * 1000); return 0; } static void sleep(int secs) {SleepEx(secs*1000, true); } static int thread_open() {HANDLE fd;SECURITY_ATTRIBUTES sa; sa.nLength = sizeof(sa);sa.bInheritHandle = TRUE;sa.lpSecurityDescriptor = NULL; fd = CreateFile(data_file, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_WRITE|FILE_SHARE_DELETE, &sa, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN | (enable_aio?FILE_FLAG_OVERLAPPED:0), NULL); if (fd == INVALID_HANDLE_VALUE){ int errCode; switch (errCode = GetLastError()) { /* EMFILE, ENFILE should not occur from CreateFile. */ case ERROR_PATH_NOT_FOUND: case ERROR_FILE_NOT_FOUND: errno = ENOENT; break; case ERROR_FILE_EXISTS: errno= EEXIST; break; case ERROR_ACCESS_DENIED: errno = EACCES; break; default: fprintf(stderr,"thread_open failed: %d\n", errCode); errno = EINVAL; } return -1;} return (int)fd; } static int thread_read(int fd, int blkno, size_t nblk, char *buf) {long offset = BLCKSZ*blkno;long nbytes;OVERLAPPED ol; memset(&ol, 0, sizeof(OVERLAPPED));ol.Offset = offset;ol.OffsetHigh = 0; if (ReadFile((HANDLE)fd, buf, BLCKSZ*nblk, &nbytes, &ol)){ /* successfully done without delay */ NULL;}else{ interrCode; switch (errCode = GetLastError()) { case ERROR_IO_PENDING: break; case ERROR_HANDLE_EOF: break; default: /* unknown error occured */ fprintf(stderr, "asyncread failed: %d\n", errCode); exit(-1); }} return nbytes; } static void thread_close(int fd) {CloseHandle((HANDLE)fd); } #else /* non-windows platforms */ static int thread_open() {int fd; fd = open(data_file, O_RDWR | PG_BINARY, 0600);if (fd < 0){ fprintf(stderr, "thread_open failed: %d\n", errno); exit(-1);} return (int)fd; } static int thread_read(int fd, int blkno, size_t nblk, char *buf) {long offset = BLCKSZ*blkno;long nbytes; nbytes = lseek(fd, offset, SEEK_SET);nbytes = read(fd, buf, BLCKSZ*nblk);if (nbytes <= 0){ fprintf(stderr, "thread_readfailed: %d\n", errno); exit(-1);} return nbytes; } static void thread_close(int fd) {close(fd); } #endif #ifdef WIN32 static DWORD WINAPI scan_thread(LPVOID args) #else static void * scan_thread(void *args) #endif {int i, fd;int start, end; start = 0;end = (size_t)args; fd = thread_open();for (i = start; i < end; i+=UNITSZ){ thread_read(fd, i, UNITSZ, (char *)thread_buffer); /* check if I was asked to stop */ if (stop_scan == true) break;}thread_close(fd); return 0; } static int init_scan(bool with_threads, size_t *nblocks) {int fd; /* open file for do_scan */fd = open(data_file, O_RDWR | PG_BINARY, 0600);if (fd < 0){ fprintf(stderr, "failed to openfile %s\n", data_file); exit(-1);} *nblocks = lseek(fd, 0, SEEK_END) / BLCKSZ;if (*nblocks < 0){ fprintf(stderr, "failed to get file length %s\n", data_file); exit(-1);} if (with_threads){ #ifndef WIN32 pthread_t thread; #endif /* create scan threads */ stop_scan = false; #ifdef WIN32 if (NULL == CreateThread(NULL, 0, scan_thread, (void *)(*nblocks), 0, NULL)) #else if (pthread_create(&thread, NULL, scan_thread, (void *)(*nblocks))) #endif { fprintf(stderr, "failed to start scan thread"); exit(-1); }} return fd; } static void do_scan(int fd, size_t nblocks) {int i, j, k, nbytes;char buffer[BLCKSZ]; for (i = 0; i < nblocks; i++){ nbytes = lseek(fd, i*BLCKSZ, SEEK_SET); nbytes = read(fd, buffer, BLCKSZ); if (nbytes!= BLCKSZ) { fprintf(stderr, "do_scan read failed\n"); exit(-1); } /* pretend to do some CPU intensive analysis */ for (k = 0; k < cpu_cost; k++) { for (j = (k*sizeof(int))%BLCKSZ; j < BLCKSZ / (5 * sizeof(int)); j += sizeof(int)) { int x, y; x = ((int *)buffer)[j]; x = (int)pow((double)x, (double)(x+1)); y = (int)sin((double)x*x); ((int *)buffer)[j] = x*y; } }} } static void close_scan(fd) {stop_scan = true;close(fd); } int main(int argc, char *argv[]) {int i, rounds, fd;size_t nblocks; if (argc != 4){ fprintf(stderr, "usage: cache <rounds> <datafile> <cpu_cost>\n"); exit(-1);} rounds = atoi(argv[1]);data_file = argv[2];cpu_cost = atoi(argv[3]);fd = init_scan(false, &nblocks);close_scan(fd);fprintf(stdout,"PostgreSQL sequential scan simulator configuration:\n" "\tMemorysize: %u\n" "\tCPU cost per page: %d\n" "\tScan thread read unit size:%d\n\n", MEMSZ, cpu_cost, UNITSZ); for (i = 0; i < 2*rounds; i++){ struct timeval start_t, stop_t; long usecs; bool enable = i%2?true:false; /* eliminate system cached data */ cleanup_cache(); sleep(2); /* do the scan task */ gettimeofday(&start_t, NULL); fd = init_scan(enable, &nblocks); do_scan(fd, nblocks); close_scan(fd); gettimeofday(&stop_t, NULL); /* measure the time */ if (stop_t.tv_usec < start_t.tv_usec) { stop_t.tv_sec--; stop_t.tv_usec +=1000000; } usecs = (long) (stop_t.tv_sec - start_t.tv_sec) * 1000000 + (long) (stop_t.tv_usec - start_t.tv_usec); fprintf (stdout, "With scan threads %s - duration: %ld.%03ld ms\n", enable?"on":"off", (long) ((stop_t.tv_sec - start_t.tv_sec) * 1000 + (stop_t.tv_usec - start_t.tv_usec) / 1000), (long) (stop_t.tv_usec - start_t.tv_usec) % 1000); sleep(2);} exit(0); }
pgsql-hackers by date: