diff --git a/configure b/configure index 24655dc..e239641 100755 --- a/configure +++ b/configure @@ -10193,7 +10193,7 @@ fi ## Header files ## -for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h +for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/event.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" @@ -12425,7 +12425,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l +for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index c564a76..ef9d450 100644 --- a/configure.in +++ b/configure.in @@ -1183,7 +1183,7 @@ AC_SUBST(UUID_LIBS) ## dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES -AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) +AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/event.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h]) # On BSD, test for net/if.h will fail unless sys/socket.h # is included first. @@ -1432,7 +1432,7 @@ PGAC_FUNC_WCSTOMBS_L LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) +AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index 42c2f52..98b8b7f 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -44,6 +44,9 @@ #ifdef HAVE_SYS_EPOLL_H #include #endif +#ifdef HAVE_SYS_EVENT_H +#include +#endif #ifdef HAVE_POLL_H #include #endif @@ -68,11 +71,14 @@ * useful to manually specify the used primitive. If desired, just add a * define somewhere before this block. */ -#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \ - defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32) +#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_KQUEUE) || \ + defined(WAIT_USE_POLL) || defined(WAIT_USE_SELECT) || \ + defined(WAIT_USE_WIN32) /* don't overwrite manual choice */ #elif defined(HAVE_SYS_EPOLL_H) #define WAIT_USE_EPOLL +#elif defined(HAVE_KQUEUE) +#define WAIT_USE_KQUEUE #elif defined(HAVE_POLL) #define WAIT_USE_POLL #elif HAVE_SYS_SELECT_H @@ -108,6 +114,10 @@ struct WaitEventSet int epoll_fd; /* epoll_wait returns events in a user provided arrays, allocate once */ struct epoll_event *epoll_ret_events; +#elif defined(WAIT_USE_KQUEUE) + int kqueue_fd; + /* kevent returns events in a user provided arrays, allocate once */ + struct kevent *kqueue_ret_events; #elif defined(WAIT_USE_POLL) /* poll expects events to be waited on every poll() call, prepare once */ struct pollfd *pollfds; @@ -137,6 +147,8 @@ static void drainSelfPipe(void); #if defined(WAIT_USE_EPOLL) static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action); +#elif defined(WAIT_USE_KQUEUE) +static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int action); #elif defined(WAIT_USE_POLL) static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event); #elif defined(WAIT_USE_WIN32) @@ -490,6 +502,8 @@ CreateWaitEventSet(MemoryContext context, int nevents) #if defined(WAIT_USE_EPOLL) sz += sizeof(struct epoll_event) * nevents; +#elif defined(WAIT_USE_KQUEUE) + sz += sizeof(struct kevent) * nevents; #elif defined(WAIT_USE_POLL) sz += sizeof(struct pollfd) * nevents; #elif defined(WAIT_USE_WIN32) @@ -508,6 +522,9 @@ CreateWaitEventSet(MemoryContext context, int nevents) #if defined(WAIT_USE_EPOLL) set->epoll_ret_events = (struct epoll_event *) data; data += sizeof(struct epoll_event) * nevents; +#elif defined(WAIT_USE_KQUEUE) + set->kqueue_ret_events = (struct kevent *) data; + data += sizeof(struct kevent) * nevents; #elif defined(WAIT_USE_POLL) set->pollfds = (struct pollfd *) data; data += sizeof(struct pollfd) * nevents; @@ -523,6 +540,10 @@ CreateWaitEventSet(MemoryContext context, int nevents) set->epoll_fd = epoll_create(nevents); if (set->epoll_fd < 0) elog(ERROR, "epoll_create failed: %m"); +#elif defined(WAIT_USE_KQUEUE) + set->kqueue_fd = kqueue(); + if (set->kqueue_fd < 0) + elog(ERROR, "kqueue failed: %m"); #elif defined(WAIT_USE_WIN32) /* @@ -549,6 +570,8 @@ FreeWaitEventSet(WaitEventSet *set) { #if defined(WAIT_USE_EPOLL) close(set->epoll_fd); +#elif defined(WAIT_USE_KQUEUE) + close(set->kqueue_fd); #elif defined(WAIT_USE_WIN32) WaitEvent *cur_event; @@ -653,6 +676,8 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, /* perform wait primitive specific initialization, if needed */ #if defined(WAIT_USE_EPOLL) WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD); +#elif defined(WAIT_USE_KQUEUE) + WaitEventAdjustKqueue(set, event, EV_ADD); #elif defined(WAIT_USE_POLL) WaitEventAdjustPoll(set, event); #elif defined(WAIT_USE_SELECT) @@ -711,6 +736,8 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch) #if defined(WAIT_USE_EPOLL) WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD); +#elif defined(WAIT_USE_KQUEUE) + WaitEventAdjustKqueue(set, event, EV_ADD); #elif defined(WAIT_USE_POLL) WaitEventAdjustPoll(set, event); #elif defined(WAIT_USE_SELECT) @@ -803,6 +830,71 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event) } #endif +#if defined(WAIT_USE_KQUEUE) + +/* + * action can be EV_ADD or EV_DELETE. EV_ADD is used for both adding and + * modifying, and EV_DELETE is not used yet. + */ +static void +WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int action) +{ + int rc; + struct kevent k_ev[2]; + int count = 1; + + k_ev[0].ident = event->fd; + k_ev[0].filter = 0; + k_ev[0].flags = action | EV_CLEAR; + k_ev[0].fflags = 0; + k_ev[0].data = 0; + k_ev[0].udata = event; + + Assert(event->fd >= 0); + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + k_ev[0].filter = EVFILT_READ; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + k_ev[0].filter = EVFILT_READ; + } + else + { + Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)); + + /* + * If only one of read and write is request, we need only one kevent + * object. + */ + if (event->events & WL_SOCKET_READABLE) + k_ev[0].filter = EVFILT_READ; + else + k_ev[0].filter = EVFILT_WRITE; + + /* + * We need to create a second kevent object if we need both. The read + * and write notifications will arrive separately. + */ + if ((event->events & WL_SOCKET_READABLE) && + (event->events & WL_SOCKET_WRITEABLE)) + { + ++count; + k_ev[1] = k_ev[0]; + k_ev[1].filter = EVFILT_WRITE; + } + } + + rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL); + if (rc < 0) + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("kevent() failed: %m"))); +} + +#endif + #if defined(WAIT_USE_WIN32) static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event) @@ -1081,6 +1173,141 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, return returned_events; } +#elif defined(WAIT_USE_KQUEUE) + +/* + * Wait using FreeBSD kqueue(2)/kevent(2). Also available on other BSD-family + * systems including MacOSX. + * + * This is the preferrable wait method for systems that have it, as several + * readiness notifications are delivered, without having to iterate through + * all of set->events. + */ +static int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct kevent *cur_kqueue_event; + struct timespec timeout; + struct timespec *timeout_p; + + if (cur_timeout < 0) + timeout_p = NULL; + else + { + timeout.tv_sec = cur_timeout / 1000; + timeout.tv_nsec = (cur_timeout % 1000) * 1000000; + timeout_p = &timeout; + } + + /* Sleep */ + rc = kevent(set->kqueue_fd, NULL, 0, + set->kqueue_ret_events, nevents, + timeout_p); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("kevent() failed while trying to wait: %m"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + /* + * At least one event occurred, iterate over the returned kqueue events + * until they're either all processed, or we've returned all the events + * the caller desired. + */ + for (cur_kqueue_event = set->kqueue_ret_events; + cur_kqueue_event < (set->kqueue_ret_events + rc) && + returned_events < nevents; + cur_kqueue_event++) + { + /* epoll's data pointer is set to the associated WaitEvent */ + cur_event = (WaitEvent *) cur_kqueue_event->udata; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + cur_kqueue_event->flags & (EV_EOF | EVFILT_READ)) + { + /* There's data in the self-pipe, clear it. */ + drainSelfPipe(); + + if (set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + cur_kqueue_event->flags & (EVFILT_READ | EV_EOF)) + { + /* + * We expect an EV_EOF when the remote end is closed, but + * because we don't expect the pipe to become readable or to have + * any errors either, treat those cases as postmaster death, too. + * + * As explained in the WAIT_USE_SELECT implementation, select(2) + * may spuriously return. Be paranoid about that here too, a + * spurious WL_POSTMASTER_DEATH would be painful. + */ + if (!PostmasterIsAlive()) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) + { + Assert(cur_event->fd >= 0); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_kqueue_event->flags & (EV_EOF | EVFILT_READ))) + { + /* readable, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_kqueue_event->flags & (EV_EOF | EVFILT_WRITE))) + { + /* writable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + + return returned_events; +} + #elif defined(WAIT_USE_POLL) /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c72635c..e319f1d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -279,6 +279,9 @@ /* Define to 1 if you have isinf(). */ #undef HAVE_ISINF +/* Define to 1 if you have the `kqueue' function. */ +#undef HAVE_KQUEUE + /* Define to 1 if you have the header file. */ #undef HAVE_LANGINFO_H @@ -533,6 +536,9 @@ /* Define to 1 if you have the header file. */ #undef HAVE_SYS_EPOLL_H +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_EVENT_H + /* Define to 1 if you have the header file. */ #undef HAVE_SYS_IOCTL_H