From 9a8af3fe03d9d7673f163e6087dd724acc40161d Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Mon, 1 Mar 2021 18:08:23 +1300 Subject: [PATCH v5] Detect dropped connections while running queries. Provide a new optional GUC that can be used to check whether the client connection has gone away periodically while running very long queries. Author: Sergey Cherkashin Author: Thomas Munro Reviewed-by: Thomas Munro Reviewed-by: Tatsuo Ishii Reviewed-by: Konstantin Knizhnik Reviewed-by: Zhihong Yu Discussion: https://postgr.es/m/77def86b27e41f0efcba411460e929ae%40postgrespro.ru --- doc/src/sgml/config.sgml | 36 +++++++ src/backend/libpq/pqcomm.c | 91 ++++++++++++++++++ src/backend/tcop/postgres.c | 8 ++ src/backend/utils/init/globals.c | 1 + src/backend/utils/init/postinit.c | 11 +++ src/backend/utils/misc/guc.c | 10 ++ src/backend/utils/misc/postgresql.conf.sample | 3 + src/include/libpq/libpq.h | 2 + src/include/miscadmin.h | 3 +- src/include/utils/timeout.h | 1 + src/test/modules/Makefile | 1 + src/test/modules/connection/Makefile | 16 ++++ .../connection/t/001_close_connection.pl | 96 +++++++++++++++++++ src/tools/msvc/Mkvcbuild.pm | 4 +- 14 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 src/test/modules/connection/Makefile create mode 100644 src/test/modules/connection/t/001_close_connection.pl diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index ee4925d6d9..fc463f0a65 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -998,6 +998,42 @@ include_dir 'conf.d' + + client_connection_check_interval (integer) + + client_connection_check_interval configuration parameter + + + + + Sets the time interval between checks that the client is still + connected, while running queries. The check is performed by testing + whether one byte could be read from the socket with + MSG_PEEK. If the kernel reports that the connection + has been closed or lost, a long running query can abort immediately, + rather than discovering the problem when it eventually tries to send + the response. + + + If this value is specified without units, it is taken as milliseconds. + The default value is 0, which disables connection + checks. Without connection checks, the server will detect the loss of + the connection only when it is waiting for a new request, receiving a + request or sending a response. + + + For the kernel itself to detect lost TCP connections reliably and + within a known timeframe in all scenarios including network failure, it + may also be necessary to adjust the default TCP keepalive settings of + the operating system, or the + , + and + settings of + PostgreSQL. + + + + diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index 4c7b1e7bfd..e5f999b898 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -79,6 +79,7 @@ #include "storage/ipc.h" #include "utils/guc.h" #include "utils/memutils.h" +#include "utils/timeout.h" /* * Cope with the various platform-specific ways to spell TCP keepalive socket @@ -104,6 +105,7 @@ */ int Unix_socket_permissions; char *Unix_socket_group; +int client_connection_check_interval; /* Where the Unix socket files are (list of palloc'd strings) */ static List *sock_paths = NIL; @@ -1921,3 +1923,92 @@ pq_settcpusertimeout(int timeout, Port *port) return STATUS_OK; } + +/* -------------------------------- + * pq_check_client_connection - check if client is still connected + * -------------------------------- + */ +void +pq_check_client_connection(void) +{ + CheckClientConnectionPending = false; + + /* + * We were called from CHECK_FOR_INTERRUPTS(), because + * client_connection_check_interval is set and the timer recently fired, so + * it's time to check if the kernel thinks the client is still there. This + * is a useful thing to do while the executor is doing busy work for a long + * time without any other kind of interaction with the socket. + * + * We'll only perform the check and re-arm the timer if we're possibly + * still running a query. We don't need to do any checks when we're + * sitting idle between queries, because in that case the FeBeWaitSet will + * wake up when the socket becomes ready to read, including lost + * connections. If a later query begins, the timer will be enabled afresh. + */ + if (IsUnderPostmaster && + MyProcPort != NULL && + !PqCommReadingMsg && + !PqCommBusy) + { + bool connection_lost = false; + char nextbyte; + int r; + +retry: +#ifdef WIN32 + pgwin32_noblock = 1; +#endif + r = recv(MyProcPort->sock, &nextbyte, 1, MSG_PEEK); +#ifdef WIN32 + pgwin32_noblock = 0; +#endif + + if (r == 0) + { + /* EOF detected. */ + connection_lost = true; + } + else if (r > 0) + { + /* Data available to read. Connection looks good. */ + } + else if (errno == EINTR) + { + /* Interrupted by a signal, so retry. */ + goto retry; + } + else if (errno == EAGAIN || errno == EWOULDBLOCK) + { + /* No data available to read. Connection looks good. */ + } + else + { + /* Got some other error. We'd better log the reason. */ + ereport(COMMERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("could not check client connection: %m"))); + connection_lost = true; + } + + if (connection_lost) + { + /* + * We're already in ProcessInterrupts(), and its check for + * ClientConnectionLost comes after the check for + * CheckClientConnectionPending. It seems a little fragile to + * rely on that here, so we'll also set InterruptPending to make + * sure that the next CHECK_FOR_INTERRUPTS() could handle it too + * if the code moves around. + */ + ClientConnectionLost = true; + InterruptPending = true; + } + else if (client_connection_check_interval > 0) + { + /* Schedule the next check, because the GUC is still enabled. */ + enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT, + client_connection_check_interval); + } + } +} diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 2b1b68109f..6d6f942b3f 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -2671,6 +2671,12 @@ start_xact_command(void) * not desired, the timeout has to be disabled explicitly. */ enable_statement_timeout(); + + /* Start timeout for checking if the client has gone away if necessary. */ + if (client_connection_check_interval > 0 && + !get_timeout_active(CLIENT_CONNECTION_CHECK_TIMEOUT)) + enable_timeout_after(CLIENT_CONNECTION_CHECK_TIMEOUT, + client_connection_check_interval); } static void @@ -3149,6 +3155,8 @@ ProcessInterrupts(void) (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("terminating connection due to administrator command"))); } + if (CheckClientConnectionPending) + pq_check_client_connection(); if (ClientConnectionLost) { QueryCancelPending = false; /* lost connection trumps QueryCancel */ diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 73e0a672ae..a9f0fc3017 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -30,6 +30,7 @@ ProtocolVersion FrontendProtocol; volatile sig_atomic_t InterruptPending = false; volatile sig_atomic_t QueryCancelPending = false; volatile sig_atomic_t ProcDiePending = false; +volatile sig_atomic_t CheckClientConnectionPending = false; volatile sig_atomic_t ClientConnectionLost = false; volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 7abeccb536..0bac23e75d 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -34,6 +34,7 @@ #include "catalog/pg_db_role_setting.h" #include "catalog/pg_tablespace.h" #include "libpq/auth.h" +#include "libpq/libpq.h" #include "libpq/libpq-be.h" #include "mb/pg_wchar.h" #include "miscadmin.h" @@ -73,6 +74,7 @@ static void StatementTimeoutHandler(void); static void LockTimeoutHandler(void); static void IdleInTransactionSessionTimeoutHandler(void); static void IdleSessionTimeoutHandler(void); +static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); @@ -620,6 +622,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, RegisterTimeout(IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IdleInTransactionSessionTimeoutHandler); RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler); + RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler); } /* @@ -1242,6 +1245,14 @@ IdleSessionTimeoutHandler(void) SetLatch(MyLatch); } +static void +ClientCheckTimeoutHandler(void) +{ + CheckClientConnectionPending = true; + InterruptPending = true; + SetLatch(MyLatch); +} + /* * Returns true if at least one role is defined in this database cluster. */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 3b36a31a47..391d9983e0 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3483,6 +3483,16 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"client_connection_check_interval", PGC_USERSET, CLIENT_CONN_OTHER, + gettext_noop("Sets the time interval for checking connection with the client."), + gettext_noop("A value of 0 disables this feature."), + GUC_UNIT_MS + }, + &client_connection_check_interval, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 86425965d0..dd850bf272 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -718,6 +718,9 @@ #dynamic_library_path = '$libdir' +#client_connection_check_interval = 0 # set time interval between + # checks for client disconnection while + # running long queries; 0 for never #------------------------------------------------------------------------------ # LOCK MANAGEMENT diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h index b20deeb555..55e605a82e 100644 --- a/src/include/libpq/libpq.h +++ b/src/include/libpq/libpq.h @@ -71,6 +71,7 @@ extern int pq_getbyte(void); extern int pq_peekbyte(void); extern int pq_getbyte_if_available(unsigned char *c); extern int pq_putmessage_v2(char msgtype, const char *s, size_t len); +extern void pq_check_client_connection(void); /* * prototypes for functions in be-secure.c @@ -109,6 +110,7 @@ extern ssize_t secure_open_gssapi(Port *port); extern char *SSLCipherSuites; extern char *SSLECDHCurve; extern bool SSLPreferServerCiphers; +extern int client_connection_check_interval; extern int ssl_min_protocol_version; extern int ssl_max_protocol_version; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 013850ac28..40fcaff25f 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -84,7 +84,8 @@ extern PGDLLIMPORT volatile sig_atomic_t ProcDiePending; extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; - +extern PGDLLIMPORT volatile sig_atomic_t ConfigReloadPending; +extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; /* these are marked volatile because they are examined by signal handlers: */ diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index ecb2a366a5..93e6a691b3 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -32,6 +32,7 @@ typedef enum TimeoutId STANDBY_LOCK_TIMEOUT, IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IDLE_SESSION_TIMEOUT, + CLIENT_CONNECTION_CHECK_TIMEOUT, /* First user-definable timeout reason */ USER_TIMEOUT, /* Maximum number of timeout reasons */ diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 93e7829c67..db51a1d700 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global SUBDIRS = \ brin \ commit_ts \ + connection \ delay_execution \ dummy_index_am \ dummy_seclabel \ diff --git a/src/test/modules/connection/Makefile b/src/test/modules/connection/Makefile new file mode 100644 index 0000000000..5c44d7ad40 --- /dev/null +++ b/src/test/modules/connection/Makefile @@ -0,0 +1,16 @@ +# src/test/modules/connection/Makefile + +subdir = src/test/modules/connection +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +export with_openssl + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -rf tmp_check diff --git a/src/test/modules/connection/t/001_close_connection.pl b/src/test/modules/connection/t/001_close_connection.pl new file mode 100644 index 0000000000..f688fdb666 --- /dev/null +++ b/src/test/modules/connection/t/001_close_connection.pl @@ -0,0 +1,96 @@ +# Check if backend stopped after client disconnection +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; +use File::Copy; + +if ($ENV{with_openssl} eq 'yes') +{ + plan tests => 3; +} +else +{ + plan tests => 2; +} + +my $long_query = q{ +SELECT pg_sleep(60); +}; +my $set_guc_on = q{ + SET client_connection_check_interval = 1000; +}; +my $set_guc_off = q{ + SET client_connection_check_interval = 0; +}; +my ($pid, $timed_out); + +my $node = get_new_node('node'); +$node->init; +$node->start; + +######################################################### +# TEST 1: GUC client_connection_check_interval: enabled # +######################################################### + +# Set GUC options, get backend pid and run a long time query +$node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query", + stdout => \$pid, timeout => 2, timed_out => \$timed_out); + +# Give time to the backend to detect client disconnected +sleep 3; +# Check if backend is still alive +my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;"); +is($is_alive, '0', 'Test: client_connection_check_interval enable'); +$node->stop; + +########################################################## +# TEST 2: GUC client_connection_check_interval: disabled # +########################################################## + +$node->start; +$node->psql('postgres', "$set_guc_off SELECT pg_backend_pid(); $long_query", + stdout => \$pid, timeout => 2, timed_out => \$timed_out); +# Give time to the client to disconnect +sleep 3; +# Check if backend is still alive +$is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;"); +is($is_alive, '1', 'Test: client_connection_check_interval disable'); +$node->stop; + +########################################################## +# TEST 3: Using client_connection_check_interval when # +# client connected using SSL # +########################################################## + +if ($ENV{with_openssl} eq 'yes') +{ + # The client's private key must not be world-readable, so take a copy + # of the key stored in the code tree and update its permissions. + copy("../../ssl/ssl/client.key", "../../ssl/ssl/client_tmp.key"); + chmod 0600, "../../ssl/ssl/client_tmp.key"; + copy("../../ssl/ssl/client-revoked.key", "../../ssl/ssl/client-revoked_tmp.key"); + chmod 0600, "../../ssl/ssl/client-revoked_tmp.key"; + $ENV{PGHOST} = $node->host; + $ENV{PGPORT} = $node->port; + + open my $sslconf, '>', $node->data_dir . "/sslconfig.conf"; + print $sslconf "ssl=on\n"; + print $sslconf "ssl_cert_file='server-cn-only.crt'\n"; + print $sslconf "ssl_key_file='server-password.key'\n"; + print $sslconf "ssl_passphrase_command='echo secret1'\n"; + close $sslconf; + + $node->start; + $node->psql('postgres', "$set_guc_on SELECT pg_backend_pid(); $long_query", + stdout => \$pid, timeout => 2, timed_out => \$timed_out, + sslmode => 'require'); + + # Give time to the backend to detect client disconnected + sleep 3; + # Check if backend is still alive + my $is_alive = $node->safe_psql('postgres', "SELECT count(*) FROM pg_stat_activity where pid = $pid;"); + is($is_alive, '0', 'Test: client_connection_check_interval enabled, SSL'); + $node->stop; +} diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index a184404e21..833918e373 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -51,7 +51,9 @@ my @contrib_excludes = ( 'pgcrypto', 'sepgsql', 'brin', 'test_extensions', 'test_misc', 'test_pg_dump', - 'snapshot_too_old', 'unsafe_tests'); + 'snapshot_too_old', 'unsafe_tests', + 'connection' +); # Set of variables for frontend modules my $frontend_defines = { 'initdb' => 'FRONTEND' }; -- 2.30.1