From d4eb4b8bdb36ec928bb14b081f9067f223afa791 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Sun, 10 Jun 2018 18:44:42 +0900 Subject: [PATCH v22 2/4] Support atomic commit among multiple foreign servers. --- doc/src/sgml/catalogs.sgml | 97 + doc/src/sgml/config.sgml | 143 +- doc/src/sgml/distributed-transaction.sgml | 157 ++ doc/src/sgml/fdwhandler.sgml | 203 ++ doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/func.sgml | 51 + doc/src/sgml/monitoring.sgml | 60 + doc/src/sgml/postgres.sgml | 1 + src/backend/access/Makefile | 2 +- src/backend/access/fdwxact/Makefile | 17 + src/backend/access/fdwxact/fdwxact.c | 2678 +++++++++++++++++++++++++ src/backend/access/fdwxact/fdwxact_launcher.c | 641 ++++++ src/backend/access/fdwxact/fdwxact_resolver.c | 331 +++ src/backend/access/heap/heapam.c | 12 - src/backend/access/rmgrdesc/Makefile | 8 +- src/backend/access/rmgrdesc/fdwxactdesc.c | 65 + src/backend/access/rmgrdesc/xlogdesc.c | 6 +- src/backend/access/transam/Makefile | 6 +- src/backend/access/transam/rmgr.c | 1 + src/backend/access/transam/twophase.c | 42 + src/backend/access/transam/xact.c | 26 +- src/backend/access/transam/xlog.c | 32 +- src/backend/catalog/system_views.sql | 11 + src/backend/commands/copy.c | 7 + src/backend/commands/foreigncmds.c | 30 + src/backend/executor/execPartition.c | 8 + src/backend/executor/nodeForeignscan.c | 25 + src/backend/executor/nodeModifyTable.c | 24 + src/backend/foreign/foreign.c | 43 + src/backend/postmaster/bgworker.c | 8 + src/backend/postmaster/pgstat.c | 18 + src/backend/postmaster/postmaster.c | 15 +- src/backend/replication/logical/decode.c | 1 + src/backend/storage/ipc/ipci.c | 6 + src/backend/storage/ipc/procarray.c | 46 + src/backend/storage/lmgr/lwlocknames.txt | 2 + src/backend/storage/lmgr/proc.c | 8 + src/backend/tcop/postgres.c | 14 + src/backend/utils/misc/guc.c | 80 + src/backend/utils/misc/postgresql.conf.sample | 16 + src/backend/utils/probes.d | 2 + src/bin/initdb/initdb.c | 1 + src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_resetwal/pg_resetwal.c | 2 + src/bin/pg_waldump/rmgrdesc.c | 1 + src/include/access/fdwxact.h | 149 ++ src/include/access/fdwxact_launcher.h | 32 + src/include/access/fdwxact_resolver.h | 23 + src/include/access/fdwxact_xlog.h | 52 + src/include/access/resolver_internal.h | 67 + src/include/access/rmgrlist.h | 1 + src/include/access/twophase.h | 1 + src/include/access/xact.h | 7 + src/include/access/xlog_internal.h | 1 + src/include/catalog/pg_control.h | 1 + src/include/catalog/pg_proc.dat | 23 + src/include/foreign/fdwapi.h | 18 +- src/include/foreign/foreign.h | 2 +- src/include/pgstat.h | 8 +- src/include/storage/proc.h | 10 + src/include/storage/procarray.h | 5 + src/include/utils/guc_tables.h | 2 + src/test/regress/expected/rules.out | 12 + 63 files changed, 5324 insertions(+), 40 deletions(-) create mode 100644 doc/src/sgml/distributed-transaction.sgml create mode 100644 src/backend/access/fdwxact/Makefile create mode 100755 src/backend/access/fdwxact/fdwxact.c create mode 100644 src/backend/access/fdwxact/fdwxact_launcher.c create mode 100644 src/backend/access/fdwxact/fdwxact_resolver.c create mode 100644 src/backend/access/rmgrdesc/fdwxactdesc.c create mode 100644 src/include/access/fdwxact.h create mode 100644 src/include/access/fdwxact_launcher.h create mode 100644 src/include/access/fdwxact_resolver.h create mode 100644 src/include/access/fdwxact_xlog.h create mode 100644 src/include/access/resolver_internal.h diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 8b7f169..f2f0571 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -9597,6 +9597,103 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_prepared_fdw_xacts</structname> + + + pg_prepared_fdw_xacts + + + + The view pg_prepared_fdw_xacts displays + information about foreign transactions that are currently prepared on + foreign servers for atomic distributed transaction commit (see + for details). + + + + pg_prepared_xacts contains one row per prepared + foreign transaction. An entry is removed when the foreign transaction is + committed or rolled back. + + + + <structname>pg_prepared_fdw_xacts</structname> Columns + + + + + Name + Type + References + Description + + + + + dbid + oid + pg_database.oid + + OID of the database which the foreign transaction resides in + + + + transaction + xid + + + Transaction id that this foreign transaction associates with + + + + serverid + oid + pg_foreign_server.oid + + The OID of the foreign server that this foreign server is prepared + + + + userid + oid + pg_user.oid + + The OID of the user that prepared this foreign transaction. + + + + status + text + + + Status of foreign transaction: prepared, committing, aborting or unknown + + + + identifier + text + + + The identifier of the prepared foreign transaction. + + + + +
+ + + When the pg_prepared_xacts view is accessed, the + internal transaction manager data structures are momentarily locked, and + a copy is made for the view to display. This ensures that the + view produces a consistent set of results, while not blocking + normal operations longer than necessary. Nonetheless + there could be some impact on database performance if this view is + frequently accessed. + + +
+ <structname>pg_publication_tables</structname> diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0f8f2ef..4fffb76 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3611,7 +3611,6 @@ ANY num_sync ( @@ -7827,6 +7826,148 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir' + + Distributed Transaction Management + + + Setting + + + + distributed_atomic_commit (enum) + + distributed_atomic_commit configuration parameter + + + + + Specifies whether transaction commit will wait for all involving foreign transaction + to be resolved before the command returns a "success" indication to the client. + Valid values are required, prefer and + disabled. The default setting is disabled. + When disabled, there can be risk of database consistency among + distributed transaction if some foreign server crashes during committing the + distributed transaction. When set to required the distributed + transaction requires that all written servers can use two-phase commit protocol. + That is, the transaction fails if any of servers returns false + from IsTwoPhaseCommitEnabled or does not support transaction + management callback routines(described in + ). + When set to prefer the distributed transaction requires + two-phase commit protocol where available but without failing when it is not + available. + + + + Both max_prepared_foreign_transactions and + max_foreign_transaction_resolvers must be non-zero value to + set this parameter either required or prefer. + + + + This parameter can be changed at any time; the behavior for any one transaction + is determined by the setting in effect when it commits. + + + + + + max_prepared_foreign_transactions (integer) + + max_prepared_foreign_transactions configuration parameter + + + + + Sets the maximum number of foreign transactions that can be prepared + simultaneously. A single local transaction can give rise to multiple + foreign transaction. If N local transactions each + across K foreign server this value need to be set + N * K, not just N. + This parameter can only be set at server start. + + + When running a standby server, you must set this parameter to the + same or higher value than on the master server. Otherwise, queries + will not be allowed in the standby server. + + + + + + + + + Foreign Transaction Resolvers + + + These settings control the behavior of a foreign transaction resolver. + + + + + max_foreign_transaction_resolvers (int) + + max_foreign_transaction_resolvers configuration parameter + + + + + Specifies maximum number of foreign transaction resolution workers. A foreign transaction + resolver is responsible for foreign transaction resolution on one database. + + + Foreign transaction resolution workers are taken from the pool defined by + max_worker_processes. + + + The default value is 0. + + + + + + foreign_transaction_resolution_retry_interval (integer) + + foreign_transaction_resolution_interval configuration parameter + + + + + Specify how long the foreign transaction resolver should wait when the last resolution + fails before retrying to resolve foreign transaction. This parameter can only be set in the + postgresql.conf file or on the server command line. + + + The default value is 10 seconds. + + + + + + foreign_transaction_resolver_timeout (integer) + + foreign_transaction_resolver_timeout configuration parameter + + + + + Terminate foreign transaction resolver processes that don't have any foreign + transactions to resolve longer than the specified number of milliseconds. + A value of zero disables the timeout mechanism. You should set this value to + zero only if you set max_foreign_transaction_resolvers as + much as databases you have. This parameter can only be set in the + postgresql.conf file or on the server command line. + + + The default value is 60 seconds. + + + + + + + Version and Platform Compatibility diff --git a/doc/src/sgml/distributed-transaction.sgml b/doc/src/sgml/distributed-transaction.sgml new file mode 100644 index 0000000..deb8a60 --- /dev/null +++ b/doc/src/sgml/distributed-transaction.sgml @@ -0,0 +1,157 @@ + + + + Distributed Transaction Management + + + This chapter explains what distributed transaction management is, and how it can be configured + in PostgreSQL. + + + + Atomic Commit + + + Atomic commit is an operation that applies a set of changes as a single operation + globally. PostgreSQL provides a way to perform a transaction + with foreign resources using Foreign Data Wrapper. Using the + PostgreSQL's atomic commit ensures that all changes + on foreign servers end in either commit or rollback using the transaction callback + routines (see ). + + + + Atomic Commit Using Two-phase Commit Protocol + + + To achieve commit among all foreign servers automatially, + PostgreSQL employs Two-phase commit protocol, which is a + type of atomic commitment protocol (ACP). Using Two-phase commit protocol, the commit + sequence of distributed transaction performs with the following steps. + + + + Prepare all transactions on foreign servers. + + + + + Commit locally. + + + + + Resolve all prepared transaction on foreign servers. + + + + + + + + At the first step, PostgreSQL distributed transaction manager + prepares all transaction on the foreign servers if two-phase commit is required. + Two-phase commit is required only if the transaction modifies data on two or more + servers including the local server itself and user requests it by + . If all preparations on foreign servers + got successful go to the next step. Any failure happens in this step + PostgreSQL changes over rollback, then rollback all transactions + on both local and foreign servers. + + + + At the local commit step, PostgreSQL commit the transaction + locally. Any failure happens in this step PostgreSQL changes + over rollback, then rollback all transactions on both local and foreign servers. + + + + At the final step, prepared transactions are resolved by a foreign Transaction + resolver process. + + + + + Foreign Transaction Resolution + + + Foreign transaction resolutions are performed by foreign transaction resolver process. + They commit all prepared transaction on foreign servers if the coordinator received + an agreement message from all foreign server during the first step. On the other hand, + if any foreign server failed to prepare the transaction, it rollbacks all prepared + transactions. + + + + One foreign transaction resolver is responsible for transaction resolutions on one + database of the coordinator side. On failure during resolution, they retries to + resolve after foreign_transaction_resolution_interval. + + + + + In-doubt Transactions + + + The atomic commit mechanism ensures that all foreign servers either commit or rollback + using two-phase commit protocol. However, if the second phase fails for whatever reason + the transaction becomes in-doubt. The transactions becomes in-doubt in the following + situations: + + + + + A local PostgreSQL server crashes during atomic commit + operation. + + + + + + A local PostgreSQL server got a cancellation by user during + atomic commit. + + + + + In-doubt transactions are automatically handled by foreign transaction resolver process + when there is no online transaction requesting resolutions. + pg_resolve_fdw_xact provides a way to resolve transactions on foreign + servers manually that participated the distributed transaction manually. + + + + The atomic commit operation is crash-safe. The being processed foreign transactions at + crash are processed by a foreign transaction resolvers as an in-doubt transaction + + + + + Monitoring + + The monitoring information about foreign transaction resolvers is visible in + pg_stat_fdwxact_resolver + view. This view contains one row for every foreign Transaction resolver worker. + + + + + Configuration Settings + + + Atomic commit requires several configuration options to be set. + + + + On the coordinator side, and + must be non-zero value. + Additionally the max_worker_processes may need to be adjusted to + accommodate for foreign transaction resolver workers, at least + (max_foreign_transaction_resolvers + 1). + Note that some extensions and parallel queries also take worker slots from + max_worker_processes. + + + + + diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml index 4ce88dd..90cc415 100644 --- a/doc/src/sgml/fdwhandler.sgml +++ b/doc/src/sgml/fdwhandler.sgml @@ -1390,6 +1390,118 @@ ReparameterizeForeignPathByChild(PlannerInfo *root, List *fdw_private, + + FDW Routines For Transaction Managements + + + If an FDW wishes to support atomic commit + (as described in ), it must call the + registrasaction function FdwXactRegisterForeignTransaction + and provide the following callback functions: + + + + +bool +PrepareForeignTransaction(FdwXactResolveState *state); + + Prepare the transaction on the foreign server. This function is called at the + pre-commit phase of the local transactions if atomic commit is required. + Returning true means that preparing the foreign + transaction got successful. + + + +bool +CommitForeignTransaction(FdwXactResolveState *state); + + Commit the not-prepared transaction on the foreign server. + This function is called at the pre-commit phase of local + transaction if atomic commit is not required. The atomic + commit is not required either when we modified data on + only one server including the local server or when userdoesn't + request atomic commit by . + Returning true means that commit the + foreign transaction got successful. + + + +bool +RollbackForeignTransaction(FdwXactResolveState *state); + + Rollback a not-prepared transaction on the foreign server. + This function is called at the end of local transaction after + rollbacked locally either when user requested rollback or when + any error occurs during the transaction. This function could + be called recursively if any error occurs during rollback the + foreign transaction for whatever reason. You need to track + recursion and prevent this function from being called infinitely. + Returning true means that rollback the + foreign transaction got successful. + + + +bool +ResolvePreparedForeignTransaction(FdwXactResolveState *state, + bool is_commit); + + Commit or rollback the prepared transaction on the foreign server. + When is_commit is true, it indicates that the foreign + transaction should be committed. Otherwise the foreign transaction should + be aborted. + This function normally is called by the foreign transaction resolver + process but can also be called by pg_resovle_fdw_xacts + function. In the resolver process, this function is called either + when a backend requests the resolver process to resolve a distributed + transaction after prepared, or when a database has dangling + transactions. Returning true means that resolving + the foreign transaction got successful. + In abort case, please note that the prepared transaction identified + by state->fdwxact_id might not exist on the foreign + server. If you failed to resolve the foreign transaction due to undefined + object error (ERRCODE_UNDEFINED_OBJECT) you should + regards it as success and return true. + + + +bool +IsTwoPhaseCommitEnabled(Oid serverid); + + Return true if the foreign server identified by + serverid is capable of two-phase commit protocol. + This function is called at commit time once. + Return false indicates that the current transaction + cannot use atomic commit even if atomic commit is requested by user. + + + + +char * +GetPrepareId(TransactionId xid, Oid serverid, Oid userid, int *prep_id_len); + + Return null terminated string that represents prepared transaction identifier + with its length *prep_id_len. + This optional function is called during executor startup for once per the + foreign server. Note that the transaction identifier must be string literal, + less than NAMEDATALEN bytes long and should not be same + as any other concurrent prepared transaction id. If this callback routine + is not supported, PostgreSQL's distributed + transaction manager generates an unique identifier with in the form of + fx_<random value up to 231>_<server oid>_<user oid>. + + + + + Functions PrepareForeignTransaction, + CommitForeignTransaction and + RolblackForeignTransaction are called + at outside of a valid transaction state. So please note that + you cannot use functions that use the system catalog cache + such as Foreign Data Wrapper helper functions described in + . + + + @@ -1835,4 +1947,95 @@ GetForeignServerByName(const char *name, bool missing_ok); + + Transaction managements for Foreign Data Wrappers + + + PostgreSQL foreign transaction manager + allows FDWs to read and write data on foreign server within a transaction while + maintaining atomicity of the foreign data (aka atomic commit). Using + atomic commit, it guarantees that a distributed transaction is committed + or rollbacked on all participants foreign + server. To achieve atomic commit, PostgreSQL + employees two-phase commit protocol, which is a type of atomic commitment + protocol. Every FDW that wish to support atomic commit + is required to support the transaction management callback routines: + PrepareForeignTransaction, + CommitForeignTransaction, + RollbackForeignTransaction, + ResolveForeignTransaction, + IsTwoPhaseCommitEnabled + (see for details). + Tranasction of foreign server that supports these callback routines is + managed by PostgreSQL's distributed transaction + manager. Each transaction management callbacks are called at appropriate time. + + + + The information in FdwXactState can be used to identify + foreign servers. state->fdw_state is a void + pointer that is available for FDW transaction functions to store Information + relevant to the particular foreign server. It is useful for passing + information forward from PrepareForeignTransaction and/or + CommitTransaciton to + RollbackForeignTransaction, there by avoiding recalculation. + Note that since ResolveForeignTransaction is called + idependently from these callback routines, the information is not passed to + ResolverForeignTransaction. + + + + An example of distributed transaction is as follows + +BEGIN; +UPDATE ft1 SET col = 'a'; +UPDATE ft2 SET col = 'b'; +COMMIT; + + ft1 and ft2 are foreign tables on different foreign servers may be using different + Foreign Data Wrappers. + + + + When the core executor access the foreign servers, foreign servers whose FDW + supports transaction management callback routines is registered as a participant. + During registration, GetPrepareId is called if provided to + generate an unique transaction identifer. + + + + During pre-commit phase of local transaction, the foreign transaction manager + persists the foreign transaction information to the disk and WAL, and then + prepare all foreign transaction by calling PrepareForeignTransaction + if two-phase commit protocol is required. Two-phase commit is required only if + the transaction modified data on more than one servers including the local + server itself and user requests atomic commit. PostgreSQL + can commit locally and go to the next step if and only if all preparing foreign + transactions got successful. If two-phase commit is not required, the foreign + transaction manager commits each transaction calling + CommitForeignTransaction and then commit locally. + If any failure happens or user requests to cancel during the pre-commit phase + the distributed Transaction manager changes over rollback and calls + RollbackForeignTransaction for not-prepared foreign + servers, and then rollback locally. The prepared foreign servers are rollbacked + by a foreign transaction resolver process. + + + + Once committed locally, the distributed transaction must be committed. The + prepared foreign transaction will be committed by foreign transaction resolver + process. + + + + When two-phase commit is required, after committed locally, the transaction + commit will wait for all prepared foreign transaction to be committed before + completetion. One foreign transaction resolver process is responsible for + foreign transaction resolution on a database. + ResolverForeignTransaction is called by the foreign + transaction resolver process when resolution. + ResolveForeignTransaction is also be called + when user executes pg_resovle_fdw_xact function. + + diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 48ac14a..38d6fcb 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -49,6 +49,7 @@ + diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index edeb3fd..d609324 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -20905,6 +20905,57 @@ SELECT (pg_stat_file('filename')).modification; + + Foreign Transaction Management Functions + + + pg_resolve_fdw_xacts + + + pg_remove_fdw_xacts + + + + shows the functions + available for foreign transaction management. + These functions cannot be executed during recovery. Use of these function + is restricted to superusers. + + + + Foreign Transaction Management Functions + + + Name Return Type Description + + + + + + pg_resolve_fdw_xact(transaction xid, userid oid, userid oid) + + bool + + Resolve a foreign transaction. This function search for foreign transaction + matching the arguments and resolves then. This function won't resolve + a foreign transaction which is in progress, or one that is locked by some + other backend. + + + + + pg_remove_fdw_xact(transaction xid, serverid oid, userid oid) + + void + + This function works the same as pg_resolve_fdw_xact + except it remove foreign transaction entry without resolving. + + + + +
+
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 7aada14..53f9e72 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -332,6 +332,14 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_fdw_xact_resolverpg_stat_fdw_xact_resolver + One row per foreign transaction resolver process, showing statistics about + foreign transaction resolution. See for + details. + + + @@ -1198,6 +1206,18 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting in main loop of checkpointer process. + FdwXactLauncherMain + Waiting in main loop of foreign transaction resolution launcher process. + + + FdwXactResolverMain + Waiting in main loop of foreign transaction resolution worker process. + + + LogicalLauncherMain + Waiting in main loop of logical launcher process. + + LogicalApplyMain Waiting in main loop of logical apply process. @@ -1413,6 +1433,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting for confirmation from remote server during synchronous replication. + FdwXactResolution + Waiting for all foreign transaction participants to be resolved during atomic commit among foreign servers. + + Timeout BaseBackupThrottle Waiting during base backup when throttling activity. @@ -2222,6 +2246,42 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i connection. + + <structname>pg_stat_fdw_xact_resolver</structname> View + + + + Column + Type + Description + + + + + + pid + integer + Process ID of a foreign transaction resolver process + + + dbid + oid + OID of the database to which the foreign transaction resolver is connected + + + last_resolved_time + timestamp with time zone + Time at which the process last resolved a foreign transaction + + + +
+ + + The pg_stat_fdw_xact_resolver view will contain one + row per foreign transaction resolver process, showing state of resolution + of foreign transactions. + <structname>pg_stat_archiver</structname> View diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index 0070603..c10e21f 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -164,6 +164,7 @@ &wal; &logical-replication; &jit; + &distributed-transaction; ®ress; diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index bd93a6a..4a1ebdc 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -9,6 +9,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - tablesample transam + tablesample transam fdwxact include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/fdwxact/Makefile b/src/backend/access/fdwxact/Makefile new file mode 100644 index 0000000..9ddbb14 --- /dev/null +++ b/src/backend/access/fdwxact/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/fdwxact +# +# IDENTIFICATION +# src/backend/access/fdwxact/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/fdwxact +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = fdwxact.o fdwxact_resolver.o fdwxact_launcher.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/fdwxact/fdwxact.c b/src/backend/access/fdwxact/fdwxact.c new file mode 100755 index 0000000..109d6a7 --- /dev/null +++ b/src/backend/access/fdwxact/fdwxact.c @@ -0,0 +1,2678 @@ +/*------------------------------------------------------------------------- + * + * fdwxact.c + * PostgreSQL distributed transaction manager for foreign servers. + * + * To achieve commit among all foreign servers automically, we employee + * two-phase commit protocol, which is a type of atomic commitment + * protocol(ACP). The basic strategy is that we prepare all of the remote + * transactions before committing locally and commit them after committing + * locally. + * + * When a foreign data wrapper starts transaction on a foreign server that + * is capable of two-phase commit protocol, foreign data wrappers registers + * the foreign transaction using function FdwXactRegisterForeignTransaction() + * in order to participate to a group for atomic commit. Participants are + * identified by oid of foreign server and user. When the foreign transaction + * begins to modify data the executor marks it as modified using + * FdwXactMarkForeignTransactionModified(). + * + * During pre-commit of local transaction, we prepare the transaction on + * foreign server everywhere. After committing or rolling back locally, we + * notify the resolver process and tell it to commit or roll back those + * transactions. If we ask it to commit, we also tell it to notify us when + * it's done, so that we can wait interruptibly for it to finish, and so + * that we're not trying to locally do work that might fail when an ERROR + * after already committed. + * + * The best performing way to manage the waiting backends is to have a + * queue of waiting backends, so that we can avoid searching the through all + * waiters each time we receive a request. We have two queues: the active + * queue and the retry queue. The backend is inserted to the active queue at + * first, and then it is moved to the retry queue by the resolver process if + * the resolution fails. The backends in the retry queue are processed at + * interval of foreign_transaction_resolution_retry_interval. + * + * Two-phase commit protocol is required if the transaction modified two or more + * servers including itself. In other case, all foreign transactions are + * committed during pre-commit. + * + * If any network failure, server crash occurs or user stopped waiting + * prepared foreign transactions are left in in-doubt state (aka. dangling + * transaction). Dangling transactions are processed by the resolve process + * + * During replay WAL and replication FdwXactCtl also holds information about + * active prepared foreign transaction that haven't been moved to disk yet. + * + * Replay of fdwxact records happens by the following rules: + * + * * On PREPARE redo we add the foreign transaction to FdwXactCtl->fdw_xacts. + * We set fdw_xact->inredo to true for such entries. + * * On Checkpoint redo, we iterate through FdwXactCtl->fdw_xacts entries that + * have set fdw_xact->inredo true and are behind the redo_horizon. We save + * them to disk and then set fdw_xact->ondisk to true. + * * On COMMIT and ABORT we delete the entry from FdwXactCtl->fdw_xacts. + * If fdw_xact->ondisk is true, we delete the corresponding file from + * the disk as well. + * * RecoverFdwXacts loads all foreign transaction entries from disk into + * memory at server startup. + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/fdwxact/fdwxact.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/fdwxact.h" +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" +#include "access/fdwxact_xlog.h" +#include "access/resolver_internal.h" +#include "access/htup_details.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "foreign/fdwapi.h" +#include "foreign/foreign.h" +#include "funcapi.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "parser/parsetree.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lock.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +/* Is atomic commit requested by user? */ +#define IsAtomicCommitEnabled() \ + (max_prepared_foreign_xacts > 0 && \ + max_foreign_xact_resolvers > 0) + +#define IsAtomicCommitRequested() \ + (IsAtomicCommitEnabled() && \ + (distributed_atomic_commit > DISTRIBUTED_ATOMIC_COMMIT_DISABLED)) + +/* Structure to bundle the foreign transaction participant */ +typedef struct FdwXactParticipant +{ + /* + * Pointer to a FdwXact entry in global entry. NULL if + * this foreign transaction is registered but not inserted + * yet. + */ + FdwXact fdw_xact; + char *fdw_xact_id; + + ForeignServer *server; + UserMapping *usermapping; + bool modified; /* true if modified the data on server */ + bool twophase_commit_enabled; /* true if the server can execute + * two-phase commit protocol */ + void *fdw_state; /* fdw-private state */ + + /* Callbacks for foreign transaction */ + PrepareForeignTransaction_function prepare_foreign_xact; + CommitForeignTransaction_function commit_foreign_xact; + RollbackForeignTransaction_function rollback_foreign_xact; + GetPrepareId_function get_prepareid; + IsTwoPhaseCommitEnabled_function is_twophase_commit_enabled; +} FdwXactParticipant; + +/* + * List of foreign transaction participants for atomic commit. + * This list has only foreign servers that support atomic commit FDW + * API regardless of their configuration. + */ +static List *FdwXactAtomicCommitParticipants = NIL; +static bool FdwXactAtomicCommitReady = false; + +/* Directory where the foreign prepared transaction files will reside */ +#define FDW_XACTS_DIR "pg_fdw_xact" + +/* + * Name of foreign prepared transaction file is 8 bytes database oid, + * xid, foreign server oid and user oid separated by '_'. + * + * Since FdwXact stat file is created per foreign transaction in a + * distributed transaction and the xid of unresolved distributed + * transaction never reused, the name is fairly enough to ensure + * uniqueness. + */ +#define FDW_XACT_FILE_NAME_LEN (8 + 1 + 8 + 1 + 8 + 1 + 8) +#define FdwXactFilePath(path, dbid, xid, serverid, userid) \ + snprintf(path, MAXPGPATH, FDW_XACTS_DIR "/%08X_%08X_%08X_%08X", \ + dbid, xid, serverid, userid) + +static FdwXact FdwXactInsertFdwXactEntry(TransactionId xid, FdwXactParticipant *fdw_part); +static void FdwXactPrepareForeignTransactions(void); +static void FdwXactCommitForeignTransaction(FdwXactParticipant *fdw_part); +static bool FdwXactResolveForeignTransaction(FdwXactState *state, FdwXact fdwxact, + int elevel); +static void FdwXactComputeRequiredXmin(void); +static bool FdwXactAtomicCommitRequired(void); +static void FdwXactQueueInsert(void); +static void FdwXactCancelWait(void); +static void FdwXactRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn); +static void FdwXactRedoRemove(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, bool give_warnings); +static void AtProcExit_FdwXact(int code, Datum arg); +static void ForgetAllFdwXactParticipants(void); +static char *ReadFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, bool give_warnings); +static void RemoveFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool giveWarning); +static void RecreateFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + void *content, int len); +static void XlogReadFdwXactData(XLogRecPtr lsn, char **buf, int *len); +static char *ProcessFdwXactBuffer(Oid dbid, TransactionId local_xid, Oid serverid, + Oid userid, XLogRecPtr insert_start_lsn, + bool give_warnings); +static void register_fdw_xact(Oid serverid, Oid userid, bool modified); +static List *get_fdw_xacts(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool need_lock); +static FdwXact get_one_fdw_xact(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool need_lock); +static FdwXact get_all_fdw_xacts(int *length); +static FdwXact insert_fdw_xact(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + Oid umid, char *fdw_xact_id); +static char *generate_fdw_xact_identifier(TransactionId xid, Oid serverid, Oid userid); +static void remove_fdw_xact(FdwXact fdw_xact); +static FdwXactState *create_fdw_xact_state(void); + +/* Guc parameters */ +int max_prepared_foreign_xacts = 0; +int max_foreign_xact_resolvers = 0; +int distributed_atomic_commit = DISTRIBUTED_ATOMIC_COMMIT_DISABLED; + +/* Keep track of registering process exit call back. */ +static bool fdwXactExitRegistered = false; + +/* + * Remember accessed foreign server. Both RegisterFdwXactByRelId and + * RegisterFdwXactByServerId are called by executor during initialization. + */ +void +RegisterFdwXactByRelId(Oid relid, bool modified) +{ + Relation rel; + Oid serverid; + Oid userid; + + rel = relation_open(relid, NoLock); + serverid = GetForeignServerIdByRelId(relid); + userid = rel->rd_rel->relowner ? rel->rd_rel->relowner : GetUserId(); + relation_close(rel, NoLock); + + register_fdw_xact(serverid, userid, modified); +} + +void +RegisterFdwXactByServerId(Oid serverid, bool modified) +{ + register_fdw_xact(serverid, GetUserId(), modified); +} + +/* + * Register given foreign transaction identified by given arguments as + * a participant of the transaction. + * + * The foreign server identified by given server id must support atomic + * commit APIs. Registered foreign transaction are managed by foreign + * transaction manager until the end of the transaction. + */ +static void +register_fdw_xact(Oid serverid, Oid userid, bool modified) +{ + FdwXactParticipant *fdw_part; + ForeignServer *foreign_server; + ForeignDataWrapper *fdw; + UserMapping *user_mapping; + MemoryContext old_ctx; + FdwRoutine *routine; + ListCell *lc; + + /* + * Participants information is needed at the end of a transaction, where + * system cache are not available. Save it in TopTransactionContext + * beforehand so that these can live until the end of transaction. + */ + old_ctx = MemoryContextSwitchTo(TopTransactionContext); + + routine = GetFdwRoutineByServerId(serverid); + + /* + * If the being modified foreign server doesn't have the atomic commit API + * we don't manage the foreign transaction in the distributed transaction + * manager. + */ + if (routine->IsTwoPhaseCommitEnabled == NULL) + { + MyXactFlags |= XACT_FLAGS_FDWNOPREPARE; + pfree(routine); + return; + } + + foreach(lc, FdwXactAtomicCommitParticipants) + { + FdwXactParticipant *fp = (FdwXactParticipant *) lfirst(lc); + + if (fp->server->serverid == serverid && + fp->usermapping->userid == userid) + { + /* The foreign server is already registered, return */ + fp->modified |= modified; + pfree(routine); + return; + } + } + + foreign_server = GetForeignServer(serverid); + fdw = GetForeignDataWrapper(foreign_server->fdwid); + user_mapping = GetUserMapping(userid, serverid); + + /* Make sure that the FDW has transaction handlers */ + if (!routine->PrepareForeignTransaction) + ereport(ERROR, + (errmsg("no function provided for preparing foreign transaction for FDW %s", + fdw->fdwname))); + if (!routine->CommitForeignTransaction) + ereport(ERROR, + (errmsg("no function to commit a foreign transaction provided for FDW %s", + fdw->fdwname))); + if (!routine->RollbackForeignTransaction) + ereport(ERROR, + (errmsg("no function to rollback a foreign transaction provided for FDW %s", + fdw->fdwname))); + + fdw_part = (FdwXactParticipant *) palloc(sizeof(FdwXactParticipant)); + + fdw_part->fdw_xact_id = NULL; + fdw_part->server = foreign_server; + fdw_part->usermapping = user_mapping; + fdw_part->fdw_xact = NULL; + fdw_part->modified = modified; + fdw_part->twophase_commit_enabled = true; /* by default, will be changed at pre-commit phase */ + fdw_part->fdw_state = NULL; + fdw_part->prepare_foreign_xact = routine->PrepareForeignTransaction; + fdw_part->commit_foreign_xact = routine->CommitForeignTransaction; + fdw_part->rollback_foreign_xact = routine->RollbackForeignTransaction; + fdw_part->is_twophase_commit_enabled = routine->IsTwoPhaseCommitEnabled; + fdw_part->get_prepareid = routine->GetPrepareId; + + /* Add this foreign transaction to the participants list */ + FdwXactAtomicCommitParticipants = lappend(FdwXactAtomicCommitParticipants, fdw_part); + + /* Revert back the context */ + MemoryContextSwitchTo(old_ctx); +} + +/* + * FdwXactShmemSize + * Calculates the size of shared memory allocated for maintaining foreign + * prepared transaction entries. + */ +Size +FdwXactShmemSize(void) +{ + Size size; + + /* Size for foreign transaction information array */ + size = offsetof(FdwXactCtlData, fdw_xacts); + size = add_size(size, mul_size(max_prepared_foreign_xacts, + sizeof(FdwXact))); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_prepared_foreign_xacts, + sizeof(FdwXactData))); + + return size; +} + +/* + * FdwXactShmemInit + * Initialization of shared memory for maintaining foreign prepared transaction + * entries. The shared memory layout is defined in definition of FdwXactCtlData + * structure. + */ +void +FdwXactShmemInit(void) +{ + bool found; + + if (!fdwXactExitRegistered) + { + before_shmem_exit(AtProcExit_FdwXact, 0); + fdwXactExitRegistered = true; + } + + FdwXactCtl = ShmemInitStruct("Foreign transactions table", + FdwXactShmemSize(), + &found); + if (!IsUnderPostmaster) + { + FdwXact fdw_xacts; + int cnt; + + Assert(!found); + FdwXactCtl->freeFdwXacts = NULL; + FdwXactCtl->numFdwXacts = 0; + + /* Initialize the linked list of free FDW transactions */ + fdw_xacts = (FdwXact) + ((char *) FdwXactCtl + + MAXALIGN(offsetof(FdwXactCtlData, fdw_xacts) + + sizeof(FdwXact) * max_prepared_foreign_xacts)); + for (cnt = 0; cnt < max_prepared_foreign_xacts; cnt++) + { + fdw_xacts[cnt].status = FDW_XACT_INITIAL; + fdw_xacts[cnt].fxact_free_next = FdwXactCtl->freeFdwXacts; + FdwXactCtl->freeFdwXacts = &fdw_xacts[cnt]; + } + } + else + { + Assert(FdwXactCtl); + Assert(found); + } +} + +/* + * PreCommit_FdwXacts + * + */ +void +PreCommit_FdwXacts(void) +{ + bool need_atomic_commit; + ListCell *lc; + ListCell *next; + ListCell *prev = NULL; + + /* If there are no foreign servers involved, we have no business here */ + if (FdwXactAtomicCommitParticipants == NIL) + return; + + need_atomic_commit = FdwXactAtomicCommitRequired(); + + /* + * If 'require' case, we require all modified server have to be capable of + * two-phase commit protocol. + */ + if (need_atomic_commit && + distributed_atomic_commit == DISTRIBUTED_ATOMIC_COMMIT_REQUIRED && + (MyXactFlags & XACT_FLAGS_FDWNOPREPARE) != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot COMMIT a distributed transaction that has operated on foreign server that doesn't support atomic commit"))); + + /* + * Commit transactions on foreign servers. + * + * Committed transactions are removed from FdwXactAtomicCommitParticipants + * so that the later preparation can process only servers that requires to be commit + * using two-phase commit protocol. + */ + for (lc = list_head(FdwXactAtomicCommitParticipants); lc != NULL; lc = next) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + bool can_commit = false; + + next = lnext(lc); + + if (!need_atomic_commit || !fdw_part->modified) + { + /* + * We can commit not-modified servers and when the atomic commit is not + * required. + */ + can_commit = true; + } + else if (distributed_atomic_commit == DISTRIBUTED_ATOMIC_COMMIT_PREFER && + !fdw_part->twophase_commit_enabled) + { + /* Also in 'prefer' case, non-2pc-capable servers can be committed */ + can_commit = true; + } + + if (can_commit) + { + /* Commit the foreign transaction */ + FdwXactCommitForeignTransaction(fdw_part); + + /* Delete it from the participant list */ + FdwXactAtomicCommitParticipants = + list_delete_cell(FdwXactAtomicCommitParticipants, lc, prev); + + continue; + } + + prev = lc; + } + + /* + * If only one participant of all participants is modified, we can commit it. + * This can avoid to use two-phase commit for only one server in the 'prefer' case + * where the transaction has one 2pc-capable modified server and some modified + * servers. + */ + if (list_length(FdwXactAtomicCommitParticipants) == 1 && + (MyXactFlags & XACT_FLAGS_WROTENONTEMPREL) == 0) + { + Assert(distributed_atomic_commit == DISTRIBUTED_ATOMIC_COMMIT_PREFER); + FdwXactCommitForeignTransaction(linitial(FdwXactAtomicCommitParticipants)); + list_free(FdwXactAtomicCommitParticipants); + return; + } + + FdwXactPrepareForeignTransactions(); + /* keep FdwXactparticipantsForAC until the end of transaction */ +} + +/* + * FdwXactPrepareForeignTransactions + * + * Prepare all foreign transaction participants. This function creates a prepared + * participants chain each time when we prepared a foreign transaction. The prepared + * participants chain is used to access all participants of distributed transaction + * quickly. If any one of them fails to prepare, we change over aborts. + */ +static void +FdwXactPrepareForeignTransactions(void) +{ + FdwXactState *state; + ListCell *lcell; + FdwXact prev_fdwxact = NULL; + TransactionId txid; + + if (FdwXactAtomicCommitParticipants == NIL) + return; + + /* Parameter check */ + if (max_prepared_foreign_xacts == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("prepread foreign transactions are disabled"), + errhint("Set max_prepared_foreign_transactions to a nonzero value."))); + + if (max_foreign_xact_resolvers == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("prepread foreign transactions are disabled"), + errhint("Set max_foreign_transaction_resolvers to a nonzero value."))); + + state = create_fdw_xact_state(); + + /* Loop over the foreign connections */ + txid = GetTopTransactionId(); + foreach(lcell, FdwXactAtomicCommitParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lcell); + FdwXact fdwxact; + + /* Generate an unique identifier */ + if (fdw_part->get_prepareid) + { + char *id; + int fdwxact_id_len = 0; + + id = fdw_part->get_prepareid(txid, fdw_part->server->serverid, + fdw_part->usermapping->userid, + &fdwxact_id_len); + + if (!id) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + (errmsg("foreign transaction identifier is not provided")))); + + /* Check length of foreign transaction identifier */ + id[fdwxact_id_len] = '\0'; + if (fdwxact_id_len > NAMEDATALEN) + ereport(ERROR, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("foreign transaction identifer \"%s\" is too long", + id), + errdetail("foreign transaction identifier must be less than %d characters.", + NAMEDATALEN))); + + fdw_part->fdw_xact_id = pstrdup(id); + } + else + fdw_part->fdw_xact_id = generate_fdw_xact_identifier(txid, + fdw_part->server->serverid, + fdw_part->usermapping->userid); + + /* + * Insert the foreign transaction entry. Registration persists this + * information to the disk and logs (that way relaying it on standby). + * Thus in case we loose connectivity to the foreign server or crash + * ourselves, we will remember that we might have prepared transaction + * on the foreign server and try to resolve it when connectivity is + * restored or after crash recovery. + * + * If we prepare the transaction on the foreign server before persisting + * the information to the disk and crash in-between these two steps, + * we will forget that we prepared the transaction on the foreign server + * and will not be able to resolve it after the crash. Hence persist + * first then prepare. + */ + fdwxact = FdwXactInsertFdwXactEntry(txid, fdw_part); + + state->serverid = fdw_part->server->serverid; + state->userid = fdw_part->usermapping->userid; + state->umid = fdw_part->usermapping->umid; + state->fdwxact_id = pstrdup(fdwxact->fdw_xact_id); + + /* + * Between FdwXactInsertFdwXactEntry call till this backend hears + * acknowledge from foreign server, the backend may abort the local + * transaction (say, because of a signal). During abort processing, + * we might try to resolve a never-prepared transaction, and get an error. + * This is fine as long as the FDW provides us unique prepared transaction + * identifiers. + */ + if (!fdw_part->prepare_foreign_xact(state)) + { + /* Failed to prepare, change over aborts */ + ereport(ERROR, + (errmsg("could not prepare transaction on foreign server %s", + fdw_part->server->servername))); + } + + /* Keep fdw_state until end of transaction */ + fdw_part->fdw_state = state->fdw_state; + + /* Preparation is success, update its status */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + fdw_part->fdw_xact->status = FDW_XACT_PREPARED; + fdw_part->fdw_xact = fdwxact; + LWLockRelease(FdwXactLock); + + /* + * Create a prepared participants chain, which is link-ed FdwXact entries + * involving with this transaction. + */ + if (prev_fdwxact) + { + /* Append others to the tail */ + Assert(fdwxact->fxact_next == NULL); + prev_fdwxact->fxact_next = fdwxact; + } + } +} + +/* + * Commit the given foreign transaction. + */ +void +FdwXactCommitForeignTransaction(FdwXactParticipant *fdw_part) +{ + FdwXactState *state; + + state = create_fdw_xact_state(); + state->serverid = fdw_part->server->serverid; + state->userid = fdw_part->usermapping->userid; + state->umid = fdw_part->usermapping->umid; + fdw_part->fdw_state = (void *) state; + + if (!fdw_part->commit_foreign_xact(state)) + ereport(ERROR, + (errmsg("could not commit foreign transaction on server %s", + fdw_part->server->servername))); +} + +/* + * FdwXactInsertFdwXactEntry + * + * This function is used to create new foreign transaction entry before an FDW + * prepares and commit/rollback. The function adds the entry to WAL and it will + * be persisted to the disk under pg_fdw_xact directory when checkpoint. + */ +static FdwXact +FdwXactInsertFdwXactEntry(TransactionId xid, FdwXactParticipant *fdw_part) +{ + FdwXact fxact; + FdwXactOnDiskData *fxact_file_data; + MemoryContext old_context; + int data_len; + + old_context = MemoryContextSwitchTo(TopTransactionContext); + + /* + * Enter the foreign transaction in the shared memory structure. + */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + fxact = insert_fdw_xact(MyDatabaseId, xid, fdw_part->server->serverid, + fdw_part->usermapping->userid, + fdw_part->usermapping->umid, fdw_part->fdw_xact_id); + fxact->status = FDW_XACT_PREPARING; + fxact->held_by = MyBackendId; + fdw_part->fdw_xact = fxact; + LWLockRelease(FdwXactLock); + + MemoryContextSwitchTo(old_context); + + /* + * Prepare to write the entry to a file. Also add xlog entry. The contents + * of the xlog record are same as what is written to the file. + */ + data_len = offsetof(FdwXactOnDiskData, fdw_xact_id); + data_len = data_len + strlen(fdw_part->fdw_xact_id) + 1; + data_len = MAXALIGN(data_len); + fxact_file_data = (FdwXactOnDiskData *) palloc0(data_len); + fxact_file_data->dbid = MyDatabaseId; + fxact_file_data->local_xid = xid; + fxact_file_data->serverid = fdw_part->server->serverid; + fxact_file_data->userid = fdw_part->usermapping->userid; + fxact_file_data->umid = fdw_part->usermapping->umid; + memcpy(fxact_file_data->fdw_xact_id, fdw_part->fdw_xact_id, + strlen(fdw_part->fdw_xact_id) + 1); + + /* See note in RecordTransactionCommit */ + MyPgXact->delayChkpt = true; + + START_CRIT_SECTION(); + + /* Add the entry in the xlog and save LSN for checkpointer */ + XLogBeginInsert(); + XLogRegisterData((char *) fxact_file_data, data_len); + fxact->insert_end_lsn = XLogInsert(RM_FDW_XACT_ID, XLOG_FDW_XACT_INSERT); + XLogFlush(fxact->insert_end_lsn); + + /* If we crash now, we have prepared: WAL replay will fix things */ + + /* Store record's start location to read that later on CheckPoint */ + fxact->insert_start_lsn = ProcLastRecPtr; + + /* File is written completely, checkpoint can proceed with syncing */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + fxact->valid = true; + LWLockRelease(FdwXactLock); + + /* Checkpoint can process now */ + MyPgXact->delayChkpt = false; + + END_CRIT_SECTION(); + + pfree(fxact_file_data); + return fxact; +} + +/* + * insert_fdw_xact + * + * Insert a new entry for a given foreign transaction identified by transaction + * id, foreign server and user mapping, into the shared memory array. Caller + * must hold FdwXactLock in exclusive mode. + * + * If the entry already exists, the function raises an error. + */ +static FdwXact +insert_fdw_xact(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + Oid umid, char *fdw_xact_id) +{ + int i; + FdwXact fxact; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + + /* Check for duplicated foreign transaction entry */ + for (i = 0; i < FdwXactCtl->numFdwXacts; i++) + { + fxact = FdwXactCtl->fdw_xacts[i]; + if (fxact->dbid == dbid && + fxact->local_xid == xid && + fxact->serverid == serverid && + fxact->userid == userid) + ereport(ERROR, (errmsg("could not insert a foreign transaction entry"), + errdetail("duplicate entry with transaction id %u, serverid %u, userid %u", + xid, serverid, userid))); + } + + /* + * Get a next free foreign transaction entry. Raise error if there are + * none left. + */ + if (!FdwXactCtl->freeFdwXacts) + { + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of foreign transactions reached"), + errhint("Increase max_prepared_foreign_transactions: \"%d\".", + max_prepared_foreign_xacts))); + } + fxact = FdwXactCtl->freeFdwXacts; + FdwXactCtl->freeFdwXacts = fxact->fxact_free_next; + + /* Insert the entry to shared memory array */ + Assert(FdwXactCtl->numFdwXacts < max_prepared_foreign_xacts); + FdwXactCtl->fdw_xacts[FdwXactCtl->numFdwXacts++] = fxact; + + fxact->held_by = InvalidBackendId; + fxact->dbid = dbid; + fxact->local_xid = xid; + fxact->serverid = serverid; + fxact->userid = userid; + fxact->umid = umid; + fxact->insert_start_lsn = InvalidXLogRecPtr; + fxact->insert_end_lsn = InvalidXLogRecPtr; + fxact->valid = false; + fxact->ondisk = false; + fxact->inredo = false; + memcpy(fxact->fdw_xact_id, fdw_xact_id, strlen(fdw_xact_id) + 1); + + return fxact; +} + +/* + * remove_fdw_xact + * + * Remove the foreign prepared transaction entry from shared memory. + * Caller must hold FdwXactLock in exclusive mode. + */ +static void +remove_fdw_xact(FdwXact fdw_xact) +{ + int cnt; + + Assert(fdw_xact != NULL); + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + + /* Search the slot where this entry resided */ + for (cnt = 0; cnt < FdwXactCtl->numFdwXacts; cnt++) + { + if (FdwXactCtl->fdw_xacts[cnt] == fdw_xact) + break; + } + + /* We did not find the given entry in the array */ + if (cnt >= FdwXactCtl->numFdwXacts) + ereport(ERROR, + (errmsg("could not remove a foreign transaction entry"), + errdetail("failed to find entry for xid %u, foreign server %u, and user %u", + fdw_xact->local_xid, fdw_xact->serverid, fdw_xact->userid))); + + /* Remove the entry from active array */ + FdwXactCtl->numFdwXacts--; + FdwXactCtl->fdw_xacts[cnt] = FdwXactCtl->fdw_xacts[FdwXactCtl->numFdwXacts]; + + /* Put it back into free list */ + fdw_xact->fxact_free_next = FdwXactCtl->freeFdwXacts; + FdwXactCtl->freeFdwXacts = fdw_xact; + + /* Reset informations */ + fdw_xact->status = FDW_XACT_INITIAL; + fdw_xact->held_by = InvalidBackendId; + fdw_xact->fxact_next = NULL; + + if (!RecoveryInProgress()) + { + xl_fdw_xact_remove record; + XLogRecPtr recptr; + + /* Fill up the log record before releasing the entry */ + record.serverid = fdw_xact->serverid; + record.dbid = fdw_xact->dbid; + record.xid = fdw_xact->local_xid; + record.userid = fdw_xact->userid; + + /* + * Now writing FdwXact state data to WAL. We have to set delayChkpt + * here, otherwise a checkpoint starting immediately after the + * WAL record is inserted could complete without fsync'ing our + * state file. (This is essentially the same kind of race condition + * as the COMMIT-to-clog-write case that RecordTransactionCommit + * uses delayChkpt for; see notes there.) + */ + START_CRIT_SECTION(); + + MyPgXact->delayChkpt = true; + + /* + * Log that we are removing the foreign transaction entry and + * remove the file from the disk as well. + */ + XLogBeginInsert(); + XLogRegisterData((char *) &record, sizeof(xl_fdw_xact_remove)); + recptr = XLogInsert(RM_FDW_XACT_ID, XLOG_FDW_XACT_REMOVE); + XLogFlush(recptr); + + /* + * Now we can mark ourselves as out of the commit critical section: a + * checkpoint starting after this will certainly see the gxact as a + * candidate for fsyncing. + */ + MyPgXact->delayChkpt = false; + + END_CRIT_SECTION(); + } +} + +/* + * Return true and set FdwXactAtomicCommitReady to true if we require atomic commit. + * It is required if the transaction modified data on two or more servers including + * local node itself. This function also checks for each server if two-phase commit + * is enabled or not. + */ +static bool +FdwXactAtomicCommitRequired(void) +{ + ListCell* lc; + int nserverswritten = 0; + + if (!IsAtomicCommitRequested()) + return false; + + foreach(lc, FdwXactAtomicCommitParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(lc); + + /* Check if the foreign server is capable of two-phase commit protocol */ + if (fdw_part->is_twophase_commit_enabled(fdw_part->server->serverid)) + fdw_part->twophase_commit_enabled = true; + else if (fdw_part->modified) + MyXactFlags |= XACT_FLAGS_FDWNOPREPARE; + + if (fdw_part->modified) + nserverswritten++; + } + + if ((MyXactFlags & XACT_FLAGS_WROTENONTEMPREL) != 0) + ++nserverswritten; + + /* Atomic commit is required if we modified data on two or more participants */ + if (nserverswritten <= 1) + return false; + + FdwXactAtomicCommitReady = true; + return true; +} + +bool +FdwXactIsAtomicCommitReady(void) +{ + return FdwXactAtomicCommitReady; +} + +/* + * Compute the oldest xmin across all unresolved foreign transactions + * and store it in the ProcArray. + */ +static void +FdwXactComputeRequiredXmin(void) +{ + int i; + TransactionId agg_xmin = InvalidTransactionId; + + Assert(FdwXactCtl != NULL); + + LWLockAcquire(FdwXactLock, LW_SHARED); + + for (i = 0; i < FdwXactCtl->numFdwXacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdw_xacts[i]; + + if (!fdwxact->valid) + continue; + + Assert(TransactionIdIsValid(fdwxact->local_xid)); + + if (!TransactionIdIsValid(agg_xmin) || + TransactionIdPrecedes(fdwxact->local_xid, agg_xmin)) + agg_xmin = fdwxact->local_xid; + } + + LWLockRelease(FdwXactLock); + + ProcArraySetFdwXactUnresolvedXmin(agg_xmin); +} + +/* + * ForgetAllFdwXactParticipants + * + * Reset all the foreign transaction entries that this backend registered. + * If the foreign transaction has the corresponding FdwXact entry, resetting + * the held_by field means to leave that entry in unresolved state. If we + * leaves any entries, we update the oldest xmin of unresolved transaction + * so that transaction status of dangling transaction are not truncated. + */ +static void +ForgetAllFdwXactParticipants(void) +{ + ListCell *cell; + int n_lefts = 0; + + if (FdwXactAtomicCommitParticipants == NIL) + return; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + + foreach(cell, FdwXactAtomicCommitParticipants) + { + FdwXactParticipant *fdw_part = (FdwXactParticipant *) lfirst(cell); + + /* Skip if didn't register FdwXact entry yet */ + if (fdw_part->fdw_xact == NULL) + continue; + + /* + * There is a race condition; the FdwXact entries in + * FdwXactAtomicCommitParticipants could be used by other backend before we + * forget in case where the resolver process removes the FdwXact entry + * and other backend reuses it before we forget. So we need to check + * if the entries are still associated with the transaction. + */ + if (fdw_part->fdw_xact->held_by == MyBackendId) + { + fdw_part->fdw_xact->held_by = InvalidBackendId; + n_lefts++; + } + } + + LWLockRelease(FdwXactLock); + + /* + * Update the oldest local transaction of unresolved distributed + * transaction if we leaved any FdwXact entries. + */ + if (n_lefts > 0) + FdwXactComputeRequiredXmin(); + + FdwXactAtomicCommitParticipants = NIL; +} + +/* + * AtProcExit_FdwXact + * + * When the process exits, forget all the entries. + */ +static void +AtProcExit_FdwXact(int code, Datum arg) +{ + ForgetAllFdwXactParticipants(); +} + +/* + * Wait for foreign transaction to be resolved. + * + * Initially backends start in state FDW_XACT_NOT_WAITING and then change + * that state to FDW_XACT_WAITING before adding ourselves to the wait queue. + * During FdwXactResolveForeignTransaction a fdwxact resolver changes the + * state to FDW_XACT_WAIT_COMPLETE once foreign transactions are resolved. + * This backend then resets its state to FDW_XACT_NOT_WAITING. + * If a resolver fails to resolve the waiting transaction it moves us to + * the retry queue and changes the state to FDW_XACT_WAITING_RETRY. + * + * This function is inspired by SyncRepWaitForLSN. + */ +void +FdwXactWaitToBeResolved(TransactionId wait_xid, bool is_commit) +{ + char *new_status = NULL; + const char *old_status; + ListCell *lc; + List *fdwxact_participants = NIL; + + /* Quick exit if atomic commit is not requested */ + if (!IsAtomicCommitRequested()) + return; + + Assert(FdwXactCtl != NULL); + Assert(TransactionIdIsValid(wait_xid)); + Assert(SHMQueueIsDetached(&(MyProc->fdwXactLinks))); + Assert(MyProc->fdwXactState == FDW_XACT_NOT_WAITING); + + if (FdwXactAtomicCommitParticipants != NIL) + { + /* + * If we're waiting for foreign transactions to be resolved that + * we've prepared just before, use the participants list. + */ + Assert(MyPgXact->xid == wait_xid); + fdwxact_participants = FdwXactAtomicCommitParticipants; + } + else + { + /* + * Get participants list from the global array. This is required (1) + * when we're waiting for foreign transactions to be resolved that + * is part of a local prepared transaction that is marked as prepared + * during running, or (2) when we resolve the PREPARE'd distributed + * transaction after restart. + */ + fdwxact_participants = get_fdw_xacts(MyDatabaseId, wait_xid, + InvalidOid, InvalidOid, true); + } + + /* Exit if we found no foreign transaction to resolve */ + if (fdwxact_participants == NIL) + return; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + + foreach(lc, fdwxact_participants) + { + FdwXact fdw_xact = (FdwXact) lfirst(lc); + + /* Don't overwrite status if fate has been determined */ + if (fdw_xact->status == FDW_XACT_PREPARED) + fdw_xact->status = (is_commit ? + FDW_XACT_COMMITTING_PREPARED : + FDW_XACT_ABORTING_PREPARED); + } + + /* Set backend status and enqueue itself to the active queue*/ + MyProc->fdwXactState = FDW_XACT_WAITING; + MyProc->fdwXactWaitXid = wait_xid; + FdwXactQueueInsert(); + LWLockRelease(FdwXactLock); + + /* Launch a resolver process if not yet, or wake it up */ + fdwxact_maybe_launch_resolver(false); + + /* + * Alter ps display to show waiting for foreign transaction + * resolution. + */ + if (update_process_title) + { + int len; + + old_status = get_ps_display(&len); + new_status = (char *) palloc(len + 31 + 1); + memcpy(new_status, old_status, len); + sprintf(new_status + len, " waiting for resolution %d", wait_xid); + set_ps_display(new_status, false); + new_status[len] = '\0'; /* truncate off "waiting ..." */ + } + + /* Wait for all foreign transactions to be resolved */ + for (;;) + { + /* Must reset the latch before testing state */ + ResetLatch(MyLatch); + + /* + * Acquiring the lock is not needed, the latch ensures proper + * barriers. If it looks like we're done, we must really be done, + * because once walsender changes the state to FDW_XACT_WAIT_COMPLETE, + * it will never update it again, so we can't be seeing a stale value + * in that case. + */ + if (MyProc->fdwXactState == FDW_XACT_WAIT_COMPLETE) + break; + + /* + * If a wait for foreign transaction resolution is pending, we can + * neither acknowledge the commit nor raise ERROR or FATAL. The latter + * would lead the client to believe that the distributed transaction + * aborted, which is not true: it's already committed locally. The + * former is no good either: the client has requested committing a + * distributed transaction, and is entitled to assume that a acknowledged + * commit is also commit on all foreign servers, which might not be + * true. So in this case we issue a WARNING (which some clients may + * be able to interpret) and shut off further output. We do NOT reset + * PorcDiePending, so that the process will die after the commit is + * cleaned up. + */ + if (ProcDiePending) + { + ereport(WARNING, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("canceling the wait for resolving foreign transaction and terminating connection due to administrator command"), + errdetail("The transaction has already committed locally, but might not have been committed on the foreign server."))); + whereToSendOutput = DestNone; + FdwXactCancelWait(); + break; + } + + /* + * If a query cancel interrupt arrives we just terminate the wait with + * a suitable warning. The foreign transactions can be orphaned but + * the foreign xact resolver can pick up them and tries to resolve them + * later. + */ + if (QueryCancelPending) + { + QueryCancelPending = false; + ereport(WARNING, + (errmsg("canceling wait for resolving foreign transaction due to user request"), + errdetail("The transaction has already committed locally, but might not have been committed on the foreign server."))); + FdwXactCancelWait(); + break; + } + + /* + * If the postmaster dies, we'll probably never get an + * acknowledgement, because all the wal sender processes will exit. So + * just bail out. + */ + if (!PostmasterIsAlive()) + { + ProcDiePending = true; + whereToSendOutput = DestNone; + FdwXactCancelWait(); + break; + } + + /* + * Wait on latch. Any condition that should wake us up will set the + * latch, so no need for timeout. + */ + WaitLatch(MyLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, + WAIT_EVENT_FDW_XACT_RESOLUTION); + } + + pg_read_barrier(); + + Assert(SHMQueueIsDetached(&(MyProc->fdwXactLinks))); + MyProc->fdwXactState = FDW_XACT_NOT_WAITING; + + /* + * Forget the list of locked entries, also means that the entries + * that could not resolved are remained as dangling transactions. + */ + ForgetAllFdwXactParticipants(); + + if (new_status) + { + set_ps_display(new_status, false); + pfree(new_status); + } +} + +/* + * Acquire FdwXactLock and cancel any wait currently in progress. + */ +static void +FdwXactCancelWait(void) +{ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + if (!SHMQueueIsDetached(&(MyProc->fdwXactLinks))) + SHMQueueDelete(&(MyProc->fdwXactLinks)); + MyProc->fdwXactState = FDW_XACT_NOT_WAITING; + LWLockRelease(FdwXactLock); +} + +/* + * Insert MyProc into the tail of FdwXactActiveQueue. + */ +static void +FdwXactQueueInsert(void) +{ + SHMQueueInsertBefore(&(FdwXactRslvCtl->FdwXactActiveQueue), + &(MyProc->fdwXactLinks)); +} + +void +FdwXactCleanupAtProcExit(void) +{ + if (!SHMQueueIsDetached(&(MyProc->fdwXactLinks))) + { + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + SHMQueueDelete(&(MyProc->fdwXactLinks)); + LWLockRelease(FdwXactLock); + } +} + +/* + * Resolve one distributed transaction. The target distributed transaction + * is fetched from either the active queue or the retry queue and its participants + * are fetched from either the global array. + * + * Release the waiter and return true if we resolved the all of the foreign + * transaction participants. On failure, we move the FdwXactLinks entry to the + * retry queue from the active queue, and raise an error and exit. + */ +bool +FdwXactResolveDistributedTransaction(Oid dbid, bool is_active) +{ + FdwXactState *state; + ListCell *lc; + ListCell *next; + PGPROC *waiter = NULL; + List *participants; + SHM_QUEUE *target_queue; + + if (is_active) + target_queue = &(FdwXactRslvCtl->FdwXactActiveQueue); + else + target_queue = &(FdwXactRslvCtl->FdwXactRetryQueue); + + LWLockAcquire(FdwXactLock, LW_SHARED); + + /* Fetch a waiter from beginning of the queue */ + while ((waiter = (PGPROC *) SHMQueueNext(target_queue, target_queue, + offsetof(PGPROC, fdwXactLinks))) != NULL) + { + /* Found a waiter */ + if (waiter->databaseId == dbid) + break; + } + + /* If no waiter, there is no job */ + if (!waiter) + { + LWLockRelease(FdwXactLock); + return false; + } + + Assert(TransactionIdIsValid(waiter->fdwXactWaitXid)); + + state = create_fdw_xact_state(); + participants = get_fdw_xacts(dbid, waiter->fdwXactWaitXid, InvalidOid, + InvalidOid, false); + LWLockRelease(FdwXactLock); + + /* Resolve all foreign transactions one by one */ + for (lc = list_head(participants); lc != NULL; lc = next) + { + FdwXact fdwxact = (FdwXact) lfirst(lc); + + CHECK_FOR_INTERRUPTS(); + + next = lnext(lc); + + state->serverid = fdwxact->serverid; + state->userid = fdwxact->userid; + state->umid = fdwxact->umid; + state->fdwxact_id = pstrdup(fdwxact->fdw_xact_id); + + PG_TRY(); + { + FdwXactResolveForeignTransaction(state, fdwxact, ERROR); + } + PG_CATCH(); + { + /* Re-insert the waiter to the retry queue */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + if (waiter->fdwXactState == FDW_XACT_WAITING) + { + SHMQueueDelete(&(waiter->fdwXactLinks)); + pg_write_barrier(); + SHMQueueInsertBefore(&(FdwXactRslvCtl->FdwXactRetryQueue), + &(waiter->fdwXactLinks)); + waiter->fdwXactState = FDW_XACT_WAITING_RETRY; + } + LWLockRelease(FdwXactLock); + + PG_RE_THROW(); + } + PG_END_TRY(); + + elog(DEBUG2, "resolved a foreign transaction xid %u, serverid %d, userid %d", + fdwxact->local_xid, fdwxact->serverid, fdwxact->userid); + } + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + + /* + * Remove waiter from shmem queue, if not detached yet. The waiter + * could already be detached if user cancelled to wait before + * resolution. + */ + if (!SHMQueueIsDetached(&(waiter->fdwXactLinks))) + { + TransactionId wait_xid = waiter->fdwXactWaitXid; + + SHMQueueDelete(&(waiter->fdwXactLinks)); + pg_write_barrier(); + + /* Set state to complete */ + waiter->fdwXactState = FDW_XACT_WAIT_COMPLETE; + + /* Wake up the waiter only when we have set state and removed from queue */ + SetLatch(&(waiter->procLatch)); + + elog(DEBUG2, "released the proc xid %u", wait_xid); + } + + LWLockRelease(FdwXactLock); + + return true; +} + +/* + * Resolve all dangling foreign transactions on the given database. Get + * all dangling foreign transactions from shmem global array and resolve + * them one by one. + */ +void +FdwXactResolveAllDanglingTransactions(Oid dbid) +{ + List *dangling_fdwxacts = NIL; + ListCell *cell; + bool n_resolved = 0; + int i; + + Assert(OidIsValid(dbid)); + + LWLockAcquire(FdwXactLock, LW_SHARED); + + /* + * Walk over the global array to make the list of dangling transactions + * of which corresponding local transaction is on the given database. + */ + for (i = 0; i < FdwXactCtl->numFdwXacts; i++) + { + FdwXact fxact = FdwXactCtl->fdw_xacts[i]; + + /* + * Append the fdwxact entry on the given database to the list if + * it's handled by nobody and the corresponding local transaction + * is not part of the prepared transaction. + */ + if (fxact->dbid == dbid && + fxact->held_by == InvalidBackendId && + !TwoPhaseExists(fxact->local_xid)) + dangling_fdwxacts = lappend(dangling_fdwxacts, fxact); + } + + LWLockRelease(FdwXactLock); + + /* Return if there is no foreign transaction we need to resolve */ + if (dangling_fdwxacts == NIL) + return; + + foreach(cell, dangling_fdwxacts) + { + FdwXact fdwxact = (FdwXact) lfirst(cell); + FdwXactState *state; + + state = create_fdw_xact_state(); + state->serverid = fdwxact->serverid; + state->userid = fdwxact->userid; + state->umid = fdwxact->umid; + state->fdwxact_id = pstrdup(fdwxact->fdw_xact_id); + + FdwXactResolveForeignTransaction(state, fdwxact, ERROR); + + n_resolved++; + } + + list_free(dangling_fdwxacts); + + elog(DEBUG2, "resolved %d dangling foreign xacts", n_resolved); +} + +/* + * AtEOXact_FdwXacts + * + * In commit case, we have already prepared transactions on the foreign + * servers during pre-commit. And that prepared transactions will be + * resolved by the resolver process. So we don't do anything about the + * foreign transaction. + * + * In abort case, user requested rollback or we changed over rollback + * due to error during commit. To close current foreign transaction anyway + * we call rollback API to every foreign transaction. If we raised an error + * during preparing and came to here, it's possible that some entries of + * FdwXactParticipants already registered its FdwXact entry. If there is + * we leave them as dangling transaction and ask the resolver process to + * process them. + */ +extern void +AtEOXact_FdwXacts(bool is_commit) +{ + ListCell *lcell; + + if (!is_commit) + { + int left_fdwxacts = 0; + FdwXactState *state = create_fdw_xact_state(); + + foreach (lcell, FdwXactAtomicCommitParticipants) + { + FdwXactParticipant *fdw_part = lfirst(lcell); + + /* + * Count FdwXact entries that we registered to shared memory array + * in this transaction. + */ + if (fdw_part->fdw_xact) + { + /* + * The status of foreign transaction must be either preparing + * or prepared. In any case, since we have registered FdwXact + * entry we leave them to the resolver process. For the preparing + * state, since the foreign transaction might not close yet we + * fall through and call rollback API. For the prepared state, + * since the foreign transaction has closed we don't need to do + * anything. + */ + Assert(fdw_part->fdw_xact->status == FDW_XACT_PREPARING || + fdw_part->fdw_xact->status == FDW_XACT_PREPARED); + + left_fdwxacts++; + if (fdw_part->fdw_xact->status == FDW_XACT_PREPARED) + continue; + } + + state->serverid = fdw_part->server->serverid; + state->userid = fdw_part->usermapping->userid; + state->umid = fdw_part->usermapping->umid; + state->fdw_state = fdw_part->fdw_state; + + /* + * Rollback all current foreign transaction. Since we're rollbacking + * the transaction it's too late even if we raise an error here. + * So we log it as warning. + */ + if (!fdw_part->rollback_foreign_xact(state)) + ereport(WARNING, + (errmsg("could not abort transaction on server \"%s\"", + fdw_part->server->servername))); + } + + /* If we left some FdwXact entries, ask the resolver process */ + if (left_fdwxacts > 0) + { + ereport(WARNING, + (errmsg("might have left %u foreign transactions in in-doubt status", + left_fdwxacts))); + fdwxact_maybe_launch_resolver(true); + } + } + + ForgetAllFdwXactParticipants(); + FdwXactAtomicCommitReady = false; +} + +/* + * AtPrepare_FdwXacts + * + * If there are foreign servers involved in the transaction, this function + * prepares transactions on those servers. + * + * Note that it can happen that the transaction aborts after we prepared part + * of participants. In this case since we can change to abort we cannot forget + * FdwXactAtomicCommitParticipants here. These are processed by the resolver process + * during aborting, or at EOXact_FdwXacts. + */ +void +AtPrepare_FdwXacts(void) +{ + if (FdwXactAtomicCommitParticipants == NIL) + return; + + /* Check for an invalid condition */ + if (!IsAtomicCommitRequested()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a distributed transaction when distributed_atomic_commit is \'disabled\'"))); + + + /* + * We cannot prepare if any foreign server of participants isn't capable + * of two-phase commit. + */ + if (FdwXactAtomicCommitRequired() && + (MyXactFlags & XACT_FLAGS_FDWNOPREPARE) != 0) + ereport(ERROR, + (errcode(ERRCODE_T_R_INTEGRITY_CONSTRAINT_VIOLATION), + errmsg("can not prepare the transaction because some foreign servers involved in transaction can not prepare the transaction"))); + + /* Prepare transactions on participating foreign servers. */ + FdwXactPrepareForeignTransactions(); +} + +/* + * FdwXactResolveForeignTransaction + * + * Resolve the foreign transaction using the foreign data wrapper's transaction + * handler routine. The foreign transaction can be a dangling transaction + * that is not interested by nobody. If the fate of foreign transaction is + * not determined yet, it'sdetermined according to the status of corresponding + * local transaction. + * + * If the resolution is successful, remove the foreign transaction entry from + * the shared memory and also remove the corresponding on-disk file. + */ +static bool +FdwXactResolveForeignTransaction(FdwXactState *state, FdwXact fdwxact, + int elevel) +{ + ForeignServer *server; + ForeignDataWrapper *fdw; + FdwRoutine *fdw_routine; + bool is_commit; + bool ret; + + Assert(fdwxact); + + /* + * Determine whether we commit or abort this foreign transaction. + */ + if (fdwxact->status == FDW_XACT_COMMITTING_PREPARED) + is_commit = true; + else if (fdwxact->status == FDW_XACT_ABORTING_PREPARED) + is_commit = false; + + /* + * If the local transaction is already committed, commit prepared + * foreign transaction. + */ + else if (TransactionIdDidCommit(fdwxact->local_xid)) + { + fdwxact->status = FDW_XACT_COMMITTING_PREPARED; + is_commit = true; + } + + /* + * If the local transaction is already aborted, abort prepared + * foreign transactions. + */ + else if (TransactionIdDidAbort(fdwxact->local_xid)) + { + fdwxact->status = FDW_XACT_ABORTING_PREPARED; + is_commit = false; + } + + /* + * The local transaction is not in progress but the foreign + * transaction is not prepared on the foreign server. This + * can happen when transaction failed after registered this + * entry but before actual preparing on the foreign server. + * So let's assume it aborted. + */ + else if (!TransactionIdIsInProgress(fdwxact->local_xid)) + is_commit = false; + + /* + * The Local transaction is in progress and foreign transaction + * state is neither committing or aborting. This should not + * happen because we cannot determine to do commit or abort for + * foreign transaction associated with the in-progress local + * transaction. + */ + else + ereport(ERROR, + (errmsg("cannot resolve the foreign transaction associated with in-progress transaction %u on server %u", + fdwxact->local_xid, fdwxact->serverid))); + + server = GetForeignServer(fdwxact->serverid); + fdw = GetForeignDataWrapper(server->fdwid); + fdw_routine = GetFdwRoutine(fdw->fdwhandler); + + /* Resolve the foreign transaction */ + Assert(fdw_routine->ResolveForeignTransaction); + + ret = fdw_routine->ResolveForeignTransaction(state, is_commit); + + if (!ret) + { + ereport(elevel, + (errmsg("could not %s a prepared foreign transaction on server \"%s\"", + is_commit ? "commit" : "rollback", server->servername), + errdetail("local transaction id is %u, connected by user id %u", + fdwxact->local_xid, fdwxact->userid))); + } + + /* Resolution was a success, remove the entry */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + if (fdwxact->ondisk) + RemoveFdwXactFile(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + true); + remove_fdw_xact(fdwxact); + LWLockRelease(FdwXactLock); + + return ret; +} + +static FdwXactState * +create_fdw_xact_state(void) +{ + FdwXactState *state; + + state = palloc(sizeof(FdwXactState)); + state->serverid = InvalidOid; + state->userid = InvalidOid; + state->umid = InvalidOid; + state->fdwxact_id = NULL; + state->fdw_state = NULL; + + return state; +} + +/* + * Return one FdwXact entry that matches to given arguments, otherwise + * return NULL. Since this function search FdwXact entry by unique key + * all arguments should be valid. + */ +static FdwXact +get_one_fdw_xact(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool need_lock) +{ + List *fdw_xact_list; + + /* All search conditions must be valid values */ + Assert(TransactionIdIsValid(xid)); + Assert(OidIsValid(serverid)); + Assert(OidIsValid(userid)); + Assert(OidIsValid(dbid)); + + fdw_xact_list = get_fdw_xacts(dbid, xid, serverid, userid, need_lock); + + /* Could not find entry */ + if (fdw_xact_list == NIL) + return NULL; + + /* Must be one entry since we search it by the unique key */ + Assert(list_length(fdw_xact_list) == 1); + + return (FdwXact) linitial(fdw_xact_list); +} + +/* + * Return true if there is at least one prepared foreign transaction + * which matches given arguments. + */ +bool +fdw_xact_exists(Oid dbid, TransactionId xid, Oid serverid, Oid userid) +{ + List *fdw_xact_list; + + fdw_xact_list = get_fdw_xacts(dbid, xid, serverid, userid, true); + + return fdw_xact_list != NIL; +} + +/* + * Returns an array of all foreign prepared transactions for the user-level + * function pg_prepared_fdw_xacts. + * + * WARNING -- we return even those transactions whose information is not + * completely filled yet. The caller should filter them out if he doesn't want them. + * + * The returned array is palloc'd. + */ +static FdwXact +get_all_fdw_xacts(int *length) +{ + List *all_fdw_xacts; + ListCell *lc; + FdwXact fdw_xacts; + int num_fdw_xacts = 0; + + Assert(length != NULL); + + /* Get all entries */ + all_fdw_xacts = get_fdw_xacts(InvalidOid, InvalidTransactionId, + InvalidOid, InvalidOid, true); + + if (all_fdw_xacts == NIL) + { + *length = 0; + return NULL; + } + + fdw_xacts = (FdwXact) + palloc(sizeof(FdwXactData) * list_length(all_fdw_xacts)); + + /* Convert list to array of FdwXact */ + foreach(lc, all_fdw_xacts) + { + FdwXact fx = (FdwXact) lfirst(lc); + + memcpy(fdw_xacts + num_fdw_xacts, fx, + sizeof(FdwXactData)); + num_fdw_xacts++; + } + + *length = num_fdw_xacts; + list_free(all_fdw_xacts); + + return fdw_xacts; +} + +/* + * Return a list of FdwXact matched to given arguments. Otherwise return + * NIL. + */ +static List* +get_fdw_xacts(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool need_lock) +{ + int i; + List *fdw_xact_list = NIL; + + if (need_lock) + LWLockAcquire(FdwXactLock, LW_SHARED); + + for (i = 0; i < FdwXactCtl->numFdwXacts; i++) + { + FdwXact fdw_xact = FdwXactCtl->fdw_xacts[i]; + bool matches = true; + + /* xid */ + if (xid != InvalidTransactionId && xid != fdw_xact->local_xid) + matches = false; + + /* dbid */ + if (OidIsValid(dbid) && fdw_xact->dbid != dbid) + matches = false; + + /* serverid */ + if (OidIsValid(serverid) && serverid != fdw_xact->serverid) + matches = false; + + /* userid */ + if (OidIsValid(userid) && fdw_xact->userid != userid) + matches = false; + + /* Append it if matched */ + if (matches) + fdw_xact_list = lappend(fdw_xact_list, fdw_xact); + } + + if (need_lock) + LWLockRelease(FdwXactLock); + + return fdw_xact_list; +} + +/* + * fdw_xact_redo + * Apply the redo log for a foreign transaction. + */ +void +fdw_xact_redo(XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_FDW_XACT_INSERT) + { + /* + * Add fdwxact entry and set start/end lsn of the WAL record + * in FdwXact entry. + */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + FdwXactRedoAdd(XLogRecGetData(record), + record->ReadRecPtr, + record->EndRecPtr); + LWLockRelease(FdwXactLock); + } + else if (info == XLOG_FDW_XACT_REMOVE) + { + xl_fdw_xact_remove *record = (xl_fdw_xact_remove *) rec; + + /* Delete FdwXact entry and file if exists */ + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + FdwXactRedoRemove(record->dbid, record->xid, record->serverid, + record->userid, false); + LWLockRelease(FdwXactLock); + } + else + elog(ERROR, "invalid log type %d in foreign transction log record", info); + + return; +} + +/* + * Return a null-terminated foreign transaction identifier with in the form + * of "fx____ whose length is always + * less than NAMEDATALEN. + * + * Returned string value is used to identify foreign transaction. The + * identifier should not be same as any other concurrent prepared transaction + * identifier. + * + * To make the foreign transactionid unique, we should ideally use something + * like UUID, which gives unique ids with high probability, but that may be + * expensive here and UUID extension which provides the function to generate + * UUID is not part of the core code. + */ +static char * +generate_fdw_xact_identifier(TransactionId xid, Oid serverid, Oid userid) +{ + char* fdw_xact_id; + + fdw_xact_id = (char *)palloc0(FDW_XACT_ID_MAX_LEN * sizeof(char)); + + snprintf(fdw_xact_id, FDW_XACT_ID_MAX_LEN, "%s_%ld_%u_%d_%d", + "fx", Abs(random()), xid, serverid, userid); + fdw_xact_id[strlen(fdw_xact_id)] = '\0'; + + return fdw_xact_id; +} + +/* + * CheckPointFdwXact + * + * We must fsync the foreign transaction state file that is valid or generated + * during redo and has a inserted LSN <= the checkpoint'S redo horizon. + * The foreign transaction entries and hence the corresponding files are expected + * to be very short-lived. By executing this function at the end, we might have + * lesser files to fsync, thus reducing some I/O. This is similar to + * CheckPointTwoPhase(). + * + * In order to avoid disk I/O while holding a light weight lock, the function + * first collects the files which need to be synced under FdwXactLock and then + * syncs them after releasing the lock. This approach creates a race condition: + * after releasing the lock, and before syncing a file, the corresponding + * foreign transaction entry and hence the file might get removed. The function + * checks whether that's true and ignores the error if so. + */ +void +CheckPointFdwXacts(XLogRecPtr redo_horizon) +{ + int cnt; + int serialized_fdw_xacts = 0; + + /* Quick get-away, before taking lock */ + if (max_prepared_foreign_xacts <= 0) + return; + + TRACE_POSTGRESQL_FDWXACT_CHECKPOINT_START(); + + LWLockAcquire(FdwXactLock, LW_SHARED); + + /* Another quick, before we allocate memory */ + if (FdwXactCtl->numFdwXacts <= 0) + { + LWLockRelease(FdwXactLock); + return; + } + + /* + * We are expecting there to be zero FdwXact that need to be copied to + * disk, so we perform all I/O while holding FdwXactLock for simplicity. + * This presents any new foreign xacts from preparing while this occurs, + * which shouldn't be a problem since the presence fo long-lived prepared + * foreign xacts indicated the transaction manager isn't active. + * + * It's also possible to move I/O out of the lock, but on every error we + * should check whether somebody committed our transaction in different + * backend. Let's leave this optimisation for future, if somebody will + * spot that this place cause bottleneck. + * + * Note that it isn't possible for there to be a FdwXact with a + * insert_end_lsn set prior to the last checkpoint yet is marked + * invalid, because of the efforts with delayChkpt. + */ + for (cnt = 0; cnt < FdwXactCtl->numFdwXacts; cnt++) + { + FdwXact fxact = FdwXactCtl->fdw_xacts[cnt]; + + if ((fxact->valid || fxact->inredo) && + !fxact->ondisk && + fxact->insert_end_lsn <= redo_horizon) + { + char *buf; + int len; + + XlogReadFdwXactData(fxact->insert_start_lsn, &buf, &len); + RecreateFdwXactFile(fxact->dbid, fxact->local_xid, + fxact->serverid, fxact->userid, + buf, len); + fxact->ondisk = true; + fxact->insert_start_lsn = InvalidXLogRecPtr; + fxact->insert_end_lsn = InvalidXLogRecPtr; + pfree(buf); + serialized_fdw_xacts++; + } + } + + LWLockRelease(FdwXactLock); + + /* + * Flush unconditionally the parent directory to make any information + * durable on disk. FdwXact files could have been removed and those + * removals need to be made persistent as well as any files newly created. + */ + fsync_fname(FDW_XACTS_DIR, true); + + TRACE_POSTGRESQL_FDWXACT_CHECKPOINT_DONE(); + + if (log_checkpoints && serialized_fdw_xacts > 0) + ereport(LOG, + (errmsg_plural("%u foreign transaction state file was written " + "for long-running prepared transactions", + "%u foreign transaction state files were written " + "for long-running prepared transactions", + serialized_fdw_xacts, + serialized_fdw_xacts))); +} + +/* + * Reads foreign transaction data from xlog. During checkpoint this data will + * be moved to fdwxact files and ReadFdwXactFile should be used instead. + * + * Note clearly that this function accesses WAL during normal operation, similarly + * to the way WALSender or Logical Decoding would do. It does not run during + * crash recovery or standby processing. + */ +static void +XlogReadFdwXactData(XLogRecPtr lsn, char **buf, int *len) +{ + XLogRecord *record; + XLogReaderState *xlogreader; + char *errormsg; + + xlogreader = XLogReaderAllocate(wal_segment_size, &read_local_xlog_page, NULL); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating an XLog reading processor."))); + + record = XLogReadRecord(xlogreader, lsn, &errormsg); + if (record == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read foreign transaction state from xlog at %X/%X", + (uint32) (lsn >> 32), + (uint32) lsn))); + + if (XLogRecGetRmid(xlogreader) != RM_FDW_XACT_ID || + (XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK) != XLOG_FDW_XACT_INSERT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("expected foreign transaction state data is not present in xlog at %X/%X", + (uint32) (lsn >> 32), + (uint32) lsn))); + + if (len != NULL) + *len = XLogRecGetDataLen(xlogreader); + + *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader)); + memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader)); + + XLogReaderFree(xlogreader); +} + +/* + * Recreates a foreign transaction state file. This is used in WAL replay + * and during checkpoint creation. + * + * Note: content and len don't include CRC. + */ +void +RecreateFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, void *content, int len) +{ + char path[MAXPGPATH]; + pg_crc32c statefile_crc; + int fd; + + /* Recompute CRC */ + INIT_CRC32C(statefile_crc); + COMP_CRC32C(statefile_crc, content, len); + FIN_CRC32C(statefile_crc); + + FdwXactFilePath(path, dbid, xid, serverid, userid); + + fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not recreate foreign transaction state file \"%s\": %m", + path))); + + /* Write content and CRC */ + pgstat_report_wait_start(WAIT_EVENT_FDW_XACT_FILE_WRITE); + if (write(fd, content, len) != len) + { + pgstat_report_wait_end(); + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write foreign transcation state file: %m"))); + } + if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c)) + { + pgstat_report_wait_end(); + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write foreign transcation state file: %m"))); + } + pgstat_report_wait_end(); + + /* + * We must fsync the file because the end-of-replay checkpoint will not do + * so, there being no FDWXACT in shared memory yet to tell it to. + */ + pgstat_report_wait_start(WAIT_EVENT_FDW_XACT_FILE_SYNC); + if (pg_fsync(fd) != 0) + { + pgstat_report_wait_end(); + CloseTransientFile(fd); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync foreign transaction state file: %m"))); + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close foreign transaction file: %m"))); +} + +/* + * ProcessFdwXactBuffer + * + * Given a transaction id, userid and serverid read it either from disk + * or read it directly via shmem xlog record pointer using the provided + * "insert_start_lsn". + */ +static char * +ProcessFdwXactBuffer(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, XLogRecPtr insert_start_lsn, bool fromdisk) +{ + TransactionId origNextXid = ShmemVariableCache->nextXid; + char *buf; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + + if (!fromdisk) + Assert(insert_start_lsn != InvalidXLogRecPtr); + + if (TransactionIdFollowsOrEquals(xid, origNextXid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing future fdwxact state file for xid %u, server %u and user %u", + xid, serverid, userid))); + RemoveFdwXactFile(dbid, xid, serverid, userid, true); + } + else + { + ereport(WARNING, + (errmsg("removing future fdwxact state from memory for xid %u, server %u and user %u", + xid, serverid, userid))); + FdwXactRedoRemove(dbid, xid, serverid, userid, true); + } + return NULL; + } + + if (fromdisk) + { + buf = ReadFdwXactFile(dbid, xid, serverid, userid, true); + if (buf == NULL) + { + ereport(WARNING, + (errmsg("removing corrupt fdwxact state file for xid %u, server %u and user %u", + xid, serverid, userid))); + RemoveFdwXactFile(dbid, xid, serverid, userid, true); + return NULL; + } + } + else + { + /* Read xlog data */ + XlogReadFdwXactData(insert_start_lsn, &buf, NULL); + } + + return buf; +} + +/* + * Read and validate the foreign transaction state file. + * + * If it looks OK (has a valid magic number and CRC), return thecontents in + * a structure allocated in-memory. Otherwise return NULL. The structure can + * be later freed by the caller. + */ +static char * +ReadFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, Oid userid, + bool give_warnings) +{ + char path[MAXPGPATH]; + int fd; + FdwXactOnDiskData *fxact_file_data; + struct stat stat; + uint32 crc_offset; + pg_crc32c calc_crc; + pg_crc32c file_crc; + char *buf; + + FdwXactFilePath(path, dbid, xid, serverid, userid); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open FDW transaction state file \"%s\": %m", + path))); + + /* + * Check file length. We can determine a lower bound pretty easily. We + * set an upper bound to avoid palloc() failure on a corrupt file, though + * we can't guarantee that we won't get an out of memory error anyway, + * even on a valid file. + */ + if (fstat(fd, &stat)) + { + CloseTransientFile(fd); + if (give_warnings) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not stat FDW transaction state file \"%s\": %m", + path))); + return NULL; + } + + if (stat.st_size < (offsetof(FdwXactOnDiskData, fdw_xact_id) + + sizeof(pg_crc32c)) || + stat.st_size > MaxAllocSize) + { + CloseTransientFile(fd); + ereport(WARNING, + (errcode_for_file_access(), + errmsg("too large FDW transaction state file \"%s\": %m", + path))); + return NULL; + } + + crc_offset = stat.st_size - sizeof(pg_crc32c); + if (crc_offset != MAXALIGN(crc_offset)) + { + CloseTransientFile(fd); + return NULL; + } + + /* + * Ok, slurp in the file. + */ + buf = (char *) palloc(stat.st_size); + fxact_file_data = (FdwXactOnDiskData *) buf; + + /* Slurp the file */ + pgstat_report_wait_start(WAIT_EVENT_FDW_XACT_FILE_READ); + if (read(fd, buf, stat.st_size) != stat.st_size) + { + pgstat_report_wait_end(); + CloseTransientFile(fd); + if (give_warnings) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not read FDW transaction state file \"%s\": %m", + path))); + return NULL; + } + + pgstat_report_wait_end(); + CloseTransientFile(fd); + + /* + * Check the CRC. + */ + INIT_CRC32C(calc_crc); + COMP_CRC32C(calc_crc, buf, crc_offset); + FIN_CRC32C(calc_crc); + + file_crc = *((pg_crc32c *) (buf + crc_offset)); + + if (!EQ_CRC32C(calc_crc, file_crc)) + { + pfree(buf); + return NULL; + } + + /* Check if the contents is an expected data */ + fxact_file_data = (FdwXactOnDiskData *) buf; + if (fxact_file_data->dbid != dbid || + fxact_file_data->serverid != serverid || + fxact_file_data->userid != userid || + fxact_file_data->local_xid != xid) + { + ereport(WARNING, + (errmsg("invalid foreign transaction state file \"%s\"", + path))); + CloseTransientFile(fd); + pfree(buf); + return NULL; + } + + return buf; +} + +/* + * PrescanFdwXacts + * + * Scan the all foreign transactions directory for oldest active transaction. + * This is run during database startup, after we completed reading WAL. + * ShmemVariableCache->nextXid has been set to one more than the highest XID + * for which evidence exists in WAL. + */ +TransactionId +PrescanFdwXacts(TransactionId oldestActiveXid) +{ + TransactionId nextXid = ShmemVariableCache->nextXid; + DIR *cldir; + struct dirent *clde; + + cldir = AllocateDir(FDW_XACTS_DIR); + while ((clde = ReadDir(cldir, FDW_XACTS_DIR)) != NULL) + { + if (strlen(clde->d_name) == FDW_XACT_FILE_NAME_LEN && + strspn(clde->d_name, "0123456789ABCDEF_") == FDW_XACT_FILE_NAME_LEN) + { + Oid dbid; + Oid serverid; + Oid userid; + TransactionId local_xid; + + sscanf(clde->d_name, "%08x_%08x_%08x_%08x", + &dbid, &local_xid, &serverid, &userid); + + /* + * Remove a foreign prepared transaction file corresponding to an + * XID, which is too new. + */ + if (TransactionIdFollowsOrEquals(local_xid, nextXid)) + { + ereport(WARNING, + (errmsg("removing future foreign prepared transaction file \"%s\"", + clde->d_name))); + RemoveFdwXactFile(dbid, local_xid, serverid, userid, true); + continue; + } + + if (TransactionIdPrecedesOrEquals(local_xid, oldestActiveXid)) + oldestActiveXid = local_xid; + } + } + + FreeDir(cldir); + return oldestActiveXid; +} + +/* + * restoreFdwXactData + * + * Scan pg_fdw_xact and fill FdwXact depending on the on-disk data. + * This is called once at the beginning of recovery, saving any extra + * lookups in the future. FdwXact files that are newer than the + * minimum XID horizon are discarded on the way. + */ +void +restoreFdwXactData(void) +{ + DIR *cldir; + struct dirent *clde; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + cldir = AllocateDir(FDW_XACTS_DIR); + while ((clde = ReadDir(cldir, FDW_XACTS_DIR)) != NULL) + { + if (strlen(clde->d_name) == FDW_XACT_FILE_NAME_LEN && + strspn(clde->d_name, "0123456789ABCDEF_") == FDW_XACT_FILE_NAME_LEN) + { + TransactionId local_xid; + Oid dbid; + Oid serverid; + Oid userid; + char *buf; + + sscanf(clde->d_name, "%08x_%08x_%08x_%08x", + &dbid, &local_xid, &serverid, &userid); + + /* Read fdwxact data from disk */ + buf = ProcessFdwXactBuffer(dbid, local_xid, serverid, userid, + InvalidXLogRecPtr, true); + + if (buf == NULL) + continue; + + /* Add this entry into the table of foreign transactions */ + FdwXactRedoAdd(buf, InvalidXLogRecPtr, InvalidXLogRecPtr); + } + } + + LWLockRelease(FdwXactLock); + FreeDir(cldir); +} + +/* + * Remove the foreign transaction file for given entry. + * + * If giveWarning is false, do not complain about file-not-present; + * this is an expected case during WAL replay. + */ +static void +RemoveFdwXactFile(Oid dbid, TransactionId xid, Oid serverid, Oid userid, bool giveWarning) +{ + char path[MAXPGPATH]; + + FdwXactFilePath(path, dbid, xid, serverid, userid); + if (unlink(path) < 0 && (errno != ENOENT || giveWarning)) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove foreign transaction state file \"%s\": %m", + path))); +} + +/* + * FdwXactRedoAdd + * + * Store pointer to the start/end of the WAL record along with the xid in + * a fdwxact entry in shared memory FdwXactData structure. + */ +static void +FdwXactRedoAdd(char *buf, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + FdwXactOnDiskData *fxact_data = (FdwXactOnDiskData *) buf; + FdwXact fxact; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + /* + * Add this entry into the table of foreign transactions. The + * status of the transaction is set as preparing, since we do not + * know the exact status right now. Resolver will set it later + * based on the status of local transaction which prepared this + * foreign transaction. + */ + fxact = insert_fdw_xact(fxact_data->dbid, fxact_data->local_xid, + fxact_data->serverid, fxact_data->userid, + fxact_data->umid, fxact_data->fdw_xact_id); + + /* + * Set status as preparing, since we do not know the xact status + * right now. Resolver will set it later based on the status of + * local transaction that prepared this fdwxact entry. + */ + fxact->status = FDW_XACT_PREPARING; + fxact->insert_start_lsn = start_lsn; + fxact->insert_end_lsn = end_lsn; + fxact->inredo = true; /* added in redo */ + fxact->valid = false; + fxact->ondisk = XLogRecPtrIsInvalid(start_lsn); +} + +/* + * FdwXactRedoRemove + * + * Remove the corresponding fdw_xact entry from FdwXactCtl. + * Also remove fdw_xact file if a foreign transaction was saved + * via an earlier checkpoint. + */ +void +FdwXactRedoRemove(Oid dbid, TransactionId xid, Oid serverid, + Oid userid, bool givewarning) +{ + FdwXact fdwxact; + + Assert(LWLockHeldByMeInMode(FdwXactLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + fdwxact = get_one_fdw_xact(dbid, xid, serverid, userid, + false); + + if (fdwxact == NULL) + return; + + /* Clean up entry and any files we may have left */ + if (fdwxact->ondisk) + RemoveFdwXactFile(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + givewarning); + remove_fdw_xact(fdwxact); +} + +/* + * Scan the shared memory entries of FdwXact and valid them. + * + * This is run at the end of recovery, but before we allow backends to write + * WAL. + */ +void +RecoverFdwXacts(void) +{ + int i; + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + for (i = 0; i < FdwXactCtl->numFdwXacts; i++) + { + FdwXact fdwxact = FdwXactCtl->fdw_xacts[i]; + char *buf; + + buf = ProcessFdwXactBuffer(fdwxact->dbid, fdwxact->local_xid, + fdwxact->serverid, fdwxact->userid, + fdwxact->insert_start_lsn, fdwxact->ondisk); + + if (buf == NULL) + continue; + + ereport(LOG, + (errmsg("recovering foreign transaction %u for server %u and user %u from shared memory", + fdwxact->local_xid, fdwxact->serverid, fdwxact->userid))); + + fdwxact->inredo = false; + fdwxact->valid = true; + } + LWLockRelease(FdwXactLock); +} + +bool +check_distributed_atomic_commit(int *newval, void **extra, GucSource source) +{ + DistributedAtomicCommitLevel newDistributedAtomicCommitLevel = *newval; + + /* Parameter check */ + if (newDistributedAtomicCommitLevel > DISTRIBUTED_ATOMIC_COMMIT_DISABLED && + (max_prepared_foreign_xacts == 0 || max_foreign_xact_resolvers == 0)) + { + GUC_check_errdetail("Cannot enable \"distributed_atomic_commit\" when " + "\"max_prepared_foreign_transactions\" or \"max_foreign_transaction_resolvers\"" + "is zero value"); + return false; + } + + return true; +} + +/* Built in functions */ +/* + * Structure to hold and iterate over the foreign transactions to be displayed + * by the built-in functions. + */ +typedef struct +{ + FdwXact fdw_xacts; + int num_xacts; + int cur_xact; +} WorkingStatus; + +Datum +pg_prepared_fdw_xacts(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + WorkingStatus *status; + char *xact_status; + + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext oldcontext; + int num_fdw_xacts = 0; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * Switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* build tupdesc for result tuples */ + /* this had better match pg_fdw_xacts view in system_views.sql */ + tupdesc = CreateTemplateTupleDesc(6, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "dbid", + OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "transaction", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "serverid", + OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "userid", + OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "status", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "identifier", + TEXTOID, -1, 0); + + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + /* + * Collect status information that we will format and send out as a + * result set. + */ + status = (WorkingStatus *) palloc(sizeof(WorkingStatus)); + funcctx->user_fctx = (void *) status; + + status->fdw_xacts = get_all_fdw_xacts(&num_fdw_xacts); + status->num_xacts = num_fdw_xacts; + status->cur_xact = 0; + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + status = funcctx->user_fctx; + + while (status->cur_xact < status->num_xacts) + { + FdwXact fdw_xact = &status->fdw_xacts[status->cur_xact++]; + Datum values[6]; + bool nulls[6]; + HeapTuple tuple; + Datum result; + + if (!fdw_xact->valid) + continue; + + /* + * Form tuple with appropriate data. + */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = ObjectIdGetDatum(fdw_xact->dbid); + values[1] = TransactionIdGetDatum(fdw_xact->local_xid); + values[2] = ObjectIdGetDatum(fdw_xact->serverid); + values[3] = ObjectIdGetDatum(fdw_xact->userid); + switch (fdw_xact->status) + { + case FDW_XACT_PREPARING: + xact_status = "prepared"; + break; + case FDW_XACT_COMMITTING_PREPARED: + xact_status = "committing"; + break; + case FDW_XACT_ABORTING_PREPARED: + xact_status = "aborting"; + break; + default: + xact_status = "unknown"; + break; + } + values[4] = CStringGetTextDatum(xact_status); + /* should this be really interpreted by FDW */ + values[5] = PointerGetDatum(cstring_to_text_with_len(fdw_xact->fdw_xact_id, + strlen(fdw_xact->fdw_xact_id))); + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + + SRF_RETURN_DONE(funcctx); +} + +/* + * Built-in function to resolve a prepared foreign transaction manually. + */ +Datum +pg_resolve_fdw_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid = DatumGetTransactionId(PG_GETARG_DATUM(0)); + Oid serverid = PG_GETARG_OID(1); + Oid userid = PG_GETARG_OID(2); + FdwXactState *state; + UserMapping *usermapping; + FdwXact fdwxact; + bool ret; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to resolve foreign transactions")))); + + fdwxact = get_one_fdw_xact(MyDatabaseId, xid, serverid, userid, true); + + if (fdwxact == NULL) + PG_RETURN_BOOL(false); + + usermapping = GetUserMapping(userid, serverid); + + state = create_fdw_xact_state(); + state->serverid = serverid; + state->userid = userid; + state->umid = usermapping->umid; + + ret = FdwXactResolveForeignTransaction(state, fdwxact, LOG); + + PG_RETURN_BOOL(ret); +} + +/* + * Built-in function to remove a prepared foreign transaction entry without + * resolution. The function gives a way to forget about such prepared + * transaction in case: the foreign server where it is prepared is no longer + * available, the user which prepared this transaction needs to be dropped. + */ +Datum +pg_remove_fdw_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid = DatumGetTransactionId(PG_GETARG_DATUM(0)); + Oid serverid = PG_GETARG_OID(1); + Oid userid = PG_GETARG_OID(2); + FdwXact fdwxact; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to remove foreign transactions")))); + + LWLockAcquire(FdwXactLock, LW_EXCLUSIVE); + + fdwxact = get_one_fdw_xact(MyDatabaseId, xid, serverid, userid, false); + if (fdwxact == NULL) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + (errmsg("could not find foreign transaction entry")))); + + remove_fdw_xact(fdwxact); + + LWLockRelease(FdwXactLock); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/fdwxact/fdwxact_launcher.c b/src/backend/access/fdwxact/fdwxact_launcher.c new file mode 100644 index 0000000..39f351b --- /dev/null +++ b/src/backend/access/fdwxact/fdwxact_launcher.c @@ -0,0 +1,641 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_launcher.c + * + * The foreign transaction resolver launcher process starts foreign + * transaction resolver processes. The launcher schedules resolver + * process to be started when arrived a requested by backend process. + * + * There is a shared memory area where the information of resolver process + * is stored. Requesting of starting new resolver process by backend process + * is done via that shared memory area. Note that the launcher is assuming + * that there is no more than one starting request for a database. + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/fdwxact/fdwxact_launcher.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "funcapi.h" +#include "pgstat.h" +#include "funcapi.h" + +#include "access/fdwxact.h" +#include "access/fdwxact_launcher.h" +#include "access/fdwxact_resolver.h" +#include "access/resolver_internal.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" + +/* max sleep time between cycles (3min) */ +#define DEFAULT_NAPTIME_PER_CYCLE 180000L + +static void fdwxact_launcher_onexit(int code, Datum arg); +static void fdwxact_launcher_sighup(SIGNAL_ARGS); +static void fdwxact_launcher_sigusr2(SIGNAL_ARGS); +static void fdwxact_launch_resolver(Oid dbid, int slot); +static bool fdwxact_relaunch_resolvers(void); + +static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t got_SIGUSR2 = false; +FdwXactResolver *MyFdwXactResolver = NULL; + +Datum pg_stat_get_fdwxact_resolver(PG_FUNCTION_ARGS); + +/* + * Wake up the launcher process to retry launch. This is used by + * the resolver process is being stopped. + */ +void +FdwXactLauncherWakeupToRetry(void) +{ + if (FdwXactRslvCtl->launcher_pid != InvalidPid) + SetLatch(FdwXactRslvCtl->launcher_latch); +} + +/* + * Wake up the launcher process to request resolution. This is + * used by the backend process. + */ +void +FdwXactLauncherWakeupToRequest(void) +{ + if (FdwXactRslvCtl->launcher_pid != InvalidPid) + kill(FdwXactRslvCtl->launcher_pid, SIGUSR2); +} + +/* Report shared memory space needed by FdwXactRsoverShmemInit */ +Size +FdwXactRslvShmemSize(void) +{ + Size size = 0; + + size = add_size(size, mul_size(max_foreign_xact_resolvers, + sizeof(FdwXactResolver))); + + return size; +} + +/* + * Allocate and initialize foreign transaction resolver shared + * memory. + */ +void +FdwXactRslvShmemInit(void) +{ + bool found; + + FdwXactRslvCtl = ShmemInitStruct("Foreign transactions resolvers", + FdwXactRslvShmemSize(), + &found); + + if (!IsUnderPostmaster) + { + int slot; + + /* First time through, so initialize */ + MemSet(FdwXactRslvCtl, 0, FdwXactRslvShmemSize()); + + SHMQueueInit(&(FdwXactRslvCtl->FdwXactActiveQueue)); + SHMQueueInit(&(FdwXactRslvCtl->FdwXactRetryQueue)); + + for (slot = 0; slot < max_foreign_xact_resolvers; slot++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[slot]; + + resolver->pid = InvalidPid; + resolver->dbid = InvalidOid; + resolver->in_use = false; + SpinLockInit(&(resolver->mutex)); + } + } +} + +/* + * Cleanup function for fdwxact launcher + * + * Called on fdwxact launcher exit. + */ +static void +fdwxact_launcher_onexit(int code, Datum arg) +{ + FdwXactRslvCtl->launcher_pid = InvalidPid; +} + +/* SIGHUP: set flag to reload configuration at next convenient time */ +static void +fdwxact_launcher_sighup(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGHUP = true; + + SetLatch(MyLatch); + + errno = save_errno; +} + +/* SIGUSR1: set flag to launch new resolver process immediately */ +static void +fdwxact_launcher_sigusr2(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Main loop for the fdwxact launcher process. + */ +void +FdwXactLauncherMain(Datum main_arg) +{ + TimestampTz last_start_time = 0; + + ereport(DEBUG1, + (errmsg("fdwxact resolver launcher started"))); + + before_shmem_exit(fdwxact_launcher_onexit, (Datum) 0); + + Assert(FdwXactRslvCtl->launcher_pid == 0); + FdwXactRslvCtl->launcher_pid = MyProcPid; + FdwXactRslvCtl->launcher_latch = &MyProc->procLatch; + + pqsignal(SIGHUP, fdwxact_launcher_sighup); + pqsignal(SIGUSR2, fdwxact_launcher_sigusr2); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* Enter main loop */ + for (;;) + { + TimestampTz now; + long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + int rc; + + CHECK_FOR_INTERRUPTS(); + ResetLatch(MyLatch); + + now = GetCurrentTimestamp(); + + /* + * Limit the start retry to once a foreign_xact_resolution_retry_interval + * but always try to start by the backend request. + */ + if (got_SIGUSR2 || + TimestampDifferenceExceeds(last_start_time, now, + foreign_xact_resolution_retry_interval)) + { + MemoryContext oldctx; + MemoryContext subctx; + bool launched; + + ResetLatch(MyLatch); + if (got_SIGUSR2) + got_SIGUSR2 = false; + + subctx = AllocSetContextCreate(TopMemoryContext, + "Foreign Transaction Launcher launch", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(subctx); + + /* + * Launch foreign transaction resolvers that are requested + * but not running. + */ + launched = fdwxact_relaunch_resolvers(); + if (launched) + { + last_start_time = now; + wait_time = foreign_xact_resolution_retry_interval; + } + + /* Switch back to original memory context. */ + MemoryContextSwitchTo(oldctx); + /* Clean the temporary memory. */ + MemoryContextDelete(subctx); + } + else + { + /* + * The wait in previous cycle was interrupted in less than + * foreign_xact_resolution_retry_interval since last resolver + * started, this usually means crash of the resolver, so we + * should retry in foreign_xact_resolution_retry_interval again. + */ + wait_time = foreign_xact_resolution_retry_interval; + } + + /* Wait for more work */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + wait_time, + WAIT_EVENT_FDW_XACT_LAUNCHER_MAIN); + + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + } + + /* Not reachable */ +} + +/* + * Request launcher to launch a new foreign transaction resolver worker + * if not running yet. A foreign transaction resolver worker is responsible + * for resolution of foreign transaction that are registered on a database. + * So if a resolver worker already is launched, we don't need to launch new + * one. + */ +void +fdwxact_maybe_launch_resolver(bool ignore_error) +{ + FdwXactResolver *resolver; + bool found = false; + int i; + + /* + * Looking for a resolver process that is running and working on the + * same database. + */ + LWLockAcquire(FdwXactResolverLock, LW_SHARED); + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + resolver = &FdwXactRslvCtl->resolvers[i]; + + if (resolver->in_use && + resolver->pid != InvalidPid && + resolver->dbid == MyDatabaseId) + { + found = true; + break; + } + } + LWLockRelease(FdwXactResolverLock); + + /* + * If we found the resolver for my database, we don't need to launch new + * one but wake running worker up. + */ + if (found) + { + SetLatch(resolver->latch); + + elog(DEBUG1, "found a running foreign transaction resolver process for database %u", + MyDatabaseId); + + return; + } + + /* Looking for unused resolver slot */ + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + resolver = &FdwXactRslvCtl->resolvers[i]; + + if (!resolver->in_use) + { + found = true; + break; + } + } + + /* + * However if there are no more free worker slots, inform user about it before + * exiting. + */ + if (!found) + { + LWLockRelease(FdwXactResolverLock); + + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of foreign trasanction resolver slots"), + errhint("You might need to increase max_foreign_transaction_resolvers."))); + return; + } + + Assert(resolver->pid == InvalidPid); + + /* Found a new resolver process */ + resolver->dbid = MyDatabaseId; + resolver->in_use = true; + + LWLockRelease(FdwXactResolverLock); + + /* Wake up launcher */ + FdwXactLauncherWakeupToRequest(); +} + +/* + * Launch a foreign transaction resolver process that will connect to given + * 'dbid' at 'slot' if given. If slot is negative value we find an unused slot. + * Note that caller must hold FdwXactResolverLock in exclusive mode. + */ +static void +fdwxact_launch_resolver(Oid dbid, int slot) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + FdwXactResolver *resolver; + int launch_slot = slot; + + /* If slot number is invalid, we find an unused slot */ + if (launch_slot < 0) + { + int i; + + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[i]; + + if (resolver->in_use && resolver->dbid == dbid) + return; + + if (!resolver->in_use) + { + launch_slot = i; + break; + } + } + } + + /* No unused found */ + if (launch_slot < 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of foreign trasanction resolver slots"), + errhint("You might need to increase max_foreign_transaction_resolvers."))); + + resolver = &FdwXactRslvCtl->resolvers[launch_slot]; + resolver->in_use = true; + resolver->dbid = dbid; + + /* Register the new dynamic worker */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FdwXactResolverMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "foreign transaction resolver for database %u", resolver->dbid); + snprintf(bgw.bgw_type, BGW_MAXLEN, "foreign transaction resolver"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_main_arg = Int32GetDatum(launch_slot); + bgw.bgw_notify_pid = (Datum) 0; + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + /* Failed to launch, cleanup the worker slot */ + SpinLockAcquire(&(MyFdwXactResolver->mutex)); + resolver->in_use = false; + SpinLockRelease(&(MyFdwXactResolver->mutex)); + + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of background worker slots"), + errhint("You might need to increase max_worker_processes."))); + } + + /* + * We don't need to wait until it attaches here because we're going to wait + * until all foreign transactions are resolved. + */ +} + +/* + * Launch all foreign transaction resolvers that are required by backend process + * but not running. Return true if we launch any resolver. + */ +static bool +fdwxact_relaunch_resolvers(void) +{ + int i, j; + int num_launches = 0; + int num_unused_slots = 0; + int num_dbs = 0; + bool launched = false; + Oid *dbs_to_launch; + Oid *dbs_having_worker = palloc0(sizeof(Oid) * max_foreign_xact_resolvers); + + /* + * Launch resolver workers on the databases that are requested + * by backend processes while looking unused slots. + */ + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[i]; + + /* Remember unused worker slots */ + if (!resolver->in_use) + { + num_unused_slots++; + continue; + } + + /* Remember databases that are having a resolve worker, fall through */ + if (OidIsValid(resolver->dbid)) + dbs_having_worker[num_dbs++] = resolver->dbid; + + /* Launch the backend-requested worker */ + if (resolver->in_use && + OidIsValid(resolver->dbid) && + resolver->pid == InvalidPid) + { + fdwxact_launch_resolver(resolver->dbid, i); + launched = true; + } + } + LWLockRelease(FdwXactResolverLock); + + /* quick exit if no unused slot */ + if (num_unused_slots == 0) + return launched; + + /* + * Launch the stopped resolver on the database that has unresolved + * foreign transaction but doesn't have any resolver. Scanning + * all FdwXact entries could take time but it's harmless for the + * relaunch case. + */ + dbs_to_launch = (Oid *) palloc(sizeof(Oid) * num_unused_slots); + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + for (i = 0; i < FdwXactCtl->numFdwXacts; i++) + { + FdwXact fdw_xact = FdwXactCtl->fdw_xacts[i]; + bool found = false; + + /* unused slot is full */ + if (num_launches > num_unused_slots) + break; + + for (j = 0; j < num_dbs; j++) + { + if (dbs_having_worker[j] == fdw_xact->dbid) + { + found = true; + break; + } + } + + /* Register the database if any resolvers aren't working on that */ + if (!found) + dbs_to_launch[num_launches++] = fdw_xact->dbid; + } + + /* Launch resolver process for a database at any worker slot */ + for (i = 0; i < num_launches; i++) + { + fdwxact_launch_resolver(dbs_to_launch[i], -1); + launched = true; + } + LWLockRelease(FdwXactResolverLock); + + return launched; +} + +/* + * FdwXactLauncherRegister + * Register a background worker running the foreign transaction + * launcher. + */ +void +FdwXactLauncherRegister(void) +{ + BackgroundWorker bgw; + + if (max_foreign_xact_resolvers == 0) + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FdwXactLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "foreign transaction launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, + "foreign transaction launcher"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +bool +IsFdwXactLauncher(void) +{ + return FdwXactRslvCtl->launcher_pid == MyProcPid; +} + + +/* + * Returns activity of foreign transaction resolvers, including pids, the number + * of tasks and the last resolution time. + */ +Datum +pg_stat_get_fdwxact_resolver(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_FDWXACT_RESOLVERS_COLS 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + int i; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + for (i = 0; i < max_foreign_xact_resolvers; i++) + { + FdwXactResolver *resolver = &FdwXactRslvCtl->resolvers[i]; + pid_t pid; + Oid dbid; + TimestampTz last_resolved_time; + Datum values[PG_STAT_GET_FDWXACT_RESOLVERS_COLS]; + bool nulls[PG_STAT_GET_FDWXACT_RESOLVERS_COLS]; + + + SpinLockAcquire(&(resolver->mutex)); + if (resolver->pid == InvalidPid) + { + SpinLockRelease(&(resolver->mutex)); + continue; + } + + pid = resolver->pid; + dbid = resolver->dbid; + last_resolved_time = resolver->last_resolved_time; + SpinLockRelease(&(resolver->mutex)); + + memset(nulls, 0, sizeof(nulls)); + /* pid */ + values[0] = Int32GetDatum(pid); + + /* dbid */ + values[1] = ObjectIdGetDatum(dbid); + + /* last_resolved_time */ + if (last_resolved_time == 0) + nulls[2] = true; + else + values[2] = TimestampTzGetDatum(last_resolved_time); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} diff --git a/src/backend/access/fdwxact/fdwxact_resolver.c b/src/backend/access/fdwxact/fdwxact_resolver.c new file mode 100644 index 0000000..0b754da --- /dev/null +++ b/src/backend/access/fdwxact/fdwxact_resolver.c @@ -0,0 +1,331 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_resolver.c + * + * The foreign transaction resolver background worker resolves foreign + * transactions that participate to a distributed transaction. A resolver + * process is started by foreign transaction launcher for every databases. + * + * A resolver process continues to resolve foreign transactions on a database + * It resolves two types of foreign transactions: on-line foreign transaction + * and dangling foreign transaction. The on-line foreign transaction is a + * foreign transaction that a concurrent backend process is waiting for + * resolution. The dangling transaction is a foreign transaction that corresponding + * distributed transaction ended up in in-doubt state. A resolver process + * doesn' exit as long as there is at least one unresolved foreign transaction + * on the database even if the timeout has come. + * + * Normal termination is by SIGTERM, which instructs the resolver process + * to exit(0) at the next convenient moment. Emergency termination is by + * SIGQUIT; like any backend. The resolver process also terminate by timeouts + * only if there is no pending foreign transactions on the database waiting + * to be resolved. + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/fdwxact/fdwxact_resolver.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/fdwxact.h" +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" +#include "access/resolver_internal.h" +#include "access/transam.h" +#include "access/xact.h" +#include "commands/dbcommands.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* GUC parameters */ +int foreign_xact_resolution_retry_interval; +int foreign_xact_resolver_timeout = 60 * 1000; + +//static MemoryContext ResolveContext = NULL; +FdwXactRslvCtlData *FdwXactRslvCtl; + +static void FdwXactRslvLoop(void); +static long FdwXactRslvComputeSleepTime(TimestampTz now); +static void FdwXactRslvCheckTimeout(TimestampTz now); + +static void fdwxact_resolver_sighup(SIGNAL_ARGS); +static void fdwxact_resolver_onexit(int code, Datum arg); +static void fdwxact_resolver_detach(void); +static void fdwxact_resolver_attach(int slot); + +/* Flags set by signal handlers */ +static volatile sig_atomic_t got_SIGHUP = false; + +/* Set flag to reload configuration at next convenient time */ +static void +fdwxact_resolver_sighup(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGHUP = true; + + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Detach the resolver and cleanup the resolver info. + */ +static void +fdwxact_resolver_detach(void) +{ + /* Block concurrent access */ + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + + MyFdwXactResolver->pid = InvalidPid; + MyFdwXactResolver->in_use = false; + MyFdwXactResolver->dbid = InvalidOid; + + LWLockRelease(FdwXactResolverLock); +} + +/* + * Cleanup up foreign transaction resolver info. + */ +static void +fdwxact_resolver_onexit(int code, Datum arg) +{ + fdwxact_resolver_detach(); + FdwXactLauncherWakeupToRetry(); +} + +/* + * Attach to a slot. + */ +static void +fdwxact_resolver_attach(int slot) +{ + /* Block concurrent access */ + LWLockAcquire(FdwXactResolverLock, LW_EXCLUSIVE); + + Assert(slot >= 0 && slot < max_foreign_xact_resolvers); + MyFdwXactResolver = &FdwXactRslvCtl->resolvers[slot]; + + if (!MyFdwXactResolver->in_use) + { + LWLockRelease(FdwXactResolverLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("foreign transaction resolver slot %d is empty, cannot attach", + slot))); + } + + MyFdwXactResolver->pid = MyProcPid; + MyFdwXactResolver->latch = &MyProc->procLatch; + TIMESTAMP_NOBEGIN(MyFdwXactResolver->last_resolved_time); + + before_shmem_exit(fdwxact_resolver_onexit, (Datum) 0); + + LWLockRelease(FdwXactResolverLock); +} + +/* Foreign transaction resolver entry point */ +void +FdwXactResolverMain(Datum main_arg) +{ + int slot = DatumGetInt32(main_arg); + + /* Attach to a slot */ + fdwxact_resolver_attach(slot); + + /* Establish signal handlers */ + pqsignal(SIGHUP, fdwxact_resolver_sighup); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Connect to our database */ + BackgroundWorkerInitializeConnectionByOid(MyFdwXactResolver->dbid, InvalidOid, 0); + + StartTransactionCommand(); + + ereport(LOG, + (errmsg("foreign transaction resolver for database \"%s\" has started", + get_database_name(MyFdwXactResolver->dbid)))); + + CommitTransactionCommand(); + + /* Initialize stats to a sanish value */ + MyFdwXactResolver->last_resolved_time = GetCurrentTimestamp(); + + /* Run the main loop */ + FdwXactRslvLoop(); + + proc_exit(0); +} + +/* + * Fdwxact resolver main loop + */ +static void +FdwXactRslvLoop(void) +{ + TimestampTz last_retry_time = 0; + MemoryContext resolver_ctx; + + resolver_ctx = AllocSetContextCreate(TopMemoryContext, + "Foreign Transaction Resolver", + ALLOCSET_DEFAULT_SIZES); + + /* Enter main loop */ + for (;;) + { + int rc; + TimestampTz now; + long sleep_time; + bool resolved; + + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + MemoryContextSwitchTo(resolver_ctx); + + if (got_SIGHUP) + { + got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* Resolve one distributed transaction */ + StartTransactionCommand(); + resolved = FdwXactResolveDistributedTransaction(MyDatabaseId, true); + CommitTransactionCommand(); + + now = GetCurrentTimestamp(); + + /* Update my state */ + if (resolved) + MyFdwXactResolver->last_resolved_time = now; + + if (TimestampDifferenceExceeds(last_retry_time, now, + foreign_xact_resolution_retry_interval)) + { + StartTransactionCommand(); + resolved = FdwXactResolveDistributedTransaction(MyDatabaseId, false); + CommitTransactionCommand(); + + last_retry_time = GetCurrentTimestamp(); + + /* Update my state */ + if (resolved) + MyFdwXactResolver->last_resolved_time = last_retry_time; + } + + /* Check for fdwxact resolver timeout */ + FdwXactRslvCheckTimeout(now); + + /* + * If we have resolved any distributed transaction we go the next + * without both resolving dangling transaction and sleeping because + * there might be other on-line transactions waiting to be resolved. + */ + if (!resolved) + { + /* Resolve dangling transactions as mush as possible */ + StartTransactionCommand(); + FdwXactResolveAllDanglingTransactions(MyDatabaseId); + CommitTransactionCommand(); + + sleep_time = FdwXactRslvComputeSleepTime(now); + + MemoryContextResetAndDeleteChildren(resolver_ctx); + MemoryContextSwitchTo(TopMemoryContext); + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + sleep_time, + WAIT_EVENT_FDW_XACT_RESOLVER_MAIN); + + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + } + } +} + +/* + * Check whether there have been foreign transactions by the backend within + * foreign_xact_resolver_timeout and shutdown if not. + */ +static void +FdwXactRslvCheckTimeout(TimestampTz now) +{ + TimestampTz timeout; + + if (foreign_xact_resolver_timeout == 0) + return; + + timeout = TimestampTzPlusMilliseconds(MyFdwXactResolver->last_resolved_time, + foreign_xact_resolver_timeout); + + if (now < timeout) + return; + + /* + * Reached to the timeout. We exit if there is no more both pending on-line + * transactions and dangling transactions. + */ + if (!fdw_xact_exists(InvalidTransactionId, MyDatabaseId, InvalidOid, + InvalidOid)) + { + StartTransactionCommand(); + ereport(LOG, + (errmsg("foreign transaction resolver for database \"%s\" will stop because the timeout", + get_database_name(MyFdwXactResolver->dbid)))); + CommitTransactionCommand(); + + fdwxact_resolver_detach(); + proc_exit(0); + } +} + +/* + * Compute how long we should sleep by the next cycle. Return the sleep time + * in milliseconds, -1 means that we reached to the timeout and should exits + */ +static long +FdwXactRslvComputeSleepTime(TimestampTz now) +{ + static TimestampTz wakeuptime = 0; + long sleeptime; + long sec_to_timeout; + int microsec_to_timeout; + + if (now >= wakeuptime) + wakeuptime = TimestampTzPlusMilliseconds(now, + foreign_xact_resolution_retry_interval); + + /* Compute relative time until wakeup. */ + TimestampDifference(now, wakeuptime, + &sec_to_timeout, µsec_to_timeout); + + sleeptime = sec_to_timeout * 1000 + microsec_to_timeout / 1000; + + return sleeptime; +} + +bool +IsFdwXactResolver(void) +{ + return MyFdwXactResolver != NULL; +} diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c2db19b..fb63471 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2629,10 +2629,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, heap_freetuple(heaptup); } - /* Make note that we've wrote on non-temprary relation */ - if (RelationNeedsWAL(relation)) - MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; - return HeapTupleGetOid(tup); } @@ -3457,10 +3453,6 @@ l1: if (old_key_tuple != NULL && old_key_copied) heap_freetuple(old_key_tuple); - /* Make note that we've wrote on non-temprary relation */ - if (RelationNeedsWAL(relation)) - MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; - return HeapTupleMayBeUpdated; } @@ -4411,10 +4403,6 @@ l2: if (old_key_tuple != NULL && old_key_copied) heap_freetuple(old_key_tuple); - /* Make note that we've wrote on non-temprary relation */ - if (RelationNeedsWAL(relation)) - MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; - bms_free(hot_attrs); bms_free(proj_idx_attrs); bms_free(key_attrs); diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile index 5514db1..742e825 100644 --- a/src/backend/access/rmgrdesc/Makefile +++ b/src/backend/access/rmgrdesc/Makefile @@ -8,9 +8,9 @@ subdir = src/backend/access/rmgrdesc top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = brindesc.o clogdesc.o committsdesc.o dbasedesc.o genericdesc.o \ - gindesc.o gistdesc.o hashdesc.o heapdesc.o logicalmsgdesc.o \ - mxactdesc.o nbtdesc.o relmapdesc.o replorigindesc.o seqdesc.o \ - smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o +OBJS = brindesc.o clogdesc.o committsdesc.o dbasedesc.o fdwxactdesc.o \ + genericdesc.o gindesc.o gistdesc.o hashdesc.o heapdesc.o \ + logicalmsgdesc.o mxactdesc.o nbtdesc.o relmapdesc.o replorigindesc.o \ + seqdesc.o smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/fdwxactdesc.c b/src/backend/access/rmgrdesc/fdwxactdesc.c new file mode 100644 index 0000000..7061bba --- /dev/null +++ b/src/backend/access/rmgrdesc/fdwxactdesc.c @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * fdw_xactdesc.c + * PostgreSQL distributed transaction manager for foreign server. + * + * This module describes the WAL records for foreign transaction manager. + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * src/backend/access/rmgrdesc/fdw_xactdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/fdwxact_xlog.h" + +void +fdw_xact_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_FDW_XACT_INSERT) + { + FdwXactOnDiskData *fdw_insert_xlog = (FdwXactOnDiskData *) rec; + + appendStringInfo(buf, "Foreign server oid: %u", fdw_insert_xlog->serverid); + appendStringInfo(buf, " user oid: %u", fdw_insert_xlog->userid); + appendStringInfo(buf, " database id: %u", fdw_insert_xlog->dbid); + appendStringInfo(buf, " local xid: %u", fdw_insert_xlog->local_xid); + /* TODO: This should be really interpreted by each FDW */ + + /* + * TODO: we also need to assess whether we want to add this + * information + */ + appendStringInfo(buf, " foreign transaction info: %s", + fdw_insert_xlog->fdw_xact_id); + } + else + { + xl_fdw_xact_remove *fdw_remove_xlog = (xl_fdw_xact_remove *) rec; + + appendStringInfo(buf, "Foreign server oid: %u", fdw_remove_xlog->serverid); + appendStringInfo(buf, " user oid: %u", fdw_remove_xlog->userid); + appendStringInfo(buf, " database id: %u", fdw_remove_xlog->dbid); + appendStringInfo(buf, " local xid: %u", fdw_remove_xlog->xid); + } + +} + +const char * +fdw_xact_identify(uint8 info) +{ + switch (info & ~XLR_INFO_MASK) + { + case XLOG_FDW_XACT_INSERT: + return "NEW FOREIGN TRANSACTION"; + case XLOG_FDW_XACT_REMOVE: + return "REMOVE FOREIGN TRANSACTION"; + } + /* Keep compiler happy */ + return NULL; +} diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 00741c7..4a9ab3d 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -112,14 +112,16 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "max_connections=%d max_worker_processes=%d " "max_prepared_xacts=%d max_locks_per_xact=%d " "wal_level=%s wal_log_hints=%s " - "track_commit_timestamp=%s", + "track_commit_timestamp=%s " + "max_prepared_foreign_transactions=%d", xlrec.MaxConnections, xlrec.max_worker_processes, xlrec.max_prepared_xacts, xlrec.max_locks_per_xact, wal_level_str, xlrec.wal_log_hints ? "on" : "off", - xlrec.track_commit_timestamp ? "on" : "off"); + xlrec.track_commit_timestamp ? "on" : "off", + xlrec.max_prepared_foreign_xacts); } else if (info == XLOG_FPW_CHANGE) { diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 16fbe47..f15c83a 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -12,9 +12,9 @@ subdir = src/backend/access/transam top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = clog.o commit_ts.o generic_xlog.o multixact.o parallel.o rmgr.o slru.o \ - subtrans.o timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \ - xact.o xlog.o xlogarchive.o xlogfuncs.o \ +OBJS = clog.o commit_ts.o generic_xlog.o multixact.o \ + parallel.o rmgr.o slru.o subtrans.o timeline.o transam.o twophase.o \ + twophase_rmgr.o varsup.o xact.o xlog.o xlogarchive.o xlogfuncs.o \ xloginsert.o xlogreader.o xlogutils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 9368b56..8b360b1 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -9,6 +9,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/ginxlog.h" #include "access/gistxlog.h" #include "access/generic_xlog.h" diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 3942734..bc4e109 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -77,6 +77,7 @@ #include #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/htup_details.h" #include "access/subtrans.h" #include "access/transam.h" @@ -844,6 +845,35 @@ TwoPhaseGetGXact(TransactionId xid) } /* + * TwoPhaseExists + * Return true if there is a prepared transaction specified by XID + */ +bool +TwoPhaseExists(TransactionId xid) +{ + int i; + bool found = false; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; + + if (pgxact->xid == xid) + { + found = true; + break; + } + } + + LWLockRelease(TwoPhaseStateLock); + + return found; +} + +/* * TwoPhaseGetDummyProc * Get the dummy backend ID for prepared transaction specified by XID * @@ -2316,6 +2346,12 @@ RecordTransactionCommitPrepared(TransactionId xid, * in the procarray and continue to hold locks. */ SyncRepWaitForLSN(recptr, true); + + /* + * Wait for foreign transaction prepared as part of this prepared + * transaction to be committed. + */ + FdwXactWaitToBeResolved(xid, true); } /* @@ -2375,6 +2411,12 @@ RecordTransactionAbortPrepared(TransactionId xid, * in the procarray and continue to hold locks. */ SyncRepWaitForLSN(recptr, false); + + /* + * Wait for foreign transaction prepared as part of this prepared + * transaction to be committed. + */ + FdwXactWaitToBeResolved(xid, false); } /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index d967400..1d06e0a 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -21,6 +21,7 @@ #include #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/multixact.h" #include "access/parallel.h" #include "access/subtrans.h" @@ -1131,6 +1132,7 @@ RecordTransactionCommit(void) SharedInvalidationMessage *invalMessages = NULL; bool RelcacheInitFileInval = false; bool wrote_xlog; + bool need_commit_globally; /* Get data needed for commit record */ nrels = smgrGetPendingDeletes(true, &rels); @@ -1139,6 +1141,7 @@ RecordTransactionCommit(void) nmsgs = xactGetCommittedInvalidationMessages(&invalMessages, &RelcacheInitFileInval); wrote_xlog = (XactLastRecEnd != 0); + need_commit_globally = FdwXactIsAtomicCommitReady(); /* * If we haven't been assigned an XID yet, we neither can, nor do we want @@ -1177,12 +1180,13 @@ RecordTransactionCommit(void) } /* - * If we didn't create XLOG entries, we're done here; otherwise we - * should trigger flushing those entries the same as a commit record + * If we didn't create XLOG entries and the transaction does not need + * to be committed using two-phase commit. we're done here; otherwise + * we should trigger flushing those entries the same as a commit record * would. This will primarily happen for HOT pruning and the like; we * want these to be flushed to disk in due time. */ - if (!wrote_xlog) + if (!wrote_xlog && !need_commit_globally) goto cleanup; } else @@ -1340,6 +1344,14 @@ RecordTransactionCommit(void) if (wrote_xlog && markXidCommitted) SyncRepWaitForLSN(XactLastRecEnd, true); + /* + * Wait for prepared foreign transaction to be resolved, if required. + * We only want to wait if we prepared foreign transaction in this + * transaction. + */ + if (need_commit_globally && markXidCommitted) + FdwXactWaitToBeResolved(xid, true); + /* remember end of last commit record */ XactLastCommitEnd = XactLastRecEnd; @@ -1994,6 +2006,9 @@ CommitTransaction(void) break; } + /* Pre-commit step for foreign transactions */ + PreCommit_FdwXacts(); + CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_PRE_COMMIT : XACT_EVENT_PRE_COMMIT); @@ -2150,6 +2165,7 @@ CommitTransaction(void) AtEOXact_PgStat(true); AtEOXact_Snapshot(true, false); AtEOXact_ApplyLauncher(true); + AtEOXact_FdwXacts(true); pgstat_report_xact_timestamp(0); CurrentResourceOwner = NULL; @@ -2237,6 +2253,8 @@ PrepareTransaction(void) * the transaction-abort path. */ + AtPrepare_FdwXacts(); + /* Shut down the deferred-trigger manager */ AfterTriggerEndXact(true); @@ -2426,6 +2444,7 @@ PrepareTransaction(void) AtEOXact_Files(true); AtEOXact_ComboCid(); AtEOXact_HashTables(true); + AtEOXact_FdwXacts(true); /* don't call AtEOXact_PgStat here; we fixed pgstat state above */ AtEOXact_Snapshot(true, true); pgstat_report_xact_timestamp(0); @@ -2631,6 +2650,7 @@ AbortTransaction(void) AtEOXact_HashTables(false); AtEOXact_PgStat(false); AtEOXact_ApplyLauncher(false); + AtEOXact_FdwXacts(false); pgstat_report_xact_timestamp(0); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7eed586..cce4fd4 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -24,6 +24,7 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/multixact.h" #include "access/rewriteheap.h" #include "access/subtrans.h" @@ -5250,6 +5251,7 @@ BootStrapXLOG(void) ControlFile->MaxConnections = MaxConnections; ControlFile->max_worker_processes = max_worker_processes; ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_prepared_foreign_xacts = max_prepared_foreign_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; @@ -6337,6 +6339,9 @@ CheckRequiredParameterValues(void) RecoveryRequiresIntParameter("max_prepared_transactions", max_prepared_xacts, ControlFile->max_prepared_xacts); + RecoveryRequiresIntParameter("max_prepared_foreign_transactions", + max_prepared_foreign_xacts, + ControlFile->max_prepared_foreign_xacts); RecoveryRequiresIntParameter("max_locks_per_transaction", max_locks_per_xact, ControlFile->max_locks_per_xact); @@ -6861,14 +6866,15 @@ StartupXLOG(void) restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI); /* - * Before running in recovery, scan pg_twophase and fill in its status to - * be able to work on entries generated by redo. Doing a scan before - * taking any recovery action has the merit to discard any 2PC files that - * are newer than the first record to replay, saving from any conflicts at - * replay. This avoids as well any subsequent scans when doing recovery - * of the on-disk two-phase data. + * Before running in recovery, scan pg_twophase and pg_fdw_xact, and then + * fill in its status to be able to work on entries generated by redo. + * Doing a scan before taking any recovery action has the merit to discard + * any state files that are newer than the first record to replay, saving + * from any conflicts at replay. This avoids as well any subsequent scans + * when doing recovery of the on-disk two-phase or fdwxact data. */ restoreTwoPhaseData(); + restoreFdwXactData(); lastFullPageWrites = checkPoint.fullPageWrites; @@ -7060,7 +7066,10 @@ StartupXLOG(void) InitRecoveryTransactionEnvironment(); if (wasShutdown) + { oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + oldestActiveXID = PrescanFdwXacts(oldestActiveXID); + } else oldestActiveXID = checkPoint.oldestActiveXid; Assert(TransactionIdIsValid(oldestActiveXID)); @@ -7566,6 +7575,7 @@ StartupXLOG(void) * as potential problems are detected before any on-disk change is done. */ oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); + oldestActiveXID = PrescanFdwXacts(oldestActiveXID); /* * Consider whether we need to assign a new timeline ID. @@ -7884,6 +7894,9 @@ StartupXLOG(void) /* Reload shared-memory state for prepared transactions */ RecoverPreparedTransactions(); + /* Load all foreign transaction entries from disk to memory */ + RecoverFdwXacts(); + /* * Shutdown the recovery environment. This must occur after * RecoverPreparedTransactions(), see notes for lock_twophase_recover() @@ -9200,6 +9213,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointReplicationOrigin(); /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); + CheckPointFdwXacts(checkPointRedo); } /* @@ -9633,7 +9647,8 @@ XLogReportParameters(void) max_worker_processes != ControlFile->max_worker_processes || max_prepared_xacts != ControlFile->max_prepared_xacts || max_locks_per_xact != ControlFile->max_locks_per_xact || - track_commit_timestamp != ControlFile->track_commit_timestamp) + track_commit_timestamp != ControlFile->track_commit_timestamp || + max_prepared_foreign_xacts != ControlFile->max_prepared_foreign_xacts) { /* * The change in number of backend slots doesn't need to be WAL-logged @@ -9665,6 +9680,7 @@ XLogReportParameters(void) ControlFile->MaxConnections = MaxConnections; ControlFile->max_worker_processes = max_worker_processes; ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_prepared_foreign_xacts = max_prepared_foreign_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; @@ -9870,6 +9886,7 @@ xlog_redo(XLogReaderState *record) RunningTransactionsData running; oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + oldestActiveXID = PrescanFdwXacts(oldestActiveXID); /* * Construct a RunningTransactions snapshot representing a shut @@ -10068,6 +10085,7 @@ xlog_redo(XLogReaderState *record) ControlFile->MaxConnections = xlrec.MaxConnections; ControlFile->max_worker_processes = xlrec.max_worker_processes; ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts; + ControlFile->max_prepared_foreign_xacts = xlrec.max_prepared_foreign_xacts; ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact; ControlFile->wal_level = xlrec.wal_level; ControlFile->wal_log_hints = xlrec.wal_log_hints; diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 715995d..1b9cdbb 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -291,6 +291,9 @@ CREATE VIEW pg_prepared_xacts AS CREATE VIEW pg_prepared_statements AS SELECT * FROM pg_prepared_statement() AS P; +CREATE VIEW pg_prepared_fdw_xacts AS + SELECT * FROM pg_prepared_fdw_xacts() AS F; + CREATE VIEW pg_seclabels AS SELECT l.objoid, l.classoid, l.objsubid, @@ -773,6 +776,14 @@ CREATE VIEW pg_stat_subscription AS LEFT JOIN pg_stat_get_subscription(NULL) st ON (st.subid = su.oid); +CREATE VIEW pg_stat_fdwxact_resolvers AS + SELECT + r.pid, + r.dbid, + r.last_resolved_time + FROM pg_stat_get_fdwxact_resolver() r + WHERE r.pid IS NOT NULL; + CREATE VIEW pg_stat_ssl AS SELECT S.pid, diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index b58a74f..4d4c339 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2504,9 +2504,16 @@ CopyFrom(CopyState cstate) if (resultRelInfo->ri_FdwRoutine != NULL && resultRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL) + { + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(RelationGetRelid(resultRelInfo->ri_RelationDesc), + true); + resultRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, resultRelInfo); + } + /* Prepare to catch AFTER triggers. */ AfterTriggerBeginQuery(); diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index e5dd995..6056feb 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/fdwxact.h" #include "access/heapam.h" #include "access/htup_details.h" #include "access/reloptions.h" @@ -1093,6 +1094,18 @@ RemoveForeignServerById(Oid srvId) if (!HeapTupleIsValid(tp)) elog(ERROR, "cache lookup failed for foreign server %u", srvId); + /* + * If there is a foreign prepared transaction with this foreign server, + * dropping it might result in dangling prepared transaction. + */ + if (fdw_xact_exists(InvalidTransactionId, MyDatabaseId, srvId, InvalidOid)) + { + Form_pg_foreign_server srvForm = (Form_pg_foreign_server) GETSTRUCT(tp); + ereport(WARNING, + (errmsg("server \"%s\" has unresolved prepared transactions on it", + NameStr(srvForm->srvname)))); + } + CatalogTupleDelete(rel, &tp->t_self); ReleaseSysCache(tp); @@ -1407,6 +1420,16 @@ RemoveUserMapping(DropUserMappingStmt *stmt) user_mapping_ddl_aclcheck(useId, srv->serverid, srv->servername); /* + * If there is a foreign prepared transaction with this user mapping, + * dropping it might result in dangling prepared transaction. + */ + if (fdw_xact_exists(InvalidTransactionId, MyDatabaseId, srv->serverid, + useId)) + ereport(WARNING, + (errmsg("server \"%s\" has unresolved prepared transaction for user \"%s\"", + srv->servername, MappingUserName(useId)))); + + /* * Do the deletion */ object.classId = UserMappingRelationId; @@ -1559,6 +1582,13 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt) errmsg("foreign-data wrapper \"%s\" does not support IMPORT FOREIGN SCHEMA", fdw->fdwname))); + /* + * Remember the transaction accesses to a foreign server. Normally during + * ImportForeignSchema we don't modify data on foreign servers, so remember it + * as not-modified server. + */ + RegisterFdwXactByServerId(server->serverid, false); + /* Call FDW to get a list of commands */ cmd_list = fdw_routine->ImportForeignSchema(stmt, server->serverid); diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 1e72e9f..2fee05d 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/fdwxact.h" #include "catalog/partition.h" #include "catalog/pg_inherits.h" #include "catalog/pg_type.h" @@ -749,7 +750,14 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, */ if (partRelInfo->ri_FdwRoutine != NULL && partRelInfo->ri_FdwRoutine->BeginForeignInsert != NULL) + { + Relation child = partRelInfo->ri_RelationDesc; + + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(RelationGetRelid(child), true); + partRelInfo->ri_FdwRoutine->BeginForeignInsert(mtstate, partRelInfo); + } MemoryContextSwitchTo(oldContext); diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c index 5d2cd0e..71c9916 100644 --- a/src/backend/executor/nodeForeignscan.c +++ b/src/backend/executor/nodeForeignscan.c @@ -22,6 +22,8 @@ */ #include "postgres.h" +#include "access/fdwxact.h" +#include "access/xact.h" #include "executor/executor.h" #include "executor/nodeForeignscan.h" #include "foreign/fdwapi.h" @@ -224,10 +226,33 @@ ExecInitForeignScan(ForeignScan *node, EState *estate, int eflags) * Tell the FDW to initialize the scan. */ if (node->operation != CMD_SELECT) + { + RangeTblEntry *rte; + + rte = exec_rt_fetch(estate->es_result_relation_info->ri_RangeTableIndex, + estate); + + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(rte->relid, true); + fdwroutine->BeginDirectModify(scanstate, eflags); + } else + { + RangeTblEntry *rte; + int rtindex = (scanrelid > 0) ? + scanrelid : + bms_next_member(node->fs_relids, -1); + + rte = exec_rt_fetch(rtindex, estate); + + /* Remember the transaction accesses to a foreign server */ + RegisterFdwXactByRelId(rte->relid, false); + fdwroutine->BeginForeignScan(scanstate, eflags); + } + return scanstate; } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index e2836b7..2557568 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -37,6 +37,7 @@ #include "postgres.h" +#include "access/fdwxact.h" #include "access/htup_details.h" #include "access/xact.h" #include "commands/trigger.h" @@ -44,6 +45,7 @@ #include "executor/executor.h" #include "executor/nodeModifyTable.h" #include "foreign/fdwapi.h" +#include "foreign/foreign.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "storage/bufmgr.h" @@ -485,6 +487,10 @@ ExecInsert(ModifyTableState *mtstate, HEAP_INSERT_SPECULATIVE, NULL); + /* Make note that we've wrote on non-temprary relation */ + if (RelationNeedsWAL(resultRelationDesc)) + MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; + /* insert index entries for tuple */ recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), estate, true, &specConflict, @@ -530,6 +536,10 @@ ExecInsert(ModifyTableState *mtstate, estate->es_output_cid, 0, NULL); + /* Make note that we've wrote on non-temprary relation */ + if (RelationNeedsWAL(resultRelationDesc)) + MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; + /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), @@ -722,6 +732,11 @@ ldelete:; true /* wait for commit */ , &hufd, changingPart); + + /* Make note that we've wrote on non-temprary relation */ + if (RelationNeedsWAL(resultRelationDesc)) + MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; + switch (result) { case HeapTupleSelfUpdated: @@ -1210,6 +1225,11 @@ lreplace:; estate->es_crosscheck_snapshot, true /* wait for commit */ , &hufd, &lockmode); + + /* Make note that we've wrote on non-temprary relation */ + if (RelationNeedsWAL(resultRelationDesc)) + MyXactFlags |= XACT_FLAGS_WROTENONTEMPREL; + switch (result) { case HeapTupleSelfUpdated: @@ -2315,6 +2335,10 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_FdwRoutine->BeginForeignModify != NULL) { List *fdw_private = (List *) list_nth(node->fdwPrivLists, i); + Oid relid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + /* Remember the transaction modifies data on a foreign server*/ + RegisterFdwXactByRelId(relid, true); resultRelInfo->ri_FdwRoutine->BeginForeignModify(mtstate, resultRelInfo, diff --git a/src/backend/foreign/foreign.c b/src/backend/foreign/foreign.c index a0bcc04..b2097ad 100644 --- a/src/backend/foreign/foreign.c +++ b/src/backend/foreign/foreign.c @@ -155,6 +155,49 @@ GetForeignServerByName(const char *srvname, bool missing_ok) return GetForeignServer(serverid); } +/* + * GetUserMapping - look up the user mapping by user mapping oid. + * + * If userid of the mapping is invalid, we set it to current userid. + */ +UserMapping * +GetUserMappingByOid(Oid umid) +{ + Datum datum; + HeapTuple tp; + UserMapping *um; + bool isnull; + Form_pg_user_mapping tableform; + + tp = SearchSysCache1(USERMAPPINGOID, + ObjectIdGetDatum(umid)); + + if (!HeapTupleIsValid(tp)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("user mapping not found for %d", umid))); + + tableform = (Form_pg_user_mapping) GETSTRUCT(tp); + um = (UserMapping *) palloc(sizeof(UserMapping)); + um->umid = umid; + um->userid = OidIsValid(tableform->umuser) ? + tableform->umuser : GetUserId(); + um->serverid = tableform->umserver; + + /* Extract the umoptions */ + datum = SysCacheGetAttr(USERMAPPINGUSERSERVER, + tp, + Anum_pg_user_mapping_umoptions, + &isnull); + if (isnull) + um->options = NIL; + else + um->options = untransformRelOptions(datum); + + ReleaseSysCache(tp); + + return um; +} /* * GetUserMapping - look up the user mapping. diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index d2b695e..b722b9a 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -15,6 +15,8 @@ #include #include "libpq/pqsignal.h" +#include "access/fdwxact_launcher.h" +#include "access/fdwxact_resolver.h" #include "access/parallel.h" #include "miscadmin.h" #include "pgstat.h" @@ -129,6 +131,12 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "FdwXactResolverMain", FdwXactResolverMain + }, + { + "FdwXactLauncherMain", FdwXactLauncherMain } }; diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 42bccce..5116369 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3484,6 +3484,12 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_CHECKPOINTER_MAIN: event_name = "CheckpointerMain"; break; + case WAIT_EVENT_FDW_XACT_RESOLVER_MAIN: + event_name = "FdwXactResolverMain"; + break; + case WAIT_EVENT_FDW_XACT_LAUNCHER_MAIN: + event_name = "FdwXactLauncherMain"; + break; case WAIT_EVENT_LOGICAL_APPLY_MAIN: event_name = "LogicalApplyMain"; break; @@ -3678,6 +3684,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_SYNC_REP: event_name = "SyncRep"; break; + case WAIT_EVENT_FDW_XACT_RESOLUTION: + event_name = "FdwXactResolution"; + break; /* no default case, so that compiler will warn */ } @@ -3893,6 +3902,15 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_TWOPHASE_FILE_WRITE: event_name = "TwophaseFileWrite"; break; + case WAIT_EVENT_FDW_XACT_FILE_WRITE: + event_name = "FdwXactFileWrite"; + break; + case WAIT_EVENT_FDW_XACT_FILE_READ: + event_name = "FdwXactFileRead"; + break; + case WAIT_EVENT_FDW_XACT_FILE_SYNC: + event_name = "FdwXactFileSync"; + break; case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ: event_name = "WALSenderTimelineHistoryRead"; break; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index cb49f32..d9faef0 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -93,6 +93,8 @@ #include #endif +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" #include "access/transam.h" #include "access/xlog.h" #include "bootstrap/bootstrap.h" @@ -896,6 +898,10 @@ PostmasterMain(int argc, char *argv[]) ereport(ERROR, (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\""))); + if (max_prepared_foreign_xacts > 0 && max_foreign_xact_resolvers == 0) + ereport(ERROR, + (errmsg("preparing foreign transactions (max_prepared_foreign_transactions > 0) requires max_foreign_transaction_resolvers > 0"))); + /* * Other one-time internal sanity checks can go here, if they are fast. * (Put any slow processing further down, after postmaster.pid creation.) @@ -971,12 +977,13 @@ PostmasterMain(int argc, char *argv[]) #endif /* - * Register the apply launcher. Since it registers a background worker, - * it needs to be called before InitializeMaxBackends(), and it's probably - * a good idea to call it before any modules had chance to take the - * background worker slots. + * Register the apply launcher and foreign transaction launcher. Since + * it registers a background worker, it needs to be called before + * InitializeMaxBackends(), and it's probably a good idea to call it + * before any modules had chance to take the background worker slots. */ ApplyLauncherRegister(); + FdwXactLauncherRegister(); /* * process any libraries that should be preloaded at postmaster start diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index afb4972..960fd6a 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -154,6 +154,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor case RM_COMMIT_TS_ID: case RM_REPLORIGIN_ID: case RM_GENERIC_ID: + case RM_FDW_XACT_ID: /* just deal with xid, and done */ ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), buf.origptr); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 0c86a58..c5610ee 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -16,6 +16,8 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact.h" +#include "access/fdwxact_launcher.h" #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" @@ -150,6 +152,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); size = add_size(size, BackendRandomShmemSize()); + size = add_size(size, FdwXactShmemSize()); + size = add_size(size, FdwXactRslvShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -270,6 +274,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) SyncScanShmemInit(); AsyncShmemInit(); BackendRandomShmemInit(); + FdwXactShmemInit(); + FdwXactRslvShmemInit(); #ifdef EXEC_BACKEND diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index dc7e875..48bb87a 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -91,6 +91,8 @@ typedef struct ProcArrayStruct TransactionId replication_slot_xmin; /* oldest catalog xmin of any replication slot */ TransactionId replication_slot_catalog_xmin; + /* local transaction id of oldest unresolved distributed transaction */ + TransactionId fdwxact_unresolved_xmin; /* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */ int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; @@ -246,6 +248,7 @@ CreateSharedProcArray(void) procArray->lastOverflowedXid = InvalidTransactionId; procArray->replication_slot_xmin = InvalidTransactionId; procArray->replication_slot_catalog_xmin = InvalidTransactionId; + procArray->fdwxact_unresolved_xmin = InvalidTransactionId; } allProcs = ProcGlobal->allProcs; @@ -1327,6 +1330,7 @@ GetOldestXmin(Relation rel, int flags) TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + TransactionId fdwxact_unresolved_xmin = InvalidTransactionId; /* * If we're not computing a relation specific limit, or if a shared @@ -1392,6 +1396,7 @@ GetOldestXmin(Relation rel, int flags) */ replication_slot_xmin = procArray->replication_slot_xmin; replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + fdwxact_unresolved_xmin = procArray->fdwxact_unresolved_xmin; if (RecoveryInProgress()) { @@ -1442,6 +1447,15 @@ GetOldestXmin(Relation rel, int flags) result = replication_slot_xmin; /* + * Check whether there are unresolved distributed transaction + * requiring an older xmin. + */ + if (!(flags & PROCARRAY_FDW_XACT_XMIN) && + TransactionIdIsValid(fdwxact_unresolved_xmin) && + NormalTransactionIdPrecedes(fdwxact_unresolved_xmin, result)) + result = fdwxact_unresolved_xmin; + + /* * After locks have been released and defer_cleanup_age has been applied, * check whether we need to back up further to make logical decoding * possible. We need to do so if we're computing the global limit (rel = @@ -3030,6 +3044,38 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin, LWLockRelease(ProcArrayLock); } +/* + * ProcArraySetFdwXactUnresolvedXmin + * + * Install limits to future computations fo the xmin horizon to prevent + * vacuum clog from affected transactions still needed by resolving + * distributed transaction. + */ +void +ProcArraySetFdwXactUnresolvedXmin(TransactionId xmin) +{ + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + procArray->fdwxact_unresolved_xmin = xmin; + LWLockRelease(ProcArrayLock); +} + +/* + * ProcArrayGetFdwXactUnresolvedXmin + * + * Return the current unresolved xmin limits. + */ +TransactionId +ProcArrayGetFdwXactUnresolvedXmin(void) +{ + TransactionId xmin; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + xmin = procArray->fdwxact_unresolved_xmin; + LWLockRelease(ProcArrayLock); + + return xmin; +} #define XidCacheRemove(i) \ do { \ diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6025ec..a42d06e 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,3 +50,5 @@ OldSnapshotTimeMapLock 42 BackendRandomLock 43 LogicalRepWorkerLock 44 CLogTruncationLock 45 +FdwXactLock 46 +FdwXactResolverLock 47 diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 6ad5044..6e7b3b8 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -35,6 +35,7 @@ #include #include +#include "access/fdwxact.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -405,6 +406,10 @@ InitProcess(void) MyProc->syncRepState = SYNC_REP_NOT_WAITING; SHMQueueElemInit(&(MyProc->syncRepLinks)); + /* initialize fields for fdw xact */ + MyProc->fdwXactState = FDW_XACT_NOT_WAITING; + SHMQueueElemInit(&(MyProc->fdwXactLinks)); + /* Initialize fields for group XID clearing. */ MyProc->procArrayGroupMember = false; MyProc->procArrayGroupMemberXid = InvalidTransactionId; @@ -806,6 +811,9 @@ ProcKill(int code, Datum arg) /* Make sure we're out of the sync rep lists */ SyncRepCleanupAtProcExit(); + /* Make sure we're out of the fdwxact lists */ + FdwXactCleanupAtProcExit(); + #ifdef USE_ASSERT_CHECKING { int i; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index a3b9757..48f3c59 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -36,6 +36,8 @@ #include "rusagestub.h" #endif +#include "access/fdwxact_resolver.h" +#include "access/fdwxact_launcher.h" #include "access/parallel.h" #include "access/printtup.h" #include "access/xact.h" @@ -2994,6 +2996,18 @@ ProcessInterrupts(void) */ proc_exit(1); } + else if (IsFdwXactResolver()) + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating foreign transaction resolver due to administrator command"))); + else if (IsFdwXactLauncher()) + { + /* + * The foreign transaction launcher can be stopped at any time. + * Use exit status 1 so the background worker is restarted. + */ + proc_exit(1); + } else if (RecoveryConflictPending && RecoveryConflictRetryable) { pgstat_report_recovery_conflict(RecoveryConflictReason); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 0327b29..ffdc166 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -27,6 +27,7 @@ #endif #include "access/commit_ts.h" +#include "access/fdwxact.h" #include "access/gin.h" #include "access/rmgr.h" #include "access/transam.h" @@ -377,6 +378,25 @@ static const struct config_enum_entry synchronous_commit_options[] = { }; /* + * Although only "required", "prefer", and "disabled" are documented, + * we accept all the likely variants of "on" and "off". + */ +static const struct config_enum_entry distributed_atomic_commit_options[] = { + {"required", DISTRIBUTED_ATOMIC_COMMIT_REQUIRED, false}, + {"prefer", DISTRIBUTED_ATOMIC_COMMIT_PREFER, false}, + {"disabled", DISTRIBUTED_ATOMIC_COMMIT_DISABLED, false}, + {"on", DISTRIBUTED_ATOMIC_COMMIT_REQUIRED, false}, + {"off", DISTRIBUTED_ATOMIC_COMMIT_DISABLED, false}, + {"true", DISTRIBUTED_ATOMIC_COMMIT_REQUIRED, true}, + {"false", DISTRIBUTED_ATOMIC_COMMIT_DISABLED, true}, + {"yes", DISTRIBUTED_ATOMIC_COMMIT_REQUIRED, true}, + {"no", DISTRIBUTED_ATOMIC_COMMIT_DISABLED, true}, + {"1", DISTRIBUTED_ATOMIC_COMMIT_REQUIRED, true}, + {"0", DISTRIBUTED_ATOMIC_COMMIT_DISABLED, true}, + {NULL, 0, false} +}; + +/* * Although only "on", "off", "try" are documented, we accept all the likely * variants of "on" and "off". */ @@ -658,6 +678,10 @@ const char *const config_group_names[] = gettext_noop("Client Connection Defaults / Other Defaults"), /* LOCK_MANAGEMENT */ gettext_noop("Lock Management"), + /* FDWXACT */ + gettext_noop("Foreign Transaction Management"), + /* FDWXACT_SETTINGS */ + gettext_noop("Foreign Transaction Management / Settings"), /* COMPAT_OPTIONS */ gettext_noop("Version and Platform Compatibility"), /* COMPAT_OPTIONS_PREVIOUS */ @@ -2234,6 +2258,52 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + /* + * See also CheckRequiredParameterValues() if this parameter changes + */ + { + {"max_prepared_foreign_transactions", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Sets the maximum number of simultaneously prepared transactions on foreign servers."), + NULL + }, + &max_prepared_foreign_xacts, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"foreign_transaction_resolver_timeout", PGC_SIGHUP, RESOURCES_ASYNCHRONOUS, + gettext_noop("Sets the maximum time to wait for foreign transaction resolution."), + NULL, + GUC_UNIT_MS + }, + &foreign_xact_resolver_timeout, + 60 * 1000, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"max_foreign_transaction_resolvers", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Maximum number of foreign transaction resolution processes."), + NULL + }, + &max_foreign_xact_resolvers, + 0, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"foreign_transaction_resolution_retry_interval", PGC_SIGHUP, RESOURCES_ASYNCHRONOUS, + gettext_noop("Sets the time to wait before retrying to resolve foreign transaction " + "after a failed attempt."), + NULL, + GUC_UNIT_MS + }, + &foreign_xact_resolution_retry_interval, + 5000, 1, INT_MAX, + NULL, NULL, NULL + }, + #ifdef LOCK_DEBUG { {"trace_lock_oidmin", PGC_SUSET, DEVELOPER_OPTIONS, @@ -4054,6 +4124,16 @@ static struct config_enum ConfigureNamesEnum[] = }, { + {"distributed_atomic_commit", PGC_USERSET, FDWXACT_SETTINGS, + gettext_noop("Use of distributed atomic commit for the current transaction."), + NULL + }, + &distributed_atomic_commit, + DISTRIBUTED_ATOMIC_COMMIT_DISABLED, distributed_atomic_commit_options, + check_distributed_atomic_commit, NULL, NULL + }, + + { {"archive_mode", PGC_POSTMASTER, WAL_ARCHIVING, gettext_noop("Allows archiving of WAL files using archive_command."), NULL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 3fe257c..88387ed 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -121,6 +121,8 @@ #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) +#max_prepared_foreign_transactions = 0 # zero disables the feature + # (change requires restart) # Caution: it is not advisable to set max_prepared_transactions nonzero unless # you actively intend to use prepared transactions. #work_mem = 4MB # min 64kB @@ -287,6 +289,20 @@ #------------------------------------------------------------------------------ +# FOREIGN TRANSACTION +#------------------------------------------------------------------------------ + +#foreign_twophase_commit = off + +#max_foreign_transaction_resolvers = 0 # max number of resolver process + # (change requires restart) +#foreign_transaction_resolver_timeout = 60s # in milliseconds; 0 disables +#foreign_transaction_resolution_retry_interval = 5s # time to wait before + # retrying to resolve + # foreign transactions + # after a failed attempt + +#------------------------------------------------------------------------------ # QUERY TUNING #------------------------------------------------------------------------------ diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d index ad06e8e..ca3eb62 100644 --- a/src/backend/utils/probes.d +++ b/src/backend/utils/probes.d @@ -81,6 +81,8 @@ provider postgresql { probe multixact__checkpoint__done(bool); probe twophase__checkpoint__start(); probe twophase__checkpoint__done(); + probe fdwxact__checkpoint__start(); + probe fdwxact__checkpoint__done(); probe smgr__md__read__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int); probe smgr__md__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index ab5cb7f..609578c 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -209,6 +209,7 @@ static const char *const subdirs[] = { "pg_snapshots", "pg_subtrans", "pg_twophase", + "pg_fdw_xact", "pg_multixact", "pg_multixact/members", "pg_multixact/offsets", diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 895a51f..7df88e0 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -306,6 +306,8 @@ main(int argc, char *argv[]) ControlFile->max_worker_processes); printf(_("max_prepared_xacts setting: %d\n"), ControlFile->max_prepared_xacts); + printf(_("max_prepared_foreign_transactions setting: %d\n"), + ControlFile->max_prepared_foreign_xacts); printf(_("max_locks_per_xact setting: %d\n"), ControlFile->max_locks_per_xact); printf(_("track_commit_timestamp setting: %s\n"), diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 6fb403a..6d867c8 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -730,6 +730,7 @@ GuessControlValues(void) ControlFile.MaxConnections = 100; ControlFile.max_worker_processes = 8; ControlFile.max_prepared_xacts = 0; + ControlFile.max_prepared_foreign_xacts = 0; ControlFile.max_locks_per_xact = 64; ControlFile.maxAlign = MAXIMUM_ALIGNOF; @@ -957,6 +958,7 @@ RewriteControlFile(void) ControlFile.MaxConnections = 100; ControlFile.max_worker_processes = 8; ControlFile.max_prepared_xacts = 0; + ControlFile.max_prepared_foreign_xacts = 0; ControlFile.max_locks_per_xact = 64; /* Contents are protected with a CRC */ diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 852d8ca..b616cea 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -11,6 +11,7 @@ #include "access/brin_xlog.h" #include "access/clog.h" #include "access/commit_ts.h" +#include "access/fdwxact_xlog.h" #include "access/generic_xlog.h" #include "access/ginxlog.h" #include "access/gistxlog.h" diff --git a/src/include/access/fdwxact.h b/src/include/access/fdwxact.h new file mode 100644 index 0000000..b4de88b --- /dev/null +++ b/src/include/access/fdwxact.h @@ -0,0 +1,149 @@ +/* + * fdwxact.h + * + * PostgreSQL distributed transaction manager + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * src/include/access/fdwxact.h + */ +#ifndef FDW_XACT_H +#define FDW_XACT_H + +#include "access/fdwxact_xlog.h" +#include "access/xlogreader.h" +#include "foreign/foreign.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "nodes/execnodes.h" +#include "storage/backendid.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "utils/guc.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +#define FDW_XACT_NOT_WAITING 0 +#define FDW_XACT_WAITING 1 +#define FDW_XACT_WAITING_RETRY 2 +#define FDW_XACT_WAIT_COMPLETE 3 + +#define FdwXactEnabled() (max_prepared_foreign_xacts > 0) + +/* Maximum length of the prepared transaction id, borrowed from twophase.c */ +#define FDW_XACT_ID_MAX_LEN 200 + +/* Enum to track the status of prepared foreign transaction */ +typedef enum +{ + FDW_XACT_INITIAL, + FDW_XACT_PREPARING, /* foreign transaction is being prepared */ + FDW_XACT_PREPARED, /* foreign transaction is prepared */ + FDW_XACT_COMMITTING_PREPARED, /* foreign prepared transaction is to + * be committed */ + FDW_XACT_ABORTING_PREPARED, /* foreign prepared transaction is to be + * aborted */ +} FdwXactStatus; + + +/* Enum for distributed_atomic_commit parameter */ +typedef enum +{ + DISTRIBUTED_ATOMIC_COMMIT_DISABLED, /* disable distributed atomic commit */ + DISTRIBUTED_ATOMIC_COMMIT_PREFER, /* use twophase commit where available */ + DISTRIBUTED_ATOMIC_COMMIT_REQUIRED /* all foreign servers have to support twophase + * commit */ +} DistributedAtomicCommitLevel; + +/* Shared memory entry for a prepared or being prepared foreign transaction */ +typedef struct FdwXactData *FdwXact; + +typedef struct FdwXactData +{ + FdwXact fxact_free_next; /* Next free FdwXact entry */ + FdwXact fxact_next; /* Pointer to the neext FdwXact entry accosiated + * with the same transaction */ + Oid dbid; /* database oid where to find foreign server + * and user mapping */ + TransactionId local_xid; /* XID of local transaction */ + Oid serverid; /* foreign server where transaction takes place */ + Oid userid; /* user who initiated the foreign transaction */ + Oid umid; + FdwXactStatus status; /* The state of the foreign transaction. This + * doubles as the action to be taken on this entry. */ + + /* + * Note that we need to keep track of two LSNs for each FdwXact. We keep + * track of the start LSN because this is the address we must use to read + * state data back from WAL when committing a FdwXact. We keep track of + * the end LSN because that is the LSN we need to wait for prior to + * commit. + */ + XLogRecPtr insert_start_lsn; /* XLOG offset of inserting this entry start */ + XLogRecPtr insert_end_lsn; /* XLOG offset of inserting this entry end */ + + bool valid; /* has the entry been complete and written to file? */ + BackendId held_by; /* backend who are holding */ + bool ondisk; /* true if prepare state file is on disk */ + bool inredo; /* true if entry was added via xlog_redo */ + char fdw_xact_id[FDW_XACT_MAX_ID_LEN]; /* prepared transaction identifier */ +} FdwXactData; + +/* Shared memory layout for maintaining foreign prepared transaction entries. */ +typedef struct +{ + /* Head of linked list of free FdwXactData structs */ + FdwXact freeFdwXacts; + + /* Number of valid foreign transaction entries */ + int numFdwXacts; + + /* Upto max_prepared_foreign_xacts entries in the array */ + FdwXact fdw_xacts[FLEXIBLE_ARRAY_MEMBER]; /* Variable length array */ +} FdwXactCtlData; + +/* Pointer to the shared memory holding the foreign transactions data */ +FdwXactCtlData *FdwXactCtl; + +typedef struct FdwXactState +{ + Oid serverid; + Oid userid; + Oid umid; + char *fdwxact_id; + void *fdw_state; /* foreign-data wrapper can keep state here */ +} FdwXactState; + +/* GUC parameters */ +extern int max_prepared_foreign_xacts; +extern int max_foreign_xact_resolvers; +extern int foreign_xact_resolution_retry_interval; +extern int foreign_xact_resolver_timeout; +extern int distributed_atomic_commit; + +extern Size FdwXactShmemSize(void); +extern void FdwXactShmemInit(void); +extern void restoreFdwXactData(void); +extern TransactionId PrescanFdwXacts(TransactionId oldestActiveXid); +extern void RecoverFdwXacts(void); +extern void AtEOXact_FdwXacts(bool is_commit); +extern void AtPrepare_FdwXacts(void); +extern bool fdw_xact_exists(TransactionId xid, Oid dboid, Oid serverid, + Oid userid); +extern void CheckPointFdwXacts(XLogRecPtr redo_horizon); +extern bool FdwTwoPhaseNeeded(void); +extern void PreCommit_FdwXacts(void); +extern void KnownFdwXactRecreateFiles(XLogRecPtr redo_horizon); +extern void FdwXactWaitToBeResolved(TransactionId wait_xid, bool commit); +extern bool FdwXactResolveDistributedTransaction(Oid dbid, bool is_active); +extern void FdwXactResolveAllDanglingTransactions(Oid dbid); +extern bool FdwXactIsAtomicCommitReady(void); +extern void FdwXactCleanupAtProcExit(void); +extern void RegisterFdwXactByRelId(Oid relid, bool modified); +extern void RegisterFdwXactByServerId(Oid serverid, bool modified); +extern void FdwXactMarkForeignServerAccessed(Oid relid, bool modified); +extern bool check_distributed_atomic_commit(int *newval, void **extra, + GucSource source); + +#endif /* FDW_XACT_H */ diff --git a/src/include/access/fdwxact_launcher.h b/src/include/access/fdwxact_launcher.h new file mode 100644 index 0000000..4ea65b2 --- /dev/null +++ b/src/include/access/fdwxact_launcher.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_launcher.h + * PostgreSQL foreign transaction launcher definitions + * + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * src/include/access/fdwxact_launcher.h + * + *------------------------------------------------------------------------- + */ + +#ifndef _FDWXACT_LAUNCHER_H +#define _FDWXACT_LAUNCHER_H + +#include "access/fdwxact.h" + +extern void FdwXactLauncherRegister(void); +extern void FdwXactLauncherMain(Datum main_arg); +extern void FdwXactLauncherWakeupToRequest(void); +extern void FdwXactLauncherWakeupToRetry(void); + +extern Size FdwXactRslvShmemSize(void); +extern void FdwXactRslvShmemInit(void); + +extern bool IsFdwXactLauncher(void); + +extern void fdwxact_maybe_launch_resolver(bool ignore_error); + + +#endif /* _FDWXACT_LAUNCHER_H */ diff --git a/src/include/access/fdwxact_resolver.h b/src/include/access/fdwxact_resolver.h new file mode 100644 index 0000000..6b2a24f --- /dev/null +++ b/src/include/access/fdwxact_resolver.h @@ -0,0 +1,23 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_resolver.h + * PostgreSQL foreign transaction resolver definitions + * + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * src/include/access/fdwxact_resolver.h + * + *------------------------------------------------------------------------- + */ +#ifndef FDWXACT_RESOLVER_H +#define FDWXACT_RESOLVER_H + +#include "access/fdwxact.h" + +extern void FdwXactResolverMain(Datum main_arg); +extern bool IsFdwXactResolver(void); + +extern int foreign_xact_resolver_timeout; + +#endif /* FDWXACT_RESOLVER_H */ diff --git a/src/include/access/fdwxact_xlog.h b/src/include/access/fdwxact_xlog.h new file mode 100644 index 0000000..e92b5a1 --- /dev/null +++ b/src/include/access/fdwxact_xlog.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * fdwxact_xlog.h + * Foreign transaction XLOG definitions. + * + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * src/include/access/fdwxact_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef FDWXACT_XLOG_H +#define FDWXACT_XLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +/* Info types for logs related to FDW transactions */ +#define XLOG_FDW_XACT_INSERT 0x00 +#define XLOG_FDW_XACT_REMOVE 0x10 + +/* Same as GIDSIZE */ +#define FDW_XACT_MAX_ID_LEN 200 +/* + * On disk file structure, also used to WAL + */ +typedef struct +{ + TransactionId local_xid; + Oid dbid; /* database oid where to find foreign server + * and user mapping */ + Oid serverid; /* foreign server where transaction takes + * place */ + Oid userid; /* user who initiated the foreign transaction */ + Oid umid; + char fdw_xact_id[FDW_XACT_MAX_ID_LEN]; /* foreign txn prepare id */ +} FdwXactOnDiskData; + +typedef struct xl_fdw_xact_remove +{ + TransactionId xid; + Oid serverid; + Oid userid; + Oid dbid; +} xl_fdw_xact_remove; + +extern void fdw_xact_redo(XLogReaderState *record); +extern void fdw_xact_desc(StringInfo buf, XLogReaderState *record); +extern const char *fdw_xact_identify(uint8 info); + +#endif /* FDWXACT_XLOG_H */ diff --git a/src/include/access/resolver_internal.h b/src/include/access/resolver_internal.h new file mode 100644 index 0000000..36391d4 --- /dev/null +++ b/src/include/access/resolver_internal.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------- + * + * resolver_internal.h + * Internal headers shared by fdwxact resolvers. + * + * Portions Copyright (c) 2018, PostgreSQL Global Development Group + * + * src/include/access/resovler_internal.h + * + *------------------------------------------------------------------------- + */ + +#ifndef _RESOLVER_INTERNAL_H +#define _RESOLVER_INTERNAL_H + +#include "storage/latch.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/timestamp.h" + +/* + * Each foreign transaction resolver has a FdwXactResolver struct in + * shared memory. This struct is protected by FdwXactResolverLaunchLock. + */ +typedef struct FdwXactResolver +{ + pid_t pid; /* this resolver's PID, or 0 if not active */ + Oid dbid; /* database oid */ + + /* Indicates if this slot is used of free */ + bool in_use; + + /* Stats */ + TimestampTz last_resolved_time; + + /* Protect shared variables shown above */ + slock_t mutex; + + /* + * Pointer to the resolver's patch. Used by backends to wake up this + * resolver when it has work to do. NULL if the resolver isn't active. + */ + Latch *latch; +} FdwXactResolver; + +/* There is one FdwXactRslvCtlData struct for the whole database cluster */ +typedef struct FdwXactRslvCtlData +{ + /* + * Foreign transaction resolution queues. Protected by FdwXactLock. + */ + SHM_QUEUE FdwXactActiveQueue; + SHM_QUEUE FdwXactRetryQueue; + + /* Supervisor process and latch */ + pid_t launcher_pid; + Latch *launcher_latch; + + FdwXactResolver resolvers[FLEXIBLE_ARRAY_MEMBER]; +} FdwXactRslvCtlData; + +extern FdwXactRslvCtlData *FdwXactRslvCtl; + +extern FdwXactResolver *MyFdwXactResolver; +extern FdwXactRslvCtlData *FdwXactRslvCtl; + +#endif /* _RESOLVER_INTERNAL_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 0bbe9879..c15dff7 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) +PG_RMGR(RM_FDW_XACT_ID, "Foreign Transactions", fdw_xact_redo, fdw_xact_desc, fdw_xact_identify, NULL, NULL, NULL) diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 0e932da..b199c88 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -36,6 +36,7 @@ extern void PostPrepare_Twophase(void); extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid); extern BackendId TwoPhaseGetDummyBackendId(TransactionId xid); +extern bool TwoPhaseExists(TransactionId xid); extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 2c1b2d8..63c833d 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -105,6 +105,13 @@ extern int MyXactFlags; #define XACT_FLAGS_WROTENONTEMPREL (1U << 2) /* + * XACT_FLAGS_FDWNONPREPARE - set when we wrote data on foreign table of which + * server isn't capable of two-phase commit + * relation. + */ +#define XACT_FLAGS_FDWNOPREPARE (1U << 3) + +/* * start- and end-of-transaction callbacks for dynamically loaded modules */ typedef enum diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 30610b3..795e85a 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -227,6 +227,7 @@ typedef struct xl_parameter_change int MaxConnections; int max_worker_processes; int max_prepared_xacts; + int max_prepared_foreign_xacts; int max_locks_per_xact; int wal_level; bool wal_log_hints; diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 773d9e6..3d5333a 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -178,6 +178,7 @@ typedef struct ControlFileData int MaxConnections; int max_worker_processes; int max_prepared_xacts; + int max_prepared_foreign_xacts; int max_locks_per_xact; bool track_commit_timestamp; diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 9264a2e..ee68caa 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5036,6 +5036,13 @@ proargmodes => '{i,o,o,o,o,o,o,o,o}', proargnames => '{subid,subid,relid,pid,received_lsn,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time}', prosrc => 'pg_stat_get_subscription' }, +{ oid => '6053', descr => 'statistics: information about foreign transaction resolver', + proname => 'pg_stat_get_fdwxact_resolver', proisstrict => 'f', provolatile => 's', + proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{oid,oid,timestamptz}', + proargmodes => '{o,o,o}', + proargnames => '{pid,dbid,last_resolved_time}', + prosrc => 'pg_stat_get_fdwxact_resolver' }, { oid => '2026', descr => 'statistics: current backend PID', proname => 'pg_backend_pid', provolatile => 's', proparallel => 'r', prorettype => 'int4', proargtypes => '', prosrc => 'pg_backend_pid' }, @@ -5741,6 +5748,22 @@ proargnames => '{type,object_names,object_args,classid,objid,objsubid}', prosrc => 'pg_get_object_address' }, +{ oid => '6050', descr => 'view foreign transactions', + proname => 'pg_prepared_fdw_xacts', prorows => '1000', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{oid,xid,oid,oid,text,text}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{dbid,transaction,serverid,userid,status,identifier}', + prosrc => 'pg_prepared_fdw_xacts' }, +{ oid => '6051', descr => 'remove foreign transaction', + proname => 'pg_remove_fdw_xact', provolatile => 'v', prorettype => 'bool', + proargtypes => 'xid oid oid', + prosrc => 'pg_remove_fdw_xact' }, +{ oid => '6052', descr => 'resolve foreign transaction', + proname => 'pg_resolve_fdw_xact', provolatile => 'v', prorettype => 'bool', + proargtypes => 'xid oid oid', + prosrc => 'pg_resolve_fdw_xact' }, + { oid => '2079', descr => 'is table visible in search path?', proname => 'pg_table_is_visible', procost => '10', provolatile => 's', prorettype => 'bool', proargtypes => 'oid', prosrc => 'pg_table_is_visible' }, diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index c14eb54..92d47bb 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -12,6 +12,7 @@ #ifndef FDWAPI_H #define FDWAPI_H +#include "access/fdwxact.h" #include "access/parallel.h" #include "nodes/execnodes.h" #include "nodes/relation.h" @@ -168,6 +169,14 @@ typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root, typedef List *(*ReparameterizeForeignPathByChild_function) (PlannerInfo *root, List *fdw_private, RelOptInfo *child_rel); +typedef bool (*PrepareForeignTransaction_function) (FdwXactState *state); +typedef bool (*CommitForeignTransaction_function) (FdwXactState *state); +typedef bool (*RollbackForeignTransaction_function) (FdwXactState *state); +typedef bool (*ResolveForeignTransaction_function) (FdwXactState *state, + bool is_commit); +typedef bool (*IsTwoPhaseCommitEnabled_function) (Oid serverid); +typedef char *(*GetPrepareId_function) (TransactionId xid, Oid serverid, + Oid userid, int *prep_id_len); /* * FdwRoutine is the struct returned by a foreign-data wrapper's handler @@ -235,6 +244,14 @@ typedef struct FdwRoutine /* Support functions for IMPORT FOREIGN SCHEMA */ ImportForeignSchema_function ImportForeignSchema; + /* Support functions for distributed transactions */ + PrepareForeignTransaction_function PrepareForeignTransaction; + CommitForeignTransaction_function CommitForeignTransaction; + RollbackForeignTransaction_function RollbackForeignTransaction; + ResolveForeignTransaction_function ResolveForeignTransaction; + IsTwoPhaseCommitEnabled_function IsTwoPhaseCommitEnabled; + GetPrepareId_function GetPrepareId; + /* Support functions for parallelism under Gather node */ IsForeignScanParallelSafe_function IsForeignScanParallelSafe; EstimateDSMForeignScan_function EstimateDSMForeignScan; @@ -247,7 +264,6 @@ typedef struct FdwRoutine ReparameterizeForeignPathByChild_function ReparameterizeForeignPathByChild; } FdwRoutine; - /* Functions in foreign/foreign.c */ extern FdwRoutine *GetFdwRoutine(Oid fdwhandler); extern Oid GetForeignServerIdByRelId(Oid relid); diff --git a/src/include/foreign/foreign.h b/src/include/foreign/foreign.h index 3ca12e6..d030368 100644 --- a/src/include/foreign/foreign.h +++ b/src/include/foreign/foreign.h @@ -68,10 +68,10 @@ typedef struct ForeignTable List *options; /* ftoptions as DefElem list */ } ForeignTable; - extern ForeignServer *GetForeignServer(Oid serverid); extern ForeignServer *GetForeignServerByName(const char *name, bool missing_ok); extern UserMapping *GetUserMapping(Oid userid, Oid serverid); +extern UserMapping *GetUserMappingByOid(Oid umid); extern ForeignDataWrapper *GetForeignDataWrapper(Oid fdwid); extern ForeignDataWrapper *GetForeignDataWrapperByName(const char *name, bool missing_ok); diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f1c10d1..05feb0a 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -759,6 +759,8 @@ typedef enum WAIT_EVENT_BGWRITER_HIBERNATE, WAIT_EVENT_BGWRITER_MAIN, WAIT_EVENT_CHECKPOINTER_MAIN, + WAIT_EVENT_FDW_XACT_RESOLVER_MAIN, + WAIT_EVENT_FDW_XACT_LAUNCHER_MAIN, WAIT_EVENT_LOGICAL_APPLY_MAIN, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, WAIT_EVENT_PGSTAT_MAIN, @@ -833,7 +835,8 @@ typedef enum WAIT_EVENT_REPLICATION_ORIGIN_DROP, WAIT_EVENT_REPLICATION_SLOT_DROP, WAIT_EVENT_SAFE_SNAPSHOT, - WAIT_EVENT_SYNC_REP + WAIT_EVENT_SYNC_REP, + WAIT_EVENT_FDW_XACT_RESOLUTION } WaitEventIPC; /* ---------- @@ -913,6 +916,9 @@ typedef enum WAIT_EVENT_TWOPHASE_FILE_READ, WAIT_EVENT_TWOPHASE_FILE_SYNC, WAIT_EVENT_TWOPHASE_FILE_WRITE, + WAIT_EVENT_FDW_XACT_FILE_READ, + WAIT_EVENT_FDW_XACT_FILE_WRITE, + WAIT_EVENT_FDW_XACT_FILE_SYNC, WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, WAIT_EVENT_WAL_BOOTSTRAP_SYNC, WAIT_EVENT_WAL_BOOTSTRAP_WRITE, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index cb613c8..45880b2 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -153,6 +153,16 @@ struct PGPROC SHM_QUEUE syncRepLinks; /* list link if process is in syncrep queue */ /* + * Info to allow us to wait for foreign transaction to be resolved, if + * needed. + */ + TransactionId fdwXactWaitXid; /* waiting for foreign transaction involved with + * this transaction id to be resolved */ + int fdwXactState; /* wait state for foreign transaction + * resolution */ + SHM_QUEUE fdwXactLinks; /* list link if process is in queue */ + + /* * All PROCLOCK objects for locks held or awaited by this backend are * linked into one of these lists, according to the partition number of * their lock. diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 75bab29..25d6a2f 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -36,6 +36,8 @@ #define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot xmin, * catalog_xmin */ +#define PROCARRAY_FDW_XACT_XMIN 0x40 /* unresolved distributed + transaciton xmin */ /* * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching * PGXACT->vacuumFlags. Other flags are used for different purposes and @@ -124,4 +126,7 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, TransactionId *catalog_xmin); + +extern void ProcArraySetFdwXactUnresolvedXmin(TransactionId xmin); +extern TransactionId ProcArrayGetFdwXactUnresolvedXmin(void); #endif /* PROCARRAY_H */ diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 668d9ef..81560bd 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -94,6 +94,8 @@ enum config_group CLIENT_CONN_PRELOAD, CLIENT_CONN_OTHER, LOCK_MANAGEMENT, + FDWXACT, + FDWXACT_SETTINGS, COMPAT_OPTIONS, COMPAT_OPTIONS_PREVIOUS, COMPAT_OPTIONS_CLIENT, diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 735dd37..fdd6ded 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1413,6 +1413,13 @@ pg_policies| SELECT n.nspname AS schemaname, FROM ((pg_policy pol JOIN pg_class c ON ((c.oid = pol.polrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))); +pg_prepared_fdw_xacts| SELECT f.dbid, + f.transaction, + f.serverid, + f.userid, + f.status, + f.identifier + FROM pg_prepared_fdw_xacts() f(dbid, transaction, serverid, userid, status, identifier); pg_prepared_statements| SELECT p.name, p.statement, p.prepare_time, @@ -1821,6 +1828,11 @@ pg_stat_database_conflicts| SELECT d.oid AS datid, pg_stat_get_db_conflict_bufferpin(d.oid) AS confl_bufferpin, pg_stat_get_db_conflict_startup_deadlock(d.oid) AS confl_deadlock FROM pg_database d; +pg_stat_fdwxact_resolvers| SELECT r.pid, + r.dbid, + r.last_resolved_time + FROM pg_stat_get_fdwxact_resolver() r(pid, dbid, last_resolved_time) + WHERE (r.pid IS NOT NULL); pg_stat_progress_vacuum| SELECT s.pid, s.datid, d.datname, -- 2.10.5