diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 57519fe..3969366 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2726,11 +2726,30 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, false, NULL, NULL, NIL, NULL, transition_capture); } +/* ---------- + * ExecBRDeleteTriggers() + * + * Called to execute BEFORE ROW DELETE triggers. + * + * Returns false in following scenarios : + * 1. Trigger function returned NULL. + * 2. The tuple was concurrently deleted OR it was concurrently updated and the + * new tuple didn't pass EvalPlanQual() test. + * 3. The tuple was concurrently updated and the new tuple passed the + * EvalPlanQual() test, BUT epqslot parameter is not NULL. In such case, this + * function skips the trigger function execution, because the caller has + * indicated that it wants to further process the updated tuple. The updated + * tuple slot is passed back through epqslot. + * + * In all other cases, returns true. + * ---------- + */ bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tupleid, - HeapTuple fdw_trigtuple) + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; bool result = true; @@ -2745,8 +2764,24 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, { trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, tupleid, LockTupleExclusive, &newSlot); + + /* If requested, pass back the concurrently updated tuple, if any. */ + if (epqslot != NULL) + *epqslot = newSlot; + if (trigtuple == NULL) return false; + + /* + * If the tuple was concurrently updated and the caller of this + * function requested for the updated tuple, skip the trigger + * execution. + */ + if (newSlot != NULL && epqslot != NULL) + { + heap_freetuple(trigtuple); + return false; + } } else trigtuple = fdw_trigtuple; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 41e857e..5006baa 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate, { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, &searchslot->tts_tuple->t_self, - NULL); + NULL, NULL); } if (!skip_tuple) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 7e0b867..1a1c6ee 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -609,7 +609,9 @@ ExecInsert(ModifyTableState *mtstate, * foreign table, tupleid is invalid; the FDW has to figure out * which row to delete using data from the planSlot. oldtuple is * passed to foreign table triggers; it is NULL when the foreign - * table has no relevant triggers. + * table has no relevant triggers. When this DELETE is a part of + * an UPDATE of partition-key, then the slot returned by + * EvalPlanQual() is passed back using output parameter epqslot. * * Returns RETURNING result if any, otherwise NULL. * ---------------------------------------------------------------- @@ -622,6 +624,7 @@ ExecDelete(ModifyTableState *mtstate, EPQState *epqstate, EState *estate, bool *tupleDeleted, + TupleTableSlot **epqslot, bool processReturning, bool canSetTag, bool changingPart) @@ -649,7 +652,7 @@ ExecDelete(ModifyTableState *mtstate, bool dodelete; dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tupleid, oldtuple); + tupleid, oldtuple, epqslot); if (!dodelete) /* "do nothing" */ return NULL; @@ -769,19 +772,30 @@ ldelete:; if (!ItemPointerEquals(tupleid, &hufd.ctid)) { - TupleTableSlot *epqslot; + TupleTableSlot *my_epqslot; - epqslot = EvalPlanQual(estate, + my_epqslot = EvalPlanQual(estate, epqstate, resultRelationDesc, resultRelInfo->ri_RangeTableIndex, LockTupleExclusive, &hufd.ctid, hufd.xmax); - if (!TupIsNull(epqslot)) + if (!TupIsNull(my_epqslot)) { *tupleid = hufd.ctid; - goto ldelete; + + /* If requested, pass back the updated row. */ + if (epqslot) + { + *epqslot = my_epqslot; + return NULL; + } + else + { + /* Normal DELETE: So delete the updated row. */ + goto ldelete; + } } } /* tuple already deleted; nothing to do */ @@ -1052,6 +1066,7 @@ lreplace:; { bool tuple_deleted; TupleTableSlot *ret_slot; + TupleTableSlot *epqslot = NULL; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; int map_index; TupleConversionMap *tupconv_map; @@ -1081,7 +1096,7 @@ lreplace:; * processing. We want to return rows from INSERT. */ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, - estate, &tuple_deleted, false, + estate, &tuple_deleted, &epqslot, false, false /* canSetTag */ , true /* changingPart */ ); /* @@ -1105,7 +1120,23 @@ lreplace:; * resurrect it. */ if (!tuple_deleted) - return NULL; + { + /* + * epqslot will be typically NULL. But when ExecDelete() finds + * that another transaction has concurrently updated the same + * row, it re-fetches the row, skips the delete, and epqslot is + * set to the re-fetched tuple slot. In that case, we need to + * do all the checks again. + */ + if (TupIsNull(epqslot)) + return NULL; + else + { + slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot); + tuple = ExecMaterializeSlot(slot); + goto lreplace; + } + } /* * Updates set the transition capture map only when a new subplan @@ -2136,7 +2167,7 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, &node->mt_epqstate, estate, - NULL, true, node->canSetTag, + NULL, NULL, true, node->canSetTag, false /* changingPart */ ); break; default: diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index a5b8610..1031448 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tupleid, - HeapTuple fdw_trigtuple); + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, ItemPointer tupleid, diff --git a/src/test/isolation/expected/partition-key-update-4.out b/src/test/isolation/expected/partition-key-update-4.out new file mode 100644 index 0000000..774a7fa --- /dev/null +++ b/src/test/isolation/expected/partition-key-update-4.out @@ -0,0 +1,60 @@ +Parsed test spec with 2 sessions + +starting permutation: s1b s2b s2u1 s1u s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1; +step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a; +tableoid a b + +foo2 2 ABC update2 update1 + +starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1; +step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1ut: <... completed> +step s1c: COMMIT; +step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a; +tableoid a b + +footrg2 2 ABC update2 update1 +step s1stl: SELECT * FROM triglog ORDER BY a; +a b + +1 ABC update2 trigger + +starting permutation: s1b s2b s2u2 s1u s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1; +step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a; +tableoid a b + +foo1 1 EFG + +starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1; +step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1ut: <... completed> +step s1c: COMMIT; +step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a; +tableoid a b + +footrg1 1 EFG +step s1stl: SELECT * FROM triglog ORDER BY a; +a b + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 0e99721..d5594e8 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -74,4 +74,5 @@ test: predicate-gin-nomatch test: partition-key-update-1 test: partition-key-update-2 test: partition-key-update-3 +test: partition-key-update-4 test: plpgsql-toast diff --git a/src/test/isolation/specs/partition-key-update-4.spec b/src/test/isolation/specs/partition-key-update-4.spec new file mode 100644 index 0000000..1d53a7d --- /dev/null +++ b/src/test/isolation/specs/partition-key-update-4.spec @@ -0,0 +1,76 @@ +# Test that a row that ends up in a new partition contains changes made by +# a concurrent transaction. + +setup +{ + -- + -- Setup to test concurrent handling of ExecDelete(). + -- + CREATE TABLE foo (a int, b text) PARTITION BY LIST(a); + CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1); + CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2); + INSERT INTO foo VALUES (1, 'ABC'); + + -- + -- Setup to test concurrent handling of GetTupleForTrigger(). + -- + CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a); + CREATE TABLE triglog as select * from footrg; + CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1); + CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2); + INSERT INTO footrg VALUES (1, 'ABC'); + CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$ + BEGIN + OLD.b = OLD.b || ' trigger'; + + -- This will verify that the trigger is not run *before* the row is + -- refetched by EvalPlanQual. The OLD row should contain the changes made + -- by the concurrent session. + INSERT INTO triglog select OLD.*; + + RETURN OLD; + END $$ LANGUAGE PLPGSQL; + CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1 + FOR EACH ROW EXECUTE PROCEDURE func_footrg(); + +} + +teardown +{ + DROP TABLE foo; + DROP TRIGGER footrg_ondel ON footrg1; + DROP FUNCTION func_footrg(); + DROP TABLE footrg; + DROP TABLE triglog; +} + +session "s1" +step "s1b" { BEGIN ISOLATION LEVEL READ COMMITTED; } +step "s1u" { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; } +step "s1ut" { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; } +step "s1s" { SELECT tableoid::regclass, * FROM foo ORDER BY a; } +step "s1st" { SELECT tableoid::regclass, * FROM footrg ORDER BY a; } +step "s1stl" { SELECT * FROM triglog ORDER BY a; } +step "s1c" { COMMIT; } + +session "s2" +step "s2b" { BEGIN ISOLATION LEVEL READ COMMITTED; } +step "s2u1" { UPDATE foo SET b = b || ' update2' WHERE a = 1; } +step "s2u2" { UPDATE foo SET b = 'EFG' WHERE a = 1; } +step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; } +step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; } +step "s2c" { COMMIT; } + + +# Session s1 is moving a row into another partition, but is waiting for +# another session s2 that is updating the original row. The row that ends up +# in the new partition should contain the changes made by session s2. +permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s" + +# Same as above, except, session s1 is waiting in GetTupleTrigger(). +permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl" + +# Below two cases are similar to the above two; except that the session s1 +# fails EvalPlanQual() test, so partition key update does not happen. +permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s" +permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"