bdr: Greatly improve conflict handling for insert
authorAndres Freund <[email protected]>
Wed, 23 Apr 2014 23:47:33 +0000 (01:47 +0200)
committerAndres Freund <[email protected]>
Thu, 3 Jul 2014 15:55:26 +0000 (17:55 +0200)
Needs some more refactoring and logging improvements.

contrib/bdr/bdr_apply.c

index 90a8b757ea8575f3726436cbb6583b1c4b365afa..eeecf114acc3fed375669de172ecfb7b7ef77130 100644 (file)
@@ -26,6 +26,7 @@
 #include "access/xact.h"
 
 #include "catalog/dependency.h"
+#include "catalog/index.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_type.h"
 
@@ -36,6 +37,8 @@
 
 #include "libpq/pqformat.h"
 
+#include "miscadmin.h"
+
 #include "parser/parse_relation.h"
 #include "parser/parse_type.h"
 
@@ -71,9 +74,12 @@ typedef struct BDRTupleData
    bool        changed[MaxTupleAttributeNumber];
 } BDRTupleData;
 
-static void build_scan_key(ScanKey skey, Relation rel, Relation idx_rel, HeapTuple key);
-static bool find_pkey_tuple(ScanKey skey, Relation rel, Relation idx_rel, TupleTableSlot *slot, bool lock);
+static void build_index_scan_keys(EState *estate, ScanKey *scan_keys, TupleTableSlot *slot);
+static void build_index_scan_key(ScanKey skey, Relation rel, Relation idx_rel, TupleTableSlot *slot);
+static bool find_pkey_tuple(ScanKey skey, Relation rel, Relation idx_rel, TupleTableSlot *slot,
+                           bool lock, LockTupleMode mode);
 static void UserTableUpdateIndexes(EState *estate, TupleTableSlot *slot);
+static void UserTableUpdateOpenIndexes(EState *estate, TupleTableSlot *slot);
 static Relation read_rel(StringInfo s, LOCKMODE mode);
 extern void read_tuple_parts(StringInfo s, Relation rel, BDRTupleData *tup);
 static void read_tuple(StringInfo s, Relation rel, TupleTableSlot *slot);
@@ -545,11 +551,20 @@ process_remote_insert(StringInfo s)
    char        action;
    EState     *estate;
    TupleTableSlot *slot;
+   TupleTableSlot *oldslot;
    Relation    rel;
    bool        started_tx;
 #ifdef VERBOSE_INSERT
    StringInfoData o;
 #endif
+   ResultRelInfo *relinfo;
+   ItemPointer conflicts;
+   bool        conflict = false;
+   ScanKey    *index_keys;
+   int         i;
+   ItemPointerData conflicting_tid;
+
+   ItemPointerSetInvalid(&conflicting_tid);
 
    started_tx = bdr_performing_work();
 
@@ -569,7 +584,9 @@ process_remote_insert(StringInfo s)
 
    estate = bdr_create_rel_estate(rel);
    slot = ExecInitExtraTupleSlot(estate);
+   oldslot = ExecInitExtraTupleSlot(estate);
    ExecSetSlotDescriptor(slot, RelationGetDescr(rel));
+   ExecSetSlotDescriptor(oldslot, RelationGetDescr(rel));
 
    read_tuple(s, rel, slot);
 
@@ -577,8 +594,6 @@ process_remote_insert(StringInfo s)
        elog(ERROR, "unexpected relkind '%c' rel \"%s\"",
             rel->rd_rel->relkind, RelationGetRelationName(rel));
 
-   PushActiveSnapshot(GetTransactionSnapshot());
-
    /* debug output */
 #ifdef VERBOSE_INSERT
    initStringInfo(&o);
@@ -587,11 +602,102 @@ process_remote_insert(StringInfo s)
    resetStringInfo(&o);
 #endif
 
-   simple_heap_insert(rel, slot->tts_tuple);
-   UserTableUpdateIndexes(estate, slot);
-   bdr_count_insert();
+   /*
+    * Search for conflicting tuples.
+    */
+   ExecOpenIndices(estate->es_result_relation_info);
+   relinfo = estate->es_result_relation_info;
+   index_keys = palloc0(relinfo->ri_NumIndices * sizeof(ScanKeyData*));
+   conflicts = palloc0(relinfo->ri_NumIndices * sizeof(ItemPointerData));
 
-   PopActiveSnapshot();
+   build_index_scan_keys(estate, index_keys, slot);
+
+   /* do a SnapshotDirty search for conflicting tuples */
+   for (i = 0; i < relinfo->ri_NumIndices; i++)
+   {
+       IndexInfo  *ii = relinfo->ri_IndexRelationInfo[i];
+       bool found = false;
+
+       Assert(ii->ii_Expressions == NIL);
+
+       if (!ii->ii_Unique)
+           continue;
+
+       /* if conflict: wait */
+       found = find_pkey_tuple(index_keys[i],
+                               rel, relinfo->ri_IndexRelationDescs[i],
+                               oldslot, true, LockTupleExclusive);
+
+       /* alert if there's more than one conflicting unique key */
+       if (found &&
+           ItemPointerIsValid(&conflicting_tid) &&
+           !ItemPointerEquals(&oldslot->tts_tuple->t_self,
+                              &conflicting_tid))
+       {
+           /* FIXME: improve logging here */
+           elog(ERROR, "diverging uniqueness conflict");
+       }
+       else if (found)
+       {
+           ItemPointerCopy(&oldslot->tts_tuple->t_self, &conflicting_tid);
+           conflict = true;
+           break;
+       }
+       else
+           ItemPointerSetInvalid(&conflicts[i]);
+
+       CHECK_FOR_INTERRUPTS();
+   }
+
+   /*
+    * If there's a conflict use the version created later, otherwise do a
+    * plain insert.
+    */
+   if (conflict)
+   {
+       TransactionId xmin;
+       TimestampTz local_ts;
+       RepNodeId   local_node_id;
+       bool        apply_update;
+       bool        log_update;
+       CommitExtraData local_node_id_raw;
+
+       /* refetch tuple, check for old commit ts & origin */
+       xmin = HeapTupleHeaderGetXmin(oldslot->tts_tuple->t_data);
+
+       /*
+        * We now need to determine whether to keep the original version of
+        * the row, or apply the insert (as an update) we received.  We use
+        * the last-update-wins strategy for this, except when the new update
+        * comes from the same node that originated the previous version of
+        * the tuple.
+        */
+       TransactionIdGetCommitTsData(xmin, &local_ts, &local_node_id_raw);
+       local_node_id = local_node_id_raw;
+
+       check_apply_update(local_node_id, local_ts,
+                          &apply_update, &log_update);
+
+       elog(LOG, "insert vs insert conflict: %s",
+            apply_update ? "update" : "ignore");
+
+       if (apply_update)
+       {
+           simple_heap_update(rel,
+                              &oldslot->tts_tuple->t_self,
+                              slot->tts_tuple);
+           /* races will be resolved by abort/retry */
+           UserTableUpdateOpenIndexes(estate, slot);
+       }
+   }
+   else
+   {
+       simple_heap_insert(rel, slot->tts_tuple);
+       /* races will be resolved by abort/retry */
+       UserTableUpdateOpenIndexes(estate, slot);
+   }
+
+   ExecCloseIndices(estate->es_result_relation_info);
 
    check_sequencer_wakeup(rel);
 
@@ -603,6 +709,9 @@ process_remote_insert(StringInfo s)
        LockRelId   lockid = rel->rd_lockInfo.lockRelId;
        TransactionId oldxid = GetTopTransactionId();
 
+       /* there never should be conflicts on these */
+       Assert(!conflict);
+
        /*
         * Release transaction bound resources for CONCURRENTLY support.
         */
@@ -770,12 +879,13 @@ process_remote_update(StringInfo s)
 
    Assert(idxrel->rd_index->indisunique);
 
-   build_scan_key(skey, rel, idxrel, oldslot->tts_tuple);
+   build_index_scan_key(skey, rel, idxrel, oldslot);
 
    PushActiveSnapshot(GetTransactionSnapshot());
 
    /* look for tuple identified by the (old) primary key */
-   found_tuple = find_pkey_tuple(skey, rel, idxrel, oldslot, true);
+   found_tuple = find_pkey_tuple(skey, rel, idxrel, oldslot, true,
+                       pkey_sent ? LockTupleExclusive : LockTupleNoKeyExclusive);
 
    if (found_tuple)
    {
@@ -898,7 +1008,7 @@ check_apply_update(RepNodeId local_node_id, TimestampTz local_ts,
            else if (local_tli < remote_tli)
                *perform_update = true;
            else if (local_tli > remote_tli)
-               *perform_update =  false;
+               *perform_update = false;
            else
                /* shouldn't happen */
                elog(ERROR, "unsuccessful node comparison");
@@ -1039,10 +1149,10 @@ process_remote_delete(StringInfo s)
 
    PushActiveSnapshot(GetTransactionSnapshot());
 
-   build_scan_key(skey, rel, idxrel, slot->tts_tuple);
+   build_index_scan_key(skey, rel, idxrel, slot);
 
    /* try to find tuple via a (candidate|primary) key */
-   found_old = find_pkey_tuple(skey, rel, idxrel, slot, true);
+   found_old = find_pkey_tuple(skey, rel, idxrel, slot, true, LockTupleExclusive);
 
    if (found_old)
    {
@@ -1338,13 +1448,23 @@ bdr_create_rel_estate(Relation rel)
 static void
 UserTableUpdateIndexes(EState *estate, TupleTableSlot *slot)
 {
-   List       *recheckIndexes = NIL;
-
    /* HOT update does not require index inserts */
    if (HeapTupleIsHeapOnly(slot->tts_tuple))
        return;
 
    ExecOpenIndices(estate->es_result_relation_info);
+   UserTableUpdateOpenIndexes(estate, slot);
+   ExecCloseIndices(estate->es_result_relation_info);
+}
+
+static void
+UserTableUpdateOpenIndexes(EState *estate, TupleTableSlot *slot)
+{
+   List       *recheckIndexes = NIL;
+
+   /* HOT update does not require index inserts */
+   if (HeapTupleIsHeapOnly(slot->tts_tuple))
+       return;
 
    if (estate->es_result_relation_info->ri_NumIndices > 0)
    {
@@ -1358,8 +1478,30 @@ UserTableUpdateIndexes(EState *estate, TupleTableSlot *slot)
 
    /* FIXME: recheck the indexes */
    list_free(recheckIndexes);
+}
 
-   ExecCloseIndices(estate->es_result_relation_info);
+static void
+build_index_scan_keys(EState *estate, ScanKey *scan_keys, TupleTableSlot *slot)
+{
+   ResultRelInfo *relinfo;
+   int i;
+
+   relinfo = estate->es_result_relation_info;
+
+   /* build scankeys for each index */
+   for (i = 0; i < relinfo->ri_NumIndices; i++)
+   {
+       IndexInfo  *ii = relinfo->ri_IndexRelationInfo[i];
+
+       if (!ii->ii_Unique)
+           continue;
+
+       scan_keys[i] = palloc(ii->ii_NumIndexAttrs * sizeof(ScanKeyData));
+       build_index_scan_key(scan_keys[i],
+                      relinfo->ri_RelationDesc,
+                      relinfo->ri_IndexRelationDescs[i],
+                      slot);
+   }
 }
 
 /*
@@ -1367,7 +1509,7 @@ UserTableUpdateIndexes(EState *estate, TupleTableSlot *slot)
  * is setup to match 'rel' (*NOT* idxrel!).
  */
 static void
-build_scan_key(ScanKey skey, Relation rel, Relation idxrel, HeapTuple key)
+build_index_scan_key(ScanKey skey, Relation rel, Relation idxrel, TupleTableSlot *slot)
 {
    int         attoff;
    Datum       indclassDatum;
@@ -1375,6 +1517,7 @@ build_scan_key(ScanKey skey, Relation rel, Relation idxrel, HeapTuple key)
    bool        isnull;
    oidvector  *opclass;
    int2vector  *indkey;
+   HeapTuple   key = slot->tts_tuple;
 
    indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
                                    Anum_pg_index_indclass, &isnull);
@@ -1430,30 +1573,29 @@ build_scan_key(ScanKey skey, Relation rel, Relation idxrel, HeapTuple key)
  */
 static bool
 find_pkey_tuple(ScanKey skey, Relation rel, Relation idxrel,
-               TupleTableSlot *slot, bool lock)
+               TupleTableSlot *slot, bool lock, LockTupleMode mode)
 {
    HeapTuple   scantuple;
-   bool        found = false;
+   bool        found;
    IndexScanDesc scan;
-   Snapshot snap = GetActiveSnapshot();
-
-   /*
-    * XXX: should we use a different snapshot here to be able to get more
-    * information about concurrent activity? For now we use a snapshot
-    * isolation snapshot...
-    */
+   SnapshotData snap;
+   TransactionId xwait;
 
+   InitDirtySnapshot(snap);
    scan = index_beginscan(rel, idxrel,
-                          snap,
+                          &snap,
                           RelationGetNumberOfAttributes(idxrel),
                           0);
+
+retry:
+   found = false;
+
    index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0);
 
-   while ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL)
+   if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL)
    {
        HeapTuple ht;
-       if (found)
-           elog(ERROR, "WTF, more than one tuple found via pk???");
+
        found = true;
        /*
         * Ugly trick to track the HeapTupleData pointing into a buffer in the
@@ -1463,9 +1605,16 @@ find_pkey_tuple(ScanKey skey, Relation rel, Relation idxrel,
        memcpy(ht, scantuple, sizeof(HeapTupleData));
        ExecStoreTuple(ht, slot, scan->xs_cbuf, false);
        slot->tts_shouldFree = true;
-   }
 
-   index_endscan(scan);
+       xwait = TransactionIdIsValid(snap.xmin) ?
+           snap.xmin : snap.xmax;
+
+       if (TransactionIdIsValid(xwait))
+       {
+           XactLockTableWait(xwait);
+           goto retry;
+       }
+   }
 
    if (lock && found)
    {
@@ -1476,25 +1625,34 @@ find_pkey_tuple(ScanKey skey, Relation rel, Relation idxrel,
 
        ItemPointerCopy(&slot->tts_tuple->t_self, &locktup.t_self);
 
-       res = heap_lock_tuple(rel, &locktup, snap->curcid, LockTupleExclusive,
+       PushActiveSnapshot(GetLatestSnapshot());
+
+       res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), mode,
                              false /* wait */,
                              false /* don't follow updates */,
                              &buf, &hufd);
+       /* the tuple slot already has the buffer pinned */
+       ReleaseBuffer(buf);
+
+       PopActiveSnapshot();
+
        switch (res)
        {
            case HeapTupleMayBeUpdated:
                break;
            case HeapTupleUpdated:
                /* XXX: Improve handling here */
-               ereport(ERROR,
+               ereport(LOG,
                        (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
                         errmsg("concurrent update, retrying")));
+               goto retry;
            default:
                elog(ERROR, "unexpected HTSU_Result after locking: %u", res);
                break;
        }
-       ReleaseBuffer(buf);
    }
 
+   index_endscan(scan);
+
    return found;
 }