(ITS#7842) mdb readers/writer exclusion protocol, as implemented, is racy. - openldap-bugs

22 Apr 2014

Full_Name: Raymond S Brand
Version: mdb.master
OS: linux
URL: ftp://ftp.openldap.org/incoming/
Submission from: (NULL) (107.145.137.13)
[I would have uploaded this to the ftp.openldap.org but there was no space.]
As implemented, the mdb readers/writer exclusion protocol has a race condition
that could result in a writer reclaiming and over-writing pages still in use
by a reader.
Pseudo code of the snapshot locking protocols of reader and writer
transactions. The labeled sections, for the purposes of this analysis, can be
assumed to execute atomically.
READER
======
*R1*    t0 = meta[0].txid;  t1 = meta[1].txid
        if (t1 > t0)
                t0 = t1
*R2*    readers[me].txid = t0
*R3*    snapshot_root = meta[t0&1].snapshot
*R4*    /* lookup data */
*R5*    readers[me].txid = -1           // Release snapshot
WRITER
======
*W1*    lock(writer, EXCLUSIVE)
        curr_txid = meta[0].txid
        if (meta[1].txid > curr_txid)
                curr_txid = meta[1].txid
*W2*    oldest = curr_txid
        for (i=0; i<reader_slots; i++)
                t = readers[i].txid
                if (t != -1 && t < oldest)
                        oldest = t
*W3*    reclaim_old_pages(oldest)
/*
        ** Commit new transaction
        */
*W4*    unlock(writer)
---------------------------------------------------------------------------
Adversarial scheduling analysis:
The following timeline demonstrates that a writer can reclaim pages in use by a
reader.
T0  T1  T2  T3  T4     T5  T6  T7  T8  T9  T10
R1                             R2  R3      R4
    W1  W2  W3  W4  *  W1  W2          W3
T0      Reader has found the latest txid, Xn.
T1->T4  Reader is not scheduled to run and a write transaction commits, Xn+1.
T5      Reader is still not scheduled and another write transaction starts.
T6      Writer finds that the oldest referenced transaction is the last
        committed transaction, Xn+1.
T7      Reader records Xn in the reader table.
T8      Reader gets the snapshot root page for transaction Xn.
T9      Writer, believing that Xn+1 is the oldest reference transaction,
        reclaims the pages of transaction Xn.
T10     Reader attempts to navigate pages of a transaction, Xn, that has been
        reclaimed and "Bad Things Happen".
In fact, bad things will happen if an even number of write transactions commit
between W4 and W1, represented by the '*', in the timeline above.
The fix is to search for the highest txid in R1 and between R2 and R3.
Optionally, followed by recording the txid, of the snapshot root found in R3,
in the reader's reader table slot to, possibly, increase the number of
reclaimable transactions.
The lack of compiler and memory barriers in the implementation of the locking
protocol is also of concern.
Beyond the above, the code in mdb_txn_renew0() after the
        "/* Copy the DB info and flags */"
comment appears to have a number of data races.
---
 libraries/liblmdb/mdb.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c
index e0f551e..908417c 100644
--- a/libraries/liblmdb/mdb.c
+++ b/libraries/liblmdb/mdb.c
@@ -533,11 +533,11 @@ typedef struct MDB_rxbody {
         *      started from so we can avoid overwriting any data used in that
         *      particular version.
         */
-       txnid_t         mrb_txnid;
+       volatile txnid_t mrb_txnid;
        /** The process ID of the process owning this reader txn. */
-       MDB_PID_T       mrb_pid;
+       volatile MDB_PID_T mrb_pid;
        /** The thread ID of the thread owning this txn. */
-       pthread_t       mrb_tid;
+       volatile pthread_t mrb_tid;
 } MDB_rxbody;
/** The actual reader record, with cacheline padding. */
@@ -585,12 +585,12 @@ typedef struct MDB_txbody {
                 *      This is recorded here only for convenience; the value
can always
                 *      be determined by reading the main database meta pages.
                 */
-       txnid_t         mtb_txnid;
+       volatile txnid_t mtb_txnid;
                /** The number of slots that have been used in the reader
table.
                 *      This always records the maximum count, it is not
decremented
                 *      when readers release their slots.
                 */
-       unsigned        mtb_numreaders;
+       volatile unsigned mtb_numreaders;
 } MDB_txbody;
/** The actual reader table definition. */
@@ -854,7 +854,7 @@ typedef struct MDB_meta {
        /** Any persistent environment flags. @ref mdb_env */
 #define        mm_flags        mm_dbs[0].md_flags
        pgno_t          mm_last_pg;                     /**< last used page in
file */
-       txnid_t         mm_txnid;                       /**< txnid that
committed this page */
+       volatile txnid_t mm_txnid;              /**< txnid that committed this
page */
 } MDB_meta;
/** Buffer for a stack-allocated meta page.
@@ -2303,10 +2303,12 @@ mdb_txn_renew0(MDB_txn *txn)
if (txn->mt_flags & MDB_TXN_RDONLY) {
                if (!ti) {
+                       /* No readers table; app responsible for locking */
                        meta = env->me_metas[ mdb_env_pick_meta(env) ];
                        txn->mt_txnid = meta->mm_txnid;
                        txn->mt_u.reader = NULL;
                } else {
+                       /* Has readers table */
                        MDB_reader *r = (env->me_flags & MDB_NOTLS) ?
txn->mt_u.reader :
                                pthread_getspecific(env->me_txkey);
                        if (r) {
@@ -2347,8 +2349,12 @@ mdb_txn_renew0(MDB_txn *txn)
                                        return rc;
                                }
                        }
-                       txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
                        txn->mt_u.reader = r;
+                       r->mr_txnid = ti->mti_txnid;
+
+                       /* Really need a memory barrier here */
+
+                       txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
                        meta = env->me_metas[txn->mt_txnid & 1];
                }
        } else {
@@ -2376,10 +2382,10 @@ mdb_txn_renew0(MDB_txn *txn)
        }
/* Copy the DB info and flags */
-       memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
+       memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));  /* Racy */
/* Moved to here to avoid a data race in read TXNs */
-       txn->mt_next_pgno = meta->mm_last_pg+1;
+       txn->mt_next_pgno = meta->mm_last_pg+1;                 /* Racy */
for (i=2; i<txn->mt_numdbs; i++) {
                x = env->me_dbflags[i];
-- 
1.9.0.138.g2de3478