BerkeleyDB performance on Linux - openldap-devel

6 Jan 2007

The attached patch makes O_DIRECT work on Linux in BerkeleyDB 4.5.20. (You 
will need to manually define LINUX_NEEDS_PAGE_ALIGNMENT if you're using a 
kernel older than 2.6.)
The main reason to use this patch is to conserve memory - ordinarily, all the 
I/O that BDB does to its files gets cached in the Linux filesystem buffer 
cache. This caching is redundant since BDB always does its own caching, and 
it effectively makes the BDB environment consume twice as much memory as it 
needs. Using O_DIRECT on I/Os disables the filesystem buffer cache for those 
I/Os, thus freeing up a sizable chunk of memory.
The caching problem is particularly aggravated on Linux because the memory 
manager doesn't give program pages higher priority than cache pages. So when 
your system is tight on memory, the kernel will start swapping program data 
pages before it starts reclaiming buffer cache pages, and application 
performance plummets. (Possibly that indicates a kernel bug, or at least a 
misfeature.)
Note that you must configure BerkeleyDB with --enable-o_direct to enable the 
support, and you must add "set_flags DB_DIRECT_DB" to your DB_CONFIG to 
enable it in a particular environment.
With this patch, a slapd that occupies 6.8GB on a system with 8GB of RAM can 
run continuously without swapping, delivering a sustained 11,500 
authentications per second. Without the patch, swapping starts when the 
process hits the 4.5GB mark (because over 3GB of buffer cache is in use), and 
performance drops to only *hundreds* of authentications per second.
-- 
   -- Howard Chu
   Chief Architect, Symas Corp.  http://www.symas.com
   Director, Highland Sun        http://highlandsun.com/hyc
   OpenLDAP Core Team            http://www.openldap.org/project/

--- dbinc/mp.h.orig	2006-09-07 14:31:58.000000000 -0700
+++ dbinc/mp.h	2007-01-06 19:14:56.000000000 -0800
@@ -378,6 +378,23 @@
 #define	BH_FREE_REUSE		0x02
 #define	BH_FREE_UNLOCKED	0x04

+#ifdef DIAG_MVCC
+#define	BH_ALIGNED
+#define VM_PAGESIZE 4096
+#endif
+
+/* Linux O_DIRECT needs aligned buffers. 2.6 kernel allows 512 byte
+ * alignment, otherwise need page sized (4096).
+ */
+#if defined(linux) && !defined(BH_ALIGNED)
+#define	BH_ALIGNED
+#ifdef LINUX_NEEDS_PAGE_ALIGNMENT
+#define VM_PAGESIZE 4096
+#else	/* Linux 2.6+ */
+#define VM_PAGESIZE 512
+#endif
+#endif
+
 /*
  * BH --
  *	Buffer header.
@@ -404,7 +421,7 @@

    roff_t		td_off;		/* MVCC: creating TXN_DETAIL offset. */
    SH_CHAIN_ENTRY	vc;		/* MVCC: version chain. */
-#ifdef DIAG_MVCC
+#ifdef BH_ALIGNED
    u_int16_t	align_off;	/* Alignment offset for diagnostics.*/
 #endif

@@ -465,15 +482,14 @@
     (dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT) &&		\
     dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))

-#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
-#define	VM_PAGESIZE 4096
-#define	MVCC_BHSIZE(mfp, sz) do {					\
+#ifdef BH_ALIGNED
+#define	BHSIZE(mfp, sz) do {					\
    sz += VM_PAGESIZE + sizeof(BH);					\
    if (mfp->stat.st_pagesize < VM_PAGESIZE)			\
    	sz += VM_PAGESIZE - mfp->stat.st_pagesize;		\
 } while (0)

-#define	MVCC_BHALIGN(mfp, p) do {					\
+#define	BHALIGN(mfp, p) do {					\
    if (mfp != NULL) {						\
    	BH *__bhp;						\
    	void *__orig = (p);					\
@@ -493,13 +509,19 @@
    }								\
 } while (0)

-#define	MVCC_BHUNALIGN(mfp, p) do {					\
+#define	BHUNALIGN(mfp, p) do {					\
    if ((mfp) != NULL) {						\
    	BH *bhp = (BH *)(p);					\
    	(p) = ((u_int8_t *)bhp - bhp->align_off);		\
    }								\
 } while (0)
+#else
+#define	BHSIZE(mfp, sz) do {} while (0)
+#define	BHALIGN(mfp, p) do {} while (0)
+#define	BHUNALIGN(mfp, p) do {} while (0)
+#endif

+#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
 #ifdef linux
 #define	MVCC_MPROTECT(buf, sz, mode) do {				\
    int __ret = mprotect((buf), (sz), (mode));			\
@@ -513,11 +535,7 @@
    }								\
 } while (0)
 #endif /* linux */
-
-#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
-#define	MVCC_BHSIZE(mfp, sz) do {} while (0)
-#define	MVCC_BHALIGN(mfp, p) do {} while (0)
-#define	MVCC_BHUNALIGN(mfp, p) do {} while (0)
+#else
 #define	MVCC_MPROTECT(buf, size, mode) do {} while (0)
 #endif

--- mp/mp_alloc.c.orig	2006-09-07 14:32:03.000000000 -0700
+++ mp/mp_alloc.c	2007-01-06 19:14:56.000000000 -0800
@@ -66,7 +66,7 @@
    if (mfp != NULL) {
    	len = SSZA(BH, buf) + mfp->stat.st_pagesize;
    	/* Add space for alignment padding for MVCC diagnostics. */
-		MVCC_BHSIZE(mfp, len);
+		BHSIZE(mfp, len);
    }

    MPOOL_REGION_LOCK(dbenv, infop);
@@ -91,10 +91,10 @@
    		c_mp->stat.st_pages++;
    	MPOOL_REGION_UNLOCK(dbenv, infop);
    	/*
-		 * For MVCC diagnostics, align the pointer so that the buffer
+		 * If necessary, align the pointer so that the buffer
    	 * starts on a page boundary.
    	 */
-		MVCC_BHALIGN(mfp, p);
+		BHALIGN(mfp, p);

 found:		if (offsetp != NULL)
    		*offsetp = R_OFFSET(infop, p);
@@ -447,7 +447,7 @@
    MPOOLFILE *mfp;
    void *buf;
 {
-	MVCC_BHUNALIGN(mfp, buf);
+	BHUNALIGN(mfp, buf);
    COMPQUIET(mfp, NULL);
    __db_shalloc_free(infop, buf);
 }
--- mp/mp_fget.c.orig	2006-09-13 09:22:42.000000000 -0700
+++ mp/mp_fget.c	2007-01-06 19:14:56.000000000 -0800
@@ -708,7 +708,7 @@
    	 * the hash bucket's priority.
    	 */
    	/*lint --e{668} (flexelint: bhp cannot be NULL). */
-#ifdef DIAG_MVCC
+#ifdef BH_ALIGNED
    	memset(bhp, 0, SSZ(BH, align_off));
 #else
    	memset(bhp, 0, sizeof(BH));