This is a multi-part message in MIME format.
--------------040901020606020603090609
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
quanah@OpenLDAP.org wrote:
Full_Name: Quanah Gibson-Mount
Version: 2.4.33
OS: Linux 2.6
URL: ftp://ftp.openldap.org/incoming/
Submission from: (NULL) (74.196.25.250)
I have a very small DB (about 25MB from a fresh slapadd). However, the data.mdb
file grows by about 50MB a day. I.e., the database size on disk *doubles* every
day. It is now up to 571MB in size after
Here is the DB after a fresh slapadd:
zimbra@zre-ldap002:~/data/ldap/mdb/db$ du -c -h data.mdb
25M data.mdb
Here is the DB on the production server:
[zimbra@ldap01-zcs db]$ du -c -h data.mdb
571M data.mdb
Based on the mdb_stat output you pasted, this is simply a case of overflow
pages not reusing freelist pages. The significant info here is the freelist
info and the number of overflow pages used in the id2e database.
Here's the patch we're currently testing for this issue.
It appears to work but is maybe not being aggressive enough in reclaiming
space. We may want to increase the number of retries a bit more.
--
-- Howard Chu
CTO, Symas Corp.
http://www.symas.com
Director, Highland Sun
http://highlandsun.com/hyc/
Chief Architect, OpenLDAP
http://www.openldap.org/project/
--------------040901020606020603090609
Content-Type: text/plain; charset=UTF-8;
name="diff.txt"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
filename="diff.txt"
diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c
index 251ab6a..117b402 100644
--- a/libraries/libmdb/mdb.c
+++ b/libraries/libmdb/mdb.c
@@ -1242,6 +1242,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
MDB_page *np;
pgno_t pgno = P_INVALID;
MDB_ID2 mid;
+ txnid_t oldest = 0, last;
int rc;
*mp = NULL;
@@ -1254,12 +1255,11 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
if (!txn->mt_env->me_pghead &&
txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
/* See if there's anything in the free DB */
- int j;
MDB_reader *r;
MDB_cursor m2;
MDB_node *leaf;
MDB_val data;
- txnid_t *kptr, last;
+ txnid_t *kptr;
mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
if (!txn->mt_env->me_pgfirst) {
@@ -1282,15 +1282,21 @@ again:
last = *(txnid_t *)key.mv_data;
}
- /* Unusable if referred by a meta page or reader... */
- j = 1;
- if (last < txn->mt_txnid-1) {
- j = txn->mt_env->me_txns->mti_numreaders;
- r = txn->mt_env->me_txns->mti_readers + j;
- for (j = -j; j && (last<r[j].mr_txnid || !r[j].mr_pid); j++) ;
+ {
+ unsigned int i, nr;
+ txnid_t mr;
+ oldest = txn->mt_txnid - 1;
+ nr = txn->mt_env->me_txns->mti_numreaders;
+ r = txn->mt_env->me_txns->mti_readers;
+ for (i=0; i<nr; i++) {
+ if (!r[i].mr_pid) continue;
+ mr = r[i].mr_txnid;
+ if (mr < oldest)
+ oldest = mr;
+ }
}
- if (!j) {
+ if (oldest > last) {
/* It's usable, grab it.
*/
MDB_oldpages *mop;
@@ -1331,29 +1337,108 @@ none:
if (txn->mt_env->me_pghead) {
MDB_oldpages *mop = txn->mt_env->me_pghead;
if (num > 1) {
- /* FIXME: For now, always use fresh pages. We
- * really ought to search the free list for a
- * contiguous range.
- */
- ;
+ MDB_cursor m2;
+ int retry = 2, readit = 0, n2 = num-1;
+ unsigned int i, j, k;
+
+ /* If current list is too short, must fetch more and coalesce */
+ if (mop->mo_pages[0] < (unsigned)num)
+ readit = 1;
+
+ mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
+ do {
+ if (readit) {
+ MDB_val key, data;
+ MDB_oldpages *mop2;
+ pgno_t *idl;
+ int exact;
+
+ last = mop->mo_txnid + 1;
+
+ /* We haven't hit the readers list yet? */
+ if (!oldest) {
+ MDB_reader *r;
+ unsigned int nr;
+ txnid_t mr;
+
+ oldest = txn->mt_txnid - 1;
+ nr = txn->mt_env->me_txns->mti_numreaders;
+ r = txn->mt_env->me_txns->mti_readers;
+ for (i=0; i<nr; i++) {
+ if (!r[i].mr_pid) continue;
+ mr = r[i].mr_txnid;
+ if (mr < oldest)
+ oldest = mr;
+ }
+ }
+
+ /* There's nothing we can use on the freelist */
+ if (oldest - last < 1)
+ break;
+
+ exact = 0;
+ key.mv_data = &last;
+ key.mv_size = sizeof(last);
+ rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact);
+ if (rc)
+ return rc;
+ idl = (MDB_ID *) data.mv_data;
+ mop2 = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - 2*sizeof(pgno_t) + MDB_IDL_SIZEOF(mop->mo_pages));
+ if (!mop2)
+ return ENOMEM;
+ /* merge in sorted order */
+ i = idl[0]; j = mop->mo_pages[0]; mop2->mo_pages[0] = k = i+j;
+ mop->mo_pages[0] = P_INVALID;
+ while (i>0 || j>0) {
+ if (i && idl[i] < mop->mo_pages[j])
+ mop2->mo_pages[k--] = idl[i--];
+ else
+ mop2->mo_pages[k--] = mop->mo_pages[j--];
+ }
+ txn->mt_env->me_pglast = last;
+ mop2->mo_txnid = last;
+ mop2->mo_next = mop->mo_next;
+ txn->mt_env->me_pghead = mop2;
+ free(mop);
+ mop = mop2;
+ /* Keep trying to read until we have enough */
+ if (mop->mo_pages[0] < (unsigned)num) {
+ continue;
+ }
+ }
+
+ /* current list has enough pages, but are they contiguous? */
+ for (i=mop->mo_pages[0]; i>=(unsigned)num; i--) {
+ if (mop->mo_pages[i-n2] == mop->mo_pages[i] + n2) {
+ pgno = mop->mo_pages[i];
+ i -= n2;
+ /* move any stragglers down */
+ for (j=i+num; j<=mop->mo_pages[0]; j++)
+ mop->mo_pages[i++] = mop->mo_pages[j];
+ mop->mo_pages[0] -= num;
+ break;
+ }
+ }
+
+ /* Stop if we succeeded, or no more retries */
+ if (!retry || pgno != P_INVALID)
+ break;
+ readit = 1;
+ retry--;
+
+ } while (1);
} else {
/* peel pages off tail, so we only have to truncate the list */
pgno = MDB_IDL_LAST(mop->mo_pages);
- if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
- mop->mo_pages[2]++;
- if (mop->mo_pages[2] > mop->mo_pages[1])
- mop->mo_pages[0] = 0;
+ mop->mo_pages[0]--;
+ }
+ if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
+ txn->mt_env->me_pghead = mop->mo_next;
+ if (mc->mc_dbi == FREE_DBI) {
+ mop->mo_next = txn->mt_env->me_pgfree;
+ txn->mt_env->me_pgfree = mop;
} else {
- mop->mo_pages[0]--;
- }
- if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
- txn->mt_env->me_pghead = mop->mo_next;
- if (mc->mc_dbi == FREE_DBI) {
- mop->mo_next = txn->mt_env->me_pgfree;
- txn->mt_env->me_pgfree = mop;
- } else {
- free(mop);
- }
+ free(mop);
}
}
}
--------------040901020606020603090609--