Full_Name: Jonathan Graehl Version: commit 8d346721a60684aeaa7b1e3b2111c972393bfad3 OS: linux URL: Submission from: (NULL) (75.85.99.117)
mdb_from_db Berkeley DB->LMDB import utility
See also https://github.com/openldap/openldap/pull/1
From 32f6c10570bf7ede64cdb734775e97ea2afe1011 Mon Sep 17 00:00:00 2001
From: graehl graehl@gmail.com Date: Sun, 24 Aug 2014 16:02:41 -0700 Subject: [PATCH] mdb_from_db Berkeley DB->LMDB import
--- libraries/liblmdb/Makefile | 3 +- libraries/liblmdb/mdb_from_db.1 | 104 ++++++++ libraries/liblmdb/mdb_from_db.c | 548 +++++++++++++++++++++++++++++++++++++B%B++ 3 files changed, 654 insertions(+), 1 deletion(-) create mode 100644 libraries/liblmdb/mdb_from_db.1 create mode 100644 libraries/liblmdb/mdb_from_db.c
diff --git a/libraries/liblmdb/Makefile b/libraries/liblmdb/Makefile index 25c1095..196ed08 100644 --- a/libraries/liblmdb/Makefile +++ b/libraries/liblmdb/Makefile @@ -29,7 +29,7 @@ prefix = /usr/local
IHDRS = lmdb.h ILIBS = liblmdb.a liblmdb.so -IPROGS = mdb_stat mdb_copy mdb_dump mdb_load +IPROGS = mdb_stat mdb_copy mdb_dump mdb_load mdb_from_db IDOCS = mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1 PROGS = $(IPROGS) mtest mtest2 mtest3 mtest4 mtest5 all: $(ILIBS) $(PROGS) @@ -58,6 +58,7 @@ mdb_stat: mdb_stat.o liblmdb.a mdb_copy: mdb_copy.o liblmdb.a mdb_dump: mdb_dump.o liblmdb.a mdb_load: mdb_load.o liblmdb.a +mdb_from_db: mdb_from_db.o liblmdb.a mtest: mtest.o liblmdb.a mtest2: mtest2.o liblmdb.a mtest3: mtest3.o liblmdb.a diff --git a/libraries/liblmdb/mdb_from_db.1 b/libraries/liblmdb/mdb_from_db.1 new file mode 100644 index 0000000..dbd6797 --- /dev/null +++ b/libraries/liblmdb/mdb_from_db.1 @@ -0,0 +1,104 @@ +.TH MDB_FROM_DB 1 "2014/06/20" "LMDB 0.9.14" +." Copyright 2014 Howard Chu, Symas Corp. All Rights Reserved. +." Copying restrictions apply. See COPYRIGHT/LICENSE. +.SH NAME +mdb_from_db - LMDB environment translate from Berkeley DB environment tool +.SH SYNOPSIS +.B mdb_from_db +.BR \ berkeley.db +.BR \ envpath +[\c +.BR -V ] +[\c +.BR -n ] +[\c +.BI -s \ subdb\fR] +[\c +.B%5-b \ bahshsize\fR] +[\c +.BI -h \ berkeley-db-homedir\fR] +[\c +.BR -N ] +[\c +.BR -T ] +.SH DESCRIPTION +The +.B mdb_from_db +utility reads from a Berkeley DB environment +.BR berkeley.db +and from_dbs all its subdatabases, or just the specified +.BR subdb +, into the +LMDB environment +.BR envpath . + +Additionally, +.B mdb_from_db +may write in the +.B -T +plain text format understood by +.BR mdb_load (1) +which can only understand a single subdatabase at a time. + B2B.SH OPTION0D0D +.TP +.BR -V +Write the library version number to the standard output, and exit. +.TP +.BR -n +From_Db an LMDB database which does not use subdirectories. +.TP +.BR -s \ subdb +From_Db a specific subdatabase. If no database is specified, data is from_dbed into the main database. +.TP +.BR -N +Don't overwrite existing records when from_dbing into an already existing database; just skip them. +.TP +.BR -b \ sz +Commit LMDB records +.B sz +at a time. +.TP +.BR -h \ db_homedir +Treat input db path as relative to this homedir (see the Berkeley DB docs). Default is '.' +.TP +.BR -B +Perform a nonblocking Berkeley DB open. +.TP +.BR -T +Write the key/data into a single simple text file (stderr messages +would allow segmenting the output into separate files for each +subdatabase). The input will be paired lines of text, where the first +line of the pair is the key item, and the second line of the pair is +its corresponding data item. If more than one database is read then +refer to the counts reported on stderr. + +A simple escape mechanism, where newline and backslash (\) characters +are special, is applied to the text input. Newline characters are +interpreted as record separators. Backslash characters in the text +will be interpreted in one of two ways: If the backslash character +precedes another backslash character, the pair will be interpreted as +a literal backslash. If the backslash character precedes any other +character, the two characters following the backslash will be +interpreted as a hexadecimal specification of a single character; for +example, \0a is a newline character in the ASCII character set. + +For this reason, any backslash or newline characters that naturally +occur in the text input must be escaped to avoid misinterpretation by +.BR mdb_load. + +.SH DIAGNOSTICS +Exit status is zero if no errors occur. +Errors result in a non-zero exit status and +a diagnostic message being written to standard error. + +Information about each subdatabase processed, and the total number of +records is also written to standard error. + +.SH "SEE ALSO" +.BR mdb_load (1) +.BR mdb_dump (1) +.BR db_dump (1) + +.SH AUTHOR +Jonathan Graehl graehl@gmail.com diff --git a/libraries/liblmdb/mdb_from_db.c b/libraries/liblmdb/mdb_from_db.c new file mode 100644 index 0000000..ee81db0 --- /dev/null +++ b/libraries/liblmdb/mdb_from_db.c @@ -0,0 +1,548 @@ +/* mdb_from_db.c - translate Berkeley DB to memory-mapped database(s) */ +/* + * Copyright 2014 Jonathan Graeh2C2C 2011-2014 Howard Chu, Symas Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * http://www.OpenLDAP.org/license.html. + */ +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <ctype.h> +#include <unistd.h> +#include "lmdb.h" +#include <db.h> +#include <stdbool.h> +#include <sys/stat.h> + +static int datasize = 64*1024; +static int batchsize = 100; + +static char *subname = NULL; + +static char *progB0D + +static MDB_val kbuf, dbuf; + +#ifdef _WIN32 +#define Z "I" +#else +#define Z "z" +#endif + + +char *usagestr = + "path.input.berkeley.db [path.output.mdb|T-txt] [-P dbpasswd] [-V] [-l] [-n] [-s subdbname] [-N] [-B] [-T] [-v] [-h homedirpath] [-b write-batchsize] [-f redirect_stdout.txt]\n" + " (-T prints to stdout key/val in mdb_load format)\n" + " (-l: only list (to stdout) database names, -N: don't overwrite existing keys; -B nonblocking db open; -n: create single mdb file instead of dir)\n" + ; + +/** + fail() and shutdown(): everything that might need to be cleaned up on error->exit + (conceptually some of these are locals, but having them global lets us call + fail() from anywhere) +*/A%A+static MDB_env *env; +static MDB_txn *txn; +static MDB_dbi dbi; + +static DB *dbp; +static DBC *dbcp; +static DB *parent_dbp; +static DBC *bdb_subdbcursor; +static char *subdbname; +static DB_TXN *dbtxn; + +void bdb_close() { + if (dbcp) + dbcp->cse%2(dbcp); + dbcp = 0; + if (dbp) + dbp->close(dbp, 0); + dbp = 0; +} +void shutdown() { + if (bdb_subdbcursor) + bdb_subdbcursor->close(bdb_subdbcursor); + bdb_subdbcursor = 0; + bdb_close(); + if (parent_dbp) + parent_dbp->close(parent_dbp, 0); + parent_dbp = 0; + if (txn) + mdb_txn_abort(txn); + txn = 0; + if (dbi) + mdb_dbi_close(env, dbi); + if (env) + mdb_env_close(env); + env = 0; + if (subdbname) + free(subdbname); + subdbname =%0; +} + +void fail() { + shutdown(); + exit(EXIT_FAILURE); +} + +static void usage(void) +{ + fprintf(stderr, "usage: %s %s", prog, usagestr); + fail(); +} + +/** + BDB env +*/ +static char *dbhome; +static DB_ENV *dbenv; +static u_int32_t dbcache = 1024*1024; +static char *dbpasswd; +static bool dbnonblocking; +void strfill(char *str, char fill) { + while(*str) + *str++ = fill; +} +void bdb_err(char *fn, int rc) { + fprintf(stderr, "%s: ", prog); + if (dbenv) + dbenv->err(dbenv, rc, fn); + else + fprintf(stderr, "%s\n", db_strerror(rc)); + fail(); +} + +DBT dbkey, dbdata; +void bdb_init_dbenv() { + int rc; + if ((rc = db_env_create(&dbenv, 0)) != 0) + bdb_err("db_env_create", rc); + dbenv->set_errfile(dbenv, stderr); + dbv-v->set_errpfx(dbenv, prog); + if (dbpasswd != NULL) { + rc = dbenv->set_encrypt(dbenv, dbpasswd, DB_ENCRYPT_AES); + strfill(dbpasswd, '\0'); + if (rc) + bdb_err("dbenv::set_encrypt", rc); + } + if (dbnonblocking) { + if ((rc = dbenv->set_flags(dbenv, DB_NOLOCKING, 1))) + bdb_err("DB_NOLOCKING", rc); + if ((rc = dbenv->set_flags(dbenv, DB_NOPANIC, 1))) + bdb_err("DB_NOPANIC", rc); + } + if ((rc = dbenv->set_cachesize(dbenv, 0,bdbcache, 1))) + bdb_err("dbenv::set_cachesize", rc); + if ((rc = dbenv->open(dbenv, dbhome, + DB_CREATE | DB_INIT_MPOOL | DB_PRIVATE | DB_USE_ENVIRON, 0))) + bdb_err("dbenv::open", rc); + dbdata.flags = DB_DBT_USERMEM; + if "1%2(dbdata.data = malloc(dbdata.ulen = dbcache))) + fail(); +} + +/** + BDB db. +*/ +static char *dbfilename; +static DBT keyret, dataret; +static bool bdb_is_recno; +static db_recno_t bdb_recno; +static DB_HEAP_RID bdb_heaprid; +static int bdb_get_flags;B2Bstatic void *pointer_get; + +void bdb_open(char *dbname) { + int rc; + bdb_close(); + if ((rc = db_create(&dbp, dbenv, 0))) + bdb_err("db_create", rc); + if ((rc = dbp->open(dbp, dbtxn, dbfilename, dbname, + DB_UNKNOWN, (parent_dbp ? 0 : DB_RDWRMASTER)|DB_RDONLY, 0))) { + fprintf(stderr, "db open %s : %s\n", dbfilename, dbname); + bdb_err(dbfilename, rc); + } +} + +void bdb_start_chunks() { + int rc; + bdb_get_flags = DB_NEXT | DB_MULTIPLE_KEY; + if ((bdb_is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE))) + keyret.size = sizeof(*(keyret.data = &bdb_recno)); + else if (dbp->type == DB_HEAP) { + bdb_get_flags = DB_NEXT; + dbkey.flags = DB_DBT_USERMEM; + dbkey.ze % = dbkey.ulen = sizeof(*(dbkey.data = &bdb_heaprid)); + } + if ((rc = dbp->cursor(dbp, NULL, &dbcp, 0))) + bdb_err("cursor", rc); +} + +unsigned align(unsigned req, unsigned granule) { + return ((req + granule - 1) / granule) * granule; +} + +/** + \return true if there's another chunk of records. +*/ +bool bdb_read_chunk() { + int rc; + if ((rc = dbcp->get(dbcp, &dbkey, &dbdata, bdb_get_flags))) { + if (rc == DB_NOTFOUND) + return false; + if (rc == DB_BUFFER_SMALL) { + dbdata.ulen = dbdata.size = align(dbdata.size, 4096); + if (!(dbdata.data = realloc(dbdata.data, dbdata.size))) + fail(); + rc = dbcp->get(dbcp, &dbkey, &dbdata, bdb_get_flags); + } + if (rc) + bdb_err("get chunk", rc); + } + DB_MULTIPLE_INIT(pointer_get, &dbdata); + return true; +} + +/** + \return true if there was another record; sets keyret and dataret. +*/ +bool bdb_next_record_in_chunk() { + if (bdb_is_recno) + DB_MULTIPLE_RECNO_NEXT(pointer_get, &dbdata, + bdb_recno, dataret.data, dataret.size); + else + DB_MULTIPLE_KEY_NEXT(pointer_get, &dbdata, + keyret.data, keyret.size, + dataret.data, dataret.size); + return dataret.data; +} + +static char hexc_[] = "01234567890ABCDEF"; + +char hexc(unsigned char i) { + return hexc_[i]; +} + +void putchar_T(unsigned char c) { + if (c >= 32 && c < 127 && c != '\') { + putchar(c); + } else { + putchar('\'); + putchar(hexc(c >> 4)); + putchar(hexc(c & 0xf)); + } +} + +/** + TODO: could fwrite chunks of no-escape-needed bytes, or probably faster, + encode in memory then write once +*/ +void print_T(char *data, unsigned len) { + unsigned i = 0; + for (; i < len; ++i) + putchar_T(data[i]); +} + +/** + Paired lines of text, where the first line of the pair is the key item, and the + second line of the pair is its corresponding data item. + + A simple escape mechanism, where newline and backslash (\) characters are special, is + applied to the text input. Newline characters are interpreted as record separators. + Backslash characters in the text will be interpreted in one of two ways: If the backslash + character precedes another backslash character, the pair will be interpreted as a literal + backslash. If the backslash character precedes any other character, the two characters + following the backslash will be interpreted as a hexadecimal specification of a single + character; for example, \0a is a newline character in the ASCII character set. + + For this reason, any backslash or newline characters that naturally occur in the text + input must be escaped to avoid misinterpretation by +*/ +void print_record_T() { + print_T(keyret.data, keyret.size); + putchar('\n'); + print_T(dataret.data, dataret.size); + putchar('\n'); +} + + +char *bdb_open_subdb(DBT key) { + if (!(subdbname = malloc(key.size + 1))) + fail(); + memcpy(subdbname, key.data, key.size); + subdbname[key.size] = '\0'; + bdb_open(subdbname); + return subdbname; +} + +bool isdir(char *path) { + struct stat s; + if (stat(path, &s)) { + perror("path"); + fail(); + } + return S_ISDIR(s.st_mode); +} + +void mkdir_if_needed(char *path) { + if (mkdir(path, 0755)) + if (errno != EEXIST) { + perror(path); + fail(); + } + if (!isdir(path)) { + fprintf(stderr, "%s is not a directory and can't mkdir it. try with -n for no-subdir (to store as a file)", path); + fail(); + } +} + +int main(int argc, char *argv[]) +{ + int i, rc; + MDB_cursor *mc; + int envflags = 0, putflags = 0; + int textflag = false; + bool havemultiple; + + prog = argv[0]; + + if (argc < 2) { + usage(); + } + + /* -n: use NOSUBDIR flag on env_open + * -S do not use NOSUBDIR + * -s subDB: translate just named subDB (default: all) + * -N: use NOOVERWRITE on puts + * -V: print version and exit + * -T: print -s database in format suitable for mdb_load -T (then output not required) + * -b N: batch size=N (default 100) + * -f stdout_file: write stdout here instead + + * db_dump-like options: + * '-h dir: ('home' dir for relative db filenames default .) + * -B: nonblocking db open + */ + bool subdir = true; + bool nodup = true; + bool listdbs = false; + + while ((i = getopt(argc, argv, "P:h:s:b:lnvVTNS")) != EOF) { + switch(i) { + case 'b': + i = sscanf(optarg, "%d", &batchsize); + if (i != 1) { + fprintf(stderr, "ERROR: -b '%s' was not int\n", optarg); + usage(); + } + break; + case 'f': + if (freopen(optarg, "w", stdout) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", + prog, optarg, strerror(errno)); + exit(EXIT_FAILURE); + } + break; + case 'V': + printf("%s\n", MDB_VERSION_STRING); + printf("%s\n", db_version(NULL, NULL, NULL)); + exit(0); + break; + case 'S': + subdir = true; + break; + case 'n': + subdir = false; + break; + case 's': + subname = strdup(optarg); + break; + case 'N': + nodup = true; + putflags = MDB_NOOVERWRITE|MDB_NODUPDATA; + break; + case 'B': + dbnonblocking = true; + break; + case 'T': + textflag = true; + break; + case 'h': + dbhome = optarg; + break; + case 'P': + /** + we XXX password immediately on init, to hide from top etc. but would + be better to get from stdin (XXX earlier would still be insecure) + */ + dbpasswd = optarg; + break; + case 'l': + listdbs = true; + break; + case '?': + default: + usage(); + } + } + + if (!subdir) + envflags |= MDB_NOSUBDIR; + bool haveout = optind == argc - 2; B2B if (opndnd >= argc) + usage(); + dbfilename = argv[optind++]; + char *mdboutpath = haveout ? argv[optind++] : NULL; + if (mdboutpath) { + if (subdir) + mkdir_if_needed(mdboutpath); + } + if (listdbs) { + if (textflag) + fprintf(stderr, "disabling -T (print key/val lines) because -l (list dbs) was specified\n"); + textflag = false; + } + + /** + args parsed. + + init BDB: + */ + bdb_init_dbenv(); + bdb_open(subname); + + /** + init MDB:D0D + */ +#undef MDB_OK +#define MDB_OK(call) \ + if (rc) { \ + fprintf(stderr, #call " failed - error %d %s\n", rc, mdb_strerror(rc)); \ + goto shutdown; \ + } else {} + + if (mdboutpath) { + rc = mdb_env_create(&env); + MDB_OK(mdb_env_create); + + rc = mdb_env_set_maxdbs(env, 2); + MDB_OK(mdb_env_set_maxdbs); + % rc = mdb_env_open(env, mdboutpath, envflags, 0664); + MDB_OK(mdb_env_open); + + kbuf.mv_size = mdb_env_get_maxkeysize(env) * 2 + 2; + kbuf.mv_data = malloc(kbuf.mv_size); + + dbuf.mv_size = datasize; + dbuf.mv_data = malloc(dbuf.mv_size); + } + + havemultiple = !subname && dbp->get_multiple(dbp); + if (havemultiple) { + parent_dbp = dbp; + dbp = 0; + if ((rc = parent_dbp->cursor(parent_dbp, NULL, &bdb_subdbcursor, 0))) + bdb_err("cursor(sub-dbs)", rc); + } + + unsigned long long wnrecords, wnrecordsall = 0; + unsigned long long nrecords, nrecordsall = 0; + unsigned ndbs = 0; + bool const reading = textflag || mdboutpath; + for (;;) { + MDB_val key, data; + int batch = 0; + if (havemultiple) { + if ((rc = bdb_subdbcursor->get(bdb_subdbcursor, &dbkey, &dbdata, DB_NEXT | DB_IGNORE_LEASE))) { + if (rc != DB_NOTFOUND) + bdb_err("get-next-subdb", rc); + else + rc = 0; + break; + } + subname = bdb_open_subdb(dbkey); + } + + ++ndbs; + nrecords = 0; + wnrecords = 0; + if (subname) { + if (listdbs) + printf("%s\n", subname); + if (reading) + fprintf(stderr, "reading DB %s ... ", subname); + } ee e { + listdbs = false; + fprintf(stderr, "reading unnamed DB ... "); + } + + if (mdboutpath) { + rc = mdb_txn_begin(env, NULL, 0, &txn); + MDB_OK(mdb_txn_begin); + rc = mdb_open(txn, subname, MDB_CREATE, &dbi); + MDB_OK(mdb_open); + rc = mdb_cursor_open(txn, dbi, &mc); + MDB_OK(mdb_cursor_open); + } + + if (reading) { + bdb_start_chunks(); + while (bdb_read_chunk()) { + while (bdb_next_record_in_chunk()) { + ++nrecords;%%0 + if (textflag) + print_record_T(); + if (mdboutpath) { + key.mv_data = keyret.data; + key.mv_size = keyret.size; + data.mv_data = dataret.data; + data.mv_size = dataret.size; + rc = mdb_cursor_put(mc, &key, &data, putflags); + if (rc == MDB_KEYEXIST && nodup) + continue; + ++wnrecords; + MDB_OK(mdb_cursor_put); + if (++batch == batchsize) { + batch = 0; + rc = mdb_txn_commit(txn); + MDB_OK(mdb_txn_commit); + rc = mdb_txn_begin(env, NULL, 0, &txn); + MDB_OK(mdb_txn_begin); + rc = mdb_cursor_open(txn, dbi, &mc); + MDB_OK(mdb_cursor_open); + } + } + } + } + if (mdboutpath) { + rc = mdb_txn_commit(txn); + txn = 0; + MDB_OK(mdb_txn_commit); + mdb_dbi_close(env, dbi); + dbi = 0; + } + nrecordsall += nrecords; + wnrecordsall += wnrecords; + fprintf(stderr, "%llu records (stored %llu).\n", nrecords, wnrecords); + } + if (!havemultiple) + break; + } + fprintf(stderr, "uound %u Berkeley DB(s) in input file %s - read %llu records", ndbs, dbfilename, nrecordsall); + if (mdboutpath) + fprintf(stderr, " (stored %llu to MDB %s).\n", wnrecordsall, mdboutpath); + fprintf(stderr, "\n"); +shutdown: + shutdown(); + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +}