I've been testing a 4-way multi-master setup using OpenLDAP 2.4.25 and I'm having some sporadic problems with it that I'm having difficulty diagnosing..
I have four identical RHEL 4.9 machines on the same switch (NTP syncronized to same stratum 2 servers):
dual-core Xeon 5110 1.60GHz
8GB RAM
100Mb full-duplex NIC
OpenLDAP 2.4.25, BDB 4.8.30, OpenSSL 1.0.0d, Cyrus SASL 2.1.23 (using no tls/ssl at this time)
I start the slapds with '-d conns,sync' then commence. I ldapadd 1000 DNs to one of the servers. After all the syncing has stopped I then compare the slapd contents against each other looking for differences. Occasionally there are as much as a couple hundred DNs missing from one or more of the instances. When that happens I've noticed that the mmaster with less DNs has lost its consumer connection to a mmaster provider (confirmed using lsof and netstat) and will never attempt a re-connect, but the provider still shows the connection (using lsof and netstat). When the consumer gets in this state I can connect to its cn=config and cn=monitor backends (and browse them) but when I try to connect to its multi-master'd backend the connection attempt just hangs. It's almost like the connect succeeds but the client is waiting for a response from the server (and never gets it). Also, the consumer slapd will not respond to a 'kill -TERM' at this time and must be 'kill -KILL'd. The same thing occurs sometimes when I delete the entire tree.
I've been trying to catch logging information that might help but so far nothing's jumping out at me. While I continue to try to reproduce and parse through logfiles maybe someone can look at my slapd.confs below and see if I might have configured something wrong (I'm listing the original slapd.conf files below, but I've used slaptest to convert them to slapd.d/cn=config.ldif format):
HOST1 slapd.conf:
include /tmp/openldap/multi-master/etc/schema/core.schema
include /tmp/openldap/multi-master/etc/schema/cosine.schema
include /tmp/openldap/multi-master/etc/schema/nis.schema
argsfile /tmp/openldap/multi-master/var/run/slapd.args
pidfile /tmp/openldap/multi-master/var/run/slapd.pid
threads 16
idletimeout 0
writetimeout 5
reverse-lookup off
timelimit time.soft=30 time.hard=300
sizelimit size.soft=500 size.hard=1000
password-hash {SSHA}
loglevel stats sync
serverid 001
modulepath /tmp/openldap/multi-master/libexec
moduleload back_monitor.la
moduleload back_hdb.la
moduleload syncprov.la
database config
rootdn cn=manager,cn=config
rootpw {SSHA}yMFj3Y7KPh223NkkKLQsFeLUVm08Ckpm
database monitor
rootdn cn=manager,cn=monitor
rootpw {SSHA}vPVSN8o8eRnLdC/bGS7yDwQGeH4BHc0R
database hdb
suffix dc=example,dc=com
rootdn cn=manager,dc=example,dc=com
rootpw {SSHA}0obbsJw5Yq2XAkdd/kS7vokaB9rrSOtI
directory /tmp/openldap/multi-master/var/data/dc=example,dc=com
cachesize 30000
cachefree 5
checkpoint 128 15
dncachesize 25000
idlcachesize 100000
index objectClass eq
index entryCSN eq
index entryUUID eq
syncrepl rid=001
provider=ldap://host2:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=002
provider=ldap://host3:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=003
provider=ldap://host4:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
HOST2 slapd.conf:
include /tmp/openldap/multi-master/etc/schema/core.schema
include /tmp/openldap/multi-master/etc/schema/cosine.schema
include /tmp/openldap/multi-master/etc/schema/nis.schema
argsfile /tmp/openldap/multi-master/var/run/slapd.args
pidfile /tmp/openldap/multi-master/var/run/slapd.pid
threads 16
idletimeout 0
writetimeout 5
reverse-lookup off
timelimit time.soft=30 time.hard=300
sizelimit size.soft=500 size.hard=1000
password-hash {SSHA}
loglevel stats sync
serverid 002
modulepath /tmp/openldap/multi-master/libexec
moduleload back_monitor.la
moduleload back_hdb.la
moduleload syncprov.la
database config
rootdn cn=manager,cn=config
rootpw {SSHA}yMFj3Y7KPh223NkkKLQsFeLUVm08Ckpm
database monitor
rootdn cn=manager,cn=monitor
rootpw {SSHA}vPVSN8o8eRnLdC/bGS7yDwQGeH4BHc0R
database hdb
suffix dc=example,dc=com
rootdn cn=manager,dc=example,dc=com
rootpw {SSHA}0obbsJw5Yq2XAkdd/kS7vokaB9rrSOtI
directory /tmp/openldap/multi-master/var/data/dc=example,dc=com
cachesize 30000
cachefree 5
checkpoint 128 15
dncachesize 25000
idlcachesize 100000
index objectClass eq
index entryCSN eq
index entryUUID eq
syncrepl rid=001
provider=ldap://host1:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=002
provider=ldap://host3:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=003
provider=ldap://host4:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
HOST3 slapd.conf:
include /tmp/openldap/multi-master/etc/schema/core.schema
include /tmp/openldap/multi-master/etc/schema/cosine.schema
include /tmp/openldap/multi-master/etc/schema/nis.schema
argsfile /tmp/openldap/multi-master/var/run/slapd.args
pidfile /tmp/openldap/multi-master/var/run/slapd.pid
threads 16
idletimeout 0
writetimeout 5
reverse-lookup off
timelimit time.soft=30 time.hard=300
sizelimit size.soft=500 size.hard=1000
password-hash {SSHA}
loglevel stats sync
serverid 003
modulepath /tmp/openldap/multi-master/libexec
moduleload back_monitor.la
moduleload back_hdb.la
moduleload syncprov.la
database config
rootdn cn=manager,cn=config
rootpw {SSHA}yMFj3Y7KPh223NkkKLQsFeLUVm08Ckpm
database monitor
rootdn cn=manager,cn=monitor
rootpw {SSHA}vPVSN8o8eRnLdC/bGS7yDwQGeH4BHc0R
database hdb
suffix dc=example,dc=com
rootdn cn=manager,dc=example,dc=com
rootpw {SSHA}0obbsJw5Yq2XAkdd/kS7vokaB9rrSOtI
directory /tmp/openldap/multi-master/var/data/dc=example,dc=com
cachesize 30000
cachefree 5
checkpoint 128 15
dncachesize 25000
idlcachesize 100000
index objectClass eq
index entryCSN eq
index entryUUID eq
syncrepl rid=001
provider=ldap://host1:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=002
provider=ldap://host2:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=003
provider=ldap://host4:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
HOST4 slapd.conf:
include /tmp/openldap/multi-master/etc/schema/core.schema
include /tmp/openldap/multi-master/etc/schema/cosine.schema
include /tmp/openldap/multi-master/etc/schema/nis.schema
argsfile /tmp/openldap/multi-master/var/run/slapd.args
pidfile /tmp/openldap/multi-master/var/run/slapd.pid
threads 16
idletimeout 0
writetimeout 5
reverse-lookup off
timelimit time.soft=30 time.hard=300
sizelimit size.soft=500 size.hard=1000
password-hash {SSHA}
loglevel stats sync
serverid 004
modulepath /tmp/openldap/multi-master/libexec
moduleload back_monitor.la
moduleload back_hdb.la
moduleload syncprov.la
database config
rootdn cn=manager,cn=config
rootpw {SSHA}yMFj3Y7KPh223NkkKLQsFeLUVm08Ckpm
database monitor
rootdn cn=manager,cn=monitor
rootpw {SSHA}vPVSN8o8eRnLdC/bGS7yDwQGeH4BHc0R
database hdb
suffix dc=example,dc=com
rootdn cn=manager,dc=example,dc=com
rootpw {SSHA}0obbsJw5Yq2XAkdd/kS7vokaB9rrSOtI
directory /tmp/openldap/multi-master/var/data/dc=example,dc=com
cachesize 30000
cachefree 5
checkpoint 128 15
dncachesize 25000
idlcachesize 100000
index objectClass eq
index entryCSN eq
index entryUUID eq
syncrepl rid=001
provider=ldap://host1:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=002
provider=ldap://host2:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
syncrepl rid=003
provider=ldap://host3:1389
type=refreshAndPersist
interval=00:00:05:00
retry="15 +"
searchbase="dc=example,dc=com"
binddn="cn=manager,dc=example,dc=com"
credentials="example_pass"
starttls=no
schemachecking=off
Thank you.