Full_Name: Ryan Tandy Version: 2.4.45 OS: Debian URL: Submission from: (NULL) (24.68.41.160) Submitted by: ryan
This is rather similar to ITS#8429 (the deadlock is at the same location), but not enough for me to be sure it's the same.
cat > slapd.conf << EOF
include /path/to/core.schema include /path/to/cosine.schema
serverid 1 ldap://:9001 serverid 2 ldap://:9002 serverid 3 ldap://:9003
database mdb directory db maxsize 104857600 envflags writemap index objectClass,cn,entryCSN,entryUUID,uid eq
suffix dc=example,dc=com rootdn cn=root,dc=example,dc=com rootpw secret access to * by * read sizelimit unlimited
syncrepl rid=1 provider="ldap://:9001" searchbase="dc=example,dc=com" type=refreshAndPersist retry="10 +" bindmethod=simple binddn="cn=root,dc=example,dc=com" credentials="secret" syncdata=accesslog logbase="cn=accesslog" logfilter="(&(objectClass=auditWriteObject)(reqResult=0))"
syncrepl rid=2 provider="ldap://:9002" searchbase="dc=example,dc=com" type=refreshAndPersist retry="10 +" bindmethod=simple binddn="cn=root,dc=example,dc=com" credentials="secret" syncdata=accesslog logbase="cn=accesslog" logfilter="(&(objectClass=auditWriteObject)(reqResult=0))"
syncrepl rid=3 provider="ldap://:9003" searchbase="dc=example,dc=com" type=refreshAndPersist retry="10 +" bindmethod=simple binddn="cn=root,dc=example,dc=com" credentials="secret" syncdata=accesslog logbase="cn=accesslog" logfilter="(&(objectClass=auditWriteObject)(reqResult=0))"
mirrormode on
overlay syncprov syncprov-checkpoint 10 1 syncprov-reloadhint TRUE
overlay accesslog logdb cn=accesslog logops writes logsuccess true logpurge 07+00:00 01+00:00
database mdb directory accesslog maxsize 104857600 envflags writemap index entryCSN,objectClass,reqEnd,reqResult,reqStart eq
suffix cn=accesslog access to * by * read sizelimit unlimited
overlay syncprov syncprov-nopresent TRUE syncprov-reloadhint TRUE
EOF
cat > data.ldif << EOF
dn: dc=example,dc=com objectClass: domain
dn: uid=u0,dc=example,dc=com objectclass: account
dn: cn=g0,dc=example,dc=com objectClass: groupOfNames member:
EOF
Start up all three slapds and get them synced and settled. I also executed no-op modifications on each node to ensure every server had CSNs from all the others.
cat > groupmod.ldif << EOF
dn: cn=g0,dc=example,dc=com add: member member: uid=u0,dc=example,dc=com
dn: cn=g0,dc=example,dc=com delete: member member: uid=u0,dc=example,dc=com
EOF
Execute the above modification on one node and watch the other two. After a few times, I reliably get one or both nodes hanging.
If I disable syncprov-checkpoint, I cannot reproduce the hang.
Backtrace from a hung node:
Thread 6 (Thread 0x7f77093d0700 (LWP 28817)): #0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 #1 0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b9696c0, mutex=0x560c5b969698) at thr_posix.c:277 #2 0x0000560c5a9add6e in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at tpool.c:683 #3 0x00007f7718a4a494 in start_thread (arg=0x7f77093d0700) at pthread_create.c:333 #4 0x00007f771878ca8f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
Thread 5 (Thread 0x7f7709bd1700 (LWP 28816)): #0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 #1 0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b9696c0, mutex=0x560c5b969698) at thr_posix.c:277 #2 0x0000560c5a9add6e in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at tpool.c:683 #3 0x00007f7718a4a494 in start_thread (arg=0x7f7709bd1700) at pthread_create.c:333 #4 0x00007f771878ca8f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
Thread 4 (Thread 0x7f770a3d2700 (LWP 28815)): #0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 #1 0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b993fc8, mutex=0x560c5b993fa0) at thr_posix.c:277 #2 0x0000560c5a9ac942 in ldap_pvt_thread_rmutex_lock (rmutex=0x560c5b993f68, owner=140149249615616) at rmutex.c:129 #3 0x0000560c5a98bd4c in accesslog_op_mod (op=0x7f770a3d14e0, rs=0x7f770a3d1120) at accesslog.c:1994 #4 0x0000560c5a941763 in overlay_op_walk (op=0x7f770a3d14e0, rs=0x7f770a3d1120, which=op_modify, oi=0x560c5b992a00, on=0x560c5b993d20) at backover.c:661 #5 0x0000560c5a941a50 in over_op_func (op=0x7f770a3d14e0, rs=0x7f770a3d1120, which=op_modify) at backover.c:730 #6 0x0000560c5a941b84 in over_op_modify (op=0x7f770a3d14e0, rs=0x7f770a3d1120) at backover.c:769 #7 0x0000560c5a92ef07 in syncrepl_message_to_op (si=0x560c5b992580, op=0x7f770a3d14e0, msg=0x7f76f4103bd0) at syncrepl.c:2417 #8 0x0000560c5a929f7e in do_syncrep2 (op=0x7f770a3d14e0, si=0x560c5b992580) at syncrepl.c:1014 #9 0x0000560c5a92c160 in do_syncrepl (ctx=0x7f770a3d1c10, arg=0x560c5b992980) at syncrepl.c:1565 #10 0x0000560c5a8b11cd in connection_read_thread (ctx=0x7f770a3d1c10, argv=0xc) at connection.c:1296 #11 0x0000560c5a9ade15 in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at tpool.c:696 #12 0x00007f7718a4a494 in start_thread (arg=0x7f770a3d2700) at pthread_create.c:333 #13 0x00007f771878ca8f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
Thread 3 (Thread 0x7f770abd3700 (LWP 28814)): #0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 #1 0x0000560c5a9af3b7 in ldap_pvt_thread_cond_wait (cond=0x560c5b9696c0, mutex=0x560c5b969698) at thr_posix.c:277 #2 0x0000560c5a9add6e in ldap_int_thread_pool_wrapper (xpool=0x560c5b969690) at tpool.c:683 #3 0x00007f7718a4a494 in start_thread (arg=0x7f770abd3700) at pthread_create.c:333 #4 0x00007f771878ca8f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
Thread 2 (Thread 0x7f770b3d4700 (LWP 28813)): #0 0x00007f771878d083 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84 #1 0x0000560c5a8ac6b3 in slapd_daemon_task (ptr=0x560c5bd176e0) at daemon.c:2539 #2 0x00007f7718a4a494 in start_thread (arg=0x7f770b3d4700) at pthread_create.c:333 #3 0x00007f771878ca8f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
Thread 1 (Thread 0x7f7719293400 (LWP 28812)): #0 0x00007f7718a4b6cd in pthread_join (threadid=140149266401024, thread_return=0x0) at pthread_join.c:90 #1 0x0000560c5a9af2f8 in ldap_pvt_thread_join (thread=140149266401024, thread_return=0x0) at thr_posix.c:197 #2 0x0000560c5a8ad99c in slapd_daemon () at daemon.c:2932 #3 0x0000560c5a88c105 in main (argc=8, argv=0x7ffd119da7b8) at main.c:1017