Howard Chu wrote:
And unfortunately I had no time to do any more debugging until now; with St. Patrick's Day this Tuesday I had gigs all weekend. I also see that the test050 run I left overnight eventually crashed, and the symptoms are the same as in Quanah's. So, there's still more to track down.
Look as if I might have hit the same, see stack trace at the end.
For reference:
violino:~/OD/hobj/tests/testrun> grep rid=003 !$ grep rid=003 slapd.1.log =>do_syncrepl rid=003 do_syncrepl: rid=003 retrying (9 retries left) =>do_syncrepl rid=003 =>do_syncrep2 rid=003 =>do_syncrepl rid=003 =>do_syncrep2 rid=003 olcSyncrepl: {2}rid=003 provider=ldap://localhost:9013/ binddn="cn=config" bin =>do_syncrepl rid=003 =>do_syncrep2 rid=003 olcSyncrepl: {2}rid=003 provider=ldap://localhost:9013/ binddn="cn=config" bin =>do_syncrepl rid=003 =>do_syncrepl rid=003 =>do_syncrep2 rid=003 do_syncrepl: rid=003 quitting
The odd thing here of course is that it should never jump from '9 retries left' to 'quitting', there should be at least 9 failures / retry messages. Seems like we have a wild memory overwrite occurring.
I assume it is quitting due to config update. Looks to me as if syncinfo structures are released while still active.
Rein
(gdb) where #0 0x0000002a968d2540 in strlen () from /lib64/tls/libc.so.6 #1 0x0000002a968a4a0b in vfprintf () from /lib64/tls/libc.so.6 #2 0x0000002a968c4434 in vsnprintf () from /lib64/tls/libc.so.6 #3 0x0000002a958c3181 in lutil_debug (debug=<value optimized out>, level=<value optimized out>, fmt=0x448076c8 "$") at debug.c:66 #4 0x00000000004957d1 in do_syncrepl (ctx=0x44807e90, arg=0x858150) at syncrepl.c:1261 #5 0x0000002a9567e415 in ldap_int_thread_pool_wrapper ( xpool=<value optimized out>) at tpool.c:663 #6 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #7 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #8 0x0000000000000000 in ?? () (gdb) print si $1 = (syncinfo_t *) 0x0 (gdb) print *rtask $2 = {next_sched = {tv_sec = 7598733802573148208, tv_usec = 14422794207978861}, interval = {tv_sec = 384, tv_usec = 64}, tnext = {stqe_next = 0x84bc30}, rnext = {stqe_next = 0x858870}, routine = 0, arg = 0x0, tname = 0x505cc0 "do_syncrepl", tspec = 0x857d94 "rid=004"}
(gdb) thr 8 [Switching to thread 8 (process 23265)]#0 0x0000002a968d2540 in strlen () from /lib64/tls/libc.so.6 (gdb) frame 4 #4 0x00000000004957d1 in do_syncrepl (ctx=0x41801e90, arg=0x858a30) at syncrepl.c:1261 1261 Debug( LDAP_DEBUG_TRACE, "=>do_syncrepl %s\n", si->si_ridtxt, 0, 0 ); (gdb) print si $3 = (syncinfo_t *) 0x20 (gdb) print *rtask $4 = {next_sched = {tv_sec = 7526470944284832317, tv_usec = 7598542775770181185}, interval = {tv_sec = 8751185004989543539, tv_usec = 3683997482740818493}, tnext = {stqe_next = 0x6974202235203030}, rnext = {stqe_next = 0x333d74756f656d}, routine = 0xc0, arg = 0x20, tname = 0x84bc10 "\220\004", tspec = 0x69666e6f43657361 <Address 0x69666e6f43657361 out of bounds>}
(gdb) thr app all where Thread 11 (process 23234): #0 0x0000002a96753ffb in pthread_join () from /lib64/tls/libpthread.so.0 #1 0x0000002a9567ec85 in ldap_pvt_thread_join (thread=1082132976, thread_return=0x0) at thr_posix.c:197 #2 0x000000000042b35b in slapd_daemon () at daemon.c:2697 #3 0x0000000000418917 in main (argc=8, argv=0x7fbffff598) at main.c:948
Thread 10 (process 23249): #0 0x0000002a96928c8c in epoll_wait () from /lib64/tls/libc.so.6 #1 0x000000000042e35a in slapd_daemon_task (ptr=<value optimized out>) at daemon.c:2323 #2 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #3 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #4 0x0000000000000000 in ?? ()
Thread 9 (process 23264): #0 0x0000002a967559ba in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/tls/libpthread.so.0 #1 0x0000002a9567ec1e in ldap_pvt_thread_cond_wait (cond=0x7ead04, mutex=0x0) at thr_posix.c:277 #2 0x0000002a9567e3e4 in ldap_int_thread_pool_wrapper ( xpool=<value optimized out>) at tpool.c:654 #3 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #4 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #5 0x0000000000000000 in ?? ()
Thread 8 (process 23265): #0 0x0000002a968d2540 in strlen () from /lib64/tls/libc.so.6 #1 0x0000002a968a4a0b in vfprintf () from /lib64/tls/libc.so.6 #2 0x0000002a968c4434 in vsnprintf () from /lib64/tls/libc.so.6 #3 0x0000002a958c3181 in lutil_debug (debug=<value optimized out>, level=<value optimized out>, fmt=0x418016c8 "D") at debug.c:66 #4 0x00000000004957d1 in do_syncrepl (ctx=0x41801e90, arg=0x858a30) at syncrepl.c:1261 #5 0x0000002a9567e415 in ldap_int_thread_pool_wrapper ( xpool=<value optimized out>) at tpool.c:663 #6 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #7 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #8 0x0000000000000000 in ?? ()
Thread 7 (process 23290): #0 0x0000002a96757e0b in __lll_mutex_lock_wait () from /lib64/tls/libpthread.so.0 #1 0x0000000000000000 in ?? ()
Thread 6 (process 23291): #0 0x0000002a9691b33f in write () from /lib64/tls/libc.so.6 #1 0x0000002a968c7328 in _IO_new_file_write () from /lib64/tls/libc.so.6 #2 0x0000002a968c7517 in _IO_new_file_xsputn () from /lib64/tls/libc.so.6 #3 0x0000002a968bd013 in fputs () from /lib64/tls/libc.so.6 #4 0x0000002a958c309a in ber_error_print ( data=0x2 <Address 0x2 out of bounds>) at bprint.c:79 #5 0x0000002a9569af4e in ldap_log_printf (ld=<value optimized out>, loglvl=<value optimized out>, fmt=<value optimized out>) at print.c:60 #6 0x0000002a956931bd in ldap_free_connection (ld=0x868ed0, lc=0x2a9ae09b00, force=1, unbind=1) at request.c:777 #7 0x0000002a9568a47d in ldap_ld_free (ld=0x868ed0, close=1, sctrls=<value optimized out>, cctrls=<value optimized out>) at unbind.c:93 #8 0x0000002a9568a768 in ldap_unbind_ext (ld=0x868ed0, sctrls=0x0, cctrls=0x0) at unbind.c:52 #9 0x000000000048dd90 in syncinfo_free (sie=0x858a90, free_all=1) ---Type <return> to continue, or q <return> to quit--- at syncrepl.c:3613 #10 0x00000000004963f2 in do_syncrepl (ctx=<value optimized out>, arg=0x858ea0) at syncrepl.c:1468 #11 0x0000002a9567e415 in ldap_int_thread_pool_wrapper ( xpool=<value optimized out>) at tpool.c:663 #12 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #13 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #14 0x0000000000000000 in ?? ()
Thread 5 (process 23441): #0 0x0000002a96757f1d in __lll_mutex_unlock_wake () from /lib64/tls/libpthread.so.0 #1 0x00000000007eacd8 in ?? () #2 0x0000000000000002 in ?? () #3 0x0000002a96754f38 in ?? () from /lib64/tls/libpthread.so.0 #4 0x0000000000000000 in ?? ()
Thread 4 (process 23442): #0 0x0000002a96757e0b in __lll_mutex_lock_wait () from /lib64/tls/libpthread.so.0 #1 0x0000000000000000 in ?? ()
Thread 3 (process 23456): #0 0x0000002a96757e0b in __lll_mutex_lock_wait () from /lib64/tls/libpthread.so.0 #1 0x0000000000000000 in ?? ()
Thread 2 (process 23458): #0 ldap_int_thread_pool_wrapper (xpool=0x7eacd0) at tpool.c:603 #1 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #2 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #3 0x0000000000000000 in ?? ()
Thread 1 (process 23457): #0 0x0000002a968d2540 in strlen () from /lib64/tls/libc.so.6 #1 0x0000002a968a4a0b in vfprintf () from /lib64/tls/libc.so.6 #2 0x0000002a968c4434 in vsnprintf () from /lib64/tls/libc.so.6 #3 0x0000002a958c3181 in lutil_debug (debug=<value optimized out>, level=<value optimized out>, fmt=0x448076c8 "$") at debug.c:66 #4 0x00000000004957d1 in do_syncrepl (ctx=0x44807e90, arg=0x858150) at syncrepl.c:1261 #5 0x0000002a9567e415 in ldap_int_thread_pool_wrapper ( xpool=<value optimized out>) at tpool.c:663 #6 0x0000002a9675310a in start_thread () from /lib64/tls/libpthread.so.0 #7 0x0000002a969288b3 in clone () from /lib64/tls/libc.so.6 #8 0x0000000000000000 in ?? () (gdb)