Full_Name: Yoshinori Nishino Version: 2.4.45 OS: CentOS 7.3.1611 URL: ftp://ftp.openldap.org/incoming/ Submission from: (NULL) (210.143.35.20)
Dear sir,
I saw the deadlock between accesslog_purge() and accesslog_responce(), both of which are implemented in servers/slapd/overlays/accesslog.c. The deadlock was caused by __db_hybrid_mutex_suspend() in libdb-5.3.so(hdb).
==[the output of pstack]== Thread 22 (Thread 0x7f1c845ec700 (LWP 23001)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 21 (Thread 0x7f1c82dea700 (LWP 23002)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 20 (Thread 0x7f1c825e9700 (LWP 23003)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 19 (Thread 0x7f1c81de8700 (LWP 23004)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 18 (Thread 0x7f1c815e7700 (LWP 23005)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 17 (Thread 0x7f1c80de6700 (LWP 23006)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 16 (Thread 0x7f1c477f0700 (LWP 23007)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 15 (Thread 0x7f1c46fef700 (LWP 23009)): #0 0x00007f2e17e856d5 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f2e1860b2f3 in __db_hybrid_mutex_suspend () from /lib64/libdb-5.3.so #2 0x00007f2e1860a640 in __db_tas_mutex_lock () from /lib64/libdb-5.3.so #3 0x00007f2e186b4d3a in __lock_get_internal () from /lib64/libdb-5.3.so #4 0x00007f2e186b5820 in __lock_get () from /lib64/libdb-5.3.so #5 0x00007f2e186e1142 in __db_lget () from /lib64/libdb-5.3.so #6 0x00007f2e18612602 in __bamc_writelock () from /lib64/libdb-5.3.so #7 0x00007f2e186cd079 in __dbc_idel () from /lib64/libdb-5.3.so #8 0x00007f2e186d1113 in __dbc_del () from /lib64/libdb-5.3.so #9 0x00007f2e186dc3db in __dbc_del_pp () from /lib64/libdb-5.3.so #10 0x00007f2e1913b382 in hdb_dn2id_delete () #11 0x00007f2e191394ca in hdb_delete () #12 0x00007f2e190d6996 in overlay_op_walk () #13 0x00007f2e190d6b04 in over_op_func () #14 0x00007f2e110f3399 in accesslog_purge () from /usr/lib64/openldap/accesslog-2.4.so.2 #15 0x00007f2e18bb9fba in ldap_int_thread_pool_wrapper () from /lib64/libldap_r-2.4.so.2 #16 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #17 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 14 (Thread 0x7f1c467ee700 (LWP 23008)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 13 (Thread 0x7f1c45fed700 (LWP 23010)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 12 (Thread 0x7f1c457ec700 (LWP 23011)): #0 0x00007f2e17e856d5 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f2e18bba00b in ldap_int_thread_pool_wrapper () from /lib64/libldap_r-2.4.so.2 #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 11 (Thread 0x7f1c44feb700 (LWP 23012)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 10 (Thread 0x7f1c447ea700 (LWP 23013)): #0 0x00007f2e17e856d5 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f2e18bba00b in ldap_int_thread_pool_wrapper () from /lib64/libldap_r-2.4.so.2 #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 9 (Thread 0x7f1c43fe9700 (LWP 23014)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 8 (Thread 0x7f1c437e8700 (LWP 23015)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 7 (Thread 0x7f1c42fe7700 (LWP 23016)): #0 0x00007f2e17e856d5 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f2e1860b2f3 in __db_hybrid_mutex_suspend () from /lib64/libdb-5.3.so #2 0x00007f2e1860a640 in __db_tas_mutex_lock () from /lib64/libdb-5.3.so #3 0x00007f2e186b4d3a in __lock_get_internal () from /lib64/libdb-5.3.so #4 0x00007f2e186b5820 in __lock_get () from /lib64/libdb-5.3.so #5 0x00007f2e186e1142 in __db_lget () from /lib64/libdb-5.3.so #6 0x00007f2e18628605 in __bam_search () from /lib64/libdb-5.3.so #7 0x00007f2e18613256 in __bamc_search () from /lib64/libdb-5.3.so #8 0x00007f2e1861790f in __bamc_put () from /lib64/libdb-5.3.so #9 0x00007f2e186ce514 in __dbc_iput () from /lib64/libdb-5.3.so #10 0x00007f2e186c9a0e in __db_put () from /lib64/libdb-5.3.so #11 0x00007f2e186defa4 in __db_put_pp () from /lib64/libdb-5.3.so #12 0x00007f2e1913acec in hdb_dn2id_add () #13 0x00007f2e1913261b in hdb_add () #14 0x00007f2e190d6996 in overlay_op_walk () #15 0x00007f2e190d6b04 in over_op_func () #16 0x00007f2e110f44e4 in accesslog_response () from /usr/lib64/openldap/accesslog-2.4.so.2 #17 0x00007f2e190d5d48 in over_back_response () #18 0x00007f2e19078243 in slap_response_play () #19 0x00007f2e19078800 in send_ldap_response () #20 0x00007f2e19079392 in slap_send_ldap_result () #21 0x00007f2e190f438d in hdb_modify () #22 0x00007f2e190d6996 in overlay_op_walk () #23 0x00007f2e190d6b04 in over_op_func () #24 0x00007f2e190c7bae in syncrepl_message_to_op () #25 0x00007f2e190cab57 in do_syncrep2 () #26 0x00007f2e190cf95b in do_syncrepl () #27 0x00007f2e190689f1 in connection_read_thread () #28 0x00007f2e18bb9fba in ldap_int_thread_pool_wrapper () from /lib64/libldap_r-2.4.so.2 #29 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #30 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 6 (Thread 0x7f1c427e6700 (LWP 23018)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 5 (Thread 0x7f1c41fe5700 (LWP 23017)): #0 0x00007f2e17e881bd in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00007f2e17e83d02 in _L_lock_791 () from /lib64/libpthread.so.0 #2 0x00007f2e17e83c08 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x00007f2e110f3f54 in accesslog_response () from /usr/lib64/openldap/accesslog-2.4.so.2 #4 0x00007f2e190d5d48 in over_back_response () #5 0x00007f2e19078243 in slap_response_play () #6 0x00007f2e19078800 in send_ldap_response () #7 0x00007f2e19079392 in slap_send_ldap_result () #8 0x00007f2e190f438d in hdb_modify () #9 0x00007f2e190d6996 in overlay_op_walk () #10 0x00007f2e190d6b04 in over_op_func () #11 0x00007f2e1909d7d3 in passwd_extop () #12 0x00007f2e1909bccc in fe_extended () #13 0x00007f2e1909ba7d in do_extended () #14 0x00007f2e19067d0c in connection_operation () #15 0x00007f2e1906807b in connection_read_thread () #16 0x00007f2e18bb9fba in ldap_int_thread_pool_wrapper () from /lib64/libldap_r-2.4.so.2 #17 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #18 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 4 (Thread 0x7f1c417e4700 (LWP 23019)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 3 (Thread 0x7f1c40fe3700 (LWP 23020)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 2 (Thread 0x7f1c3bfff700 (LWP 23021)): #0 0x00007f2e17344d13 in epoll_wait () from /lib64/libc.so.6 #1 0x00007f2e19062ee8 in slapd_daemon_task () #2 0x00007f2e17e81dc5 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f2e1734473d in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f2e1900d740 (LWP 22992)): #0 0x00007f2e17e82ef7 in pthread_join () from /lib64/libpthread.so.0 #1 0x00007f2e19064de1 in slapd_daemon () #2 0x00007f2e1904bb62 in main () ====
==[the output of "db_stat -C A"]== Default locking region information: 9 Last allocated locker ID 0x7fffffff Current maximum unused locker ID 9 Number of lock modes 3600 Initial number of locks allocated 0 Initial number of lockers allocated 3600 Initial number of lock objects allocated 26000 Maximum number of locks possible 26000 Maximum number of lockers possible 26000 Maximum number of lock objects possible 3600 Current number of locks allocated 18 Current number of lockers allocated 3600 Current number of lock objects allocated 720 Number of lock object partitions 32771 Size of object hash table 15 Number of current locks 2044 Maximum number of locks at any one time 4 Maximum number of locks in any one bucket 1 Maximum number of locks stolen by for an empty partition 1 Maximum number of locks stolen for any one partition 17 Number of current lockers 17 Maximum number of lockers at any one time 11 Number of current lock objects 1456 Maximum number of lock objects at any one time 3 Maximum number of lock objects in any one bucket 0 Maximum number of objects stolen by for an empty partition 0 Maximum number of objects stolen for any one partition 301M Total number of locks requested (301292352) 301M Total number of locks released (301292334) 0 Total number of locks upgraded 7 Total number of locks downgraded 3 Lock requests not available due to conflicts, for which we waited 2 Lock requests not available due to conflicts, for which we did not wait 0 Number of deadlocks 0 Lock timeout value 0 Number of locks that have timed out 0 Transaction timeout value 0 Number of transactions that have timed out 2GB 524MB 688KB Region size 0 The number of partition locks that required waiting (0%) 0 The maximum number of times any partition lock was waited for (0%) 0 The number of object queue operations that required waiting (0%) 0 The number of locker allocations that required waiting (0%) 0 The number of region locks that required waiting (0%) 3 Maximum hash bucket length =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Lock REGINFO information: Environment Region type 1 Region ID /var/lib/ldap/accesslog_db/__db.001 Region name 0x7f0ffe75b000 Region address 0x7f0ffe75b0a0 Region allocation head 0x7f0ffe95b5b0 Region primary address 0 Region maximum allocation 0 Region allocated Region allocations: 1032145 allocations, 0 failures, 1030659 frees, 1 longest Allocations by power-of-two sizes: 1KB 1032140 2KB 0 4KB 0 8KB 0 16KB 0 32KB 0 64KB 1 128KB 0 256KB 0 512KB 0 1024KB 4 REGION_SHARED Region flags =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Lock region parameters: 2 Lock region region mutex [0/11M 0% 6376/139709380257600] 32771 locker table size 32771 object table size 2099280 obj_off 5570968 locker_off 0 need_dd =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Lock conflict matrix: =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Locks grouped by lockers: Locker Mode Count Status ----------------- Object --------------- 1 dd= 0 locks held 1 write locks 0 pid/thread 22992/139835964708672 flags 10 priority 100 1 READ 1 HELD id2entry.bdb handle 0 2 dd= 0 locks held 1 write locks 0 pid/thread 22992/139835964708672 flags 10 priority 100 2 READ 1 HELD dn2id.bdb handle 0 3 dd= 0 locks held 0 write locks 0 pid/thread 22992/139835964708672 flags 0 priority 100 4 dd= 0 locks held 1 write locks 0 pid/thread 22992/139760456615680 flags 10 priority 100 4 READ 1 HELD entryCSN.bdb handle 0 5 dd= 0 locks held 0 write locks 0 pid/thread 22992/139759426926336 flags 0 priority 100 6 dd= 0 locks held 1 write locks 0 pid/thread 22992/139759426926336 flags 10 priority 100 6 READ 1 HELD objectClass.bdb handle 0 7 dd= 0 locks held 1 write locks 0 pid/thread 22992/139759426926336 flags 10 priority 100 7 READ 1 HELD reqStart.bdb handle 0 8 dd= 0 locks held 1 write locks 0 pid/thread 22992/139759384962816 flags 10 priority 100 8 READ 1 HELD reqEnd.bdb handle 0 9 dd= 0 locks held 1 write locks 0 pid/thread 22992/139759384962816 flags 10 priority 100 9 READ 1 HELD reqResult.bdb handle 0 80000003 dd= 0 locks held 0 write locks 0 pid/thread 22992/139835964708672 flags 0 priority 100 80000004 dd= 0 locks held 0 write locks 0 pid/thread 22992/139835964708672 flags 0 priority 100 80000006 dd= 0 locks held 0 write locks 0 pid/thread 22992/139759384962816 flags 0 priority 100 80000007 dd= 0 locks held 0 write locks 0 pid/thread 22992/139759426926336 flags 0 priority 100 8007dd08 dd= 0 locks held 2 write locks 0 pid/thread 22992/139759359784704 flags 0 priority 100 8007dd08 READ 1 HELD 0x652a90 len: 9 data: 010000000000000000 8007dd08 READ 1 HELD dn2id.bdb page 3 8007dd09 dd= 0 locks held 2 write locks 0 pid/thread 22992/139759426926336 flags 0 priority 100 8007dd09 READ 1 HELD 0x652a90 len: 9 data: 010000000000000000 8007dd09 READ 1 HELD 0x6716e8 len: 9 data: cfea8d000000000000 8007dd0a dd= 0 locks held 2 write locks 0 pid/thread 22992/139759426926336 flags 0 priority 100 8007dd0a WRITE 1 WAIT dn2id.bdb page 3 8007dd0a READ 1 HELD dn2id.bdb page 3 8007dd0a READ 1 HELD dn2id.bdb page 444591 8007dd0b dd= 0 locks held 0 write locks 0 pid/thread 22992/139759359784704 flags 0 priority 100 8007dd0b WRITE 1 WAIT dn2id.bdb page 3 =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= Locks grouped by object: Locker Mode Count Status ----------------- Object --------------- 8007dd08 READ 1 HELD dn2id.bdb page 3 8007dd0a READ 1 HELD dn2id.bdb page 3 8007dd0b WRITE 1 WAIT dn2id.bdb page 3 8007dd0a WRITE 1 WAIT dn2id.bdb page 3
2 READ 1 HELD dn2id.bdb handle 0
1 READ 1 HELD id2entry.bdb handle 0
4 READ 1 HELD entryCSN.bdb handle 0
8007dd09 READ 1 HELD 0x652a90 len: 9 data: 010000000000000000 8007dd08 READ 1 HELD 0x652a90 len: 9 data: 010000000000000000
9 READ 1 HELD reqResult.bdb handle 0
8 READ 1 HELD reqEnd.bdb handle 0
8007dd0a READ 1 HELD dn2id.bdb page 444591
7 READ 1 HELD reqStart.bdb handle 0
8007dd09 READ 1 HELD 0x6716e8 len: 9 data: cfea8d000000000000
6 READ 1 HELD objectClass.bdb handle 0 ====
#pid/thread 22992/139759426926336:Thread 7f1c46fef700 - accesslog_purge() #pid/thread 22992/139759359784704:Thread 7f1c42fe7700 - accesslog_responce()
I am going to consider the tentative fix for openldap because I anticipate it would take so long time to pursue the fix for bdb(hdb), The tentative fix is like the following. Would you let me know whethere or not there are any concern about the fix?
==[accesslog_purge() in accesslog.c]== for (i=0; i<pd.used; i++) { op->o_req_dn = pd.dn[i]; op->o_req_ndn = pd.ndn[i]; if ( !slapd_shutdown ) { ldap_pvt_thread_mutex_lock( &li->li_log_mutex ); rs_reinit( &rs, REP_RESULT ); op->o_bd->be_delete( op, &rs ); ldap_pvt_thread_mutex_unlock( &li->li_log_mutex ); } ch_free( pd.ndn[i].bv_val ); ch_free( pd.dn[i].bv_val ); ldap_pvt_thread_pool_pausecheck( &connection_pool ); } ====
I know openldap itself currently recommend NOT using hdb/bdb as backend database. However, I should seek any workaround in the hdb backend case.
Best Regards,