Richard Silverman wrote:
So the question is why are these threads still waiting on their sockets.
Looking at your logs, it's clear that the error occurs and connection_closing() is called, which will call connection_abandon() to set the o_abandon flag on all of the connection's ops.
I agree; I had followed that much.
Can you please provide the output for "print *op" and "print *op->o_hdr" in thread 13 and thread 16, frame 3?
(gdb) thread 13 [Switching to thread 13 (Thread 0x7f21f27fc700 (LWP 8470))]#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:162 162 62: movl (%rsp), %edi (gdb) frame 3 #3 0x000000000043e716 in send_ldap_ber (op=0x7f21cc191420, ber=0x7f21f267a460) at result.c:379 379 ldap_pvt_thread_cond_wait( &conn->c_write2_cv, &conn->c_write2_mutex ); (gdb) print *op $1 = {o_hdr = 0x7f21cc191590, o_tag = 99, o_time = 1356070283, o_tincr = 46, o_bd = 0x18217e0, o_req_dn = { bv_len = 13, bv_val = 0x7f21e0009e88 "dc=alu,dc=com"}, o_req_ndn = {bv_len = 13, bv_val = 0x7f21e0009ec0 "dc=alu,dc=com"}, o_request = {oq_add = {rs_modlist = 0x2, rs_e = 0xffffffffffffffff}, oq_bind = {rb_method = 2, rb_cred = {bv_len = 18446744073709551615, bv_val = 0x0}, rb_edn = {bv_len = 0, bv_val = 0x0}, rb_ssf = 3758137048, rb_mech = {bv_len = 15, bv_val = 0x7f21e0009ef8 "(objectClass=*)"}}, oq_compare = {rs_ava = 0x2}, oq_modify = {rs_mods = {rs_modlist = 0x2, rs_no_opattrs = -1 '\377'}, rs_increment = 0}, oq_modrdn = {rs_mods = {rs_modlist = 0x2, rs_no_opattrs = -1 '\377'}, rs_deleteoldrdn = 0, rs_newrdn = {bv_len = 0, bv_val = 0x0}, rs_nnewrdn = {bv_len = 139783468785368, bv_val = 0xf <Address 0xf out of bounds>}, rs_newSup = 0x7f21e0009ef8, rs_nnewSup = 0x0}, oq_search = { rs_scope = 2, rs_deref = 0, rs_slimit = -1, rs_tlimit = -1, rs_limit = 0x0, rs_attrsonly = 0, rs_attrs = 0x0, rs_filter = 0x7f21e0009ed8, rs_filterstr = {bv_len = 15, bv_val = 0x7f21e0009ef8 "(objectClass=*)"}}, oq_abandon = {rs_msgid = 2}, oq_cancel = {rs_msgid = 2}, oq_extended = {rs_reqoid = {bv_len = 2, bv_val = 0xffffffffffffffff <Address 0xffffffffffffffff out of bounds>}, rs_flags = 0, rs_reqdata = 0x0}, oq_pwdexop = {rs_extended = {rs_reqoid = {bv_len = 2, bv_val = 0xffffffffffffffff <Address 0xffffffffffffffff out of bounds>}, rs_flags = 0, rs_reqdata = 0x0}, rs_old = {bv_len = 0, bv_val = 0x7f21e0009ed8 "\207"}, rs_new = {bv_len = 15, bv_val = 0x7f21e0009ef8 "(objectClass=*)"}, rs_mods = 0x0, rs_modtail = 0x0}}, o_abandon = 0, o_cancel = 0, o_groups = 0x0, o_do_not_cache = 0 '\000', o_is_auth_check = 0 '\000', o_dont_replicate = 0 '\000', o_acl_priv = ACL_NONE, o_nocaching = 0 '\000', o_delete_glue_parent = 0 '\000', o_no_schema_check = 0 '\000', o_no_subordinate_glue = 0 '\000', o_ctrlflag = '\000' <repeats 31 times>, o_controls = 0x7f21cc1916d8, o_authz = {sai_method = 128, sai_mech = {bv_len = 0, bv_val = 0x0}, sai_dn = { bv_len = 24, bv_val = 0x7f21e4110a60 "cn=manager,dc=alu,dc=com"}, sai_ndn = {bv_len = 24, bv_val = 0x7f21e4128e50 "cn=manager,dc=alu,dc=com"}, sai_ssf = 0, sai_transport_ssf = 0, sai_tls_ssf = 0, sai_sasl_ssf = 0}, o_ber = 0x7f21e41f9700, o_res_ber = 0x0, o_callback = 0x0, o_ctrls = 0x0, o_csn = { bv_len = 0, bv_val = 0x0}, o_private = 0x0, o_extra = {slh_first = 0x0}, o_next = {stqe_next = 0x7f21d8009fc0}} (gdb) print *op->o_hdr $2 = {oh_opid = 4, oh_connid = 1370, oh_conn = 0x7f221069b758, oh_msgid = 5, oh_protocol = 3, oh_tid = 139783779108608, oh_threadctx = 0x7f21f27fba00, oh_tmpmemctx = 0x7f21e0009cb0, oh_tmpmfuncs = 0x839840, oh_counters = 0x7f21e0009b80, oh_log_prefix = "conn=1370 op=4", '\000' <repeats 241 times>}
OK, so thread 13 is waiting on conn 1370, sd 39. There are actually no error messages associated with this connection or socket anywhere in your logs.
Likewise for thread 16: conn 1351, sd 20. No errors anywhere.
This looks more like a problem with epoll() than anything else. Are the connections associated with socket 20 and 39 actually dead?
(gdb) thread 16 [Switching to thread 16 (Thread 0x7f21f3fff700 (LWP 8467))]#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:162 162 62: movl (%rsp), %edi (gdb) frame 3 #3 0x000000000043e716 in send_ldap_ber (op=0x7f21c8111140, ber=0x7f21f3e7d460) at result.c:379 379 ldap_pvt_thread_cond_wait( &conn->c_write2_cv, &conn->c_write2_mutex ); (gdb) print *op $3 = {o_hdr = 0x7f21c81112b0, o_tag = 99, o_time = 1356070283, o_tincr = 37, o_bd = 0x18217e0, o_req_dn = { bv_len = 13, bv_val = 0x7f21d0001088 "dc=alu,dc=com"}, o_req_ndn = {bv_len = 13, bv_val = 0x7f21d00010c0 "dc=alu,dc=com"}, o_request = {oq_add = {rs_modlist = 0x2, rs_e = 0xffffffffffffffff}, oq_bind = {rb_method = 2, rb_cred = {bv_len = 18446744073709551615, bv_val = 0x0}, rb_edn = {bv_len = 0, bv_val = 0x0}, rb_ssf = 3489665240, rb_mech = {bv_len = 15, bv_val = 0x7f21d00010f8 "(objectClass=*)"}}, oq_compare = {rs_ava = 0x2}, oq_modify = {rs_mods = {rs_modlist = 0x2, rs_no_opattrs = -1 '\377'}, rs_increment = 0}, oq_modrdn = {rs_mods = {rs_modlist = 0x2, rs_no_opattrs = -1 '\377'}, rs_deleteoldrdn = 0, rs_newrdn = {bv_len = 0, bv_val = 0x0}, rs_nnewrdn = {bv_len = 139783200313560, bv_val = 0xf <Address 0xf out of bounds>}, rs_newSup = 0x7f21d00010f8, rs_nnewSup = 0x0}, oq_search = { rs_scope = 2, rs_deref = 0, rs_slimit = -1, rs_tlimit = -1, rs_limit = 0x0, rs_attrsonly = 0, rs_attrs = 0x0, rs_filter = 0x7f21d00010d8, rs_filterstr = {bv_len = 15, bv_val = 0x7f21d00010f8 "(objectClass=*)"}}, oq_abandon = {rs_msgid = 2}, oq_cancel = {rs_msgid = 2}, oq_extended = {rs_reqoid = {bv_len = 2, bv_val = 0xffffffffffffffff <Address 0xffffffffffffffff out of bounds>}, rs_flags = 0, rs_reqdata = 0x0}, oq_pwdexop = {rs_extended = {rs_reqoid = {bv_len = 2, bv_val = 0xffffffffffffffff <Address 0xffffffffffffffff out of bounds>}, rs_flags = 0, rs_reqdata = 0x0}, rs_old = {bv_len = 0, bv_val = 0x7f21d00010d8 "\207"}, rs_new = {bv_len = 15, bv_val = 0x7f21d00010f8 "(objectClass=*)"}, rs_mods = 0x0, rs_modtail = 0x0}}, o_abandon = 0, o_cancel = 0, o_groups = 0x0, o_do_not_cache = 0 '\000', o_is_auth_check = 0 '\000', o_dont_replicate = 0 '\000', o_acl_priv = ACL_NONE, o_nocaching = 0 '\000', o_delete_glue_parent = 0 '\000', o_no_schema_check = 0 '\000', o_no_subordinate_glue = 0 '\000', o_ctrlflag = '\000' <repeats 31 times>, o_controls = 0x7f21c81113f8, o_authz = {sai_method = 128, sai_mech = {bv_len = 0, bv_val = 0x0}, sai_dn = { bv_len = 24, bv_val = 0x7f21a813ed50 "cn=manager,dc=alu,dc=com"}, sai_ndn = {bv_len = 24, bv_val = 0x7f21a813ce90 "cn=manager,dc=alu,dc=com"}, sai_ssf = 0, sai_transport_ssf = 0, sai_tls_ssf = 0, sai_sasl_ssf = 0}, o_ber = 0x7f21a813ecb0, o_res_ber = 0x0, o_callback = 0x0, o_ctrls = 0x0, o_csn = { bv_len = 0, bv_val = 0x0}, o_private = 0x0, o_extra = {slh_first = 0x0}, o_next = {stqe_next = 0x7f21c4167f80}} (gdb) print *op->o_hdr $4 = {oh_opid = 4, oh_connid = 1351, oh_conn = 0x7f2210697570, oh_msgid = 5, oh_protocol = 3, oh_tid = 139783804286720, oh_threadctx = 0x7f21f3ffea00, oh_tmpmemctx = 0x7f21d0000eb0, oh_tmpmfuncs = 0x839840, oh_counters = 0x7f21d0000d80, oh_log_prefix = "conn=1351 op=4", '\000' <repeats 241 times>}