We very occasionally see a problem that goes like this:

 

1)     Send an LDAP search request to a Windows DC

2)     Call ldap_int_select from result.c – wait4msg()

3)     After ldap_int_select returns okay – (eventually) call down into the sockbuf.c function sb_stream_read() which calls read(…)

4)     The read() hangs forever if it has to do multiple calls to get the entire response (very very occasionally)

 

I was not able to get a network trace to see why recv() hangs forever (AIX 5.3 BTW), but it does have to do with a large response that comes back in several TCP fragments.

 

 

I solved the issue by adding an additional select() call in sb_stream_read()

 

Here is what I did in code base 2.2.6

 

static ber_slen_t

sb_stream_read( Sockbuf_IO_Desc *sbiod, void *buf, ber_len_t len )

 

 

#elif defined( HAVE_NCSA )

/*

 * NCSA Telnet TCP/IP stack (under DOS)

 */

            return nread( sbiod->sbiod_sb->sb_fd, buf, len );

 

#else

    {

        int rc = ber_int_sb_read_wait(sbiod->sbiod_sb);  <- The new check

        if (rc > 0)

        {

            rc = read( sbiod->sbiod_sb->sb_fd, buf, len );

        }

        return rc;

    }

   

#endif

}

 

New Function

 

/**

 * If there is a timeout - wait for data, to avoid

 * permanent blocking on a read.

 *

 * @param sb    Socket buf with socket id and timeout

 * @return -1=error : 0=timeout : 1=proceed

 */

static int

ber_int_sb_read_wait(Sockbuf *sb)

{

    struct timeval tm = sb->sb_timeout;

    int rc = 1;

 

    /* If no timeout was specified skip the select */

    if (tm.tv_sec || tm.tv_usec)

    {

        int sock = sb->sb_fd;

        fd_set rfds;

 

        FD_ZERO(&rfds);

        FD_SET(sock, &rfds);

 

        ber_log_printf(LDAP_DEBUG_PACKETS, sb->sb_debug,

                       "ber_int_sb_read_wait start: timeout is %d %d",

                       (int)tm.tv_sec, (int)tm.tv_usec);

 

        rc = select(sock+1, &rfds, NULL, NULL, &tm);

 

        ber_log_printf(LDAP_DEBUG_PACKETS, sb->sb_debug,

                       "ber_int_sb_read_wait end: sock=%d rc=%d %s\n",

                       sock, rc, (rc >= 0) ? "" : STRERROR(errno));

    }

 

    return rc;

}

 

In open.c – ldap_int_open_connection(), I added some code to propagate the timeout down to ber_int_sb_read_wait((

 

            {

                /*

                 * Propagate the network timeout to sockbuf layer for

                 * select calls.

                 */

                struct timeval *tm = 0;

                int ret = ldap_get_option(ld, LDAP_OPT_NETWORK_TIMEOUT,

                                          (void *)&tm);

               

                if (ret == LDAP_OPT_SUCCESS && tm) {

                    ber_sockbuf_ctrl(conn->lconn_sb, LBER_SB_OPT_SET_TIMEOUT, tm);

                    LDAP_FREE(tm);

                }

            }

 

Regards

 

Dave Daugherty

Centrify Corp.