getdnsapi / getdns

A modern asynchronous DNS API https://getdnsapi.net/
Other
461 stars 127 forks source link

select_eventloop rewrite to handle FD_SETSIZE file descriptors #534

Closed Andersama closed 1 year ago

Andersama commented 1 year ago

Currently the eventloop system appears to break down because it seems to use file descriptors as indexs. This is a slight problem because operating systems hand out file descriptors, they're effectively a pointer. Although file descriptors tend to be monotonically increasing compared to pointers, much like pointers, there's no guarantee for their starting or following values.

Storing data like this and indexing in this way means that the eventloop's structure

/* Eventloop based on select */
typedef struct _getdns_select_eventloop {
    getdns_eventloop        loop;
    getdns_eventloop_event *fd_events[FD_SETSIZE];
    uint64_t                fd_timeout_times[FD_SETSIZE];
    getdns_eventloop_event *timeout_events[MAX_TIMEOUTS];
    uint64_t                timeout_times[MAX_TIMEOUTS];
} _getdns_select_eventloop;

Can only handle when file descriptors are < FD_SETSIZE, which depending on the state of the operating system means the program may randomly crash while starting or spuriously fail when other programs open files or otherwise increase the file descriptor index.

The underlying system (if this is meant to only handle a fixed number of events), should treat this structure like a fixed size array, adding new events as they're scheduled, and removing them as needed.

The structure then should look more like

typedef struct _getdns_select_eventloop {
    getdns_eventloop        loop;
    getdns_eventloop_event *fd_events[FD_SETSIZE];
    uint64_t                fd_timeout_times[FD_SETSIZE];
    getdns_eventloop_event *timeout_events[MAX_TIMEOUTS];
    uint64_t                timeout_times[MAX_TIMEOUTS];
    int                     fd[FD_SETSIZE]; // file descriptor for index n
    uint64_t                fd_index; // current size of the event loop structure
} _getdns_select_eventloop;

The implementation then would likely need to scan linearly for the file descriptor.

static uint64_t find_getdns_eventloop_event_index(_getdns_select_eventloop* loop, int fd) {
    uint64_t event_idx = FD_SETSIZE;
    // loop through all events
    for (uint64_t i = 0; i < loop->fd_index && i < FD_SETSIZE; i++) {
        if (loop->fd[i] == fd) {
            event_idx = i;
            return event_idx;
        }
    }

    return event_idx;
}
static getdns_return_t
select_eventloop_schedule(getdns_eventloop *loop,
    int fd, uint64_t timeout, getdns_eventloop_event *event)
{
    _getdns_select_eventloop *select_loop  = (_getdns_select_eventloop *)loop;
    size_t i;
    uint64_t event_idx = FD_SETSIZE;

    DEBUG_SCHED( "%s(loop: %p, fd: %d, timeout: %"PRIu64", event: %p, FD_SETSIZE: %d)\n"
            , __FUNC__, (void *)loop, fd, timeout, (void *)event, FD_SETSIZE);

    if (!loop || !event)
        return GETDNS_RETURN_INVALID_PARAMETER;

    if (select_loop->fd_index >= (int)FD_SETSIZE) {
        DEBUG_SCHED("ERROR: can't schedule anymore file descriptors! index %d >= FD_SETSIZE: %d!\n"
            , select_loop->fd_index, FD_SETSIZE);
        return GETDNS_RETURN_GENERIC_ERROR;
    }

    if (fd >= 0 && !(event->read_cb || event->write_cb)) {
        DEBUG_SCHED("WARNING: fd event without "
                    "read or write cb!\n");
        fd = -1;
    }

    event_idx = find_getdns_eventloop_event_index(select_loop, fd);
    if (fd >= 0) {
        if (select_loop->fd_events[event_idx]) {
            if (select_loop->fd_events[event_idx] == event) {
                DEBUG_SCHED("WARNING: Event %p not cleared "
                    "before being rescheduled!\n"
                    , (void*)select_loop->fd_events[event_idx]);
            } else {
                DEBUG_SCHED("ERROR: A different event is "
                    "already present at fd slot: %p!\n"
                    , (void*)select_loop->fd_events[event_idx]);
            }
        }

        select_loop->fd_events[event_idx] = event;
        select_loop->fd_timeout_times[event_idx] = get_now_plus(timeout);
        event->ev = (void*)(intptr_t)(fd + 1);

        DEBUG_SCHED("scheduled read/write at %d\n", fd);
        return GETDNS_RETURN_GOOD;
    }
    if (!event->timeout_cb) {
        DEBUG_SCHED("ERROR: fd < 0 without timeout_cb!\n");
        return GETDNS_RETURN_GENERIC_ERROR;
    }
    if (event->read_cb) {
        DEBUG_SCHED("ERROR: timeout event with read_cb! Clearing.\n");
        event->read_cb = NULL;
    }
    if (event->write_cb) {
        DEBUG_SCHED("ERROR: timeout event with write_cb! Clearing.\n");
        event->write_cb = NULL;
    }
    for (i = 0; i < MAX_TIMEOUTS; i++) {
        if (select_loop->timeout_events[i] == NULL) {
            select_loop->timeout_events[i] = event;
            select_loop->timeout_times[i] = get_now_plus(timeout);
            event->ev = (void*)(intptr_t)(select_loop->fd[i] + 1);
            DEBUG_SCHED("scheduled timeout at %d\n", (int)i);
            return GETDNS_RETURN_GOOD;
        }
    }
}

Since there's no documentation I'm not sure how select_eventloop_schedule is supposed to function. I get the impression that a file descriptor of -1 does something special.

It seems like

event->ev

could be used to just store the file descriptor rather than transforming it with +1 and -1 respectively when read. I'm also not sure what that's about.

select_eventloop_run_once might look like this:

static void
select_eventloop_run_once(getdns_eventloop *loop, int blocking)
{
    _getdns_select_eventloop *select_loop  = (_getdns_select_eventloop *)loop;

    fd_set   readfds, writefds;
    int      fd, max_fd = -1;
    uint64_t now, timeout = TIMEOUT_FOREVER;
    size_t   i;
    struct timeval tv;

    if (!loop)
        return;

    FD_ZERO(&readfds);
    FD_ZERO(&writefds);
    now = get_now_plus(0);

    for (i = 0; i < MAX_TIMEOUTS; i++) {
        if (!select_loop->timeout_events[i])
            continue;
        if (now > select_loop->timeout_times[i])
            select_timeout_cb(-1, select_loop->timeout_events[i]);
        else if (select_loop->timeout_times[i] < timeout)
            timeout = select_loop->timeout_times[i];
    }

    for (i = 0; i < select_loop->fd_index && i < FD_SETSIZE; i++) {
        if (!select_loop->fd_events[i])
            continue;
        if (select_loop->fd_events[i]->read_cb)
            FD_SET(select_loop->fd[i], &readfds);
        if (select_loop->fd_events[i]->write_cb)
            FD_SET(select_loop->fd[i], &writefds);
        if (select_loop->fd[i] > max_fd)
            max_fd = select_loop->fd[i];
        if (select_loop->fd_timeout_times[i] < timeout)
            timeout = select_loop->fd_timeout_times[i];
    }

    if (max_fd == -1 && timeout == TIMEOUT_FOREVER)
        return;

    if (! blocking || now > timeout) {
        tv.tv_sec = 0;
        tv.tv_usec = 0;
    } else {
        tv.tv_sec  = (long)((timeout - now) / 1000000);
        tv.tv_usec = (long)((timeout - now) % 1000000);
    }
#ifdef USE_WINSOCK
    if (max_fd == -1) {
        if (timeout != TIMEOUT_FOREVER) {
            uint32_t timeout_ms = (tv.tv_usec / 1000) + (tv.tv_sec * 1000);
            Sleep(timeout_ms);
            }
    } else {
#endif
    if (select(max_fd + 1, &readfds, &writefds, NULL,
           ((blocking && timeout == TIMEOUT_FOREVER) ? NULL : &tv)) < 0) {
        if (_getdns_socketerror_wants_retry())
            return;

        DEBUG_SCHED("I/O error with select(): %s\n", _getdns_errnostr());
        return;
    }
#ifdef USE_WINSOCK
    }
#endif
    now = get_now_plus(0);
    for (i = 0; i < select_loop->fd_index && i < FD_SETSIZE; i++) {
        int current_fd = select_loop->fd[i];
        if (select_loop->fd_events[i] &&
            select_loop->fd_events[i]->read_cb &&
            FD_ISSET(current_fd, &readfds))
            select_read_cb(current_fd, select_loop->fd_events[i]);

        if (select_loop->fd_events[i] &&
            select_loop->fd_events[i]->write_cb &&
            FD_ISSET(current_fd, &writefds))
            select_write_cb(current_fd, select_loop->fd_events[i]);

        if (select_loop->fd_events[i] &&
            select_loop->fd_events[i]->timeout_cb &&
            now > select_loop->fd_timeout_times[i])
            select_timeout_cb(current_fd, select_loop->fd_events[i]);

        if (select_loop->fd_events[i] &&
            select_loop->fd_events[i]->timeout_cb &&
            now > select_loop->timeout_times[i])
            select_timeout_cb(-1, select_loop->timeout_events[i]);
    }
}

and select_eventloop_run might include a check against the current number of fd's being handled.

static void
select_eventloop_run(getdns_eventloop *loop)
{
    _getdns_select_eventloop *select_loop  = (_getdns_select_eventloop *)loop;
    size_t        i;

    if (!loop)
        return;

    i = 0;
    while (i < select_loop->fd_index && i < MAX_TIMEOUTS) {
        if (select_loop->fd_events[i] || select_loop->timeout_events[i]) {
            select_eventloop_run_once(loop, 1);
            i = 0;
        } else {
            i++;
        }
    }
}

Something like this should resolve #527

Andersama commented 1 year ago

I'm closing, took a while to try to test the pooling event loop, but that one appears to work just fine. Didn't have luck configuring cmake to get it to work, I'm not sure why it seemed like there were only two settings to change to use the polling code instead. Since I was just testing, instead I just overwrote the select source code with the polling one.

Although I'm closing this, I figured I should leave a note. There is an infinite loop bug in the select eventloop source code*. Since select eventloop works by indexing with the filedescriptor a solution might be to have a maximum number of file descriptors it can handle, with an additional fallback slot that handles or produces some error or otherwise keeps the server running smoothly. EG:

  int filedescriptor = ...;
  if (filedescriptor > FD_SETSIZE) {
    filedescriptor = FD_SETSIZE + 1
  }
  fd_events[filedescriptor]; //where the additional slot is specifically reserved for handling the FD_SETSIZE error

The bug appears to be that timeout callbacks aren't set when it fails early due to the filedescriptor being too large. The result appears to be that the processing for the timeouts gets stuck in a way where all the events appear to be treated as if they have an infinite timeout. When the event loop runs it turns into an infinite cycle.