int uv_run(uv_loop_t* loop, uv_run_mode mode) {
int timeout;
int r;
int ran_pending;
// 有活跃的handle或req
r = uv__loop_alive(loop);
if (!r)
uv__update_time(loop);
while (r != 0 && loop->stop_flag == 0) {
uv__update_time(loop);
uv__run_timers(loop);
// run pending queue
ran_pending = uv__run_pending(loop);
// UV_LOOP_WATCHER_DEFINE,执行队列
uv__run_idle(loop);
uv__run_prepare(loop);
timeout = 0;
if ((mode == UV_RUN_ONCE && !ran_pending) || mode == UV_RUN_DEFAULT)
// 检查下还有没有active handle,返回下次timer发生剩余时间
timeout = uv_backend_timeout(loop);
uv__io_poll(loop, timeout);
uv__run_check(loop);
uv__run_closing_handles(loop);
if (mode == UV_RUN_ONCE) {
/* UV_RUN_ONCE implies forward progress: at least one callback must have
* been invoked when it returns. uv__io_poll() can return without doing
* I/O (meaning: no callbacks) when its timeout expires - which means we
* have pending timers that satisfy the forward progress constraint.
*
* UV_RUN_NOWAIT makes no guarantees about progress so it's omitted from
* the check.
*/
uv__update_time(loop);
uv__run_timers(loop);
}
r = uv__loop_alive(loop);
if (mode == UV_RUN_ONCE || mode == UV_RUN_NOWAIT)
break;
}
/* The if statement lets gcc compile it to a conditional store. Avoids
* dirtying a cache line.
*/
if (loop->stop_flag != 0)
loop->stop_flag = 0;
return r;
}
void uv__io_poll(uv_loop_t* loop, int timeout) {
/* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes
* effectively infinite on 32 bits architectures. To avoid blocking
* indefinitely, we cap the timeout and poll again if necessary.
*
* Note that "30 minutes" is a simplification because it depends on
* the value of CONFIG_HZ. The magic constant assumes CONFIG_HZ=1200,
* that being the largest value I have seen in the wild (and only once.)
*/
static const int max_safe_timeout = 1789569;
static int no_epoll_pwait;
static int no_epoll_wait;
struct uv__epoll_event events[1024];
struct uv__epoll_event* pe;
struct uv__epoll_event e;
int real_timeout;
QUEUE* q;
uv__io_t* w;
sigset_t sigset;
uint64_t sigmask;
uint64_t base;
int have_signals;
int nevents;
int count;
int nfds;
int fd;
int op;
int i;
// loop->watchers[w->fd] = w in uv__io_start func
if (loop->nfds == 0) {
assert(QUEUE_EMPTY(&loop->watcher_queue));
return;
}
// 取出观察者队列中的fd, 调用uv__epoll_ctl监听
while (!QUEUE_EMPTY(&loop->watcher_queue)) {
q = QUEUE_HEAD(&loop->watcher_queue);
QUEUE_REMOVE(q);
QUEUE_INIT(q);
// QUEUE_DATA类似container
w = QUEUE_DATA(q, uv__io_t, watcher_queue);
assert(w->pevents != 0);
assert(w->fd >= 0);
assert(w->fd < (int) loop->nwatchers);
e.events = w->pevents;
e.data = w->fd;
if (w->events == 0)
op = UV__EPOLL_CTL_ADD;
else
op = UV__EPOLL_CTL_MOD;
/* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching
* events, skip the syscall and squelch the events after epoll_wait().
*/
// fd = uv__epoll_create1(UV__EPOLL_CLOEXEC); loop->backend_fd = fd;
if (uv__epoll_ctl(loop->backend_fd, op, w->fd, &e)) {
if (errno != EEXIST)
abort();
assert(op == UV__EPOLL_CTL_ADD);
/* We've reactivated a file descriptor that's been watched before. */
if (uv__epoll_ctl(loop->backend_fd, UV__EPOLL_CTL_MOD, w->fd, &e))
abort();
}
w->events = w->pevents;
}
sigmask = 0;
if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
sigemptyset(&sigset);
sigaddset(&sigset, SIGPROF);
sigmask |= 1 << (SIGPROF - 1);
}
assert(timeout >= -1);
base = loop->time;
count = 48; /* Benchmarks suggest this gives the best throughput. */
real_timeout = timeout;
for (;;) {
/* See the comment for max_safe_timeout for an explanation of why
* this is necessary. Executive summary: kernel bug workaround.
*/
if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout)
timeout = max_safe_timeout;
if (sigmask != 0 && no_epoll_pwait != 0)
if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
abort();
if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) {
// 返回需要处理的事件数目
nfds = uv__epoll_pwait(loop->backend_fd,
events,
ARRAY_SIZE(events),
timeout,
sigmask);
if (nfds == -1 && errno == ENOSYS)
no_epoll_pwait = 1;
} else {
nfds = uv__epoll_wait(loop->backend_fd,
events,
ARRAY_SIZE(events),
timeout);
if (nfds == -1 && errno == ENOSYS)
no_epoll_wait = 1;
}
if (sigmask != 0 && no_epoll_pwait != 0)
if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL))
abort();
/* Update loop->time unconditionally. It's tempting to skip the update when
* timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
* operating system didn't reschedule our process while in the syscall.
*/
SAVE_ERRNO(uv__update_time(loop));
if (nfds == 0) {
assert(timeout != -1);
if (timeout == 0)
return;
/* We may have been inside the system call for longer than |timeout|
* milliseconds so we need to update the timestamp to avoid drift.
*/
// 没有需要处理的事件
goto update_timeout;
}
if (nfds == -1) {
if (errno == ENOSYS) {
/* epoll_wait() or epoll_pwait() failed, try the other system call. */
assert(no_epoll_wait == 0 || no_epoll_pwait == 0);
continue;
}
if (errno != EINTR)
abort();
if (timeout == -1)
continue;
if (timeout == 0)
return;
/* Interrupted by a signal. Update timeout and poll again. */
goto update_timeout;
}
have_signals = 0;
nevents = 0;
assert(loop->watchers != NULL);
loop->watchers[loop->nwatchers] = (void*) events;
loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
for (i = 0; i < nfds; i++) {
pe = events + i;
// (*pe).data
fd = pe->data;
/* Skip invalidated events, see uv__platform_invalidate_fd */
if (fd == -1)
continue;
assert(fd >= 0);
assert((unsigned) fd < loop->nwatchers);
w = loop->watchers[fd];
if (w == NULL) {
/* File descriptor that we've stopped watching, disarm it.
*
* Ignore all errors because we may be racing with another thread
* when the file descriptor is closed.
*/
// 从红黑树中删除fd
uv__epoll_ctl(loop->backend_fd, UV__EPOLL_CTL_DEL, fd, pe);
continue;
}
/* Give users only events they're interested in. Prevents spurious
* callbacks when previous callback invocation in this loop has stopped
* the current watcher. Also, filters out events that users has not
* requested us to watch.
*/
pe->events &= w->pevents | POLLERR | POLLHUP;
/* Work around an epoll quirk where it sometimes reports just the
* EPOLLERR or EPOLLHUP event. In order to force the event loop to
* move forward, we merge in the read/write events that the watcher
* is interested in; uv__read() and uv__write() will then deal with
* the error or hangup in the usual fashion.
*
* Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
* reads the available data, calls uv_read_stop(), then sometime later
* calls uv_read_start() again. By then, libuv has forgotten about the
* hangup and the kernel won't report EPOLLIN again because there's
* nothing left to read. If anything, libuv is to blame here. The
* current hack is just a quick bandaid; to properly fix it, libuv
* needs to remember the error/hangup event. We should get that for
* free when we switch over to edge-triggered I/O.
*/
if (pe->events == POLLERR || pe->events == POLLHUP)
pe->events |= w->pevents & (POLLIN | POLLOUT | UV__POLLPRI);
if (pe->events != 0) {
/* Run signal watchers last. This also affects child process watchers
* because those are implemented in terms of signal watchers.
*/
if (w == &loop->signal_io_watcher)
have_signals = 1;
else
// uv__async_io, uv__async_start中的uv__io_init注册
w->cb(loop, w, pe->events);
nevents++;
}
}
if (have_signals != 0)
loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
loop->watchers[loop->nwatchers] = NULL;
loop->watchers[loop->nwatchers + 1] = NULL;
if (have_signals != 0)
return; /* Event loop should cycle now so don't poll again. */
if (nevents != 0) {
if (nfds == ARRAY_SIZE(events) && --count != 0) {
/* Poll for more events but don't block this time. */
timeout = 0;
continue;
}
return;
}
if (timeout == 0)
return;
if (timeout == -1)
continue;
update_timeout:
assert(timeout > 0);
real_timeout -= (loop->time - base);
if (real_timeout <= 0)
return;
timeout = real_timeout;
}
}
void uv__io_start(uv_loop_t* loop, uv__io_t* w, unsigned int events) {
assert(0 == (events & ~(POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI)));
assert(0 != events);
assert(w->fd >= 0);
assert(w->fd < INT_MAX);
w->pevents |= events;
maybe_resize(loop, w->fd + 1);
#if !defined(__sun)
/* The event ports backend needs to rearm all file descriptors on each and
* every tick of the event loop but the other backends allow us to
* short-circuit here if the event mask is unchanged.
*/
if (w->events == w->pevents)
return;
#endif
if (QUEUE_EMPTY(&w->watcher_queue))
QUEUE_INSERT_TAIL(&loop->watcher_queue, &w->watcher_queue);
if (loop->watchers[w->fd] == NULL) {
loop->watchers[w->fd] = w;
loop->nfds++;
}
}
本文将主要介绍libuv的事件循环,包括了事件循环的流程,而我们也知道libuv是使用poll机制来实现网络I/O,通过线程池来实现文件I/O,当然线程间也是通过poll机制来实现通信的,后面就将介绍线程池与事件循环是如何结合的。
event loop流程
事件循环的流程大致如下图所示:
代码如下所示:
时间循环可以分为以下几个步骤:
时间循环结束的条件有如下几种:
下面挑选重要的几点进行讲解:
判断loop是不是alive
决定loop是否是alive取决于是否有活跃的handle或者req,或者被直接stop掉,代码如下:
uv__run_timers
uv__run_timers代码如下:
我们注意到,存储timer节点的数据结构是一个以handle->timeout为基准的最小堆,函数循环过程中主要做了如下几件事:
uv__run_pending
uv__run_pending主要是将loop->pending_queue中的callback取出执行,代码如下:
后面的uv__run_idle和uv__run_prepare与之类似。
poll I/O
poll I/O是事件循环的重点,它基于IO多路复用的机制,所有网络操作都使用 non-blocking 套接字,并使用各个平台上性能最好的 poll 机制例如 linux 上的 epoll,OSX 的 kqueue 等等;而所有文件I/O基于线程池实现,但线程间通信同样基于相应的poll机制。
下面的uv__io_poll是基于linux伤的epoll来实现,其他平台的实现也类似,具体代码如下:
这里主要做了如下几件事:
这里需要注意的有以下几点:
loop->backend_fd
uv__epoll_ctl(loop->backend_fd, op, w->fd, &e),了解epoll的同学都会知道这里loop->backend_fd在内核高速缓冲区,用来表示当前这个epoll在所在红黑树的起点。
其在uv__platform_loop_init中被赋值,代码如下:
loop->watchers
epoll通过调用uv__epoll_pwait来获取需要处理事件的数据,参数events用来从内核得到事件的集合,这也是epoll的优势之一(共享内存的方式)。我们从events中取出相应的fd,然后根据fd从loop->watchers中取出handle并执行起callback,那么loop->watchers是如何初始化的呢?
其在uv__io_start中被初始化,loop->watchers是一个数组类型,其index用来表示uv__iot handle中的fd,这样我们根据fd可以轻松的找出其uv\_io_t handle。
uv__io_start在多处被用到,包括uv__async_start中调用uv__io_start来监听线程间通信用到的fd,还有在tcp、udp模块中都有用其监听fd。
我们可以看出,IO事件都会调用 uv__io_start 函数,该函数将需要监听的事件保存到 event loop的watcher_queue队列中
超时
我们发现uv__io_poll其实是阻塞的,为了解决阻塞的问题,在调用的时候加入了timeout参数,timeout参数表示距离下一个timer需要执行(超过了timer的timeout)的时间,当没有要处理的事件时,会根据进入uv__io_poll时的事件来计算是否需要break。update_timeout的代码如下:
线程池实现文件异步I/O
Libuv的文件I/O是基于线程池来实现的,大致原理是主线程提交任务到任务队列,发送信号给线程池,线程池中的worker收到信号,从任务队列中取出任务并执行,工作线程执行完任务后,将任务对应uv_async_t handle的pending状态置0,通过fd通知主线程(该 fd 同样由epoll管理),主线程监听该fd,当有epoll事件时,执行非pending的uv_async_t handle对应的回调,然后根据层层回调,最终会调用到用户注册的回调函数
说到线程池,几乎所有线程池的实现都遵循如下模型,也就是任务队列+线程池的模型,libuv的实现也是基于此。
libuv中任务队列基于一个双向链表,其中的任务的struct声明如下:
我们可以看到,其中work代表线程池实际要做的工作,done代表任务执行后的callback,wq数组为两个指针,分别指向任务队列中的前后节点。
下面我们首先看一下主线程如何提交任务到任务队列:
首先在fs.c中有这样一段逻辑,其中所有的文件操作都会调用POST,代码如下:
POST宏中调用了uv__work_submit将任务提交到队列,下面我们看下uv__work_submit的代码:
这里主要做了两件事:
uv__work_submit这块涉及的逻辑如下:
这里需要关注的有以下几点:
我们再来看下工作线程执行完任务后是如何通知主线程的,也就是上述的uv_async_send方法:
这里主要做了如下几件事:
当主线程监听到async_watcher->io_watcher.fd的变化后,通过层层回调,最终调用uv__work的done函数,也就是用户注册的回调。这部分我们首先从前向后看下回调的注册:
这块按照执行顺序做了如下几件事:
现在我们来梳理下当主线程接收到事件后,如何层层回调,最终执行uv__work的done即用户提交的回调函数。
在uv__io_poll方法中,通过uv__epoll_pwait监听到时间后,会执行loop->watchers取出uv__io_start中注册的uv__io_t(也就是上面注册的loop->async_iowatcher),然后执行其注册的回调(uv\_async_io)。
uv__async_io代码如下:
这里主要做了两件事:
总结
由于Node.js异步I/O依赖libuv,libuv的核心又是event loop,本文主要介绍了event loop的流程以及线程池的实现。