poll(2) 源碼分析
- 2019 年 10 月 12 日
- 筆記
poll(2)
poll(2) 系統調用的功能和 select(2) 類似:等待一個文件集合中的文件描述符就緒進行I/O操作。
select(2) 的局限性:
- 關注的文件描述符集合大小最大只有 1024
- 文件描述符集合為順序的,不能任意指定 fd,浪費佔用的fd
poll(2) 對 select(2) 的改進,關注的文件描述符集合為動態大小,文件描述可以任意指定。
struct pollfd { int fd; /* file descriptor */ short events; /* requested events */ short revents; /* returned events */ }; - fd 為關注的文件描述符 - events 為關注的事件(輸入),使用位掩碼來表示事件 - revents 為就緒的事件(輸出),同樣使用位掩碼錶示 #include <poll.h> int poll(struct pollfd *fds, nfds_t nfds, int timeout); - fds 為文件描述符集合的地址 - nfds 為文件描述符集合的長度 - timeout 為超時的時間,單位為 毫秒 返回值為 revents 不為 0 的個數,出錯返回 -1
一個簡單的例子:等待標準輸入就緒,超時時間為3s。
#include <poll.h> #include <unistd.h> #include <stdio.h> int main() { int timeout = 3000; struct pollfd fds = {0}; fds.events |= POLLIN; // fd = 0 等待標準輸入 int ret = poll(&fds, 1, timeout); if (ret == -1) printf("error polln"); else if (ret) printf("data is avaliable now.n"); else printf("no data within 3000 ms.n"); }
實現
程式碼位於在 fs/select.c 中,參考中的鏈接有一些關於文件回調和poll結構的說明
poll()
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -EINTR) { struct restart_block *restart_block; restart_block = ¤t->restart_block; restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = -ERESTART_RESTARTBLOCK; } return ret; }
poll() 程式碼很簡單:
- 處理超時時間
- 實現 poll(2)
- 處理後事:判斷是否超時或者重新調用。
do_sys_poll()
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; // 256 位元組大小 struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned long todo = nfds; if (nfds > rlimit(RLIMIT_NOFILE)) // 最大打開的文件數量限制 return -EINVAL; // N_STACK_PPS = (256 - 16) / 8 = 30, 棧空間可以保存 30 個pollfd結構 // 將用戶空間的 struct pollfd 部分移動至棧空間內的數組中 len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; todo -= walk->len; if (!todo) break; // POLLFD_PER_PAGE = (4096 - 16) / 8 = 510 // 申請頁,每頁可容納 510 個 pollfd 結構 len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; walk = walk->next = kmalloc(size, GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } // 將所有的pollfd 結構移動至以 head 為首地址的內核空間中 poll_initwait(&table); // 初始化 table,詳見 select 中的分析,見下參考 fdcount = do_poll(head, &table, end_time); poll_freewait(&table); // 釋放 table // 將 revents 複製到用戶空間 for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; int j; for (j = 0; j < walk->len; j++, ufds++) if (__put_user(fds[j].revents, &ufds->revents)) goto out_fds; } err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; }
do_sys_poll() 函數也是分為三步實現
- 將用戶空間的數據複製到內核空間
- 調用核心實現 do_poll()
- 將就緒的事件數據從內核空間複製到用戶空間
do_poll()
static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); // 估算進程等待的時間,函數返回 納秒 for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { // 對所有的 struct pollfd 遍歷處理,do_pollfd 為單獨處理一個 fd 的函數 /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -EINTR; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) // 調度直到超時 timed_out = 1; } return count; }
這個函數寫的很清楚了,也有很多注釋
- can_busy_loop 是和 CONFIG_NET_RX_BUSY_POLL 配置相關的,不算通用處理情況,先忽略不考慮
- count 為函數的返回值,在 do_pollfd 有返回匹配的掩碼時遞增,為就緒的文件描述符數量,無就緒文件的時候為等待隊列中的錯誤碼
pt->_qproc
為文件poll操作調用的函數,= NULL
的操作在注釋中已經說明,函數已經註冊到隊列中,不必再次註冊. 這個函數相關的內容可以在另外一篇 select(2) 找到具體的說明
/* * Fish for events. If we found one, record it and kill poll_table->_qproc, so we don't * needlessly register any other waiters after this. They'll get immediately deregistered * when we break out and return. */ /* * All waiters have already been registered, so don't provide a poll_table->_qproc to them on the next loop iteration. */
do_pollfd()
/* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { __poll_t mask; int fd; mask = 0; fd = pollfd->fd; if (fd >= 0) { struct fd f = fdget(fd); mask = EPOLLNVAL; // 0x20 if (f.file) { /* userland u16 ->events contains POLL... bitmap */ // 設置關注的事件 __poll_t filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; mask = DEFAULT_POLLMASK; // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM) if (f.file->f_op->poll) { pwait->_key = filter; pwait->_key |= busy_flag; // key 在喚醒函數的時候用到 mask = f.file->f_op->poll(f.file, pwait); // 獲取就緒的文件掩碼 if (mask & busy_flag) *can_busy_poll = true; } /* Mask out unneeded events. */ mask &= filter; // 將文件返回的事件掩碼與關注的事件做與操作得到 關注的就緒事件掩碼 fdput(f); } } /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); // 設置就緒掩碼 return mask; }
討論在不考慮錯誤的情況下,
poll(2) 返回的是revents 非 0 的個數,在 do_pollfd() 中返回一個非 0 的 mask,poll(2) 返回的 count 就 +1。
mask = 0 有兩種可能:
- 和 filter 做與運算,但是這樣做有一個前提就是可以取到 fd
- fd < 0,這種屬於無意義的fd了,屬於用戶的問題
在已了解的fd中: eventfd 和普通的文件poll函數返回情況
- EPOLLIN 或者 EPOLLOUT 或兩個都存在
- (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
當關注的事件不在以上事件中,是可能返回 0,而count不增加的
struct pollfd fds[n]; rn = poll(fds, n, 0); for (int i = 0; i < rn; ++i) if (fds[i].revents ...)
像上面這種操作是有風險的,會訪問不到rn之後的fd。
mangle_poll() 設置就緒掩碼
展開一下 就緒掩碼的設置函數, __MAP 函數有點繞, 大概就是將 v & from 轉換至靠近 to 大小的數值,沒太明白為什麼這麼做。在 4.17 內核中 POLLIN 和 EPOLLIN 這類宏定義大小是一樣的。
#define __MAP(v, from, to) (from < to ? (v & from) * (to/from) : (v & from) / (from/to)) static inline __poll_t demangle_poll(u16 val) { return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) | (__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) | (__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) | (__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) | (__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) | (__force __poll_t)__MAP(val, POLLRDNORM, (__force __u16)EPOLLRDNORM) | (__force __poll_t)__MAP(val, POLLRDBAND, (__force __u16)EPOLLRDBAND) | (__force __poll_t)__MAP(val, POLLWRNORM, (__force __u16)EPOLLWRNORM) | (__force __poll_t)__MAP(val, POLLWRBAND, (__force __u16)EPOLLWRBAND) | (__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) | (__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) | (__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG); }
參考
select 源碼分析,上一篇寫的關於 select 的分析,有一些關於 poll 結構和文件回調的分析。