poll(2) 源碼分析

2019 年 10 月 12 日
筆記

poll(2)

poll(2) 系統調用的功能和 select(2) 類似：等待一個文件集合中的文件描述符就緒進行I/O操作。

使用
實現

select(2) 的局限性：

關注的文件描述符集合大小最大只有 1024
文件描述符集合為順序的，不能任意指定 fd，浪費佔用的fd

poll(2) 對 select(2) 的改進，關注的文件描述符集合為動態大小，文件描述可以任意指定。

struct pollfd {         int   fd;         /* file descriptor */         short events;     /* requested events */         short revents;    /* returned events */  };    - fd 為關注的文件描述符  - events 為關注的事件（輸入），使用位掩碼來表示事件  - revents 為就緒的事件（輸出），同樣使用位掩碼錶示    #include <poll.h>    int poll(struct pollfd *fds, nfds_t nfds, int timeout);    - fds 為文件描述符集合的地址  - nfds 為文件描述符集合的長度  - timeout 為超時的時間，單位為 毫秒    返回值為 revents 不為 0 的個數，出錯返回 -1

一個簡單的例子：等待標準輸入就緒，超時時間為3s。

#include <poll.h>  #include <unistd.h>  #include <stdio.h>    int main()  {          int timeout = 3000;            struct pollfd fds = {0};          fds.events |= POLLIN;  // fd = 0 等待標準輸入            int ret = poll(&fds, 1, timeout);          if (ret == -1)                  printf("error polln");          else if (ret)                  printf("data is avaliable now.n");          else                  printf("no data within 3000 ms.n");    }

實現

程式碼位於在 fs/select.c 中，參考中的鏈接有一些關於文件回調和poll結構的說明

poll()

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,                  int, timeout_msecs)  {          struct timespec64 end_time, *to = NULL;          int ret;            if (timeout_msecs >= 0) {                  to = &end_time;                  poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,                          NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));          }            ret = do_sys_poll(ufds, nfds, to);            if (ret == -EINTR) {                  struct restart_block *restart_block;                    restart_block = &current->restart_block;                  restart_block->fn = do_restart_poll;                  restart_block->poll.ufds = ufds;                  restart_block->poll.nfds = nfds;                    if (timeout_msecs >= 0) {                          restart_block->poll.tv_sec = end_time.tv_sec;                          restart_block->poll.tv_nsec = end_time.tv_nsec;                          restart_block->poll.has_timeout = 1;                  } else                          restart_block->poll.has_timeout = 0;                    ret = -ERESTART_RESTARTBLOCK;          }          return ret;  }

poll() 程式碼很簡單:

處理超時時間
實現 poll(2)
處理後事：判斷是否超時或者重新調用。

do_sys_poll()

  static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,                  struct timespec64 *end_time)  {          struct poll_wqueues table;           int err = -EFAULT, fdcount, len, size;          /* Allocate small arguments on the stack to save memory and be             faster - use long to make sure the buffer is aligned properly             on 64 bit archs to avoid unaligned access */          long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  // 256 位元組大小          struct poll_list *const head = (struct poll_list *)stack_pps;           struct poll_list *walk = head;           unsigned long todo = nfds;            if (nfds > rlimit(RLIMIT_NOFILE))  // 最大打開的文件數量限制                  return -EINVAL;            // N_STACK_PPS = (256 - 16) / 8 = 30, 棧空間可以保存 30 個pollfd結構          // 將用戶空間的 struct pollfd 部分移動至棧空間內的數組中          len = min_t(unsigned int, nfds, N_STACK_PPS);          for (;;) {                  walk->next = NULL;                  walk->len = len;                  if (!len)                          break;                    if (copy_from_user(walk->entries, ufds + nfds-todo,                                          sizeof(struct pollfd) * walk->len))                          goto out_fds;                    todo -= walk->len;                  if (!todo)                          break;                    // POLLFD_PER_PAGE = (4096 - 16) / 8 = 510                  // 申請頁，每頁可容納 510 個 pollfd 結構                  len = min(todo, POLLFD_PER_PAGE);                  size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;                  walk = walk->next = kmalloc(size, GFP_KERNEL);                  if (!walk) {                          err = -ENOMEM;                          goto out_fds;                  }          }          // 將所有的pollfd 結構移動至以 head 為首地址的內核空間中            poll_initwait(&table);  // 初始化 table，詳見 select 中的分析，見下參考          fdcount = do_poll(head, &table, end_time);          poll_freewait(&table);  // 釋放 table            // 將 revents 複製到用戶空間          for (walk = head; walk; walk = walk->next) {                  struct pollfd *fds = walk->entries;                  int j;                    for (j = 0; j < walk->len; j++, ufds++)                          if (__put_user(fds[j].revents, &ufds->revents))                                  goto out_fds;            }            err = fdcount;  out_fds:          walk = head->next;          while (walk) {                  struct poll_list *pos = walk;                  walk = walk->next;                  kfree(pos);          }            return err;  }

do_sys_poll() 函數也是分為三步實現

將用戶空間的數據複製到內核空間
調用核心實現 do_poll()
將就緒的事件數據從內核空間複製到用戶空間

do_poll()

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,                     struct timespec64 *end_time)  {          poll_table* pt = &wait->pt;          ktime_t expire, *to = NULL;          int timed_out = 0, count = 0;          u64 slack = 0;          __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;          unsigned long busy_start = 0;            /* Optimise the no-wait case */          if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {                  pt->_qproc = NULL;                  timed_out = 1;          }            if (end_time && !timed_out)                  slack = select_estimate_accuracy(end_time);  // 估算進程等待的時間，函數返回 納秒            for (;;) {                  struct poll_list *walk;                  bool can_busy_loop = false;                    for (walk = list; walk != NULL; walk = walk->next) {                          struct pollfd * pfd, * pfd_end;                            pfd = walk->entries;                          pfd_end = pfd + walk->len;                          for (; pfd != pfd_end; pfd++) {  // 對所有的 struct pollfd 遍歷處理，do_pollfd 為單獨處理一個 fd 的函數                                  /*                                   * Fish for events. If we found one, record it                                   * and kill poll_table->_qproc, so we don't                                   * needlessly register any other waiters after                                   * this. They'll get immediately deregistered                                   * when we break out and return.                                   */                                  if (do_pollfd(pfd, pt, &can_busy_loop,                                                busy_flag)) {                                          count++;                                          pt->_qproc = NULL;                                          /* found something, stop busy polling */                                          busy_flag = 0;                                          can_busy_loop = false;                                  }                          }                  }                  /*                   * All waiters have already been registered, so don't provide                   * a poll_table->_qproc to them on the next loop iteration.                   */                  pt->_qproc = NULL;                  if (!count) {                          count = wait->error;                          if (signal_pending(current))                                  count = -EINTR;                  }                  if (count || timed_out)                          break;                    /* only if found POLL_BUSY_LOOP sockets && not out of time */                  if (can_busy_loop && !need_resched()) {                          if (!busy_start) {                                  busy_start = busy_loop_current_time();                                  continue;                          }                          if (!busy_loop_timeout(busy_start))                                  continue;                  }                  busy_flag = 0;                    /*                   * If this is the first loop and we have a timeout                   * given, then we convert to ktime_t and set the to                   * pointer to the expiry value.                   */                  if (end_time && !to) {                          expire = timespec64_to_ktime(*end_time);                          to = &expire;                  }                    if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))  // 調度直到超時                          timed_out = 1;          }          return count;  }

這個函數寫的很清楚了，也有很多注釋

can_busy_loop 是和 CONFIG_NET_RX_BUSY_POLL 配置相關的，不算通用處理情況，先忽略不考慮
count 為函數的返回值，在 do_pollfd 有返回匹配的掩碼時遞增，為就緒的文件描述符數量，無就緒文件的時候為等待隊列中的錯誤碼
pt->_qproc 為文件poll操作調用的函數，= NULL 的操作在注釋中已經說明，函數已經註冊到隊列中，不必再次註冊. 這個函數相關的內容可以在另外一篇 select(2) 找到具體的說明

/*   * Fish for events. If we found one, record it and kill poll_table->_qproc, so we don't   * needlessly register any other waiters after this. They'll get immediately deregistered   * when we break out and return.   */    /*   * All waiters have already been registered, so don't provide a poll_table->_qproc to them on the next loop iteration.   */

do_pollfd()

/*   * Fish for pollable events on the pollfd->fd file descriptor. We're only   * interested in events matching the pollfd->events mask, and the result   * matching that mask is both recorded in pollfd->revents and returned. The   * pwait poll_table will be used by the fd-provided poll handler for waiting,   * if pwait->_qproc is non-NULL.   */  static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,                                       bool *can_busy_poll,                                       __poll_t busy_flag)  {          __poll_t mask;          int fd;            mask = 0;          fd = pollfd->fd;          if (fd >= 0) {                  struct fd f = fdget(fd);                  mask = EPOLLNVAL;  // 0x20                  if (f.file) {                          /* userland u16 ->events contains POLL... bitmap */                          // 設置關注的事件                          __poll_t filter = demangle_poll(pollfd->events) |                                                  EPOLLERR | EPOLLHUP;                          mask = DEFAULT_POLLMASK;  // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)                          if (f.file->f_op->poll) {                                  pwait->_key = filter;                                  pwait->_key |= busy_flag;  // key 在喚醒函數的時候用到                                  mask = f.file->f_op->poll(f.file, pwait);  // 獲取就緒的文件掩碼                                  if (mask & busy_flag)                                          *can_busy_poll = true;                          }                          /* Mask out unneeded events. */                          mask &= filter;  // 將文件返回的事件掩碼與關注的事件做與操作得到 關注的就緒事件掩碼                          fdput(f);                  }          }          /* ... and so does ->revents */          pollfd->revents = mangle_poll(mask);  // 設置就緒掩碼            return mask;  }

討論在不考慮錯誤的情況下，
poll(2) 返回的是revents 非 0 的個數，在 do_pollfd() 中返回一個非 0 的 mask，poll(2) 返回的 count 就 +1。
mask = 0 有兩種可能：

和 filter 做與運算，但是這樣做有一個前提就是可以取到 fd
fd < 0，這種屬於無意義的fd了，屬於用戶的問題

在已了解的fd中： eventfd 和普通的文件poll函數返回情況

EPOLLIN 或者 EPOLLOUT 或兩個都存在
(EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

當關注的事件不在以上事件中，是可能返回 0，而count不增加的

struct pollfd fds[n];  rn = poll(fds, n, 0);  for (int i = 0; i < rn; ++i)          if (fds[i].revents ...)

像上面這種操作是有風險的，會訪問不到rn之後的fd。

mangle_poll() 設置就緒掩碼

展開一下就緒掩碼的設置函數， __MAP 函數有點繞，大概就是將 v & from 轉換至靠近 to 大小的數值，沒太明白為什麼這麼做。在 4.17 內核中 POLLIN 和 EPOLLIN 這類宏定義大小是一樣的。

#define __MAP(v, from, to)           (from < to ? (v & from) * (to/from) : (v & from) / (from/to))    static inline __poll_t demangle_poll(u16 val) {      return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) |             (__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) |             (__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) |             (__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) |             (__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) |             (__force __poll_t)__MAP(val, POLLRDNORM,                                     (__force __u16)EPOLLRDNORM) |             (__force __poll_t)__MAP(val, POLLRDBAND,                                     (__force __u16)EPOLLRDBAND) |             (__force __poll_t)__MAP(val, POLLWRNORM,                                     (__force __u16)EPOLLWRNORM) |             (__force __poll_t)__MAP(val, POLLWRBAND,                                     (__force __u16)EPOLLWRBAND) |             (__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) |             (__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) |             (__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG);  }

參考

select 源碼分析，上一篇寫的關於 select 的分析，有一些關於 poll 結構和文件回調的分析。