Linux進程管理 (篇外)內核執行緒簡要介紹【轉】

  • 2019 年 10 月 10 日
  • 筆記

轉自:https://www.cnblogs.com/arnoldlu/p/8336998.html

關鍵詞:kthread、irq、ksoftirqd、kworker、workqueues

在使用ps查看執行緒的時候,會有不少[…]名稱的執行緒,這些有別於其它執行緒,都是內核執行緒。

其中多數內核執行緒從名稱看,就知道其主要功能。

比如給中斷執行緒化使用的irq內核執行緒,軟中斷使用的內核執行緒ksoftirqd,以及work使用的kworker內核執行緒。

本文首先概覽一下Linux都有哪些內核執行緒,然後分析創建內核執行緒的API。

在介紹內核執行緒和普通執行緒都有哪些區別?

最後介紹主要內核執行緒(irq/ksoftirqd/kworker/)的創建過程及其作用。

1. ps下初步認識Linux內核執行緒

在ps -a會顯示如下,可以看出內核執行緒都用[…]標註。

並且pid=1的init進程是所有用戶空間進程的父進程;pid=2的kthreadd內核執行緒是所有內核執行緒的父執行緒。

內核執行緒分為幾大類:softirq、kworker、irq及其他。

PID   USER     TIME   COMMAND      1 0          0:01 {linuxrc} init      2 0          0:00 [kthreadd]      3 0          0:00 [ksoftirqd/0]      4 0          0:00 [kworker/0:0]      5 0          0:00 [kworker/0:0H]      6 0          0:00 [kworker/u8:0]      7 0          0:00 [rcu_sched]      8 0          0:00 [rcu_bh]      9 0          0:00 [migration/0]     10 0          0:00 [migration/1]     11 0          0:00 [ksoftirqd/1]     12 0          0:00 [kworker/1:0]     13 0          0:00 [kworker/1:0H]     14 0          0:00 [migration/2]     15 0          0:00 [ksoftirqd/2]     16 0          0:00 [kworker/2:0]     17 0          0:00 [kworker/2:0H]     18 0          0:00 [migration/3]     19 0          0:00 [ksoftirqd/3]     20 0          0:00 [kworker/3:0]     21 0          0:00 [kworker/3:0H]     22 0          0:00 [khelper]     23 0          0:00 [kdevtmpfs]     24 0          0:00 [perf]     25 0          0:00 [kworker/u8:1]    279 0          0:00 [khungtaskd]    280 0          0:00 [writeback]    281 0          0:00 [kintegrityd]    282 0          0:00 [kworker/0:1]    284 0          0:00 [bioset]    286 0          0:00 [kblockd]    294 0          0:00 [ata_sff]    408 0          0:00 [rpciod]    409 0          0:00 [kworker/2:1]    410 0          0:00 [kworker/1:1]    412 0          0:00 [kswapd0]    416 0          0:00 [fsnotify_mark]    429 0          0:00 [nfsiod]    449 0          0:00 [kworker/3:1]    527 0          0:00 [kpsmoused]    537 0          0:00 [kworker/1:2]    613 0          0:00 [deferwq]

2. kthreadd以及創建內核執行緒API

2.1 kthreadd:kthreadd內核執行緒的創建

內核其他執行緒的創立,要基於kthreadd。kthreadd執行緒是其他執行緒的父執行緒。

start_kernel–>rest_init如下:

static noinline void __init_refok rest_init(void)  {      int pid;        rcu_scheduler_starting();      /*       * We need to spawn init first so that it obtains pid 1, however       * the init task will end up wanting to create kthreads, which, if       * we schedule it before we create kthreadd, will OOPS.       */      kernel_thread(kernel_init, NULL, CLONE_FS);--------------------------------創建第一個用戶空間執行緒init      numa_default_policy();      pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);---------------創建第一個內核執行緒kthreadd      rcu_read_lock();      kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);--------------------kthreadd_task指向kthreadd的task_strcut結構體      rcu_read_unlock();      complete(&kthreadd_done);--------------------------------------------------在init進程kernel_init-->kernel_init_freeable中等待kthreadd_done釋放        /*       * The boot idle thread must execute schedule()       * at least once to get things moving:       */      init_idle_bootup_task(current);      schedule_preempt_disabled();      /* Call into cpu_idle with preempt disabled */      cpu_startup_entry(CPUHP_ONLINE);  }

kernel_init在kthreadd之前啟動,但是kernel_init的很多任務需要基於kthreadd。所以在kernel_init的開頭等待reset_init的kthreadd_done完成量。

因為kernel_init–>kernel_init_freeable–>do_basic_setup–>do_initcalls中很多初始化需要kthread_create支援。

kernel_init-->kernel_init_freeable:  static noinline void __init kernel_init_freeable(void)  {      /*       * Wait until kthreadd is all set-up.       */      wait_for_completion(&kthreadd_done);-------------------等待kthreadd_done完成量  ...

內核中有一個執行緒kthreadd_task負責創建其他內核執行緒,這個執行緒的函數為kthreadd()。

int kthreadd(void *unused)  {      struct task_struct *tsk = current;        /* Setup a clean context for our children to inherit. */      set_task_comm(tsk, "kthreadd");      ignore_signals(tsk);      set_cpus_allowed_ptr(tsk, cpu_all_mask);      set_mems_allowed(node_states[N_MEMORY]);        current->flags |= PF_NOFREEZE;        for (;;) {          set_current_state(TASK_INTERRUPTIBLE);          if (list_empty(&kthread_create_list))              schedule();----------------------------------------------如果kthread_create_list為空,讓出CPU,進入休眠狀態。在kthread_create_on_node()中會將要創建進程節點加入到kthread_create_list中,然後喚醒此進程。          __set_current_state(TASK_RUNNING);            spin_lock(&kthread_create_lock);          while (!list_empty(&kthread_create_list)) {------------------只要kthread_create_list不為空,遍歷kthread_create_list鏈表              struct kthread_create_info *create;                create = list_entry(kthread_create_list.next,                          struct kthread_create_info, list);              list_del_init(&create->list);----------------------------從kthread_create_list中摘除當前create              spin_unlock(&kthread_create_lock);                create_kthread(create);----------------------------------創建執行緒                spin_lock(&kthread_create_lock);          }          spin_unlock(&kthread_create_lock);      }        return 0;  }    static void create_kthread(struct kthread_create_info *create)  {      int pid;    #ifdef CONFIG_NUMA      current->pref_node_fork = create->node;  #endif      /* We want our own signal handler (we take no signals by default). */      pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);----調用do_fork()創建執行緒      if (pid < 0) {          /* If user was SIGKILLed, I release the structure. */          struct completion *done = xchg(&create->done, NULL);            if (!done) {              kfree(create);              return;          }          create->result = ERR_PTR(pid);          complete(done);--------------------------------------------------------觸發complete事件      }  }      pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)  {      return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,          (unsigned long)arg, NULL, NULL);  }

2.2 創建內核執行緒介面:kthread_create等

kthread_create()是最常見的創建內核執行緒的介面。

kthread_create_on_cpu()相對於kthread_create多了個cpu,但都基於kthread_create_on_node()。

kthread_run基於kthreadd_create,所以這些函數都基於kthread_create_on_node。

#define kthread_create(threadfn, data, namefmt, arg...)       kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)    struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),                        void *data,                        unsigned int cpu,                        const char *namefmt);      /**   * kthread_run - create and wake a thread.   * @threadfn: the function to run until signal_pending(current).   * @data: data ptr for @threadfn.   * @namefmt: printf-style name for the thread.   *   * Description: Convenient wrapper for kthread_create() followed by   * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).   */  #define kthread_run(threadfn, data, namefmt, ...)                 ({                                             struct task_struct *__k                                     = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__);       if (!IS_ERR(__k))                           --------------------------如果kthread_create()正確創建了一個進程,調用wake_up_process()喚醒它。          wake_up_process(__k);                             __k;                                     })

kthread_create_on_node()負責創建一個執行緒,填充一個kthread_create_info結構體;然後將此結構體作為一個節點插入kthread_create_list隊尾。

然後喚醒kthreadd_task進行處理,創建執行緒。

struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),                         void *data, int node,                         const char namefmt[],                         ...)  {      DECLARE_COMPLETION_ONSTACK(done);      struct task_struct *task;      struct kthread_create_info *create = kmalloc(sizeof(*create),                               GFP_KERNEL);---------------------------------創建插入kthread_create_list的節點。        if (!create)          return ERR_PTR(-ENOMEM);      create->threadfn = threadfn;      create->data = data;      create->node = node;      create->done = &done;        spin_lock(&kthread_create_lock);      list_add_tail(&create->list, &kthread_create_list);-------------------將填充的節點插入kthread_create_list中。      spin_unlock(&kthread_create_lock);        wake_up_process(kthreadd_task);---------------------------------------喚醒kthread_task處理kthread_create_list鏈表,創建相應的執行緒。      /*       * Wait for completion in killable state, for I might be chosen by       * the OOM killer while kthreadd is trying to allocate memory for       * new kernel thread.       */      if (unlikely(wait_for_completion_killable(&done))) {------------------等待complete事件觸發,在create_kthread()中觸發。          /*           * If I was SIGKILLed before kthreadd (or new kernel thread)           * calls complete(), leave the cleanup of this structure to           * that thread.           */          if (xchg(&create->done, NULL))              return ERR_PTR(-EINTR);          /*           * kthreadd (or new kernel thread) will call complete()           * shortly.           */          wait_for_completion(&done);---------------------------------------等待complete事件觸發。      }      task = create->result;------------------------------------------------創建的結果為task_struct結構體。      if (!IS_ERR(task)) {          static const struct sched_param param = { .sched_priority = 0 };          va_list args;            va_start(args, namefmt);          vsnprintf(task->comm, sizeof(task->comm), namefmt, args);---------配置進程名稱。          va_end(args);          /*           * root may have changed our (kthreadd's) priority or CPU mask.           * The kernel thread should not inherit these properties.           */          sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);-----------設置進程調度策略為NORMAL,優先順序為0。          set_cpus_allowed_ptr(task, cpu_all_mask);      }      kfree(create);--------------------------------------------------------釋放kthread_create_info。      return task;  }

3. 內核執行緒和普通執行緒的區別

內核執行緒沒有地址空間,所以task_struct->mm指針為NULL。內核執行緒沒有用戶上下文。

內核執行緒只工作在內核空間,不會切換至用戶空間。但內核執行緒同樣是可調度且可搶佔的。

普通執行緒即可工作在內核空間,也可工作在用戶空間。

內核執行緒只能訪問3GB以上地址,而普通執行緒可訪問所有4GB地址空間。

4. irq、softirq、woker內核執行緒

irq、softirq、worker都可能創建對應的內核執行緒,有執行緒就有優先順序。

下面從優先來來看看它們的重要性。

可以看出中斷內核執行緒優先順序很高,為49,並且使用了實時調度策略。softirq和worker都是普通內核執行緒。

prio

policy

irq

49

SCHED_FIFO

softirq

120

SCHED_NORMAL

worker

120

SCHED_NORMAL

init

120

SCHED_NORMAL

kthreadd

120

SCHED_NORMAL

cfinteractive

0

SCHED_FIFO

其它特殊內核執行緒init優先順序為120,kthreadd優先順序為120.

cfinteractive優先順序最高,主要處理CPU Frequency負載更新。

4.1 irq/xx-xx:創建處理執行緒化中斷的執行緒

request_threaded_irq–>__setup_irq,可見如果設置了thread_fn,並且不允許中斷嵌套,則創建一個類似"irq/中斷號-終端名稱"的執行緒。

執行緒函數是irq_thread,

/*   * Internal function to register an irqaction - typically used to   * allocate special interrupts that are part of the architecture.   */  static int  __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  {  ...      if (new->thread_fn && !nested) {          struct task_struct *t;          static const struct sched_param param = {              .sched_priority = MAX_USER_RT_PRIO/2,          };            t = kthread_create(irq_thread, new, "irq/%d-%s", irq,----------------在irq_thread中調用irq_thread_fn,進而調用action->thread_fn,request_threaded_irq參數thread_fn。                     new->name);  ...      }  ...  }

request_irq是對request_threaded_irq的封裝,創建中斷執行緒的工作交給__setup_irq()

static inline int __must_check  request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,          const char *name, void *dev)  {      return request_threaded_irq(irq, handler, NULL, flags, name, dev);  }

更詳細資訊參考:《Linux中斷管理 (1)Linux中斷管理機制》中關於request_irq()介紹。

4.2 ksoftirqd/xx:創建處理軟中斷執行緒

軟中斷執行緒通過smpboot_register_percpu_thread註冊softirq_threads創建。

static struct smp_hotplug_thread softirq_threads = {      .store            = &ksoftirqd,      .thread_should_run    = ksoftirqd_should_run,      .thread_fn        = run_ksoftirqd,      .thread_comm        = "ksoftirqd/%u",  };    static __init int spawn_ksoftirqd(void)  {      register_cpu_notifier(&cpu_nfb);        BUG_ON(smpboot_register_percpu_thread(&softirq_threads));        return 0;  }

smpboot_register_percpu_thread–>__smpboot_create_thread,最終也還是調用kthread_create_on_cpu,創建了類似"ksoftirqd/xx"的內核執行緒,xx為cpuid號。

從ps -a中可以看出創建的結果如下,可以看出每個CPU創建了一個ksoftirqd內核執行緒。

    3 0          0:03 [ksoftirqd/0]     11 0          0:03 [ksoftirqd/1]     15 0          0:00 [ksoftirqd/2]     19 0          0:00 [ksoftirqd/3]

更詳細資訊參考: 《Linux中斷管理 (2)軟中斷和tasklet

4.3 kworker:創建work的工作執行緒

kwoker執行緒是處理work的工作執行緒,詳細參考《Linux中斷管理 (3)workqueue工作隊列》。

每個CPU都會創建自己的workqueue,用以集中處理內核kworker。

workquuue就是把一些任務(work)推遲到一個或一組內核執行緒中去執行,那個內核執行緒被稱為worker_thread。

首先看看創建結果,可以看出在init_workqueues中創建了綁定CPU0的兩個kworker,分別是nice=0和nice=-20。

apply_workqueue_attrs創建unbund worker,即kworker/u8:0。

然後在每個CPU_UP_PREPARE回調中創建兩個不同nice的kworker。所以四個CPU一共9個內核執行緒。

PID   USER     TIME   COMMAND      1 0          0:01 {linuxrc} init      2 0          0:00 [kthreadd]      3 0          0:00 [ksoftirqd/0]      4 0          0:00 [kworker/0:0]      5 0          0:00 [kworker/0:0H]---------------init_workqueues-->create_worker      6 0          0:00 [kworker/u8:0]---------------apply_workqueue_attrs-->alloc_unbound_pwq-->create_worker      7 0          0:00 [rcu_sched]      8 0          0:00 [rcu_bh]      9 0          0:00 [migration/0]     10 0          0:00 [migration/1]     11 0          0:00 [ksoftirqd/1]     12 0          0:00 [kworker/1:0]---------------workqueue_cpu_up_callback-->create_worker     13 0          0:00 [kworker/1:0H]     14 0          0:00 [migration/2]     15 0          0:00 [ksoftirqd/2]     16 0          0:00 [kworker/2:0]     17 0          0:00 [kworker/2:0H]--------------workqueue_cpu_up_callback-->create_worker     18 0          0:00 [migration/3]     19 0          0:00 [ksoftirqd/3]     20 0          0:00 [kworker/3:0]     21 0          0:00 [kworker/3:0H]--------------workqueue_cpu_up_callback-->create_worker     22 0          0:00 [khelper]     23 0          0:00 [kdevtmpfs]     24 0          0:00 [perf]     25 0          0:00 [kworker/u8:1]--------------worker_thread-->create_worker    279 0          0:00 [khungtaskd]    280 0          0:00 [writeback]    281 0          0:00 [kintegrityd]    282 0          0:00 [kworker/0:1]---------------worker_thread-->create_worker    284 0          0:00 [bioset]    286 0          0:00 [kblockd]    294 0          0:00 [ata_sff]    408 0          0:00 [rpciod]    409 0          0:00 [kworker/2:1]---------------worker_thread-->create_worker    410 0          0:00 [kworker/1:1]---------------worker_thread-->create_worker    412 0          0:00 [kswapd0]    416 0          0:00 [fsnotify_mark]    429 0          0:00 [nfsiod]    449 0          0:00 [kworker/3:1]---------------worker_thread-->create_worker    527 0          0:00 [kpsmoused]    537 0          0:00 [kworker/1:2]---------------worker_thread-->create_worker    613 0          0:00 [deferwq]

init_workqueues–>create_worker–>kthread_create_on_node,創建"kworker/xx:xxH"內核執行緒。

static int __init init_workqueues(void)  {      int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };      int i, cpu;  ...      /* create the initial worker */      for_each_online_cpu(cpu) {---------------------------------遍歷CPU[0~3]          struct worker_pool *pool;            for_each_cpu_worker_pool(pool, cpu) {------------------NR_STD_WORKER_POOLS=2,所以每個CPU有兩個pool              pool->flags &= ~POOL_DISASSOCIATED;              BUG_ON(!create_worker(pool));          }      }  ...      system_wq = alloc_workqueue("events", 0, 0);      system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);      system_long_wq = alloc_workqueue("events_long", 0, 0);      system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,                          WQ_UNBOUND_MAX_ACTIVE);      system_freezable_wq = alloc_workqueue("events_freezable",                            WQ_FREEZABLE, 0);      system_power_efficient_wq = alloc_workqueue("events_power_efficient",                            WQ_POWER_EFFICIENT, 0);      system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",                            WQ_FREEZABLE | WQ_POWER_EFFICIENT,                            0);      BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||             !system_unbound_wq || !system_freezable_wq ||             !system_power_efficient_wq ||             !system_freezable_power_efficient_wq);      return 0;  }

create_worker()函數創建工作執行緒。

static struct worker *create_worker(struct worker_pool *pool)  {  ...      if (pool->cpu >= 0)          snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,-------------cpuid和id,區分cpu和cpu內kworker。               pool->attrs->nice < 0  ? "H" : "");      else          snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);--------------u表示不指定cpu。        worker->task = kthread_create_on_node(worker_thread, worker, pool->node,                            "kworker/%s", id_buf);  ...  }

更詳細資訊參考:《Linux中斷管理 (3)workqueue工作隊列》、《Linux workqueue工作原理》、《Concurrency Managed Workqueue之(一):workqueue的基本概念

5. 其他內核執行緒

rcu_sched、rcu_bh

migration

khelper

kdevtmpfs

perf

writeback

kintegrityd

bioset

kblockd

ata_sff

rpciod

kswapd

nfsiod

kpsmpused

deferwq

聯繫方式:[email protected]