[linux][perf]list過長導致CPU消耗過高的問題分析

2019 年 12 月 15 日
筆記

前言

某機器上網絡出現時斷時續的問題，網絡的同事發現ovs進程的CPU消耗很高，硬件offload的規則下發卡住的問題。即通過netlink向內核發送消息卡住。

分析

perf分析熱點函數

可見，CPU的主要消耗是在__tcf_chain_get函數和__mutex_lock上。

為什麼鎖消耗那麼高

rtnetlink_rcv_msg函數（去掉參數檢查等等，保留關鍵邏輯），通過這段邏輯我們可以知道，在執行每個netlink命令的時候，都會先執行鎖操作，在執行具體的回調函數（即link->doit），再解鎖的過程。

如果其中某一個回調函數執行時間過長，就會長時間佔用鎖，造成其他的link->doit回調函數block住更長的時間，那麼鎖的消耗也會更高。

再結合其他的代碼邏輯可以發現，__tcf_chain_get函數就剛好在某一個回調函數的路徑上。

Annotate __tcf_chain_get

分析上面的熱點函數__tcf_chain_get

__tcf_chain_get比較長，理論上來說，一個cmp指令只要一個cycle即可，那麼在這裡為什麼一個cmp指令使用了99.13%的CPU時間呢？可以從彙編中看到，需要取rax偏移0x18的地址上數值和esi進行比較。這裡存在一次讀取內存的操作，如果是鏈表的話，前後兩個鏈表元素未必在內存上相鄰，容易造成CPU的cache miss。

計算熱點代碼的路徑

ffffffff8161ab40+1d= ffffffff8161ab5d

所以執行addr2line -e /usr/lib/debug/vmlinux-4.19 -a 0xffffffff8161ab5d

0xffffffff8161ab5d

可以得到/linux/net/sched/cls_api.c:268

可以看到268行是對chain->index和chain_index進行對比。

繼續看tcf_chain結構體

index結構在tcf_chain結構體中偏移0x20，為什麼反彙編的代碼在0x18上？

結合上下文我們可以看到，使用list來遍歷：

chain的地址是在list的地址-0x8，index的地址是在chain+0x20,那麼index的地址相對於list的地址就是+0x18，計算chain的過程都被編譯器優化掉了，只需要使用list的地址+0x18即可完成代碼邏輯中的遍歷過程。

綜上，可以證實，__tcf_chain_get消耗過高的原因是：遍歷list的過程中遇到了比較多的cache miss;遍歷了太多的鏈表元素的導致的。

計算鏈表的長度

基於kprobe實現kmod，來dump出來鏈表的長度。

#include <linux/kernel.h>  #include <linux/module.h>  #include <linux/kprobes.h>  #include <linux/uaccess.h>  #include <net/sch_generic.h>      static long filter_count(struct tcf_chain *chain)  {    struct tcf_proto *tp;    long count = 0;      for (tp = rtnl_dereference(chain->filter_chain);        tp; tp = rtnl_dereference(tp->next)) {      count++;    }    return count;  }      static int kp_do_tcf_handler(struct kprobe *p, struct pt_regs *regs)  {    struct tcf_block *block = (void __user *)regs->di;    struct tcf_chain *chain;    long count = 0;      list_for_each_entry(chain, &block->chain_list, list) {      count++;    }      printk("[always]count = %ldn", count);      return 0;  }    static struct kprobe kp = {    .symbol_name  = "__tcf_chain_get",    .pre_handler = kp_do_tcf_handler,  };    static int __init kprobe_init(void)  {    int ret;      ret = register_kprobe(&kp);    if (ret < 0) {      pr_err("register_kprobe failed, returned %dn", ret);      return ret;    }      pr_info("[probe-tcf]Planted kprobe at %pn", kp.addr);    return 0;  }    static void __exit kprobe_exit(void)  {    unregister_kprobe(&kp);    pr_info("[probe-unregistered]kprobe at %p unregisteredn", kp.addr);  }    module_init(kprobe_init)  module_exit(kprobe_exit)  MODULE_LICENSE("GPL");  MODULE_AUTHOR("pizhenwei [email protected]");

得到dump的結果

可以知道list的元素多少不等，有的不到100，有的到達了接近25W個元素。由此可以論證上面的猜測，鏈表元素太多。

找到哪個dev的鏈表元素過多

再進一步完善kprobe的邏輯，

#include <linux/kernel.h>  #include <linux/module.h>  #include <linux/kprobes.h>  #include <linux/uaccess.h>  #include <net/sch_generic.h>    static long filter_count(struct tcf_chain *chain)  {    struct tcf_proto *tp;    long count = 0;      for (tp = rtnl_dereference(chain->filter_chain);        tp; tp = rtnl_dereference(tp->next)) {      count++;    }      return count;  }    static int kp_do_tcf_handler(struct kprobe *p, struct pt_regs *regs)  {    struct tcf_block *block = (void __user *)regs->di;    struct tcf_chain *chain;    long count = 0, tps;    long tpc[4] = {0};  //繪製filter count分佈圖    long ref[4] = {0};  //繪製refcnt的分佈圖    long action_ref[4] = {0};  //繪製action_refcnt的分佈圖    long actrefonly = 0;  //過濾出來action_refcnt等於refcnt的分佈圖      list_for_each_entry(chain, &block->chain_list, list) {      count++;        tps = filter_count(chain);      if (tps == 0)        tpc[0]++;      else if (tps > 0 && tps <= 1)        tpc[1]++;      else if (tps > 1 && tps <= 2)        tpc[2]++;      else        tpc[3]++;        if (chain->refcnt == 0)        ref[0]++;      else if (chain->refcnt > 0 && chain->refcnt <= 1)        ref[1]++;      else if (chain->refcnt > 1 && chain->refcnt <= 2)        ref[2]++;      else        ref[3]++;        if (chain->action_refcnt == 0)        action_ref[0]++;      else if (chain->action_refcnt > 0 && chain->action_refcnt <= 1)        action_ref[1]++;      else if (chain->action_refcnt > 1 && chain->action_refcnt <= 2)        action_ref[2]++;      else        action_ref[3]++;          if (chain->action_refcnt == chain->refcnt)        actrefonly++;    }      #if 1    if (count < 1000)  //過濾鏈表元素少於1000的情況      return 0;  #endif      printk("[always]DEV %sn", block→q→dev_queue→dev→name);  //打印出來異常的netdev的名字      printk("[always][0]count = %ldt", tpc[0]);    printk("[always][1]count = %ldt", tpc[1]);    printk("[always][2]count = %ldt", tpc[2]);    printk("[always][4]count = %ldn", tpc[3]);      printk("[always][0]action_ref = %ldt", action_ref[0]);    printk("[always][1]action_ref = %ldt", action_ref[1]);    printk("[always][2]action_ref = %ldt", action_ref[2]);    printk("[always][4]action_ref = %ldn", action_ref[3]);      printk("[always][0]ref = %ldt", ref[0]);    printk("[always][1]ref = %ldt", ref[1]);    printk("[always][2]ref = %ldt", ref[2]);    printk("[always][4]ref = %ldn", ref[3]);      printk("[always]actrefonly = %ldn", actrefonly);      return 0;  }    static struct kprobe kp = {    .symbol_name  = "__tcf_chain_get",    .pre_handler = kp_do_tcf_handler,  };    static int __init kprobe_init(void)  {    int ret;      ret = register_kprobe(&kp);    if (ret < 0) {      pr_err("register_kprobe failed, returned %dn", ret);      return ret;    }      pr_info("[probe-tcf]Planted kprobe at %pn", kp.addr);    return 0;  }    static void __exit kprobe_exit(void)  {    unregister_kprobe(&kp);    pr_info("[probe-unregistered]kprobe at %p unregisteredn", kp.addr);  }    module_init(kprobe_init)  module_exit(kprobe_exit)  MODULE_LICENSE("GPL");  MODULE_AUTHOR("pizhenwei [email protected]");

得到dump出來的結果

至此，我們已經找到了vxlan_sys_4789上的元素比較多。

後面，經過網絡同事的驗證（通過tc命令dump出來結果），vxlan_sys_4789上的chain確實過多。腳本刪除空的chain後，ovs的cpu消耗下降到10%以內，網絡恢復正常。

[linux][perf]list過長導致CPU消耗過高的問題分析

VirMach 便宜 VPS

QNews

[linux][perf]list過長導致CPU消耗過高的問題分析

分享此文：

Related Posts

Python使用多進程提高網絡爬蟲的爬取速度

這個設計原則，你認同嗎？

關於這段時間刷算法的總結

SAS-你還在被圖像中的title/footnote困擾嗎…

VirMach 便宜 VPS

QNews

熱門搜尋