使用perf进行性能分析-入门级别

看一个cpp的例子

考虑以下的代码类似ls的代码minils.c

#include <dirent.h> //包含使用的Opendir readdir closedir 函数
#include <error.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h> //包含使用的 stat 结构体和函数 用于获取文件或目录的 状态信息

void list_dir(const char *path) {
  DIR *dir_p;
  struct dirent *dir_entry;
  struct stat file_stat;

  dir_p = opendir(path);
  if (dir_p == NULL) {
    perror("open path failed");
    return;
  }
  /*
  文件目录如下
  其中..是父目录 .是当前目录
  递归搜索的时候需要跳过父目录和当前目录

  ├── .
  ├── ..
  ├── demo-zeros
  ├── learn_perf
  │   ├── log.sh
  │   ├── ls-mini
  │   ├── ls-mini.c
  │   └── test.c
  ├── output
  ├── perf.data
  ├── perf.data.old
  ├── test
  └── test.cc

  */

  /*依次读取 demo-zeros
  lear_perf
     ├── log.sh
     ├── ls-mini
     ├── ls-mini.c
     └── test.c
  output
  ...
  */
  while ((dir_entry = readdir(dir_p)) != NULL) {
    char *filename = dir_entry->d_name;

    /// 跳过父目录和当前目录
    if (strcmp(filename, ".") == 0 || strcmp(filename, "..") == 0) {
      continue;
    }

    // 尝试获取文件状态信息。如果 stat() 返回 -1,表示失败。否则保存在
    // file_stat结构体中
    if (stat(filename, &file_stat) == -1) {
      perror("stat failed");
      continue;
    }
    //打印大小和文件名
    printf("%lld ", (long long)file_stat.st_size); // 文件大小
    printf("%s\n", filename);
  }
}

int main() {
  list_dir(".");
  return 0;
}
#output
filament@black:~$ ./learn_perf/mini_ls 
220 .bash_logout
4096 .debug
94860 perf.data.old
40960000 output
2147483648 demo-zeros
1222 .bash_history
807 .profile
4096 .cache
148828 perf.data
183 .wget-hsts
16464 test
4096 learn_perf
4096 .local
124 test.cc
4096 .vscode-server
20 .lesshst
4096 .dotnet
3771 .bashrc

使用pref进行分析

对我们的./mini_ls的数据

 Performance counter stats for './mini_ls':

              0.94 msec task-clock                #    0.468 CPUs utilized          
                 0      context-switches          #    0.000 /sec                   
                 0      cpu-migrations            #    0.000 /sec                   
                61      page-faults               #   64.980 K/sec                  
            737982      cycles                    #    0.786 GHz                    
            712467      instructions              #    0.97  insn per cycle         
            149577      branches                  #  159.337 M/sec                  
              5528      branch-misses             #    3.70% of all branches        
           3689890      slots                     #    3.931 G/sec                  
            795858      topdown-retiring          #     21.6% retiring              
            477515      topdown-bad-spec          #     12.9% bad speculation       
           1562776      topdown-fe-bound          #     42.4% frontend bound        
            853739      topdown-be-bound          #     23.1% backend bound         

       0.002005034 seconds time elapsed

       0.002259000 seconds user
       0.000000000 seconds sys

syscallls的数据

 Performance counter stats for 'ls':

              1.87 msec task-clock                #    0.563 CPUs utilized          
                 0      context-switches          #    0.000 /sec                   
                 0      cpu-migrations            #    0.000 /sec                   
               103      page-faults               #   55.160 K/sec                  
           1478451      cycles                    #    0.792 GHz                    
           1652087      instructions              #    1.12  insn per cycle         
            346600      branches                  #  185.615 M/sec                  
             11215      branch-misses             #    3.24% of all branches        
           7392255      slots                     #    3.959 G/sec                  
           1797332      topdown-retiring          #     24.3% retiring              
           1072601      topdown-bad-spec          #     14.5% bad speculation       
           3043869      topdown-fe-bound          #     41.2% frontend bound        
           1478451      topdown-be-bound          #     20.0% backend bound         

       0.003314592 seconds time elapsed

       0.000000000 seconds user
       0.003587000 seconds sys

性能对比:./mini_ls vs. ls (标准)

统计指标 ./mini_ls (自定义) ls (标准) 性能差异(mini_ls vs ls 结论/意义
实际运行时间 (Elapsed Time) 0.002005034 s 0.003314592 s 🔻 快 39.5% mini_ls 完成任务所需总时间更短。
任务时钟 (task-clock) 0.94 msec 1.87 msec 🔻 少 49.8% 进程在 CPU 上花费的时间更少。
指令数 (instructions) 712,467 1,652,087 🔻 少 56.9% mini_ls 执行的操作步骤(指令)远少于 ls
周期数 (cycles) 737,982 1,478,451 🔻 少 50.1% 占用的 CPU 周期数更少。
IPC (insn per cycle) 0.97 1.12 🔺 低 13.4% mini_ls 每个周期执行的指令数较低,效率略低,可能是前端瓶颈(FE-Bound)所致。
分支预测错误 (branch-misses) 5,528 (3.70%) 11,215 (3.24%) 🔻 数量少一半,但百分比更高 mini_ls 的分支预测失败率略高。
前端瓶颈 (topdown-fe-bound) 42.4% 41.2% 🔺 略高 CPU 等待指令从前端获取或解码的时间占比略高。
后端瓶颈 (topdown-be-bound) 23.1% 20.0% 🔺 略高 CPU 等待后端资源(如内存、执行单元)的时间占比略高。
页错误 (page-faults) 61 103 🔻 少 40.8% 程序访问内存页面的次数更少。

另一个使用rust的例子

fn test2() {
    //20万次
    for _ in 0..200_000 {
        ()
    }
}
fn test1() {
    //10万次
    for _ in 0..100_000 {
        ()
    }
    test2();
}
fn main() {
    for _ in 0..10 {
        test1();
    }
}

perf命令

sudo perf record --call-graph=dwarf ./target/debug/mytest

--call-graph 作用

  • 启用调用图(call graph)记录
  • 记录函数调用关系,而不仅仅是函数执行时间

dwarf 调试格式

  • DWARF:Debugging With Attributed Record Formats
  • 优势
    • 利用编译器生成的调试信息
    • 能够处理复杂的栈帧布局
sudo perf record --call-graph=dwarf ./mytest

sudo perf report --stdio

读取得到的perf.data数据然后格式化输出

filament@black:~/learn_perf$ sudo perf report --stdio
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 146  of event 'cycles'
# Event count (approx.): 34204409
#
# Children      Self  Command  Shared Object         Symbol                                                                                                   
# ........  ........  .......  ....................  .........................................................................................................
#
    42.01%    42.01%  mytest   mytest                [.] <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next
            |
            ---<core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next

    33.61%    33.61%  mytest   mytest                [.] <i32 as core::iter::range::Step>::forward_unchecked
            |
            ---<i32 as core::iter::range::Step>::forward_unchecked

    10.38%    10.38%  mytest   mytest                [.] core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next
            |
            ---core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next

     6.20%     6.20%  mytest   mytest                [.] mytest::test2
            |
            ---mytest::test2

     3.79%     3.79%  mytest   mytest                [.] mytest::test1
            |
            ---mytest::test1

     2.57%     0.00%  mytest   [kernel.kallsyms]     [k] entry_SYSCALL_64_after_hwframe
            |
            ---entry_SYSCALL_64_after_hwframe
               do_syscall_64
               |          
                --2.24%--x64_sys_call
                          |          
                           --0.82%--__x64_sys_mmap
                                     ksys_mmap_pgoff
                                     vm_mmap_pgoff
                                     do_mmap
                                     mmap_region
                                     __mmap_region

     2.57%     0.00%  mytest   [kernel.kallsyms]     [k] do_syscall_64
            |
            ---do_syscall_64
               |          
                --2.24%--x64_sys_call
                          |          
                           --0.82%--__x64_sys_mmap
                                     ksys_mmap_pgoff
                                     vm_mmap_pgoff
                                     do_mmap
                                     mmap_region
                                     __mmap_region

     2.24%     0.00%  mytest   [kernel.kallsyms]     [k] x64_sys_call
            |
            ---x64_sys_call
               |          
                --0.82%--__x64_sys_mmap
                          ksys_mmap_pgoff
                          vm_mmap_pgoff
                          do_mmap
                          mmap_region
                          __mmap_region

     1.96%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _start
            |
            ---_start
               |          
                --1.63%--_dl_start
                          _dl_start_final (inlined)
                          _dl_sysdep_start
                          dl_main
                          |          
                           --0.82%--_dl_map_object_deps
                                     _dl_catch_exception
                                     openaux
                                     _dl_map_object
                                     _dl_map_object_from_fd
                                     _dl_map_segments (inlined)

     1.63%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_start
            |
            ---_dl_start
               _dl_start_final (inlined)
               _dl_sysdep_start
               dl_main
               |          
                --0.82%--_dl_map_object_deps
                          _dl_catch_exception
                          openaux
                          _dl_map_object
                          _dl_map_object_from_fd
                          _dl_map_segments (inlined)

     1.63%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_sysdep_start
            |
            ---_dl_sysdep_start
               dl_main
               |          
                --0.82%--_dl_map_object_deps
                          _dl_catch_exception
                          openaux
                          _dl_map_object
                          _dl_map_object_from_fd
                          _dl_map_segments (inlined)

     1.63%     0.00%  mytest   ld-linux-x86-64.so.2  [.] dl_main
            |
            ---dl_main
               |          
                --0.82%--_dl_map_object_deps
                          _dl_catch_exception
                          openaux
                          _dl_map_object
                          _dl_map_object_from_fd
                          _dl_map_segments (inlined)

     1.63%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_start_final (inlined)
            |
            ---_dl_start_final (inlined)
               _dl_sysdep_start
               dl_main
               |          
                --0.82%--_dl_map_object_deps
                          _dl_catch_exception
                          openaux
                          _dl_map_object
                          _dl_map_object_from_fd
                          _dl_map_segments (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::rt::lang_start_internal
            |
            ---std::rt::lang_start_internal
               std::panic::catch_unwind (inlined)
               std::panicking::catch_unwind (inlined)
               std::panicking::catch_unwind::do_call (inlined)
               std::rt::lang_start_internal::{{closure}} (inlined)
               std::rt::init (inlined)
               std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::panic::catch_unwind (inlined)
            |
            ---std::panic::catch_unwind (inlined)
               std::panicking::catch_unwind (inlined)
               std::panicking::catch_unwind::do_call (inlined)
               std::rt::lang_start_internal::{{closure}} (inlined)
               std::rt::init (inlined)
               std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::panicking::catch_unwind (inlined)
            |
            ---std::panicking::catch_unwind (inlined)
               std::panicking::catch_unwind::do_call (inlined)
               std::rt::lang_start_internal::{{closure}} (inlined)
               std::rt::init (inlined)
               std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::panicking::catch_unwind::do_call (inlined)
            |
            ---std::panicking::catch_unwind::do_call (inlined)
               std::rt::lang_start_internal::{{closure}} (inlined)
               std::rt::init (inlined)
               std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::rt::lang_start_internal::{{closure}} (inlined)
            |
            ---std::rt::lang_start_internal::{{closure}} (inlined)
               std::rt::init (inlined)
               std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::rt::init (inlined)
            |
            ---std::rt::init (inlined)
               std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::sys::pal::unix::init (inlined)
            |
            ---std::sys::pal::unix::init (inlined)
               std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.96%     0.00%  mytest   mytest                [.] std::sys::pal::unix::stack_overflow::imp::init (inlined)
            |
            ---std::sys::pal::unix::stack_overflow::imp::init (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_map_object_deps
            |
            ---_dl_map_object_deps
               _dl_catch_exception
               openaux
               _dl_map_object
               _dl_map_object_from_fd
               _dl_map_segments (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_catch_exception
            |
            ---_dl_catch_exception
               openaux
               _dl_map_object
               _dl_map_object_from_fd
               _dl_map_segments (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] openaux
            |
            ---openaux
               _dl_map_object
               _dl_map_object_from_fd
               _dl_map_segments (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_map_object
            |
            ---_dl_map_object
               _dl_map_object_from_fd
               _dl_map_segments (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_map_object_from_fd
            |
            ---_dl_map_object_from_fd
               _dl_map_segments (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_map_segments (inlined)
            |
            ---_dl_map_segments (inlined)

     0.82%     0.00%  mytest   ld-linux-x86-64.so.2  [.] __mmap64 (inlined)
            |
            ---__mmap64 (inlined)
               __mmap64 (inlined)
               entry_SYSCALL_64_after_hwframe
               do_syscall_64
               x64_sys_call
               __x64_sys_mmap
               ksys_mmap_pgoff
               vm_mmap_pgoff
               do_mmap
               mmap_region
               __mmap_region

     0.82%     0.00%  mytest   [kernel.kallsyms]     [k] __x64_sys_mmap
            |
            ---__x64_sys_mmap
               ksys_mmap_pgoff
               vm_mmap_pgoff
               do_mmap
               mmap_region
               __mmap_region

     0.82%     0.00%  mytest   [kernel.kallsyms]     [k] ksys_mmap_pgoff
            |
            ---ksys_mmap_pgoff
               vm_mmap_pgoff
               do_mmap
               mmap_region
               __mmap_region

     0.82%     0.00%  mytest   [kernel.kallsyms]     [k] vm_mmap_pgoff
            |
            ---vm_mmap_pgoff
               do_mmap
               mmap_region
               __mmap_region

     0.82%     0.00%  mytest   [kernel.kallsyms]     [k] do_mmap
            |
            ---do_mmap
               mmap_region
               __mmap_region

     0.82%     0.00%  mytest   [kernel.kallsyms]     [k] mmap_region
            |
            ---mmap_region
               __mmap_region

     0.82%     0.00%  mytest   [kernel.kallsyms]     [k] __mmap_region
            |
            ---__mmap_region

     0.73%     0.00%  mytest   [kernel.kallsyms]     [k] asm_exc_page_fault
            |
            ---asm_exc_page_fault

     0.49%     0.49%  mytest   [kernel.kallsyms]     [k] sigaction_compat_abi
     0.49%     0.00%  mytest   libc.so.6             [.] __GI___libc_sigaction (inlined)
     0.49%     0.00%  mytest   [kernel.kallsyms]     [k] __x64_sys_rt_sigaction
     0.49%     0.00%  mytest   [kernel.kallsyms]     [k] do_sigaction
     0.47%     0.47%  mytest   [kernel.kallsyms]     [k] down_write_killable
     0.47%     0.00%  mytest   mytest                [.] std::sys::pal::unix::stack_overflow::imp::install_main_guard (inlined)
     0.47%     0.00%  mytest   mytest                [.] std::sys::pal::unix::stack_overflow::imp::install_main_guard_linux (inlined)
     0.47%     0.00%  mytest   mytest                [.] std::sys::pal::unix::stack_overflow::imp::stack_start_aligned (inlined)
     0.47%     0.00%  mytest   mytest                [.] std::sys::pal::unix::stack_overflow::imp::get_stack_start (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] __pthread_getattr_np (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] _IO_new_fopen (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] __fopen_internal (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] __GI___libc_malloc (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] tcache_init (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] tcache_init (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] _int_malloc
     0.47%     0.00%  mytest   libc.so.6             [.] sysmalloc
     0.47%     0.00%  mytest   libc.so.6             [.] __glibc_morecore (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] __GI___sbrk (inlined)
     0.47%     0.00%  mytest   libc.so.6             [.] __brk
     0.47%     0.00%  mytest   [kernel.kallsyms]     [k] __x64_sys_brk
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] __x64_sys_execve
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] do_execveat_common.isra.0
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] bprm_execve
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] bprm_execve.part.0
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] exec_binprm
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] search_binary_handler
     0.46%     0.00%  mytest   [kernel.kallsyms]     [k] load_elf_binary
     0.46%     0.46%  mytest   libc.so.6             [.] __libc_early_init
     0.44%     0.44%  mytest   ld-linux-x86-64.so.2  [.] do_lookup_x
     0.44%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_relocate_object
     0.44%     0.00%  mytest   ld-linux-x86-64.so.2  [.] elf_dynamic_do_Rela (inlined)
     0.44%     0.00%  mytest   ld-linux-x86-64.so.2  [.] elf_machine_rela (inlined)
     0.44%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_lookup_symbol_x
     0.42%     0.42%  mytest   [kernel.kallsyms]     [k] vma_set_page_prot
     0.40%     0.40%  mytest   [kernel.kallsyms]     [k] vma_interval_tree_insert
     0.40%     0.00%  mytest   ld-linux-x86-64.so.2  [.] _dl_map_segment (inlined)
     0.40%     0.00%  mytest   [kernel.kallsyms]     [k] vma_link
     0.40%     0.00%  mytest   [kernel.kallsyms]     [k] __vma_link_file
     0.37%     0.37%  mytest   [kernel.kallsyms]     [k] sync_regs
     0.37%     0.00%  mytest   ld-linux-x86-64.so.2  [.] elf_get_dynamic_info (inlined)
     0.33%     0.33%  mytest   [kernel.kallsyms]     [k] truncate_inode_pages_range
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] syscall_exit_to_user_mode
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] exit_to_user_mode_prepare
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] exit_to_user_mode_loop
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] task_work_run
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] ____fput
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] __fput
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] dput
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] dentry_kill
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] __dentry_kill
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] dentry_unlink_inode
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] iput
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] evict
     0.33%     0.00%  mytest   [kernel.kallsyms]     [k] truncate_inode_pages_final
     0.27%     0.27%  mytest   [kernel.kallsyms]     [k] __clear_user
     0.27%     0.00%  mytest   [kernel.kallsyms]     [k] clear_user
     0.19%     0.19%  mytest   [kernel.kallsyms]     [k] clear_page_erms
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] setup_arg_pages
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] shift_arg_pages
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] move_page_tables
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] move_page_tables.part.0
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] __pte_alloc
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] pte_alloc_one
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] alloc_pages
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] __alloc_pages
     0.19%     0.00%  mytest   [kernel.kallsyms]     [k] get_page_from_freelist
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] entry_SYSCALL_64_after_hwframe
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] do_syscall_64
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] x64_sys_call
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] __x64_sys_execve
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] do_execveat_common.isra.0
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] bprm_execve
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] bprm_execve.part.0
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] exec_binprm
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] search_binary_handler
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] load_elf_binary
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] begin_new_exec
     0.18%     0.00%  perf-ex  [kernel.kallsyms]     [k] perf_event_exec
     0.11%     0.11%  perf-ex  [kernel.kallsyms]     [k] send_call_function_single_ipi
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] asm_sysvec_apic_timer_interrupt
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] sysvec_apic_timer_interrupt
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] __sysvec_apic_timer_interrupt
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] hrtimer_interrupt
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] __hrtimer_run_queues
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] tick_sched_timer
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] tick_sched_handle
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] update_process_times
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] scheduler_tick
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] trigger_load_balance
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] nohz_balancer_kick
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] smp_call_function_single_async
     0.11%     0.00%  perf-ex  [kernel.kallsyms]     [k] generic_exec_single
     0.06%     0.00%  perf-ex  [kernel.kallsyms]     [k] native_write_msr
     0.06%     0.00%  perf-ex  [kernel.kallsyms]     [k] ctx_resched
     0.06%     0.00%  perf-ex  [kernel.kallsyms]     [k] x86_pmu_enable
     0.06%     0.00%  perf-ex  [kernel.kallsyms]     [k] intel_pmu_enable_all
     0.04%     0.04%  perf-ex  [kernel.kallsyms]     [k] nmi_restore
     0.01%     0.01%  perf-ex  [kernel.kallsyms]     [k] native_flush_tlb_one_user
     0.00%     0.00%  perf-ex  [kernel.kallsyms]     [k] native_sched_clock
     0.00%     0.00%  perf-ex  [kernel.kallsyms]     [k] its_return_thunk


#
# (Cannot load tips.txt file, please install perf!)
#

当前目录如下

mytest
│   ├── Cargo.lock
│   ├── Cargo.toml
│   ├── out.folded
│   ├── perf.data
│   ├── perf.data.old
│   ├── src
│   │   └── main.rs
│   └── target
          ├── mytest
          
├── FlameGraph
│   ├── README.md
│   ├── aix-perf.pl
│   ├── demos

进入我们的mytest文件夹

perf script | ~/FlameGraph/stackcollapse-perf.pl > out.folded

  • perf script:将 perf.data 中的二进制采样数据
  • ~/FlameGraph/stackcollapse-perf.pl: 将 perf script 的多行调用栈 “折叠”成一行一行的汇总格式,并统计每种栈出现的次数
  • 把原始性能数据 → 转成 “栈 + 计数” 的简洁汇总格式

~/FlameGraph/flamegraph.pl out.folded > mytest-flamegraph.svg

  • 读取 out.folded 这种“折叠栈”文件,生成一个交互式 SVG 火焰图

以下是得到的out.folded和火焰图svg

mytest;<core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next 19009718
mytest;<i32 as core::iter::range::Step>::forward_unchecked 6799607
mytest;<i32 as core::iter::range::Step>::forward_unchecked;core::num::<impl i32>::checked_add_unsigned 2168609
mytest;<i32 as core::iter::range::Step>::forward_unchecked;core::num::<impl i32>::checked_add_unsigned;core::num::<impl i32>::overflowing_add_unsigned 6883545
mytest;<i32 as core::iter::range::Step>::forward_unchecked;core::num::<impl i32>::checked_add_unsigned;core::num::<impl i32>::overflowing_add_unsigned;core::num::<impl i32>::overflowing_add 825448
mytest;_start;_dl_start;_dl_start_final;_dl_sysdep_start;dl_main;_dl_map_object_deps;_dl_catch_exception;openaux;_dl_map_object;_dl_map_object_from_fd;_dl_map_segments;__mprotect;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_mprotect;do_mprotect_pkey;mprotect_fixup;perf_event_mmap;perf_event_mmap_event;perf_iterate_sb;perf_iterate_ctx 143804
mytest;_start;_dl_start;_dl_start_final;_dl_sysdep_start;dl_main;_dl_map_object_deps;_dl_catch_exception;openaux;_dl_map_object;open_verify;__GI___read_nocancel;entry_SYSCALL_64 135150
mytest;_start;_dl_start;_dl_start_final;_dl_sysdep_start;dl_main;_dl_new_object;asm_exc_page_fault;exc_page_fault;do_user_addr_fault;handle_mm_fault;__handle_mm_fault;handle_pte_fault;do_fault;do_read_fault;filemap_map_pages;next_uptodate_page 125265
mytest;_start;_dl_start;_dl_start_final;_dl_sysdep_start;dl_main;_dl_relocate_object;_dl_protect_relro;__mprotect;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_mprotect;do_mprotect_pkey;mprotect_fixup;perf_event_mmap;perf_event_mmap_event;perf_iterate_sb;perf_iterate_ctx;perf_event_mmap_output;perf_output_begin 156958
mytest;_start;_dl_start;_dl_start_final;_dl_sysdep_start;dl_main;_dl_relocate_object;elf_dynamic_do_Rela;elf_machine_rela_relative;asm_exc_page_fault;exc_page_fault;do_user_addr_fault;find_vma;vmacache_find 150888
mytest;_start;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_execve;do_execveat_common.isra.0;putname;kmem_cache_free;slab_free_freelist_hook.constprop.0 111752
mytest;core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next 7222319
mytest;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_execve;do_execveat_common.isra.0;bprm_execve;bprm_execve.part.0;exec_binprm;search_binary_handler;load_elf_binary;elf_map;vm_mmap;vm_mmap_pgoff;do_mmap;mmap_region;__mmap_region;perf_event_mmap;perf_event_mmap_event;perf_iterate_sb;perf_iterate_ctx;perf_event_mmap_output;__perf_event__output_id_sample;perf_output_copy 93497
mytest;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_execve;do_execveat_common.isra.0;bprm_execve;bprm_execve.part.0;exec_binprm;search_binary_handler;load_elf_binary;setup_arg_pages;shift_arg_pages;move_page_tables;move_page_tables.part.0;alloc_new_pud.constprop.0;__p4d_alloc;get_zeroed_page;alloc_pages;__alloc_pages 69216
mytest;mytest::test1 550608
mytest;mytest::test2 2129064
mytest;std::rt::lang_start_internal;std::panic::catch_unwind;std::panicking::catch_unwind;std::panicking::catch_unwind::do_call;std::rt::lang_start_internal::{{closure}};std::rt::init;std::sys::pal::unix::init;std::sys::pal::unix::init::sanitize_standard_fds;__GI___poll;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_poll;do_sys_poll;do_poll.constprop.0;tty_poll 162159
mytest;std::rt::lang_start_internal;std::panic::catch_unwind;std::panicking::catch_unwind;std::panicking::catch_unwind::do_call;std::rt::lang_start_internal::{{closure}};std::rt::init;std::sys::pal::unix::init;std::sys::pal::unix::stack_overflow::imp::init;std::sys::pal::unix::stack_overflow::imp::install_main_guard;std::sys::pal::unix::stack_overflow::imp::install_main_guard_linux;std::sys::pal::unix::stack_overflow::imp::stack_start_aligned;std::sys::pal::unix::stack_overflow::imp::get_stack_start;__pthread_getattr_np;__GI___isoc99_sscanf 166637
perf-exec;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_execve;do_execveat_common.isra.0;bprm_execve;bprm_execve.part.0;exec_binprm;search_binary_handler;load_elf_binary;begin_new_exec;perf_event_exec;asm_sysvec_apic_timer_interrupt;sysvec_apic_timer_interrupt;__sysvec_apic_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;tick_sched_timer;tick_sched_handle;update_process_times;calc_global_load_tick 37506
perf-exec;entry_SYSCALL_64_after_hwframe;do_syscall_64;x64_sys_call;__x64_sys_execve;do_execveat_common.isra.0;bprm_execve;bprm_execve.part.0;exec_binprm;search_binary_handler;load_elf_binary;begin_new_exec;perf_event_exec;ctx_resched;x86_pmu_enable;intel_pmu_enable_all;native_write_msr 19422

mytest-flamegraph

如何下载

scp filament@your_server_ip:/mnt/hdd-ws/users/filament/mytest/mytest-flamegraph.svg D:\Documents

#从服务器下载文件
scp username@servername file_path destination_path

#上传文件 
scp local_file_path username@server_ip:server_file_path

参考文章:

DatenLord|Rust程序性能分析 - 知乎

我的程序卡在哪里?—— 用火焰图精确定位性能瓶颈 | Mice World

posted @ 2025-10-10 09:55  phrink  阅读(9)  评论(0)    收藏  举报