Linux驱动开发之Oops错误
- 不少新手在驱动开发入门过程中遇到第一个Oops错误时总会不明所以
下面我就结合一个真实案例,来讲解一下遇到Oops错误如何快速定位
关于Oops
- Oops类似于应用层的段错误,区别在于应用层需要借助gdb或其它工具才能查看堆栈信息,而Oops不需要gdb,是内核中的自调试机制给我们进行的反馈
- 对于小的驱动项目,熟练掌握Oops调试配合自定义调试输出足以应对了,大一些的驱动项目才可能会用到kgdb、串口等进行调试
遇到Oops应该怎么做
保存Oops输出
- 在Oops错误影响不大的情况下,日志无需干预就能正常的落地到/var/log/kern.log或/var/log/message日志中
- 在Oops日志影响较为严重的情况下,键鼠可能不再生效、磁盘缓冲区可能也就无法将Oops信息写入,那就掏出手机拍照吧
- 笔者推荐的做法是使用另一台设备远程ssh,使用 tail -f / tailf 工具查看日志,在开发环境死机的情况下,另一台设备还是可以正常看到已输出的Oops信息
Oops日志分析
Aug 30 18:14:31 Kylin kernel: [ 162.780689][ 3] ZDFY001.fake_close: >--------------------------------------------filp->f_flags = 2001
Aug 30 18:14:31 Kylin kernel: [ 162.907781][ 0] ZDFY001.fake_close: >--------------------------------------------filp->f_flags = 2009
Aug 30 18:14:31 Kylin kernel: [ 162.942187][ 3] ZDFY001.fake_close: >--------------------------------------------filp->f_flags = 2009
Aug 30 18:14:31 Kylin kernel: [ 162.942817][ 0] CPU 0 Unable to handle kernel paging request at virtual address 0000303030344173, epc == ffffffffc0475880, ra == ffffffffc0475858
Aug 30 18:14:31 Kylin kernel: [ 162.946277][ 0] Oops[#1]:
Aug 30 18:14:31 Kylin kernel: [ 162.946946][ 0] CPU: 0 PID: 10637 Comm: tipcore Tainted: G O 4.4.131.D001.64.190906 #D001
Aug 30 18:14:31 Kylin kernel: [ 162.949368][ 0] Hardware name: Loongson Kunlun-LS3A4000-7A1000-1w-V1.2-Dev/Kunlun-LS3A4000-7A1000-1w-V1.2-Dev, BIOS Kunlun-V4.0.12 2018/07/01
Aug 30 18:14:31 Kylin kernel: [ 162.952684][ 0] task: 98000000fd83df88 ti: 98000000fdbb8000 task.ti: 98000000fdbb8000
Aug 30 18:14:31 Kylin kernel: [ 162.954720][ 0] $ 0 : 0000000000000000 0000000000000001 fffffffdc0004700 0000000000002000
Aug 30 18:14:31 Kylin kernel: [ 162.957044][ 0] $ 4 : ffffffffc047a868 0000000000000000 ffffffffffffffbb 0000000000000000
Aug 30 18:14:31 Kylin kernel: [ 162.959357][ 0] $ 8 : 0000000000000000 000000004bf39ad8 000000001c1d19ac 00000000630de289
Aug 30 18:14:31 Kylin kernel: [ 162.961653][ 0] $12 : ffffffff80219b40 ffffffff84080018 ffffffffc0475b20 2e74726174732071
Aug 30 18:14:31 Kylin kernel: [ 162.963946][ 0] $16 : ffffffffb80008e0 ffffffffc0480000 0000303030344133 000000ffffb251e0
Aug 30 18:14:31 Kylin kernel: [ 162.966263][ 0] $20 : 00000001238db108 000000ffee4b70f0 000000fff4142580 000000fff4142590
Aug 30 18:14:31 Kylin kernel: [ 162.968595][ 0] $24 : 0000000000000038 ffffffffc0475820
Aug 30 18:14:31 Kylin kernel: [ 162.970917][ 0] $28 : 98000000fdbb8000 98000000fdbbbd70 000000ffee4b5f70 ffffffffc0475858
Aug 30 18:14:31 Kylin kernel: [ 162.973239][ 0] Hi : 0000000000000000
Aug 30 18:14:31 Kylin kernel: [ 162.974260][ 0] Lo : cccccccccccccccd
Aug 30 18:14:31 Kylin kernel: [ 162.975290][ 0] epc : ffffffffc0475880 fake_close.part.0+0x60/0x300 [ZDFY001]
Aug 30 18:14:31 Kylin kernel: [ 162.977199][ 0] ra : ffffffffc0475858 fake_close.part.0+0x38/0x300 [ZDFY001]
Aug 30 18:14:31 Kylin kernel: [ 162.979099][ 0] Status: 5400cce3 KX SX UX KERNEL EXL IE
Aug 30 18:14:31 Kylin kernel: [ 162.980649][ 0] Cause : 10000008 (ExcCode 02)
Aug 30 18:14:31 Kylin kernel: [ 162.981773][ 0] BadVA : 0000303030344173
Aug 30 18:14:31 Kylin kernel: [ 162.982782][ 0] PrId : 00140000 (Loongson-3)
Aug 30 18:14:31 Kylin kernel: [ 162.983911][ 0] Modules linked in: fuse kl_audit(O) policyrules(O) host_integrity_mod(O) softmanager_interface(O) set_soc_time(O) security_interface(O) ZDFY001(O) bmsvfat(O) binfmt_misc bmsfat(O) serio_raw shpchp host_soc_sys(O) host_soc_update(O) host_soc_net(O) host_soc_file(O) pcie_vbus_common(O) tcm_tis amdgpu radeon
Aug 30 18:14:31 Kylin kernel: [ 162.992004][ 0] Process tipcore (pid: 10637, threadinfo=98000000fdbb8000, task=98000000fd83df88, tls=000000ffee4be7f0)
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] Stack : 98000000fc191d40 0000000000000001 ffffffff80c476c0 98000000fdbbbe00
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 0000000000000000 ffffffff810dfa88 ffffffff81210000 ffffffff8037d890
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 98000000fd83df88 0000000000000001 98000002549aa000 9800000246049400
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] ffffffff812135f0 0000000100000000 98000000fd83df88 ffffffff802d6e58
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 98000000fd83df88 ffffffff802d6ce8 000000ffb80008e0 9800000246049400
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 9800000246049648 9800000246049400 0000000000000001 0000000000000000
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 000000ffffb251e0 00000001238db108 000000ffee4b70f0 ffffffff802d72c0
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 00000000630de307 00000000256d5dcc 000000ffee4b70f0 000000ffffb251a0
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 0000000000000000 000000ffffb251e0 00000001238db108 000000ffee4b70f0
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] 000000fff4142580 000000fff4142590 000000ffee4b5f70 ffffffff80219a50
Aug 30 18:14:31 Kylin kernel: [ 162.994808][ 0] ...
Aug 30 18:14:31 Kylin kernel: [ 163.012886][ 0] Call Trace: //调用跟踪
Aug 30 18:14:31 Kylin kernel: [ 163.013629][ 0] [<ffffffffc0475880>] fake_close.part.0+0x9c/0x*** [ZDFY001] <<---------------------重点
Aug 30 18:14:31 Kylin kernel: [ 163.015464][ 0] [<ffffffff80219a50>] syscall_common+0x34/0x58
Aug 30 18:14:31 Kylin kernel: [ 163.016963][ 0]
Aug 30 18:14:31 Kylin kernel: [ 163.017439][ 0]
Aug 30 18:14:31 Kylin kernel: [ 163.017439][ 0] Code: da521003 12400089 24032000 <8e420040> 10430086 3c030004 24632000 10430083 00000000
Aug 30 18:14:31 Kylin kernel: [ 163.023089][ 0] ---[ end trace 24f6e61d230d4721 ]---
Aug 30 18:14:36 Kylin kernel: [ 167.313654][ 0]
Aug 30 18:14:36 Kylin kernel: [ 167.313654][ 0] do_page_fault(): sending SIGSEGV to mutter for invalid read access from 000000ffe290af40
Aug 30 18:14:36 Kylin kernel: [ 167.316132][ 0] epc = 000000fff363f458 in libpthread-2.23.so[fff3634000+20000]
Aug 30 18:14:36 Kylin kernel: [ 167.318095][ 0] ra = 000000ffef5afa30 in radeonsi_dri.so[ffef16c000+ae4000]
Aug 30 18:14:36 Kylin kernel: [ 167.319973][ 0]
Aug 30 18:14:36 Kylin kernel: [ 167.754457][ 0] skb->len:272
Aug 30 18:14:36 Kylin kernel: [ 167.755192][ 0] kernel receive data: 0
Aug 30 18:14:41 Kylin kernel: [ 172.756329][ 0] skb->len:272
Aug 30 18:14:41 Kylin kernel: [ 172.757073][ 0] kernel receive data: 0
- 类似上面的日志,我们仅关注箭头指示的重点附近几行即可,其中的
fake_close是笔者驱动中自己的函数,其中的0x9c是错误地点相对于fake_close函数起始地址的偏移量
查询地址对应的代码行
生成Debug版本驱动
- 通常我们编译驱动时,在不添加自定义参数的情况下,生成的是Release版本的驱动,当遇到Oops错误时需要再编译一份Debug版本的驱动
- 那么如何生成Debug版本驱动呢,很简单,在我们的MakeFile中加入
ccflags-y += -c -g即可,在源码没有变动的情况下,不用去安装Debug驱动,它只作为反汇编分析使用,相对地址是不会改变的
- 在生成Debug版本驱动后,我们就需要根据函数所在的
.c文件找到对应的.o文件,如果懒得找,使用最后驱动名称后缀为.o的文件,我记得也可以
反汇编.o类型文件
- 使用
objdump工具对找到的.o类型文件作反汇编,举例:objdump -S 源.o > 目标.s
- 下面的内容就是由上面日志匹配的驱动中的
.o文件反汇编得来
0000000000000440 <fake_close>: //fake_close起始位置
return real_open(path, oflags, mode);
}
asmlinkage ssize_t fake_close(ssize_t fd)
{
if(bExit || get_hipflag()) return real_close(fd);
440: 3c020000 lui v0,0x0
*/
return real_open(path, oflags, mode);
}
asmlinkage ssize_t fake_close(ssize_t fd)
{
444: 67bdff90 daddiu sp,sp,-112
if(bExit || get_hipflag()) return real_close(fd);
448: 8c420000 lw v0,0(v0)
*/
return real_open(path, oflags, mode);
}
asmlinkage ssize_t fake_close(ssize_t fd)
{
44c: ebb000b1 gssq s1,s0,32(sp)
450: ebbe01bf gssq ra,s8,96(sp)
454: ebb60177 gssq s7,s6,80(sp)
458: ebb40135 gssq s5,s4,64(sp)
45c: ebb200f3 gssq s3,s2,48(sp)
if(bExit || get_hipflag()) return real_close(fd);
460: 14400007 bnez v0,480 <fake_close+0x40>
464: 00808025 move s0,a0
468: 3c020000 lui v0,0x0
46c: 64420000 daddiu v0,v0,0
470: 0040f809 jalr v0
474: 00000000 nop
478: 1040000b beqz v0,4a8 <fake_close+0x68>
47c: 3c110000 lui s1,0x0
480: 3c020000 lui v0,0x0
484: 02002025 move a0,s0
488: dc590000 ld t9,0(v0)
}
kfree(statfsbuf);
out_unlock:
unhooklock();
return real_close(fd);
}
48c: cbbe01bf gslq ra,s8,96(sp)
490: cbb60177 gslq s7,s6,80(sp)
494: cbb40135 gslq s5,s4,64(sp)
498: cbb200f3 gslq s3,s2,48(sp)
49c: cbb000b1 gslq s1,s0,32(sp)
break;
}
kfree(statfsbuf);
out_unlock:
unhooklock();
return real_close(fd);
4a0: 03200008 jr t9
4a4: 67bd0070 daddiu sp,sp,112
char *pathbuf = NULL, *temp = NULL;
struct file *filp = NULL;
struct fdtable *fdt = NULL;
struct kstat *statbuf = NULL;
struct kstatfs *statfsbuf = NULL;
struct files_struct *files = current->files;
4a8: df820000 ld v0,0(gp)
4ac: dc520520 ld s2,1312(v0)
void hooklock(void)
{
//spin_lock(&lock);
mutex_lock(&mymutex);
4b0: 3c020000 lui v0,0x0
4b4: 64420000 daddiu v0,v0,0
4b8: 0040f809 jalr v0
4bc: 66240000 daddiu a0,s1,0
})
static __always_inline
void __read_once_size(const volatile void *p, void *res, int size)
{
__READ_ONCE_SIZE;
4c0: de430020 ld v1,32(s2)
struct files_struct *files = current->files;
PCOMMON_DATA common_data = NULL;
hooklock();
fdt = files_fdtable(files);
if (fd >= fdt->max_fds) goto out_unlock; //判断描述符合法
4c4: 9c620000 lwu v0,0(v1)
4c8: 0202102a slt v0,s0,v0
4cc: 1040000b beqz v0,4fc <fake_close+0xbc>
4d0: 3c020000 lui v0,0x0
filp = fdt->fd[fd]; //从fd表获取文件指针
4d4: dc620008 ld v0,8(v1)
4d8: 001090f8 dsll s2,s0,0x3
4dc: d8529003 gsldx s2,0(v0,s2) //fake_close起始位置+0x9c = 4dc
if (!filp) goto out_unlock; //判断指针合法
4e0: 12400006 beqz s2,4fc <fake_close+0xbc>
4e4: 3c020000 lui v0,0x0
if(!(O_ACCMODE&filp->f_flags)) goto out_unlock; //只读打开
4e8: 8e420040 lw v0,64(s2)
4ec: 30420003 andi v0,v0,0x3
4f0: 1440000f bnez v0,530 <fake_close+0xf0>
4f4: 3c140000 lui s4,0x0
计算偏移地址所在行代码
- 根据我们前面得到的信息,崩溃地点在基于
fake_close函数起始地址的0x9c行
- 也就是
0x440 + 0x9c = 0x4dc,如此得出引起崩溃的汇编代码是4dc: d8529003 gsldx s2,0(v0,s2),也就是说是C代码filp = fdt->fd[fd];引起了崩溃
- 根据
filp = fdt->fd[fd];这行C代码不难看出,有两种崩溃可能
- 一是可能存在对空指针解引用
- 二是可能存在数组访问越界
至此,分析结束