<gdb调试程序>
gdb主要用于调试用户态程序、内核模块及驱动程序,支持断点设置、单步执行、变量查看、堆栈回溯等功能,尤其适用于定位段错误、内存泄漏及多线程同步问题。
coredump的使用,以及客户的设备需不需要coredump,会不会coredump的文件大小过于大。
1.调试用户态程序(如应用程序崩溃)
段错误demo:
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <pthread.h> 4 #include <string.h> 5 #include <unistd.h> 6 7 // 链表节点结构体 8 typedef struct Node { 9 int data; 10 struct Node* next; 11 } Node; 12 13 // 全局链表头指针(未初始化,导致野指针) 14 Node* head; 15 pthread_mutex_t lock; 16 17 // 线程1:添加数据到链表 18 void* input_thread(void* arg) { 19 int value; 20 while (1) { 21 printf("Enter a number (0 to exit): "); 22 scanf("%d", &value); 23 if (value == 0) break; 24 25 // 创建新节点 26 Node* new_node = (Node*)malloc(sizeof(Node)); 27 new_node->data = value; 28 29 // 故意错误:未检查 head 是否为 NULL,直接操作野指针 30 pthread_mutex_lock(&lock); 31 new_node->next = head->next; // 段错误触发点(head 未初始化) 32 head->next = new_node; // 进一步解引用野指针 33 pthread_mutex_unlock(&lock); 34 } 35 return NULL; 36 } 37 38 // 线程2:遍历链表并计算平方 39 void* process_thread(void* arg) { 40 while (1) { 41 pthread_mutex_lock(&lock); 42 Node* current = head; // 同样使用未初始化的 head 43 printf("Processing list: "); 44 while (current != NULL) { 45 printf("%d ", current->data * current->data); // 可能的段错误 46 current = current->next; 47 } 48 printf("\n"); 49 pthread_mutex_unlock(&lock); 50 sleep(1); 51 } 52 return NULL; 53 } 54 55 int main() { 56 pthread_t tid1, tid2; 57 pthread_mutex_init(&lock, NULL); 58 59 // 创建线程 60 pthread_create(&tid1, NULL, input_thread, NULL); 61 pthread_create(&tid2, NULL, process_thread, NULL); 62 63 // 等待线程结束(实际不会执行到这里) 64 pthread_join(tid1, NULL); 65 pthread_join(tid2, NULL); 66 67 pthread_mutex_destroy(&lock); 68 return 0; 69 }
编译:
gcc -g test.c -lpthread
运行后:
Enter a number (0 to exit): 5 Segmentation fault (core dumped)
gdb调试过程:
1.运行程序复现崩溃
(gdb) run
输入 5 后程序崩溃,GDB 自动暂停:
Thread 2 "a.out" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffff77c2700 (LWP 383)]
0x0000555555400a8f in input_thread (arg=0x0) at gdb.c:31
31 new_node->next = head->next; // 段错误触发点(head 未初始化)
上面这个时候其实已经说明了段错误发生的行数和代码。可以通过gdb的其他指令再定位
2.查看崩溃位置
(gdb) bt # 查看调用栈
输出:
#0 0x0000555555400a8f in input_thread (arg=0x0) at gdb.c:31
#1 0x00007ffff7bbb6db in start_thread (arg=0x7ffff77c2700) at pthread_create.c:463
#2 0x00007ffff78e471f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
可以看到#0显示,错误发生在input_thread的31行。
3.查看31行附近的代码
(gdb) list 31
输出:
26 Node* new_node = (Node*)malloc(sizeof(Node)); 27 new_node->data = value; 28 29 // 故意错误:未检查 head 是否为 NULL,直接操作野指针 30 pthread_mutex_lock(&lock); 31 new_node->next = head->next; // 段错误触发点(head 未初始化) 32 head->next = new_node; // 进一步解引用野指针 33 pthread_mutex_unlock(&lock); 34 } 35 return NULL;
3.打印变量值
(gdb) print head
输出:
$1 = (Node *) 0x0 # head 是 NULL 野指针!
2.调试多线程程序
场景:多线程程序出现死锁或数据竞争,需分析线程状态和同步问题。
多线程死锁以及数据竞争demo:
1 #include <stdio.h> 2 #include <pthread.h> 3 #include <unistd.h> 4 5 pthread_mutex_t mutex_A = PTHREAD_MUTEX_INITIALIZER; 6 pthread_mutex_t mutex_B = PTHREAD_MUTEX_INITIALIZER; 7 int shared_data = 0; // 未加锁保护的共享变量(存在数据竞争) 8 9 void* thread1_func(void* arg) { 10 pthread_mutex_lock(&mutex_A); 11 printf("Thread 1: Locked mutex_A\n"); 12 13 // 模拟业务逻辑(耗时操作) 14 sleep(1); 15 16 // 故意制造死锁:尝试获取mutex_B,但mutex_B可能被线程2持有 17 printf("Thread 1: Trying to lock mutex_B...\n"); 18 pthread_mutex_lock(&mutex_B); 19 printf("Thread 1: Locked mutex_B\n"); 20 21 // 操作共享数据(未加锁,存在数据竞争) 22 shared_data++; 23 printf("Thread 1: shared_data = %d\n", shared_data); 24 25 pthread_mutex_unlock(&mutex_B); 26 pthread_mutex_unlock(&mutex_A); 27 return NULL; 28 } 29 30 void* thread2_func(void* arg) { 31 pthread_mutex_lock(&mutex_B); 32 printf("Thread 2: Locked mutex_B\n"); 33 34 // 模拟业务逻辑(耗时操作) 35 sleep(1); 36 37 // 故意制造死锁:尝试获取mutex_A,但mutex_A可能被线程1持有 38 printf("Thread 2: Trying to lock mutex_A...\n"); 39 pthread_mutex_lock(&mutex_A); 40 printf("Thread 2: Locked mutex_A\n"); 41 42 // 操作共享数据(未加锁,存在数据竞争) 43 shared_data--; 44 printf("Thread 2: shared_data = %d\n", shared_data); 45 46 pthread_mutex_unlock(&mutex_A); 47 pthread_mutex_unlock(&mutex_B); 48 return NULL; 49 } 50 51 void* thread3_func(void* arg) { 52 // 故意不加锁,直接修改shared_data(数据竞争) 53 for (int i = 0; i < 5; i++) { 54 shared_data += 2; 55 printf("Thread 3: shared_data = %d (no lock!)\n", shared_data); 56 sleep(1); 57 } 58 return NULL; 59 } 60 61 int main() { 62 pthread_t t1, t2, t3; 63 64 // 创建线程 65 pthread_create(&t1, NULL, thread1_func, NULL); 66 pthread_create(&t2, NULL, thread2_func, NULL); 67 pthread_create(&t3, NULL, thread3_func, NULL); 68 69 // 等待线程结束(实际上会卡死在死锁) 70 pthread_join(t1, NULL); 71 pthread_join(t2, NULL); 72 pthread_join(t3, NULL); 73 74 printf("Final shared_data = %d\n", shared_data); 75 return 0; 76 }
运行后输出:
Thread 1: Locked mutex_A Thread 2: Locked mutex_B Thread 3: shared_data = 2 (no lock!) Thread 1: Trying to lock mutex_B... Thread 2: Trying to lock mutex_A... Thread 3: shared_data = 4 (no lock!) Thread 3: shared_data = 6 (no lock!) Thread 3: shared_data = 8 (no lock!) Thread 3: shared_data = 10 (no lock!) 然后卡住
gdb调试过程:
1.查看线程状态:
(gdb) info threads Id Target Id Frame * 1 Thread 0x7ffff7fe7740 (LWP 416) "a.out" 0x00007ffff7bbcd2d in __GI___pthread_timedjoin_ex (threadid=140737345496832, thread_return=0x0, abstime=0x0, block=<optimized out>) at pthread_join_common.c:89 2 Thread 0x7ffff77c2700 (LWP 420) "a.out" __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 3 Thread 0x7ffff6fc1700 (LWP 421) "a.out" __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
可以看到线程2和3都在lock_wait(在虚拟机调试是显示这个,在板卡运行就不是这个),说明这两个线程都在等待一个锁。
2.切换到死锁线程
(gdb) thread 2 [Switching to thread 2 (Thread 0x7ffff77c2700 (LWP 420))] #0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 135 ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory. (gdb) bt #0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 #1 0x00007ffff7bbe025 in __GI___pthread_mutex_lock (mutex=0x555555602080 <mutex_>) at ../nptl/pthread_mutex_lock.c:80 #2 0x0000555555400900 in thread1_func (arg=0x0) at gdb.c:18 #3 0x00007ffff7bbb6db in start_thread (arg=0x7ffff77c2700) at pthread_create.c:463 #4 0x00007ffff78e471f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
切换到线程3也是一样的
可以看到死锁发生在18行。
3.除了检测死锁,还可以通过watch检测是不是存在共享资源没加锁导致资源被随意更改
(gdb) watch shared_data Hardware watchpoint 1: shared_data (gdb) r Starting program: /home/zhuangquan/test/a.out [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". [New Thread 0x7ffff77c2700 (LWP 429)] Thread 1: Locked mutex_A [New Thread 0x7ffff6fc1700 (LWP 430)] Thread 2: Locked mutex_B [New Thread 0x7ffff67c0700 (LWP 431)] [Switching to Thread 0x7ffff67c0700 (LWP 431)] Thread 4 "a.out" hit Hardware watchpoint 1: shared_data Old value = 0 New value = 2 thread3_func (arg=0x0) at gdb.c:55 55 printf("Thread 3: shared_data = %d (no lock!)\n", shared_data); (gdb) list 55 50 51 void* thread3_func(void* arg) { 52 // 故意不加锁,直接修改shared_data(数据竞争) 53 for (int i = 0; i < 5; i++) { 54 shared_data += 2; 55 printf("Thread 3: shared_data = %d (no lock!)\n", shared_data); 56 sleep(1); 57 } 58 return NULL; 59 }
3.调试动态库
场景:动态库函数调用异常,需单独调试库代码。
4.结合valgrind生成泄漏报告,再用GDB定位
demo描述:
- 隐式泄漏:
malloc分配的内存未被释放。 - 显式泄漏:链表节点动态分配后未释放。
- 间接泄漏:链表中的字符串数据未释放。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <unistd.h> 5 6 typedef struct Node { 7 char *data; 8 struct Node *next; 9 } Node; 10 11 // 隐式内存泄漏:分配的内存未释放 12 void implicit_leak() { 13 char *leak_ptr = malloc(100); 14 strcpy(leak_ptr, "This is an implicit leak!"); 15 printf("Implicit leak: %s\n", leak_ptr); 16 // 忘记 free(leak_ptr); 17 } 18 19 // 显式内存泄漏:链表节点未释放 20 Node* create_node(const char *data) { 21 Node *node = malloc(sizeof(Node)); 22 node->data = strdup(data); // 动态分配字符串 23 node->next = NULL; 24 return node; 25 } 26 27 void add_node(Node **head, const char *data) { 28 Node *new_node = create_node(data); 29 new_node->next = *head; 30 *head = new_node; 31 } 32 33 // 显式内存泄漏:未释放链表 34 void explicit_leak() { 35 Node *head = NULL; 36 add_node(&head, "Node 1"); 37 add_node(&head, "Node 2"); 38 add_node(&head, "Node 3"); 39 40 // 遍历链表(但不释放) 41 Node *current = head; 42 while (current) { 43 printf("Explicit leak: %s\n", current->data); 44 current = current->next; 45 } 46 // 忘记 free_list(head); 47 } 48 49 // 错误的释放方式(仅释放节点,未释放节点内的字符串) 50 void wrong_free(Node *head) { 51 Node *current = head; 52 while (current) { 53 Node *temp = current; 54 current = current->next; 55 free(temp); // 只释放节点,未释放 temp->data 56 } 57 } 58 59 int main() { 60 implicit_leak(); 61 explicit_leak(); 62 63 // 模拟另一个泄漏场景(错误释放) 64 Node *wrong_list = NULL; 65 add_node(&wrong_list, "Wrong Node 1"); 66 add_node(&wrong_list, "Wrong Node 2"); 67 wrong_free(wrong_list); // 导致字符串内存泄漏 68 69 // 程序故意不释放内存,以便 Valgrind 检测 70 printf("Memory leaks generated. Use Valgrind and GDB to debug.\n"); 71 return 0; 72 }
使用valgrind检测内存泄露的位置:
valgrind --leak-check=full --show-leak-kinds=all ./memleak_demo
输出:
==1234== 100 bytes in 1 blocks are definitely lost in loss record 1 of 3 ==1234== at 0x483AB65: malloc (vg_replace_malloc.c:307) ==1234== by 0x401123: implicit_leak (memleak_demo.c:12) ==1234== by 0x401234: main (memleak_demo.c:45) ==1234== 120 bytes in 3 blocks are definitely lost in loss record 2 of 3 ==1234== at 0x483AB65: malloc (vg_replace_malloc.c:307) ==1234== by 0x401156: create_node (memleak_demo.c:18) ==1234== by 0x401189: add_node (memleak_demo.c:25) ==1234== by 0x4011F2: explicit_leak (memleak_demo.c:33) ==1234== by 0x401248: main (memleak_demo.c:46) ==1234== 40 bytes in 2 blocks are indirectly lost in loss record 3 of 3 ==1234== at 0x483AB65: malloc (vg_replace_malloc.c:307) ==1234== by 0x483D8FF: strdup (vg_replace_strmem.c:524) ==1234== by 0x401166: create_node (memleak_demo.c:19) ==1234== by 0x401189: add_node (memleak_demo.c:25) ==1234== by 0x401210: wrong_free (memleak_demo.c:40) ==1234== by 0x40125C: main (memleak_demo.c:49)
- 明确泄漏(definitely lost):
implicit_leak()中的malloc(100)未释放。explicit_leak()中的链表节点未释放。
- 间接泄漏(indirectly lost):
wrong_free()释放了节点,但未释放strdup()分配的字符串。
使用gdb定位泄露点:
在上面几个关键地方设置断点:
(gdb) break memleak_demo.c:12 # implicit_leak() 中的 malloc (gdb) break memleak_demo.c:18 # create_node() 中的 malloc (gdb) break memleak_demo.c:19 # create_node() 中的 strdup (gdb) break memleak_demo.c:37 # wrong_free() 中的 free
运行程序:
(gdb) run
调试implicit_leak():
(gdb) break memleak_demo.c:12 (gdb) run (gdb) next # 执行 malloc (gdb) print leak_ptr # 查看分配的地址 $1 = (char *) 0x555555756260 "This is an implicit leak!" (gdb) continue # 程序继续运行,但不会释放该内存
调试explicit_leak():
(gdb) break memleak_demo.c:18 (gdb) run (gdb) next # 执行 malloc(sizeof(Node)) (gdb) print node # 查看节点地址 $2 = (Node *) 0x5555557562a0 (gdb) next # 执行 strdup(data) (gdb) print node->data # 查看字符串地址 $3 = (char *) 0x5555557562c0 "Node 3" (gdb) continue # 程序继续运行,但不会释放链表
调试wrong_free():
(gdb) break memleak_demo.c:37 (gdb) run (gdb) next # 执行 free(temp) (gdb) print temp->data # 发现字符串未被释放 $4 = (char *) 0x555555756300 "Wrong Node 2" (gdb) continue # 程序继续运行,导致字符串泄漏
为什么使用完valgrind定位完泄露点,不直接看code解决内存泄露,还要通过gdb定位?
Valgrind 是一个 静态分析工具(通过动态插桩模拟内存管理),它能告诉你 “哪里泄漏了”,但无法直接回答 “为什么泄漏” 或 “泄漏是如何发生的”。
比如:
==1234== 100 bytes in 1 blocks are definitely lost in loss record 1 of 3 ==1234== at 0x483AB65: malloc (vg_replace_malloc.c:307) ==1234== by 0x401123: implicit_leak (memleak_demo.c:12) ==1234== by 0x401234: main (memleak_demo.c:45)
- 优点:明确指出泄漏发生在
memleak_demo.c第12行(malloc调用)。 - 缺点:
- 无法观察运行时行为:比如变量是否被意外修改、指针是否被覆盖。
- 无法跟踪间接泄漏:如链表节点中的字符串未释放(Valgrind 只能告诉你“节点泄漏”,但无法直接显示“字符串未释放”的调用链)。
- 无法调试条件泄漏:如某些分支下才发生的泄漏(Valgrind 只能给出统计结果,无法动态观察)。
GDB 是一个 动态调试工具,它能让你 实时观察程序运行时的内存状态,回答以下问题:
- “为什么这块内存没有被释放?”
(例如:指针被意外覆盖、提前退出导致free未执行、逻辑错误导致跳过释放代码) - “泄漏的内存是如何被分配和使用的?”
(例如:链表节点的next指针是否正确维护、字符串是否被多次分配但只释放一次) - “泄漏是否由特定输入或条件触发?”
(例如:仅在特定网络数据包到达时发生泄漏)
有时候直接去看代码,如果代码一开始写的逻辑是和实际运行的情况不一样,这个时候就要用gdb来定位问题。
valgrind+gdb联合使用的经典场景:
1.隐式泄露(忘记释放)
Valgrind输出:
==1234== 100 bytes in 1 blocks are definitely lost at memleak_demo.c:12
代码中可能有多处 malloc,但 Valgrind 只告诉你行号,无法确认 是哪个变量持有这块内存。
用 GDB 可以
(gdb) break memleak_demo.c:12 (gdb) run (gdb) print leak_ptr # 查看指针值 (gdb) watch leak_ptr # 监控指针是否被意外修改
2.链表/树结构泄露
Valgrind输出:
==1234== 120 bytes in 3 blocks are definitely lost (链表节点泄漏) ==1234== 40 bytes in 2 blocks are indirectly lost (节点中的字符串未释放)
Valgrind 无法告诉你 链表是如何被遍历和释放的(例如:是否在释放节点时漏掉了 data 字段)。
用 GDB 可以:
(gdb) break create_node # 观察节点分配 (gdb) break free_list # 观察释放逻辑 (gdb) next (gdb) print node->data # 检查字符串是否被正确释放
3.条件泄露(仅在特殊情况下发生)
Valgrind输出:
==1234== 50 bytes in 1 blocks are possibly lost (可能泄漏,取决于分支)
Valgrind 无法告诉你 泄漏是否由特定输入或分支触发。
用 GDB 可以:
(gdb) break some_condition_check # 在关键分支处设断点 (gdb) cond 1 input == "trigger_leak" # 仅在特定输入时暂停 (gdb) run
4.多线程泄露
Valgrind输出:
==1234== 200 bytes in 4 blocks are definitely lost (多线程竞争导致泄漏)
Valgrind 无法告诉你 泄漏是否由线程竞争导致(例如:一个线程分配内存,另一个线程释放了错误的指针)。
用 GDB 可以:
(gdb) set scheduler-locking on # 单步调试特定线程 (gdb) break malloc (gdb) thread apply all bt # 查看所有线程的调用栈
gdb怎么调试多进程?
浙公网安备 33010602011771号