#커널 크래시 디버깅 및 TroubleShooting
- Race로 mmc_wait_data_done() 함수에서 커널 패닉
- "cat /d/shrinker" 입력 시 커널 패닉
- 함수 포인터 미지정으로 xfrm_local_error() 커널 패닉
- preempt 조건으로 ___might_sleep() 함수 크래시
- 스택 카나리: __stack_chk_fail() 함수 크래시
- 스택 카나리: tcp_v4_rcv -> __stack_chk_fail 크래시
- 뮤텍스 데드락(Mutex Deadlock)으로 락업(lockup)
- 디바이스 드라이버 Signature 문제로 커널 크래시
- 메모리 불량 커널 크래시 @find_vma_links()
- 메모리 불량 커널 크래시 @ttwu_do_activate()
- Race로 ipv6_ifa_notify() Stuck - watchdog reset
- tty_wakeup() 함수 Data Abort
- irq_affinity_notify() 함수 Data Abort
- cpuacct_charge() 함수 Data Abort
- 워크큐(workqueue) 락업(1)
- 워크큐(workqueue) 락업(2)
- 워크큐(workqueue) 락업(3)
Debugging
"ndroid.systemui" 프로세스가 raw_spin_lock_irqsave() API 동작 중에 watchdog reset이 되었다.
-000|aee_wdt_atf_entry()
-001|prefetchw(inline)
-001|arch_spin_lock(inline)
-001|do_raw_spin_lock_flags(inline)
-001|__raw_spin_lock_irqsave(inline)
-001|raw_spin_lock_irqsave(lock = 0xDD82725C)
-002|__wake_up(q = 0xDD82725C, mode = 1, nr_exclusive = 1, key = 0x0)
-003|mmc_wait_data_done
-004|msdc_irq_data_complete(host = 0xDD827280, ?, error = 0)
-005|msdc_irq(?, dev_id = 0xDD13B9C8)
-006|static_key_count(inline)
-006|static_key_false(inline)
-006|trace_irq_handler_exit(inline)
-006|handle_irq_event_percpu(?, action = 0xDD13B9C8)
-007|handle_irq_event(desc = 0xC0F38440)
-008|cond_unmask_eoi_irq(inline)
-008|handle_fasteoi_irq(?, desc = 0xC0F38440)
-009|static_key_count(inline)
-009|static_key_false(inline)
-009|trace_irq_exit(inline)
-009|__handle_domain_irq(domain = 0xDE806000, hwirq = 111, lookup = TRUE, ?)
-010|__raw_readl(inline)
-010|gic_handle_irq(regs = 0xDD13BFB0)
-011|__irq_usr(asm)
spinlock이 release되어 spinlock 변수가 [owner: 0x56AC, next: 0x56AD]로 업데이트되기까지 기다리고 있었다.
다른 프로세스가 spinlock을 잡고 있었는데..
(raw_spinlock_t *)0xDD82725C = 0xDD82725C -> (
raw_lock = (
slock = 0x56AD56AB,
tickets = (
owner = 0x56AB, // <<--
next = 0x56AD))) // <<--
0xDD82725C spinlock 변수 주소를 누가 접근하는 지 검색해보면.
(where)
(raw_spinlock_t *)0xDD82725C
c2d01b78: dd82725c
c2d01b7c: dd82725c
c44f0994: dd82725c
dd13bd60: dd82725c // "ndroid.systemui"
dd13bd9c: dd82725c // "ndroid.systemui"
dd13bdc8: dd82725c // "ndroid.systemui"
ddbefdb8: dd82725c // <<--"mmcqd/0"
ddbefdd0: dd82725c
e0abd994: dd82725c
"mmcqd/0" 프로세스가 접근함을 알 수 있었다.
v.v % (struct thread_info*)(0xDDBEFDB8 & ~0x1fff)
(long unsigned int) swap_in = 0 = 0x0 = '....',
(long unsigned int) swap_out = 0 = 0x0 = '....',
(struct task_cputime) cputime_expires = ((cputime_t) utime = 0 = 0x0 = '....', (cputime_t) stime = 0 = 0x0 = '....', (long lo
(struct list_head [3]) cpu_timers = ([0] = ((struct list_head *) next = 0xDDB49A08 = -> ((struct list_head *) next = 0xDDB49
(struct cred *) real_cred = 0xDDB71200 = -> ((atomic_t) usage = ((int) counter = 2 = 0x2 = '....'), (kuid_t) uid = ((uid_t)
(struct cred *) cred = 0xDDB71200 = -> ((atomic_t) usage = ((int) counter = 2 = 0x2 = '....'), (kuid_t) uid = ((uid_t) val =
(char [16]) comm = "mmcqd/0",
"mmcqd/0" 프로세스 콜스택은 아래와 같다.
-000|schedule()
-001|mmc_wait_for_data_req_done(inline)
-001|mmc_start_req(host = 0xC0F75D0C, areq = 0xC0F644E0, error = 0xC102CB08)
-002|mmc_blk_issue_rw_rq(mq = 0xDDB64408, rqc = 0x0)
-003|mmc_blk_issue_rq(mq = 0xDDB64408, req = 0x0)
-004|mmc_queue_thread(d = 0xDDB64408)
-005|kthread(_create = 0xDDB8BDC0)
-006|ret_from_fork(asm)
-007|ret_fast_syscall(asm)
mmc_wait_for_data_req_done 코드를 가보면, spin_lock_irqsave(&context_info->lock, flags); API를 통해 lock을 걸고 있었다.
static int mmc_wait_for_data_req_done(struct mmc_host *host,
struct mmc_request *mrq,
struct mmc_async_req *next_req)
{
struct mmc_command *cmd;
struct mmc_context_info *context_info = &host->context_info;
int err;
unsigned long flags;
while (1) {
wait_event_interruptible(context_info->wait,
(context_info->is_done_rcv ||
context_info->is_new_req));
spin_lock_irqsave(&context_info->lock, flags);
context_info->is_waiting_last_req = false;
조금 더 코드를 보면 Watchdog Reset이 trigger된 "ndroid.systemui" 프로세스 콜 스택에 아래 함수들이 있었다.
-003|mmc_wait_data_done
-004|msdc_irq_data_complete(host = 0xDD827280, ?, error = 0)
mmc_wait_data_done() 함수를 보면 mrq->host->context_info.is_done_rcv = true로 설정하고,
&mrq->host->context_info.wait wait queue을 깨우는 역할을 수행했다.
static void mmc_wait_data_done(struct mmc_request *mrq){
mrq->host->context_info.is_done_rcv = true;
wake_up_interruptible(&mrq->host->context_info.wait);
}
그런데 mmc_wait_for_data_req_done() 함수의 아래 구문은 context_info->is_done_rcv, context_info->is_new_req가 true가
될 때 까지 기다리는 코드다.
wait_event_interruptible(context_info->wait,
(context_info->is_done_rcv ||
context_info->is_new_req));
정리해보면, race condition으로 wake_up_interruptible(&mrq->host->context_info.wait); 함수를 통해 wait_queue에 queue된
&mrq->host->context_info.wait으로 등록된 프로세스를 깨우기 전에 아래 구문이 풀려버려 이미 깨어버린 경우임을 짐작할 수 있다.
잠을 자고 있는 누군가를 깨우려고 손을 대려고 하는데 이미 깨버린 경우와 비슷하네.
In-depth Analysis
내 추정이 맞다면 아래 코드를 삽입하면 바로 커널 패닉이 나야 한다.
(램덤프 force_mmc_wait_data_done_crash)
static void mmc_wait_data_done(struct mmc_request mrq){*
mrq->host->context_info.is_done_rcv = true;
+ mdelay(1);
wake_up_interruptible(&mrq->host->context_info.wait);
}
그런데 바로 커널 패닉이 발생했다.
-000|do_DataAbort(addr = 0, fsr = 0, regs = 0x0)
-001|__dabt_svc(asm)
-->|exception
-002|debug_spin_lock_before(inline)
-002|do_raw_spin_lock(lock = 0x02B4)
-003|raw_spin_lock_irqsave(lock = 0x02B4)
-004|__wake_up(q = 0x02B4, mode = 1, nr_exclusive = 1, key = 0x0)
-005|mmc_wait_data_done(mrq = 0xDC9725F8)
-006|msdc_irq_data_complete(host = 0xDC919300, ?, error = 0)
-007|msdc_irq(?, dev_id = 0x1)
-008|handle_irq_event_percpu(?, action = 0xDC91E0C0)
-009|handle_irq_event(desc = 0xC121A440)
-010|cond_unmask_eoi_irq(inline)
-010|handle_fasteoi_irq(?, desc = 0xC121A440)
-011|static_key_count(inline)
-011|static_key_false(inline)
-011|trace_irq_exit(inline)
-011|__handle_domain_irq(domain = 0xDE806000, hwirq = 111, lookup = TRUE, ?)
-012|__raw_readl(inline)
-012|gic_handle_irq(regs = 0xCB773E10)
-013|__irq_svc(asm)
-->|exception
-014|current_thread_info(inline)
-014|check_preemption_disabled(what1 = 0xC0F115AC, what2 = 0xC0EE0B28)
-015|debug_smp_processor_id()
-016|MT_trace_preempt_on()
-017|preempt_count_sub
-018|current_thread_info(inline)
-018|preempt_count(inline)
-018|should_resched(inline)
-018|__percpu_counter_add(fbc = 0xD8AC7178, amount = 0, batch = 32)
-019|__sb_start_write(sb = 0xC0F115AC, level = -1058141400, wait = FALSE)
-020|file_start_write(inline)
-020|vfs_write(file = 0xD366C480, buf = 0xBEA83A4C, ?, pos = 0xCB773F78)
-021|SYSC_write(inline)
-021|sys_write(?, buf = -1096271284, count = 146)
-022|ret_fast_syscall(asm)
mrq->host->context_info.wait 변수를 확인해보면, 이미 mrq->host 변수가 0이다.
(struct mmc_request *)0xDC9725F8 = 0xDC9725F8 -> (
sbc = 0xDC97262C,
cmd = 0xDC972660,
data = 0xDC9726C8,
stop = 0xDC972694,
completion = (done = 0x0, wait = (lock = (rlock = (raw_lock = (slock = 0x0,
done = 0x0,
host = 0x0)
어머 그런데, mmcqd/0 프로세스가 mmc_wait_for_data_req_done() 코드를 이미 수행했네?
-000|context_switch(inline)
-000|__schedule()
-001|schedule()
-002|mmc_wait_for_data_req_done(inline)
-002|mmc_start_req(host = 0xDC919000, areq = 0xDC972710, ?)
-003|mmc_blk_issue_rw_rq(mq = 0x0, rqc = 0xC64B18C0)
-004|mmc_blk_issue_rq(mq = 0xDC972494, req = 0xC64B18C0)
-005|mmc_queue_thread(d = 0xDC972494)
-006|kthread(_create = 0xDCA24E40)
-007|ret_from_fork(asm)
-008|ret_fast_syscall(asm)
@ftrace log를 좀 더 추가해보자.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index db74413..fcf6692 100644
? a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1034,6 +1034,9 @@ out:
}
EXPORT_SYMBOL(mmc_start_bkops);
#endif
/*
mmc_wait_data_done() - done callback for data request
@mrq: done data request
@@ -1042,10 +1045,25 @@ EXPORT_SYMBOL(mmc_start_bkops);
*/
static void mmc_wait_data_done(struct mmc_request *mrq)
static void mmc_wait_data_done(struct mmc_request mrq) *{
mrq->host->context_info.is_done_rcv = true;
+ trace_printk("+msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));
mdelay(1);
+ trace_printk("++msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));
wake_up_interruptible(&mrq->host->context_info.wait);
+ trace_printk("--msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));
역시나 GIC mtk-msdc(111) IRQ가 Trigger된 다음에 아래와 같이 커널 패닉이 발생하였네.
(램덤프: neo_more_crash_ftrace)
-000|do_DataAbort(addr = 0, fsr = 0, regs = 0x0)
-001|__dabt_svc(asm)
-->|exception
-002|debug_spin_lock_before(inline)
-002|do_raw_spin_lock(lock = 0x02B4)
-003|raw_spin_lock_irqsave(lock = 0x02B4)
-004|__wake_up(q = 0x02B4, mode = 1, nr_exclusive = 1, key = 0x0)
-005|mmc_wait_data_done(mrq = 0xDC9235F8)
-006|msdc_irq_data_complete(host = 0xDC927300, ?, error = 0)
-007|msdc_irq(?, dev_id = 0x1)
-008|handle_irq_event_percpu(?, action = 0xDC92B180)
-009|handle_irq_event(desc = 0xC121A440)
-010|cond_unmask_eoi_irq(inline)
-010|handle_fasteoi_irq(?, desc = 0xC121A440)
-011|static_key_count(inline)
-011|static_key_false(inline)
-011|trace_irq_exit(inline)
-011|__handle_domain_irq(domain = 0xDE806000, hwirq = 111, lookup = TRUE, ?)
-012|__raw_readl(inline)
-012|gic_handle_irq(regs = 0xC2631E20)
-013|__irq_svc(asm)
-->|exception
-014|__fget(?, mask = 16384)
-015|__fget_light(?, mask = 16384)
-016|__fdget
-017|sockfd_lookup_light(?, err = 0xC2631EC8, fput_needed = 0xC2631ECC)
-018|SYSC_recvfrom(inline)
-018|sys_recvfrom(?, ubuf = -1212542344, ?, flags = 64, addr = 0, addr_len = 0)
-019|ret_fast_syscall(asm)
Ftrace log 분석 좀 해보자.
커널 패닉이 일어나기 직전 상황
[1]: irq=111 name=mtk-msdc 가 Trigger됨
[2]: IRQ context에서 mmc_wait_data_done() 함수가 호출 from(msdc_irq_data_complete)
[2]~[3] 사이 mmcq0 process가 context switching 되지 않고 수행 ( InputDispatcher-1538)
[3]: mdelay(1); 이후 찍히는 ftrace log
[4]: wake_up_interruptible(&mrq->host->context_info.wait); API 호출로 mmcq0 process을 wakeup 시킴
CPU5: mmcqd/0, pid=244
sched_wakeup: comm=mmcqd/0 pid=244 prio=120 success=1 target_cpu=005 state=W
[5]: wake_up_interruptible(&mrq->host->context_info.wait); API 호출이 끝난 후 찍히는 로그(별 의미 없음)
kworker/4:0-25 [004] 149.827488: irq_handler_entry: irq=111 name=mtk-msdc // <<--[1]
kworker/4:0-25 [004] 149.827495: bprint: mmc_wait_data_done: +msdc [F: mmc_wait_data_done, L:1049]: ? ? from(msdc_irq_data_complete) // <<--[2]
InputDispatcher-1538 [005] 149.827496: sched_migrate_task: comm=m.lge.launcher3 pid=2135 prio=120 orig_cpu=6 dest_cpu=4 state=W
InputDispatcher-1538 [005] 149.827519: sched_wakeup: comm=m.lge.launcher3 pid=2135 prio=120 success=1 target_cpu=004 state=W
InputDispatcher-1538 [005] 149.827591: sched_migrate_task: comm=ndroid.systemui pid=1622 prio=120 orig_cpu=5 dest_cpu=6 state=W
InputDispatcher-1538 [005] 149.827608: sched_wakeup: comm=ndroid.systemui pid=1622 prio=120 success=1 target_cpu=006 state=W
InputDispatcher-1538 [005] 149.827675: sched_migrate_task: comm=android.ui pid=1287 prio=118 orig_cpu=4 dest_cpu=6 state=W
InputDispatcher-1538 [005] 149.827694: sched_wakeup: comm=android.ui pid=1287 prio=118 success=1 target_cpu=006 state=W
InputDispatcher-1538 [005] 149.827855: sched_switch: prev_comm=InputDispatcher prev_pid=1538 prev_prio=112 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 extra_prev_state=K|m|d
<idle>-0 [005] 149.827912: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=InputDispatcher next_pid=1538 next_prio=112 extra_prev_state=K|m|d
InputDispatcher-1538 [005] 149.828064: sched_switch: prev_comm=InputDispatcher prev_pid=1538 prev_prio=112 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 extra_prev_state=K|m|d
<idle>-0 [005] 149.828209: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=InputDispatcher next_pid=1538 next_prio=112 extra_prev_state=K|m|d
InputDispatcher-1538 [005] 149.828352: sched_switch: prev_comm=InputDispatcher prev_pid=1538 prev_prio=112 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 extra_prev_state=K|m|d
kworker/4:0-25 [004] 149.828497: bprint: mmc_wait_data_done: ++msdc [F: mmc_wait_data_done, L:1058]: ? ? from(msdc_irq_data_complete) // <<--[3]
kworker/4:0-25 [004] 149.828521: sched_wakeup: comm=mmcqd/0 pid=244 prio=120 success=1 target_cpu=005 state=W // <<--[4]
kworker/4:0-25 [004] 149.828525: bprint: mmc_wait_data_done: -msdc [F: mmc_wait_data_done, L:1060]: ? ? from(msdc_irq_data_complete) <<-[5]
<idle>0 [005] 149.828528: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=mmcqd/0 next_pid=244 next_prio=120 extra_prev_state=K|m|d <<-[6]
kworker/4:0-25 [004] 149.828530: irq_handler_exit: irq=111 ret=handled
1046 static void mmc_wait_data_done(struct mmc_request *mrq)
1047*{*
1048 mrq->host->context_info.is_done_rcv = true;
1049 trace_printk("+msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[2]
//생략
1051 mdelay(1);
//생략
1057
1058 trace_printk("++msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[3]
1059 wake_up_interruptible(&mrq->host->context_info.wait);
1060 trace_printk("-msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<-[5]
커널 패닉 상황
[7]: irq=111 name=mtk-msdc 가 handle 되었음
[8]: [mmcqd/0-244, CPU5]mmc_wait_for_data_req_done() 함수 내[from(mmc_blk_issue_rw_rq)] wait_event_interruptible()에서 빠져나옴.
[9][10]: [mmcqd/0-244, CPU5]mmc_wait_for_data_req_done() 함수를 빠져나옴
[11]: irq=111 name=mtk-msdc 가 Trigger됨 --> CPU4의 m.lge.launcher3-2135
[12]: [mmcqd/0-244, CPU5] 깨어난 mmcqd/0-244 프로세스 활동 시작
&mqrq->brq 변수를 0으로 밀어버림 (where: memset(brq, 0, sizeof(struct mmc_blk_request));
[13]: [m.lge.launcher3-2135 CPU4] irq=111 name=mtk-msdc IRQ로 Trigger되어 실행되어
mmc_wait_data_done() 함수 진입
[14][15]: [mmcqd/0-244, CPU5] mmc_blk_rw_rq_prep() 함수 퇴장하려함
[16][17][18] [mmcqd/0-244, CPU5] mmc_wait_for_data_req_done() 함수 처리 후 퇴장하려함
[19]: [m.lge.launcher3-2135 CPU4] wake_up_interruptible(&mrq->host->context_info.wait);
[20] 커널 패닉
[12]번 동작에서 0으로 초기화된 &mrq->host 변수를 통해 밀린((where: memset(brq, 0, sizeof(struct mmc_blk_request)) wake_up_interruptible() API 호출
kworker/4:0-25 [004] 149.828530: irq_handler_exit: irq=111 ret=handled // <<--[7]
mmcqd/0-244 [005] 149.828536: bprint: mmc_start_req: ++mmcq [F: mmc_wait_for_data_req_done, L:1236]: ? ? from(mmc_blk_issue_rw_rq) // <<--[8]
kworker/4:0-25 [004] 149.828537: irq_handler_entry: irq=30 name=arch_timer
kworker/4:0-25 [004] 149.828546: irq_handler_exit: irq=30 ret=handled
mmcqd/0-244 [005] 149.828559: bprint: mmc_start_req: mmcq [F: mmc_wait_for_data_req_done, L:1268]: ? ? from(mmc_blk_issue_rw_rq) // <<-[9]
mmcqd/0-244 [005] 149.828561: bprint: mmc_start_req: mmcq release [F: mmc_wait_for_data_req_done, L:1270]: ? ? from(mmc_blk_issue_rw_rq) // <<--[10]
kworker/4:0-25 [004] 149.828584: workqueue_execute_end: work struct 0xcb40cae4
kworker/4:0-25 [004] 149.828630: sched_switch: prev_comm=kworker/4:0 prev_pid=25 prev_prio=120 prev_state=S ==> next_comm=m.lge.launcher3 next_pid=2135 next_prio=120 extra_prev_state=K|m|d
m.lge.launcher3-2135 [004] 149.828660: irq_handler_entry: irq=111 name=mtk-msdc // <<--[11]
mmcqd/0-244 [005] 149.828661: bprint: mmc_blk_rw_rq_prep: ++mmc_blk_rw_rq_prep [F: mmc_blk_rw_rq_prep, L:1725]: ? ? from(mmc_blk_issue_rw_rq) // <<--[12]
m.lge.launcher3-2135 [004] 149.828666: bprint: mmc_wait_data_done: +msdc [F: mmc_wait_data_done, L:1049]: ? ? from(msdc_irq_data_complete) // <<--[13]
mmcqd/0-244 [005] 149.828670: bprint: mmc_blk_rw_rq_prep: -mmc_blk_rw_rq_prep [F: mmc_blk_rw_rq_prep, L:1905]: ? ? from(mmc_blk_issue_rw_rq) // <<-[14]
mmcqd/0-244 [005] 149.828672: bprint: mmc_blk_rw_rq_prep: mmc_blk_rw_rq_prep [F: mmc_blk_rw_rq_prep, L:1907]: ? ? from(mmc_blk_issue_rw_rq) // <<-[15]
mmcqd/0-244 [005] 149.828681: bprint: mmc_start_req: +mmcq [F: mmc_wait_for_data_req_done, L:1231]: ? ? from(mmc_blk_issue_rw_rq) // <<--[16]
mmcqd/0-244 [005] 149.828683: bprint: mmc_start_req: ++mmcq [F: mmc_wait_for_data_req_done, L:1236]: ? ? from(mmc_blk_issue_rw_rq) // <<--[17]
mmcqd/0-244 [005] 149.828695: bprint: mmc_start_req: -mmcq [F: mmc_wait_for_dat뗥nQ, L:0]: ? ? from(0x726f776bS) // <<--[18]
m.lge.launcher3-2135 [004] 149.829669: bprint: mmc_wait_data_done: ++msdc [F: mmc_wait_data_done, L:1058]: ? ? from(msdc_irq_data_complete) // <<--[19]
m.lge.launcher3-2135 [004] 149.829748: tracing_on: ftrace is disabled caller=oops_enter // <<?[20]
1222 static int mmc_wait_for_data_req_done(struct mmc_host *host,
1223 struct mmc_request *mrq,
1224 struct mmc_async_req *next_req)
1225*{*
1226 struct mmc_command *cmd;
1227 struct mmc_context_info *context_info = &host->context_info;
1228 int err;
1229 unsigned long flags;
1230
1231 trace_printk("+mmcq [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[16]
1232 while (1) {
1233 wait_event_interruptible(context_info->wait,
1234 (context_info->is_done_rcv ||
1235 context_info->is_new_req));
1236 trace_printk("++mmcq [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[8], [17]
1237 spin_lock_irqsave(&context_info->lock, flags);
//snip
1268 trace_printk("mmcq [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<-[9], [18]
1269 mmc_retune_release(host);
1270 trace_printk("mmcq release [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));
1271 return err;
1272 }
1701 static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
1702 struct mmc_card *card,
1703 int disable_multi,
1704 struct mmc_queue *mq)
1705*{*
1706 u32 readcmd, writecmd;
1707 struct mmc_blk_request *brq = &mqrq->brq;
//생략
1724 memset(brq, 0, sizeof(struct mmc_blk_request));
1725 trace_printk("++mmc_blk_rw_rq_prep [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[12]
1726 brq->mrq.cmd = &brq->cmd;
//생략
1905 trace_printk("-mmc_blk_rw_rq_prep [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); //<<-[14]
1906 mmc_queue_bounce_pre(mqrq);
1907 trace_printk("-mmc_blk_rw_rq_prep [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));
1908 }
1046 static void mmc_wait_data_done(struct mmc_request *mrq)
1047*{*
1048 mrq->host->context_info.is_done_rcv = true;
1049 trace_printk("+msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[13]
//생략
1051 mdelay(1);
//생략
1057
1058 trace_printk("++msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[19]
1059 wake_up_interruptible(&mrq->host->context_info.wait);
1060 trace_printk("--msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));
정리하면 wake_up_interruptible() API에 파라미터로 전달되는 &mrq->host->context_info.wait를 다른 값(mmcqd를 깨울 수 있는 wait_queue)으로 바꾸거나,
wake_up_interruptible() 호출 시 조건을 줘야 한다. 혹은 spinlock irq save로 코드 보호
솔류션
분석된 내용에 따라 아래 패치를 적용하였더니 문제가 사라졌다.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index c3abe34..01e4bfc 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -403,8 +403,13 @@ EXPORT_SYMBOL(mmc_start_bkops);
*/
static void mmc_wait_data_done(struct mmc_request *mrq)
{
+ unsigned long flags;
+ struct mmc_context_info *context_info = &mrq->host->context_info;
+
+ spin_lock_irqsave(&context_info->lock, flags);
mrq->host->context_info.is_done_rcv = true;
wake_up_interruptible(&mrq->host->context_info.wait);
+ spin_unlock_irqrestore(&context_info->lock, flags);
}
static void mmc_wait_done(struct mmc_request *mrq)
@@ -465,6 +470,7 @@ static int mmc_wait_for_data_req_done(struct mmc_host *host,
struct mmc_command *cmd;
struct mmc_context_info *context_info = &host->context_info;
int err;
+ bool is_done_rcv = false;
unsigned long flags;
while (1) {
@@ -472,9 +478,10 @@ static int mmc_wait_for_data_req_done(struct mmc_host *host,
(context_info->is_done_rcv ||
context_info->is_new_req));
spin_lock_irqsave(&context_info->lock, flags);
+ is_done_rcv = context_info->is_done_rcv;
context_info->is_waiting_last_req = false;
spin_unlock_irqrestore(&context_info->lock, flags);
- if (context_info->is_done_rcv) {
+ if (is_done_rcv) {
context_info->is_done_rcv = false;
context_info->is_new_req = false;
cmd = mrq->cmd;
그런데, 리눅스 커뮤니티에서 이미 패치를 릴리즈했네.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 347108a..db74413 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1042,8 +1042,10 @@ EXPORT_SYMBOL(mmc_start_bkops);
*/
static void mmc_wait_data_done(struct mmc_request *mrq)
{
- mrq->host->context_info.is_done_rcv = true;
- wake_up_interruptible(&mrq->host->context_info.wait);
+ struct mmc_context_info *context_info = &mrq->host->context_info;
+
+ context_info->is_done_rcv = true;
+ wake_up_interruptible(&context_info->wait);
}
# Reference: For more information on 'Linux Kernel';
디버깅을 통해 배우는 리눅스 커널의 구조와 원리. 1
디버깅을 통해 배우는 리눅스 커널의 구조와 원리. 2
'Core BSP 분석 > 커널 트러블슈팅' 카테고리의 다른 글
[Linux][Kernel]뮤텍스 데드락(Mutex Deadlock) 락업(lockup) - "simpleperf" 디버깅 (0) | 2023.05.07 |
---|---|
[Linux][Kernel] Kernel Panic @__stack_chk_fail - 스택 카나리 (Stack canary Feature) (0) | 2023.05.07 |
[Linux][Kernel] panic@___might_sleep (0) | 2023.05.07 |
[Linux][Kernel][Stability] Kernel panic @0x0 from xfrm_local_error+0x4c (0) | 2023.05.07 |
[Kernel][Debug] "cat /d/shrinker" kernel panic (0) | 2023.05.07 |