Sometime I noticed that system does not crash when the call to BUG() is made in the kernel driver. I just observed the stack trace from the kernel log and then find that target is running rather than entering crash mode.
 
To make the target crash when BUG() is called, the following config should be present;
 
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_ON_OOPS_VALUE=1
 
Let's look at kernel die() function which is called from BUG() function.
 
void die(const char *str, struct pt_regs *regs, int err)
{
...
 
if (in_interrupt())
panic("%s: Fatal exception in interrupt", str);
if (panic_on_oops)
panic("%s: Fatal exception", str);
 
If panic_on_oops is true, the panic() is called with "Fatal exception message". How panic_on_oops is specified? Is it configured during run-time?
 
Let's take a look at below statement;
 
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 
panic_on_oops depends on CONFIG_PANIC_ON_OOPS_VALUE.
Now is time to check the definition of CONFIG_PANIC_ON_OOPS_VALUE.
 
config PANIC_ON_OOPS_VALUE
int
range 0 1
default 0 if !PANIC_ON_OOPS
default 1 if PANIC_ON_OOPS
 
CONFIG_PANIC_ON_OOPS_VALUE can be configured by PANIC_ON_OOPS.
 
Conclusion
 
PANIC_ON_OOPS should be enabled to make the target crash when BUG() is called.
가끔 가다가 커널 메모리 누수(memory leak) 이슈가 생길 때가 있어요.
OOM Killer가 메모리가 부족하다고 커널이 메시지를 남기며 스스로 자살을 하거나,
Low Memory Killer가 너무나도 자주 돌아서 락업 현상으로 검출되죠.
 
이런 이슈가 나왔을 때 어떻게 디버깅을 하면 좋을까요?
한번 정리 좀 해볼께요.
 
1. 디버그 정보: contig_page_data.node_zones[0--1].free_area
 
우선 중 High/Low 메모리 Zone 중 어떤 Zone에서 페이지가 부족한 지 점검할 필요가 있어요.
만약에 Low 메모리 존에서 메모리가 부족하면 커널 동작으로 포커스를 맞추어야 하구요,
아래 경우와 같이 High Memory Zone에 Order 별로 free 페이지가 거의 없으면 더 골치 아프죠.
 
이런 때는 mmap으로 유저 공간에서 메모리를 과도하게 할당하려다가 메모리 릭이 일어날 수 있으니까요.
  contig_page_data = (
    node_zones = (
      (watermark = (756, 2005, 2194), lowmem_reserve = (0, 6144, 6144), inactive_ratio = 1, zone_pgdat = 0xC1A866C0
      (
        watermark = (128, 1778, 2028),
        lowmem_reserve = (0, 0, 0),
        inactive_ratio = 1,
        zone_pgdat = 0xC1A866C0,
        pageset = 0xC18D727C,
        dirty_balance_reserve = 2028,
        cma_alloc = FALSE,
        pageblock_flags = 0xEFAFFD80,
        zone_start_pfn = 720896,
        managed_pages = 196608,
        spanned_pages = 262144,
        present_pages = 196608,
        name = 0xC1469F47 -> "HighMem",
        nr_migrate_reserve_block = 1,
        nr_isolate_pageblock = 0,
        wait_table = 0xECF7F000,
        wait_table_hash_nr_entries = 1024,
        wait_table_bits = 10,
        _pad1_ = (x = ".....N..........tr.......k...k..TQ....N..k...k..T.y...y..k...k..y."),
        lock = (rlock = (raw_lock = (slock = 4260691444, tickets = (owner = 65012, next = 65012)), magic = 37358998
        free_area = (
          (
            free_list = ((next = 0xEEA47274, prev = 0xEEA586D4), (next = 0xC1A86B98, prev = 0xC1A86B98), (next = 0x
            nr_free = 0, //<<--
            nr_free_cma = 0),
          (
            free_list = ((next = 0xEED6A014, prev = 0xEED709D4), (next = 0xC1A86BD0, prev = 0xC1A86BD0), (next = 0x
            nr_free = 0, //<<--
            nr_free_cma = 0),
          (
            free_list = ((next = 0xEE98A694, prev = 0xEEF6DD14), (next = 0xC1A86C08, prev = 0xC1A86C08), (next = 0x
            nr_free = 0, //<<--
            nr_free_cma = 0), 
          (
            free_list = ((next = 0xEED6CA14, prev = 0xEF60E514), (next = 0xC1A86C40, prev = 0xC1A86C40), (next = 0x
            nr_free = 0, //<<--
            nr_free_cma = 0),
          (
            free_list = ((next = 0xEE993214, prev = 0xEF5FB614), (next = 0xC1A86C78, prev = 0xC1A86C78), (next = 0x
            nr_free = 0, //<<--
            nr_free_cma = 0),
          (
            free_list = ((next = 0xC1A86CA8, prev = 0xC1A86CA8), (next = 0xC1A86CB0, prev = 0xC1A86CB0), (next = 0x
            nr_free = 0,  //<<--
            nr_free_cma = 0),
          (
            free_list = ((next = 0xEEA59814, prev = 0xEEA59814), (next = 0xC1A86CE8, prev = 0xC1A86CE8), (next = 0x
            nr_free = 3,
            nr_free_cma = 0),
          (free_list = ((next = 0xC1A86D18, prev = 0xC1A86D18), (next = 0xC1A86D20, prev = 0xC1A86D20), (next = 0xE
          (free_list = ((next = 0xC1A86D50, prev = 0xC1A86D50), (next = 0xC1A86D58, prev = 0xC1A86D58), (next = 0xE
          (free_list = ((next = 0xC1A86D88, prev = 0xC1A86D88), (next = 0xC1A86D90, prev = 0xC1A86D90), (next = 0xC
          (free_list = ((next = 0xC1A86DC0, prev = 0xC1A86DC0), (next = 0xC1A86DC8, prev = 0xC1A86DC8), (next = 0xC
        flags = 0,
        _pad2_ = (x = ".^.^.N...........3...7....>..s.......v....L...^...7..q~..G."),
        lru_lock = (rlock = (raw_lock = (slock = 1579114015, tickets = (owner = 24095, next = 24095)), magic = 3735
        lruvec = (lists = ((next = 0xEEFF33F4, prev = 0xEEDE37B4), (next = 0xEF3E8EB4, prev = 0xEEA17394), (next =
        inactive_age = (counter = 193062),
 
2. reclaimable 페이지: vm_stat
direct reclaim 수행 시 회수 가능한 페이지 갯수를 확인할 수가 있는데, 이를 확인할 수 있는 전역 변수가 vm_stat 이에요.
아래 경우 60056+55707 개의 페이지가 reclaimable 가능한 페이지 갯수죠.
  (static atomic_long_t [34]) vm_stat = (
    [0] = ((int) counter = 12193), //<<-- NR_FREE_PAGES
    [1] = ((int) counter = 270),
    [2] = ((int) counter = 60056),  //<<-- NR_INACTIVE_ANON
    [3] = ((int) counter = 55707), // <<- NR_ACTIVE_ANON
    [4] = ((int) counter = 52514),
    [5] = ((int) counter = 52794),
    [6] = ((int) counter = 64),
    [7] = ((int) counter = 64),
    [8] = ((int) counter = 102899),
    [9] = ((int) counter = 63365),
    [10] = ((int) counter = 120639),
    [11] = ((int) counter = 0),
    [12] = ((int) counter = 11),
    [13] = ((int) counter = 14342),
    [14] = ((int) counter = 46446),
    [15] = ((int) counter = 11697),
 
zone_stat_item 선언부는 아래와 같아요.
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,   // <<- 0
NR_ALLOC_BATCH, // <<- 1
NR_LRU_BASE, // <<- 2
NR_INACTIVE_ANON = NR_LRU_BASE, // <<- 2
NR_ACTIVE_ANON, // <<- 3
NR_INACTIVE_FILE, // <<- 4
NR_ACTIVE_FILE, // <<- 5
NR_UNEVICTABLE, /*  "     "     "   "       "         */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
NR_ANON_PAGES, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.   
 
그럼 위와 같은 디버깅 정보의 근거는 뭐냐구요? 아래 코드를 보아요.
zone_reclaimable_pages() API는 reclaimable 페이지 갯수를 계산해요.
static unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
 
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
     zone_page_state(zone, NR_INACTIVE_FILE);
 
if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
      zone_page_state(zone, NR_INACTIVE_ANON);
 
return nr;
}
 
3. 디버그 피쳐: kmem leak
리눅스 커널에서는 kmem leak이란 디버그 피쳐를 제공합니다.
리눅스 커널 커뮤니티에서는 이 피쳐를 돌려서 Memory Leak을 체크하죠.
 
각각 패치를 안드로이드 디바이스에서 어떻게 설정하는 지 설명할께요
 
1> 커널
CONFIG_DEBUG_KMEMLEAK 컨피그를 키고 CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF를 꺼야 해요
diff --git a/arch/arm/configs/pompeii_defconfig b/arch/arm/configs/pompeii_defconfig
index 2e97f97..aac678a 100644
--- a/arch/arm/configs/pompeii_defconfig
+++ b/arch/arm/configs/pompeii_defconfig
@@ -754,8 +754,8 @@
 CONFIG_SLUB_DEBUG_PANIC_ON=y
 CONFIG_SLUB_DEBUG_ON=y
 CONFIG_DEBUG_KMEMLEAK=y
-CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=4000
-CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y
+CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=40000
+# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_VM=y
 CONFIG_DEBUG_MEMORY_INIT=y
 
2> 커널 로딩 직전(대부분 LK죠), command line(ATAG)로 " kmemleak=on"을 전달해야 해요.
diff --git a/app/aboot/aboot.c b/app/aboot/aboot.c
index a02a85c..04409a3 100644
--- a/app/aboot/aboot.c
+++ b/app/aboot/aboot.c
@@ -304,6 +304,8 @@ struct getvar_partition_info part_info[] =
  { "cache"   , "partition-size:", "partition-type:", "", "ext4" },
 };
 
+static const char *kmem_feature = " kmemleak=on";
+
 char max_download_size[MAX_RSP_SIZE];
 char charger_screen_enabled[MAX_RSP_SIZE];
 #ifdef WITH_POMPEII_EXPORT_MODEL_NAME
@@ -511,6 +513,8 @@ unsigned char *update_cmdline(const char * cmdline)
   }
  }
 
+ cmdline_len += strlen(kmem_feature);
+
  if (target_warm_boot()) {
   warm_boot = true;
   cmdline_len += strlen(warmboot_cmdline);
@@ -688,6 +692,12 @@ unsigned char *update_cmdline(const char * cmdline)
    while ((*dst++ = *src++));
   }
 
+  if (strlen(kmem_feature)) {
+   src = ftrace_event;
+   if (have_cmdline) --dst;
+   while ((*dst++ = *src++));
+  }
+
   if (have_target_boot_params) {
    if (have_cmdline) --dst;
    src = target_boot_params;
 
3> 10초 마다 메모리 정보를 디바이스 파일 시스템(/sdcard)에 저장할 수 있는,
service-kmemleak란 서비스를 하나 만들어요.  
 
핵심 코드는 아래 코드와 같이 /sys/kernel/debug/kmemleak 을 clear 후 scan을 해줘야 해요.
echo clear  > /sys/kernel/debug/kmemleak
echo scan   > /sys/kernel/debug/kmemleak
 
diff --git a/device.mk b/device.mk
index ef6940b..f79383f 100644
--- a/device.mk
+++ b/device.mk
@@ -121,7 +121,8 @@
 
 PRODUCT_COPY_FILES += \
+ device/pompeii/init.kmemleak.sh:root/init.kmemleak.sh
 
diff --git a/init.core.rc b/init.core.rc
index f0dbf37..8472053 100644
--- a/init.core.rc
+++ a/init.core.rc
@@ -141,6 +141,9 @@
     disabled
     oneshot
 
+service service-kmemleak /system/bin/sh /init.kmemleak.sh
+    class main
+
 
diff --git a/init.kmemleak.sh b/init.kmemleak.sh
new file mode 100644
index 0000000..402a6b3
--- /dev/null
+++ b/init.kmemleak.sh
@@ -0,0 +1,17 @@
+#!/system/bin/sh
+test_count=0
+file_name=""
+file_name2=""
+#rm /sdcard/log_kmemleak/kmemleak_log*
+
+while true;do
+    file_name="kmemleak_log_$test_count.txt"
+    cat /sys/kernel/debug/kmemleak > /sdcard/$file_name
+    echo clear  > /sys/kernel/debug/kmemleak
+    echo scan   > /sys/kernel/debug/kmemleak
+    test_count=$(($test_count+1))
+    sleep 600
+done
+echo "end"
 
이 기능의 핵심 함수는 kmemleak_scan()인데요.
object_list란 링크드 리스트를 돌면서 플래그 속성이 OBJECT_ALLOCATED인 kmemleak_object에 대해
콜스택을 찍어주죠.
 
아웃풋 파일은 아래와 같아요. 
이제 덤프한 파일을 해석하는 게 아주 중요한데요.
사실 로그 중에는 false positive라고 정상 동작인데 에러 로그 처럼 보이는게 있는데요.
이런 놈을 잘 필터링하는게 중요해요. 아래는 대표적인 false positive 로그에요.
unreferenced object 0xc413bd00 (size 64):
  comm "swapper/0", pid 1, jiffies 4294938050 (age 652.640s)
  hex dump (first 32 bytes):
    00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  ....kkkkkkkkkkkk
    6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
  backtrace:
    [<c0472728>] dma_async_device_register+0x198/0x494
    [<c0473818>] qbam_probe+0x170/0x230
    [<c057ddf0>] platform_drv_probe+0x30/0x78
    [<c057c1e4>] driver_probe_device+0x33c/0x6b40
    [<c057c604>] __driver_attach+0x68/0x8c
    [<c057a5d0>] bus_for_each_dev+0x70/0x84
    [<c057b66c>] bus_add_driver+0x100/0x1f8
    [<c057cf2c>] driver_register+0x9c/0xe0
    [<c01009dc>] do_one_initcall+0x19c/0x1dc
    [<c1900dd0>] kernel_init_freeable+0x108/0x1cc
    [<c0f53e28>] kernel_init+0x8/0xe8
    [<c01068e0>] ret_from_fork+0x14/0x34
    [<ffffffff>] 0xffffffff
 
unreferenced object 0xc413bb00 (size 64):
  comm "swapper/0", pid 1, jiffies 4294938050 (age 652.640s)
  hex dump (first 32 bytes):
    00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  ....kkkkkkkkkkkk
    6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
  backtrace:
    [<c0472728>] dma_async_device_register+0x198/0x494
    [<c0473818>] qbam_probe+0x170/0x230
    [<c057ddf0>] platform_drv_probe+0x30/0x78
    [<c057c1e4>] driver_probe_device+0x33c/0x6b0
    [<c057c604>] __driver_attach+0x68/0x8c
    [<c057a5d0>] bus_for_each_dev+0x70/0x84
    [<c057b66c>] bus_add_driver+0x100/0x1f8
    [<c057cf2c>] driver_register+0x9c/0xe0
    [<c01009dc>] do_one_initcall+0x19c/0x1dc
    [<c1900dd0>] kernel_init_freeable+0x108/0x1cc
    [<c0f53e28>] kernel_init+0x8/0xe8
    [<c01068e0>] ret_from_fork+0x14/0x34
    [<ffffffff>] 0xffffffff
 
 
 
# Reference: For more information on 'Linux Kernel';
 
디버깅을 통해 배우는 리눅스 커널의 구조와 원리. 1
 
디버깅을 통해 배우는 리눅스 커널의 구조와 원리. 2
 
 
 
 
 
 
#커널 크래시 디버깅 및 TroubleShooting
커널 패닉이 났어요.
네트워크 드라이버 리눅스 커널 코드에서 발생한 것 같은데요. 음.
일단 당황하지 마시구요. 차근 차근 커널 로그와 코어 덤프를 살펴보면, 정답이 나와요.
 
일단 커널 로그를 보면, 프로그램 카운터가 0x0을 가르키고 있네요.
음... 그리고
링크 레지스터(R14)가 0xc0adc274(LR is at xfrm_local_error+0x4c/0x58) 을 가르키고 있습니다.
[  262.401303] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[  262.401365] pgd = dbdc4000
[  262.401389] [00000000] *pgd=00000000
[  262.401433] Internal error: Oops: 80000005 [#1] PREEMPT SMP ARM
[  262.401459] Modules linked in:
[  262.401495] CPU: 0 PID: 7107 Comm: Framework Tainted: G        W    3.10.49-g356bd9f-00007-gadca646 #1
[  262.401522] task: da6b0540 ti: d9412000 task.ti: d9412000
[  262.401549] PC is at 0x0
[  262.401590] LR is at xfrm_local_error+0x4c/0x58
[  262.401619] pc : [<00000000>]    lr : [<c0adc274>]    psr: a00f0013
[  262.401619] sp : d9413c68  ip : c0ac6c20  fp : 0000dd86
[  262.401654] r10: 0000010e  r9 : 0000010a  r8 : de0ddc20
[  262.401678] r7 : c13ddf00  r6 : 00000500  r5 : d9094540  r4 : c13e3780
[  262.401703] r3 : 00000000  r2 : 00000001  r1 : 00000500  r0 : d9094540
 
일단 커널 로그에서 R14가 가르키는(커널 패닉이 발생한 함수를 호출한) 주소로 가보면요. 
R3를 branch instruction으로 수행하고 있었습니다. 이런 Instruction은 함수 포인터로 함수를 호출할 때 쓰이죠.
그런데 커널 로그에서 R3이 0x0이라고 하네요.
   NSR:C0ADC264|E1A00005            cpy     r0,r5            ; proto,skb
   NSR:C0ADC268|E594344C            ldr     r3,[r4,#0x44C]
   NSR:C0ADC26C|E1A01006            cpy     r1,r6            ; r1,mtu
   NSR:C0ADC270|E12FFF33            blx     r3  //<<--
   
아 이제, 커널 로그가 가르키는 스택 주소 0xd9413c68로 좀 가볼까요?
d.v %y.l 0xD9413C68
______address|_data________|value_____________|symbol
NSD:D9413C60| 13 00 0F A0  0xA00F0013
NSD:D9413C64| FF FF FF FF  0xFFFFFFFF
NSD:D9413C68| 40 45 09 D9  0xD9094540    // <<-R4 
NSD:D9413C6C| 00 0C 42 DC  0xDC420C00   //<<--R5 
NSD:D9413C70| 40 45 09 D9  0xD9094540     //<<-- R6
NSD:D9413C74| 0C 11 AD C0  0xC0AD110C         \vmlinuxxfrm4_output__xfrm4_output+0xE4  //<<--R14
NSD:D9413C78| 00 00 00 00  0x0
NSD:D9413C7C| 01 00 00 00  0x1
NSD:D9413C80| 00 0C 42 DC  0xDC420C00
NSD:D9413C84| 6C C0 AD C0  0xC0ADC06C         \vmlinuxxfrm_outputxfrm_output_resume+0x41C
 
@D9413C74 스택 주소를 보니, 0xC0AD110C 즉 __xfrm4_output+0xE4 주소를 볼 수 있는데요.
0xC0AD110C 주소를 가보니, xfrm_local_error란 함수를 branch operation으로 수행을 하네요.
______addr/line|code_____|label____|mnemonic________________|comment
    NSR:C0AD1104|E1A00004            cpy     r0,r4            ; r0,skb
   NSR:C0AD1108|EB002C46            bl      0xC0ADC228       ; xfrm_local_error
 
xfrm_local_error() 함수 초반 ARM Instruction을 보면, R14, R6, R5, R4 순으로 스택에 Push를 하네요.   
   NSR:C0ADC228|E92D4070  xfrm_local_error:    push    {r4-r6,r14}
   NSR:C0ADC22C|E1A05000                       cpy     r5,r0            ; skb,pr
   
예를 들면, 이런 거죠. 스택에 이렇게 Push를 하고나면 새로운 스택 주소는 D9413C68(<-D9413C74)가 되는거죠.
stack 주소
D9413C68  R4을 스택에 Push       =>> 0xD9094540
D9413C6C  R5을 스택에 Push       =>> 0xDC420C00
D9413C70  R6을 스택에 Push       =>> 0xD9094540
D9413C74  R14 주소 스택에 Push =>>  0xC0AD110C
 
이렇게 새로운 함수가 호출될 때 스택에 파라미터를 Push하는 규칙은 CPU 아키텍쳐마다 다르거든요.
위의 경우에는 ARM32 아키텍처 Calling Convention에 따른 거에요.
 
ARM 스팩 문서를 보시면 위 내용이 자세히 나와 있어요.
 
자 다시 커널 로그로 돌아가 봅시다.
[  262.401495] CPU: 0 PID: 7107 Comm: BaldTainted: G        W    3.10.49-g356bd9f-00007-gadca646 #1
[  262.401522] task: da6b0540 ti: d9412000 task.ti: d9412000
[  262.401549] PC is at 0x0
[  262.401590] LR is at xfrm_local_error+0x4c/0x58
[  262.401619] pc : [<00000000>]    lr : [<c0adc274>]    psr: a00f0013
[  262.401619] sp : d9413c68  ip : c0ac6c20  fp : 0000dd86
 
T32 프로그램으로, 
커널 로그에서 찍힌 LR을 프로그램 카운터로 설정하고, 스택 주소는 그대로 설정 그리고 위 스택 정보에서 살펴본 0xC0AD110C(__xfrm4_output+0xE4) 주소를
설정하면. 이렇게 이쁘게 콜 스택이 보이네요.
 
-000|xfrm_local_error(skb = 0xDCD376C0, mtu = -611987776)
-001|__xfrm4_output(skb = 0xD9094540)
-002|xfrm_output_resume(skb = 0xD9094540, err = 1)
-003|__xfrm4_output(skb = 0xD9094540)
-004|ip_local_out(skb = 0xD9094540)
-005|ip_send_skb(net = 0xC13DDF00, ?)
-006|udp_send_skb(skb = 0xD9094540, ?)
-007|udp_sendmsg(?, sk = 0xDE0F1680, msg = 0xD9413EE0, len = 1300)
-008|inet_sendmsg(iocb = 0xD9413E58, ?, msg = 0xD9413EE0, size = 1300)
-009|sock_sendmsg(sock = 0xDA9A5500, msg = 0xD9413EE0, size = 1300)
-010|SYSC_sendto(inline)
-010|sys_sendto(?, buff = -1997588480, ?, ?, addr = -1985806992, addr_len = 16)
-011|ret_fast_syscall(asm)
 
T32 명령어는 아래와 같아요.
r.s R13  0xD9413C68
r.s R14 0xC0AD110C
r.s PC   0xC0ADC270
   
0xC0ADC270 주소에 다시 가볼까요? R3가 어떻게 0x0으로 업데이트되는지 봐야겠죠.  
확인해보니, ldr     r3,[r4,#0x44C] 명령어로 업데이트 되네요. 
   NSR:C0ADC264|E1A00005                        cpy     r0,r5            ; proto,skb
   NSR:C0ADC268|E594344C                        ldr     r3,[r4,#0x44C]
   NSR:C0ADC26C|E1A01006                        cpy     r1,r6            ; r1,mtu
   NSR:C0ADC270|E12FFF33                        blx     r3
   
0xC0ADC270 주소로 실제 소스 코드 위치를 알면 좋을 상황인데요. T32에서 이런 상황에서 아주 유용한 커맨드를 제공해요.
y.l.line 0xC0ADC270   
__________address________|module__________________________________|source________|line______|o
     P:C0ADC25C--C0ADC263|\vmlinuxxfrm_output                   |.1/android/kernel/net/xfrm/xfrm_output.c|247--247          |
     P:C0ADC264--C0ADC273|\vmlinuxxfrm_output                   |.1/android/kernel/net/xfrm/xfrm_output.c|248--250          |
    
음, kernel/net/xfrm/xfrm_output.c의 248--250  라인인 것 같네요. 
역시나, 실제 소스 코드를 보면 함수 포인터(콜백 함수 형태)로 함수를 호출하고 있네요.
234void xfrm_local_error(struct sk_buff *skb, int mtu)
235{
236 unsigned int proto;
237 struct xfrm_state_afinfo *afinfo;
238
// ...생략...
245
246 afinfo = xfrm_state_get_afinfo(proto);
247 if (!afinfo)
248 return;
249
250 afinfo->local_error(skb, mtu);  //<<--
251 xfrm_state_put_afinfo(afinfo);   
 
struct xfrm_state_afinfo란 구조체에서 local_error란 멤버의 오프셋 주소가 얼마인지 알아볼까요?
예를 들어서, bald_manager란 구조체가 있으면, issue 멤버는 0x4, issue_list 멤버는 0x8 오프셋이라고 볼 수 있어요.
struct bald_manager {
int phone;
int issue;
struct list_head issue_list;
}; 
 
음, 이럴 때 커널 패닉을 디버깅하는 Crash Tool이란 좋은 툴에서 아주 유용한 기능을 제공하거든요.
아래와 같은 명령어를 입력하면, local_error 멤버의 오프셋이 0x44c임을 알 수 있어요.
crash> struct -o xfrm_state_afinfo.local_error
struct xfrm_state_afinfo {
  [0x44c] void (*local_error)(struct sk_buff *, u32);
}
 
어떤 T32 매니아가 저에게 질문을 던지네요.
음 T32로는 확인할 수 없냐구요? 아래 매크로 기능으로 확인 가능해요.
 
T32로 아래 명령어를 입력하면 되요.
sYmbol.NEW.MACRO offsetof(type,member) ((int)(&((type*)0)->member))
 
이게 제가 뚝딱만든 코드는 아니구요. 이미 리눅스 커널 코드에 있는 매크로를 그대로 가져온 거거든요.
[android/kernel/include/linux/stddef.h]
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
 
요렇게 입력하면 똑같은 0x044C 오프셋 주소를 확인할 수 있어요. T32로 디버깅할 때 자주 쓰면 좋아요.
v.v %h offsetof(struct xfrm_state_afinfo,local_error)
  offsetof(struct xfrm_state_afinfo,local_error) = 0x044C
 
자, 이제 0xC0ADC270 주소에 다시 가보면 [1] instruction이 afinfo->local_error(skb, mtu); 함수 라인에 매핑되는 걸 알 수 있어요.
   NSR:C0ADC264|E1A00005                        cpy     r0,r5            ; proto,skb
   NSR:C0ADC268|E594344C                        ldr     r3,[r4,#0x44C]  //<<--[1]
   250 afinfo->local_error(skb, mtu);  //<<--[1]
   NSR:C0ADC26C|E1A01006                        cpy     r1,r6            ; r1,mtu
   NSR:C0ADC270|E12FFF33                        blx     r3
 
커널 패닉 시 로그를 다시 보면, R4가 0xc13e3780이거든요.
이 값을 T32로 (struct xfrm_state_afinfo *) 타입으로 캐스팅해서 올려보면,  local_error값이 0x0이네요.
어라, 그리고 0xC13E3780 주소의 정체가 xfrm4_state_afinfo란 전역 벼수라는 것도 알 수 있네요.
v.v %all (struct xfrm_state_afinfo *)0xc13e3780
  (struct xfrm_state_afinfo *) (struct xfrm_state_afinfo *)0xc13e3780 = 0xC13E3780 = xfrm4_state_afinfo -> (
    (unsigned int) family = 2 = 0x2 = '....',
    (unsigned int) proto = 4 = 0x4 = '....',
    (__be16) eth_proto = 8 = 0x8 = '..',
    (struct module *) owner = 0x0 =  -> NULL,
    (struct xfrm_type * [256]) type_map = ([0] = 0x0 =  -> NULL, [1] = 0x0 =  -> NULL, [2] = 0x0 =  -> NULL, [3] = 0
    (struct xfrm_mode * [5]) mode_map = ([0] = 0xC13E31D8 = xfrm4_transport_mode -> ((int (*)()) input2 = 0x0 = , (i
    (int (*)()) init_flags = 0xC0AD0AD0 = xfrm4_init_flags -> ,
    (void (*)()) init_tempsel = 0xC0AD0B6C = __xfrm4_init_tempsel -> ,
    (void (*)()) init_temprop = 0xC0AD0AF4 = xfrm4_init_temprop -> ,
    (int (*)()) tmpl_sort = 0x0 =  -> NULL,
    (int (*)()) state_sort = 0x0 =  -> NULL,
    (int (*)()) output = 0xC0AD12E4 = xfrm4_output -> ,
    (int (*)()) output_finish = 0xC0AD129C = xfrm4_output_finish -> ,
    (int (*)()) extract_input = 0xC0AD0D8C = xfrm4_extract_input -> ,
    (int (*)()) extract_output = 0xC0AD11E0 = xfrm4_extract_output -> ,
    (int (*)()) transport_finish = 0xC0AD0D94 = xfrm4_transport_finish -> ,
    (void (*)()) local_error = 0x0 =  -> NULL)  //<<--
 
    
이제 정리 좀 하면.
afinfo->local_error(skb, mtu); 주소가 0x0이라서 커널 패닉으로 타겟이 크래시가 난 거에요.
참, 결론은 허무하죠. 대부분의 커널 패닉 문제는 허무하게 끝납니다.     
 
아 이제, 코드 리뷰를 해볼 시간인데. xfrm4_state_afinfo란 변수 선언문을 가보았더니 에서 local_error 멤버를 초기화하지 않고 있네요.
[kernel/net/ipv4/xfrm4_state.c]
static struct xfrm_state_afinfo xfrm4_state_afinfo = {
        .family                 = AF_INET,
        .proto                  = IPPROTO_IPIP,
        .eth_proto              = htons(ETH_P_IP),
        .owner                  = THIS_MODULE,
        .init_flags             = xfrm4_init_flags,
        .init_tempsel           = __xfrm4_init_tempsel,
        .init_temprop           = xfrm4_init_temprop,
        .output                 = xfrm4_output,
        .output_finish          = xfrm4_output_finish,
        .extract_input          = xfrm4_extract_input,
        .extract_output         = xfrm4_extract_output,
        .transport_finish       = xfrm4_transport_finish,
}; 
 
아래와 같이 local_error 멤버에 대한 콜백 함수를 추가하는 코드를 추가하니 커널 패닉 증상은 사라졌습니다.
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 9258e75..0b2a064 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -83,6 +83,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
        .extract_input          = xfrm4_extract_input,
        .extract_output         = xfrm4_extract_output,
        .transport_finish       = xfrm4_transport_finish,
+       .local_error            = xfrm4_local_error,
 };
 
# Reference: For more information on 'Linux Kernel';
 
디버깅을 통해 배우는 리눅스 커널의 구조와 원리. 1
 
디버깅을 통해 배우는 리눅스 커널의 구조와 원리. 2
 
 
 

Kernel panic log

 2107.232713 / 01-01 11:11:03.809][7] init: cannot find '/system/bin/qrngp' (No such file or directory), disabling 'qrngp'

[ 2107.239317 / 01-01 11:11:03.809][5] Unable to handle kernel NULL pointer dereference at virtual address 00000028

[ 2107.239351 / 01-01 11:11:03.809][5] pgd = e37ec000

[ 2107.239366 / 01-01 11:11:03.809][0] [00000028] *pgd=00000000

[ 2107.239388 / 01-01 11:11:03.809][5] Internal error: Oops: 5 [#1] PREEMPT SMP ARM

[ 2107.239405 / 01-01 11:11:03.809][0] Modules linked in: texfat(PO)

[ 2107.239433 / 01-01 11:11:03.809][5] CPU: 5 PID: 2803 Comm: sensorservice Tainted: P        W  O   3.18.31-perf-gd069b48-00001-g8a6d6e5 #1

[ 2107.239452 / 01-01 11:11:03.809][5] task: e3ffb700 ti: e37f4000 task.ti: e37f4000

[ 2107.239479 / 01-01 11:11:03.809][5] PC is at find_vma_links+0x1c/0x78

[ 2107.239499 / 01-01 11:11:03.809][5] LR is at vma_adjust+0x3a0/0x574


Equivalent callstacks can be restored using the following command.

crash>  bt -I C01002D8 -S  E37F5DD0 0xE3FFB700

PID: 2803   TASK: e3ffb700  CPU: 5   COMMAND: "sensorservice"

bt: WARNING:  stack address:0xe37f5b98, program counter:0xc0ee5b60

 #0 [<c01002d8>] (do_DataAbort) from [<c010ad58>]

    pc : [<c01f980c>]    lr : [<c01fa708>]    psr: 200f0013

    sp : e37f5ec4  ip : 00000034  fp : e8236d9c

    r10: 00000000  r9 : 00000000  r8 : b2c99000

    r7 : c4616c80  r6 : e8236d68  r5 : 7f555034  r4 : 7f555034

    r3 : e37f5f04  r2 : b2c97000  r1 : b2c95000  r0 : 7f555038

    Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM

 #1 [<c010ad58>] (__dabt_svc) from [<c01fa708>]

 #2 [<c01f980c>] (find_vma_links) from [<c01fa708>]

 #3 [<c01fa708>] (vma_adjust) from [<c01fa9e4>]

 #4 [<c01fa9e4>] (__split_vma) from [<c01fb364>]

 #5 [<c01fb364>] (do_munmap) from [<c01fb558>]

 #6 [<c01fb558>] (vm_munmap) from [<c0106820>]


Execution profile at the moment of kernel panic

[1]: R12 is updated as 0x34 where 0x34 = *0x7f555038 = *R0

[2]: Data abort is raised since base address for LDR execution is 0x28(0x34-0xC = R12-0xC)

0xc01f97f0 <find_vma_links>:    push    {r4, r5, lr}

0xc01f97f4 <find_vma_links+0x4>:        add     r0, r0, #4

0xc01f97f8 <find_vma_links+0x8>:        mov     r4, #0

0xc01f97fc <find_vma_links+0xc>:        mov     r5, r4

0xc01f9800 <find_vma_links+0x10>:       ldr     r12, [r0]  //<<--[1]

0xc01f9804 <find_vma_links+0x14>:       cmp     r12, #0

0xc01f9808 <find_vma_links+0x18>:       beq     0xc01f983c <find_vma_links+76>

0xc01f980c <find_vma_links+0x1c>:       ldr     r0, [r12, #-12] //<<--[2]


When tracing back to the context where the call to find_vma_links() is made inside vma_adjust(), R0 is copied from R7(0xC4616C80) whose data structure type is (struct mm_struct*) .

0xc01fa6e8 <vma_adjust+896>:    mov     r0, r7

0xc01fa6ec <vma_adjust+900>:    str     r3, [sp]

0xc01fa6f0 <vma_adjust+904>:    add     r3, sp, #60     ; 0x3c

0xc01fa6f4 <vma_adjust+908>:    str     r3, [sp, #4]

0xc01fa6f8 <vma_adjust+912>:    add     r3, sp, #52     ; 0x34

0xc01fa6fc <vma_adjust+916>:    ldr     r1, [r2]

0xc01fa700 <vma_adjust+920>:    ldr     r2, [r2, #4]

0xc01fa704 <vma_adjust+924>:    bl      0xc01f97f0 <find_vma_links>  //<<--


Identical debug information

crash> struct task_struct.mm e3ffb700

  mm = 0xc4616d80


At logical level, when find_vma_links() is called, R0 should have been 0xC4616C80 instead of 0x7f555038. This signature cannot be understandable from SW point of view.


This issue ended up with replacing defective PMIC with normal one in the problematic target device.


When I enter the command adb shell "cat /d/shrinker", the system crashes with 100% after dumping the following kernel message as below.

[  761.636711] Unable to handle kernel paging request at virtual address f38a9a84

[  761.645048] pgd = e8074000

[  761.649800] [f38a9a84] *pgd=a0721811, *pte=00000000, *ppte=00000000

[  761.658106] Internal error: Oops: 7 [#1] PREEMPT SMP ARM

[  761.665481] Modules linked in:

[  761.670605] CPU: 6 PID: 6978 Comm: sh Tainted: G        W      3.18.31-g5b0528e-dirty #12

[  761.680852] task: eb5b8000 ti: d228e000 task.ti: d228e000

[  761.688315] PC is at do_raw_spin_lock+0x8/0x17c

[  761.694912] LR is at list_lru_count_node+0x14/0x60

//--- skip ---

[  761.885719] [<c016a650>] (do_raw_spin_lock) from [<c02057f4>] (list_lru_count_node+0x14/0x60)

[  761.896307] [<c02057f4>] (list_lru_count_node) from [<c022dffc>] (super_cache_count+0x40/0x94)

[  761.906982] [<c022dffc>] (super_cache_count) from [<c01f2708>] (debug_shrinker_show+0x44/0x7c)

[  761.917658] [<c01f2708>] (debug_shrinker_show) from [<c024a56c>] (seq_read+0x1a4/0x3f0)

[  761.927726] [<c024a56c>] (seq_read) from [<c022c87c>] (vfs_read+0x88/0x138)

[  761.936755] [<c022c87c>] (vfs_read) from [<c022cda8>] (SyS_read+0x3c/0x7c)

[  761.945695] [<c022cda8>] (SyS_read) from [<c0106840>] (ret_fast_syscall+0x0/0x38)

[  761.955242] Code: e12fff1e deaf1eed e92d47f0 e1a08000 (e5902004) 


After loading coredump using T32 program, we can find out that the data abort is triggered inside debug_spin_lock_before() function during sanity-check procedure of spinlock where list_lru_count_node() is attempting to acquire.


-000|do_DataAbort(addr = 170, fsr = 3247439880, regs = 0xEC4901C0)

-001|__dabt_svc(asm)

 -->|exception

-002|debug_spin_lock_before(inline)  //<<-- data abort

-002|do_raw_spin_lock(lock = 0xF38A9A80)

-003|list_lru_count_node(?, ?)

-004|super_cache_count(shrink = 0xEBC47DE0, sc = 0xD228FEF8)

-005|debug_shrinker_show(s = 0xE7CF63C0, ?)

-006|seq_read(file = 0xE805CA00, buf = 0xB3246000, size = 4096, ppos = 0xD228FF80)

-007|vfs_read(file = 0xFFFFFFFF, buf = 0x0, ?, pos = 0xD228FF80)

-008|SYSC_read(inline)

-008|sys_read(?, buf = -1289461760, count = 4096)

-009|ret_fast_syscall(asm)



Upon acquiring spinlock, all members of raw_spinlock_t(0xF38A9A80) have been scribbled.


It is necessary to examine the variable which contains spinlock as member.


    (register raw_spinlock_t *) [R0] lock = 0xF38A9A80 -> (

      (arch_spinlock_t) [NSD:0xF38A9A80] raw_lock = (

        (u32) [NSD:0xF38A9A80] slock = 0x6F632F34,

        (struct __raw_tickets) [NSD:0xF38A9A80] tickets = (

          (u16) [NSD:0xF38A9A80] owner = 0x2F34, //<<--

          (u16) [NSD:0xF38A9A82] next = 0x6F63)), //<<--

      (unsigned int) [NSD:0xF38A9A84] magic = 0x6E6F6D6D //<<--

      (unsigned int) [NSD:0xF38A9A88] owner_cpu = 0x6F6F675F,

      (void *) [NSD:0xF38A9A8C] owner = 0x5F656C67))



The address of spin_lock is passed from &nlru->lock which is responsible for saving &lru->node[nid].


51unsigned long

52list_lru_count_node(struct list_lru *lru, int nid)

53{

54 unsigned long count = 0;

55 struct list_lru_node *nlru = &lru->node[nid];  <<--

56

57 spin_lock(&nlru->lock);



The list_lru_count_node() is called inside super_cache_count() [128 line.].


108static unsigned long super_cache_count(struct shrinker *shrink,

109           struct shrink_control *sc)

110{

111 struct super_block *sb;

112 long total_objects = 0;

113

114 sb = container_of(shrink, struct super_block, s_shrink);

//snip

124 if (sb->s_op && sb->s_op->nr_cached_objects)

125  total_objects = sb->s_op->nr_cached_objects(sb,

126       sc->nid);

127

128 total_objects += list_lru_count_node(&sb->s_dentry_lru, //<<--

129       sc->nid);


Tracing back to the caller function of super_cache_count() using T32 program, we can find out that debug_shrinker_show() calls super_cache_count().


Here, the second parameter in a call to super_cache_count() is &sc defined as local variable whose memory location belongs to stack area, which looks very *odd*. odd debug information.

205static int debug_shrinker_show(struct seq_file *s, void *unused)

206{

207 struct shrinker *shrinker;

208 struct shrink_control sc;

209

210 sc.gfp_mask = -1;

211 sc.nr_to_scan = 0;

212

213 down_read(&shrinker_rwsem);

214 list_for_each_entry(shrinker, &shrinker_list, list) {

215  int num_objs;

216

217  num_objs = shrinker->count_objects(shrinker, &sc);  //<<-- shrinker->count_objects = super_cache_count


Here, let me wrap up the call trace flow as follows.

1. shrinker->count_objects(shrinker, &sc); // <<-- super_cache_count


2. list_lru_count_node(&sb->s_dentry_lru, sc->nid); // The second parameter sc->nid contains garbage value(since it is defined as local variable without initialization)


3. list_lru_count_node(struct list_lru *lru, int nid) // <<-- nid(sc->nid): garbage value

  3.1 struct list_lru_node *nlru = &lru->node[nid]; // &lru->node[nid]; // access lru->node from  nid(garbage value)

  3.2 spin_lock(&nlru->lock); // &nlru->lock: garbage address


Solution

After visiting linux kernel community and google web-site, we have found that google already generated revert commit

https://chromium-review.googlesource.com/c/345895/


The commit message below seems to contain identical debugging signature what we analyzed(shrink_control.nid is used but not initialzed)


Commit log

ANDROID: Revert "mm: vmscan: Add a debug file for shrinkers"


Kernel panic when type "cat /sys/kernel/debug/shrinker"


Unable to handle kernel paging request at virtual address 0af37d40

pgd = d4dec000

[0af37d40] *pgd=00000000

Internal error: Oops: 5 [#1] PREEMPT SMP ARM

[<c0bb8f24>] (_raw_spin_lock) from [<c020aa08>] (list_lru_count_one+0x14/0x28)

[<c020aa08>] (list_lru_count_one) from [<c02309a8>] (super_cache_count+0x40/0xa0)

[<c02309a8>] (super_cache_count) from [<c01f6ab0>] (debug_shrinker_show+0x50/0x90)

[<c01f6ab0>] (debug_shrinker_show) from [<c024fa5c>] (seq_read+0x1ec/0x48c)

[<c024fa5c>] (seq_read) from [<c022e8f8>] (__vfs_read+0x20/0xd0)

[<c022e8f8>] (__vfs_read) from [<c022f0d0>] (vfs_read+0x7c/0x104)

[<c022f0d0>] (vfs_read) from [<c022f974>] (SyS_read+0x44/0x9c)

[<c022f974>] (SyS_read) from [<c0107580>] (ret_fast_syscall+0x0/0x3c)

Code: e1a04000 e3a00001 ebd66b39 f594f000 (e1943f9f)

---[ end trace 60c74014a63a9688 ]---

Kernel panic - not syncing: Fatal exception


shrink_control.nid is used but not initialzed, same for

shrink_control.memcg.


This reverts commit b0e7a582b2264cdf75874dcd8df915b6b4427755.


Reported-by: Xiaowen Liu <xiaowen.liu@freescale.com>

Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>

(cherry picked from https://android.googlesource.com/kernel/common android-4.4

 commit ad95c12f66df9efae04b15d5c4d0d0ba56ab2620)


BUG=chromium:612862

TEST=build and boot on a few devices

Debugging 

Kernel panic occurs at 68 line inside __list_del_entry() whose caller function is process_one_work().

Code Review at the moment of kernel panic


49void __list_del_entry(struct list_head *entry)

50{

51     struct list_head *prev, *next;

52

53     prev = entry->prev;

54     next = entry->next;

55

56    if (WARN(next == LIST_POISON1,

57                      "list_del corruption, %p->next is LIST_POISON1 (%p)\n",

58                      entry, LIST_POISON1) ||

59       WARN(prev == LIST_POISON2,

60                     "list_del corruption, %p->prev is LIST_POISON2 (%p)\n",

61                     entry, LIST_POISON2) ||

62      WARN(prev->next != entry,

63                    "list_del corruption. prev->next should be %p, "

64                    "but was %p\n", entry, prev->next) ||

65      WARN(next->prev != entry,

66                    "list_del corruption. next->prev should be %p, "

67                   "but was %p\n", entry, next->prev)) {

68                   BUG_ON(PANIC_CORRUPTION); // <<-- kernel panic

69     return;

70 }

71

72     __list_del(prev, next);

73}


Callstacks

-000|do_undefinstr(regs = 0x0)

-001|__und_svc_fault(asm)

-->|exception

-002|__list_del_entry(?) // <<-- kernel panic

-003|process_one_work(worker = 0xEC048600, work = 0xEA2585AC)

-004|worker_thread(__worker = 0x0)

-005|kthread(_create = 0xEB9B3000)

-006|ret_from_fork(asm)

-007|ret_fast_syscall(asm)

The list_del_init() is called inside process_one_work() whose parameter is &work->entry.

Now, it is important to debug the state of &work->entry

static void process_one_work(struct worker *worker, struct work_struct *work)

__releases(&pool->lock)

__acquires(&pool->lock)

{

    struct pool_workqueue *pwq = get_work_pwq(work);

    struct worker_pool *pool = worker->pool;

    bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;

    int work_color;

    struct worker *collision;

//snip

/* claim and dequeue */

    debug_work_deactivate(work);

    hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);

    worker->current_work = work;

    worker->current_func = work->func;

    worker->current_pwq = pwq;

    work_color = get_work_color(work);

    list_del_init(&work->entry);

Going back to process_one_work() function frame using T32, I can have the following debug information

[1],[2]: next element is corrupted

[3]: worker.current_func is found to be irq_affinity_notify which looks abnormal.

process_one_work(

    worker = 0xEC048600 -> (

        entry = (next = 0x0, prev = 0xEC5AE968),

        hentry = (next = 0x0, pprev = 0xEC5AE968),

        current_work = 0xEA2585AC,

        current_func = 0xC016FD40,

        current_pwq = 0xEC5B2400,

        desc_valid = FALSE,

The memory content around 0xEC048600 seems to be corrupted especially for irq_affinity_notify at 0xEC04860C.

NSD:EC0485F8| 5A 5A 5A 5A 0x5A5A5A5A

NSD:EC0485FC| 5A 5A 5A 5A 0x5A5A5A5A

NSD:EC048600| 00 00 00 00 0x0 // <<--

NSD:EC048604| 68 E9 5A EC 0xEC5AE968

NSD:EC048608| AC 85 25 EA 0xEA2585AC

NSD:EC04860C| 40 FD 16 C0 0xC016FD40 \\vmlinux\manage\irq_affinity_notify // <<--

NSD:EC048610| 00 24 5B EC 0xEC5B2400

NSD:EC048614| 00 00 00 00 0x0

NSD:EC048618| 18 86 04 EC 0xEC048618

NSD:EC04861C| 18 86 04 EC 0xEC048618

NSD:EC048620| C0 57 2A EC 0xEC2A57C0

NSD:EC048624| 00 E8 5A EC 0xEC5AE800

NSD:EC048628| E8 E0 04 E9 0xE904E0E8

NSD:EC04862C| F0 E9 5A EC 0xEC5AE9F0

NSD:EC048630| AC 8D FF FF 0xFFFF8DAC

Searching "irq_affinity_notify" key word enables me to the definition of irq_affinity_notify structure.

struct irq_affinity_notify {

    unsigned int irq;

    struct kref kref;

    struct work_struct work;

    void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);

    void (*release)(struct kref *ref);

};



By casting 0xEA2585AC as struct work_struct, the following debug information can be deduced which looks very odd.

The struct work_struct.entry.next is corrupted as NULL and work_func_t is registered as irq_affinity_notify().

(struct work_struct *) [-] (struct work_struct*)0xEA2585AC = 0xEA2585AC = __bss_stop+0x2857B9A4 -> (

  (atomic_long_t) [D:0xEA2585AC] data = ((int) [D:0xEA2585AC] counter = -329571323 = 0xEC5B2405 = '.[

    (struct list_head) [D:0xEA2585B0] entry = (

    (struct list_head *) [D:0xEA2585B0] next = 0xE9E9EAB4 = __bss_stop+0x281C1EAC -> (

      (struct list_head *) [D:0xE9E9EAB4] next = 0x0 = -> NULL,

    (struct list_head *) [D:0xE9E9EAB8] prev = 0x0 = -> NULL),

  (struct list_head *) [D:0xEA2585B4] prev = 0xEC5AE820 = -> ((struct list_head *) [D:0xEC5AE820]

 (work_func_t) [D:0xEA2585B8] func = 0xC016FD40 = irq_affinity_notify -> )


Looking into the symbol information at 0xEA2585AC address, I can have the following symbols below.

(i.e) irq_affinity_notify, pm_qos_irq_notify, pm_qos_irq_release, pm_qos_work_fn

________address|_data________|value_____________|symbol

NSD:EA25859C| FF 00 00 00 0xFF \\vmlinux\Global\up_b_offset+0xCF

NSD:EA2585A0| 9D 00 00 00 0x9D \\vmlinux\Global\up_b_offset+0x6D

NSD:EA2585A4| 9D 00 00 00 0x9D \\vmlinux\Global\up_b_offset+0x6D

NSD:EA2585A8| 03 00 00 00 0x3

NSD:EA2585AC| 05 24 5B EC 0xEC5B2405

NSD:EA2585B0| B4 EA E9 E9 0xE9E9EAB4

NSD:EA2585B4| 20 E8 5A EC 0xEC5AE820

NSD:EA2585B8| 40 FD 16 C0 0xC016FD40 \\vmlinux\manage\irq_affinity_notify

NSD:EA2585BC| 58 90 16 C0 0xC0169058 \\vmlinux\kernel/power/qos\pm_qos_irq_notify

NSD:EA2585C0| 00 90 16 C0 0xC0169000 \\vmlinux\kernel/power/qos\pm_qos_irq_release

NSD:EA2585C4| 00 94 35 77 0x77359400

NSD:EA2585C8| C8 85 25 EA 0xEA2585C8

NSD:EA2585CC| C8 85 25 EA 0xEA2585C8

NSD:EA2585D0| 14 81 86 C1 0xC1868114 \\vmlinux\adreno\device_3d0+0x45C

NSD:EA2585D4| F8 C6 C4 E9 0xE9C4C6F8

NSD:EA2585D8| 01 00 00 00 0x1

NSD:EA2585DC| E0 FF FF FF 0xFFFFFFE0

NSD:EA2585E0| E0 85 25 EA 0xEA2585E0

NSD:EA2585E4| E0 85 25 EA 0xEA2585E0

NSD:EA2585E8| 10 8F 16 C0 0xC0168F10 \\vmlinux\kernel/power/qos\pm_qos_work_fn

NSD:EA2585EC| 00 00 00 00 0x0

NSD:EA2585F0| 00 00 00 00 0x0

NSD:EA2585F4| 00 00 00 00 0x0

NSD:EA2585F8| 02 00 B8 EB 0xEBB80002

NSD:EA2585FC| A8 77 13 C0 0xC01377A8 \\vmlinux\workqueue\delayed_work_timer_fn

NSD:EA258600| DC 85 25 EA 0xEA2585DC

When searching pm_qos_irq_notify in the kernel source code, I can see that pm_qos_irq_notify is registered as req->irq_notify.notify.

void pm_qos_add_request(struct pm_qos_request *req,

int pm_qos_class, s32 value)

{

//snip 

case PM_QOS_REQ_AFFINE_IRQ:

    if (irq_can_set_affinity(req->irq)) {

         int ret = 0;

         struct irq_desc *desc = irq_to_desc(req->irq);

         struct cpumask *mask = desc->irq_data.affinity;

         /* Get the current affinity */

         cpumask_copy(&req->cpus_affine, mask);

         req->irq_notify.irq = req->irq;

         req->irq_notify.notify = pm_qos_irq_notify;

         req->irq_notify.release = pm_qos_irq_release;

I can see that pm_qos_irq_notify() as well as pm_qos_irq_release() is registered as struct irq_affinity_notify.

struct irq_affinity_notify {

    unsigned int irq;

    struct kref kref;

    struct work_struct work;

    void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); // pm_qos_irq_notify

    void (*release)(struct kref *ref); // pm_qos_irq_release

};


struct pm_qos_request {

    enum pm_qos_req_type type;

    struct cpumask cpus_affine;

#ifdef CONFIG_SMP

    uint32_t irq;

/* Internal structure members */

    struct irq_affinity_notify irq_notify;

#endif

    struct plist_node node;

    int pm_qos_class;

    struct delayed_work work; /* for pm_qos_update_request_timeout */

};

The 0xEA2585A4 address is starting point where struct irq_affinity_notify element is located.

(struct irq_affinity_notify *) [-] (struct irq_affinity_notify*)0xEA2585A4

(unsigned int) [D:0xEA2585A4] irq = 157 = 0x9D = '....',

(struct kref) [D:0xEA2585A8] kref = (

   (atomic_t) [D:0xEA2585A8] refcount = ((int) [D:0xEA2585A8] counter = 3 = 0x3 = '....')),

(struct work_struct) [D:0xEA2585AC] work = (

   (atomic_long_t) [D:0xEA2585AC] data = ((int) [D:0xEA2585AC] counter = -329571323 = 0xEC5B2

   (struct list_head) [D:0xEA2585B0] entry = (

      (struct list_head *) [D:0xEA2585B0] next = 0xE9E9EAB4

      (struct list_head *) [D:0xEA2585B4] prev = 0xEC5AE820

   (work_func_t) [D:0xEA2585B8] func = 0xC016FD40 = irq_affinity_notify -> ),

   (void (*)()) [D:0xEA2585BC] notify = 0xC0169058 = pm_qos_irq_notify -> ,

   (void (*)()) [D:0xEA2585C0] release = 0xC0169000 = pm_qos_irq_release -> )

________address|_data________|value_____________|symbol

NSD:EA25859C| FF 00 00 00 0xFF \\vmlinux\Global\up_b_offset+0xCF

NSD:EA2585A0| 9D 00 00 00 0x9D \\vmlinux\Global\up_b_offset+0x6D

NSD:EA2585A4| 9D 00 00 00 0x9D //struct irq_affinity_notify.irq

NSD:EA2585A8| 03 00 00 00 0x3 //struct irq_affinity_notify.kref

NSD:EA2585AC| 05 24 5B EC 0xEC5B2405 //struct irq_affinity_notify.work.data

NSD:EA2585B0| B4 EA E9 E9 0xE9E9EAB4 //struct irq_affinity_notify.work.entry.next

NSD:EA2585B4| 20 E8 5A EC 0xEC5AE820 //struct irq_affinity_notify.work.entry.prev

NSD:EA2585B8| 40 FD 16 C0 0xC016FD40 \\vmlinux\manage\irq_affinity_notify //struct irq_affinity_notify.work.func

NSD:EA2585BC| 58 90 16 C0 0xC0169058 \\vmlinux\kernel/power/qos\pm_qos_irq_notify

NSD:EA2585C0| 00 90 16 C0 0xC0169000 \\vmlinux\kernel/power/qos\pm_qos_irq_release

NSD:EA2585C4| 00 94 35 77 0x77359400

NSD:EA2585C8| C8 85 25 EA 0xEA2585C8

NSD:EA2585CC| C8 85 25 EA 0xEA2585C8

NSD:EA2585D0| 14 81 86 C1 0xC1868114 \\vmlinux\adreno\device_3d0+0x45C

NSD:EA2585D4| F8 C6 C4 E9 0xE9C4C6F8

NSD:EA2585D8| 01 00 00 00 0x1

NSD:EA2585DC| E0 FF FF FF 0xFFFFFFE0

NSD:EA2585E0| E0 85 25 EA 0xEA2585E0

NSD:EA2585E4| E0 85 25 EA 0xEA2585E0

NSD:EA2585E8| 10 8F 16 C0 0xC0168F10 \\vmlinux\kernel/power/qos\pm_qos_work_fn

NSD:EA2585EC| 00 00 00 00 0x0

NSD:EA2585F0| 00 00 00 00 0x0

NSD:EA2585F4| 00 00 00 00 0x0

NSD:EA2585F8| 02 00 B8 EB 0xEBB80002

NSD:EA2585FC| A8 77 13 C0 0xC01377A8 \\vmlinux\workqueue\delayed_work_timer_fn

NSD:EA258600| DC 85 25 EA 0xEA2585DC


155: 1608 0 0 0 0 0 0 0 GIC mmc0

157: 98 0 0 0 0 0 0 0 GIC mmc1

166: 10 0 0 0 0 0 0 0 GIC msm_otg,msm_hsusb


Taking a looking at the structure associated with struct irq_affinity_notify,

struct pm_qos_request seems to include struct irq_affinity_notify irq_notify.

struct pm_qos_request {

    enum pm_qos_req_type type;

    struct cpumask cpus_affine;

#ifdef CONFIG_SMP

    uint32_t irq;

/* Internal structure members */

    struct irq_affinity_notify irq_notify;

#endif

    struct plist_node node;

    int pm_qos_class;

    struct delayed_work work; /* for pm_qos_update_request_timeout */

};

By casting 0xEA258598 as struct pm_qos_request, the whole picture is drawn as below.

(i.e) type = PM_QOS_REQ_AFFINE_IRQ

(struct pm_qos_request *) [-] (struct pm_qos_request*)0xEA258598 = 0xEA258598 = __bss_sto

  (enum pm_qos_req_type) [D:0xEA258598] type = PM_QOS_REQ_AFFINE_IRQ = 2 = 0x2 = '....',

  (struct cpumask) [D:0xEA25859C] cpus_affine = (

    (long unsigned int [1]) [D:0xEA25859C] bits = (

    [0] = 255 = 0xFF = '....')),

  (uint32_t) [D:0xEA2585A0] irq = 157 = 0x9D = '....',

  (struct irq_affinity_notify) [D:0xEA2585A4] irq_notify = ((unsigned int) [D:0xEA2585A4]  

  (struct plist_node) [D:0xEA2585C4] node = ((int) [D:0xEA2585C4] prio = 2000000000 = 0x7

  (int) [D:0xEA2585D8] pm_qos_class = 1 = 0x1 = '....',

  (struct delayed_work) [D:0xEA2585DC] work = (

   (struct work_struct) [D:0xEA2585DC] work = (

    (atomic_long_t) [D:0xEA2585DC] data = ((int) [D:0xEA2585DC] counter = -32 = 0xFFFFF

    (struct list_head) [D:0xEA2585E0] entry = ((struct list_head *) [D:0xEA2585E0] next

  (work_func_t) [D:0xEA2585E8] func = 0xC0168F10 = pm_qos_work_fn -> ),

  (struct timer_list) [D:0xEA2585EC] timer = ((struct list_head) [D:0xEA2585EC] entry =

 (struct workqueue_struct *) [D:0xEA258620] wq = 0x0 = -> NULL,

 (int) [D:0xEA258624] cpu = 0 = 0x0 = '....'))

(where)

________address|_data________|value_____________|symbol

NSD:EA258590| 00 00 00 00 0x0

NSD:EA258594| 01 00 00 00 0x1

NSD:EA258598| 02 00 00 00 0x2 //enum pm_qos_req_type type;

NSD:EA25859C| FF 00 00 00 0xFF //struct cpumask cpus_affine;

NSD:EA2585A0| 9D 00 00 00 0x9D //uint32_t irq;

NSD:EA2585A4| 9D 00 00 00 0x9D //struct irq_affinity_notify

NSD:EA2585A8| 03 00 00 00 0x3

NSD:EA2585AC| 05 24 5B EC 0xEC5B2405

NSD:EA2585B0| B4 EA E9 E9 0xE9E9EAB4

NSD:EA2585B4| 20 E8 5A EC 0xEC5AE820

NSD:EA2585B8| 40 FD 16 C0 0xC016FD40 \\vmlinux\manage\irq_affinity_notify

NSD:EA2585BC| 58 90 16 C0 0xC0169058 \\vmlinux\kernel/power/qos\pm_qos_irq_notify

NSD:EA2585C0| 00 90 16 C0 0xC0169000 \\vmlinux\kernel/power/qos\pm_qos_irq_release

NSD:EA2585C4| 00 94 35 77 0x77359400

NSD:EA2585C8| C8 85 25 EA 0xEA2585C8

NSD:EA2585CC| C8 85 25 EA 0xEA2585C8

NSD:EA2585D0| 14 81 86 C1 0xC1868114 \\vmlinux\adreno\device_3d0+0x45C

NSD:EA2585D4| F8 C6 C4 E9 0xE9C4C6F8

NSD:EA2585D8| 01 00 00 00 0x1

NSD:EA2585DC| E0 FF FF FF 0xFFFFFFE0

NSD:EA2585E0| E0 85 25 EA 0xEA2585E0

NSD:EA2585E4| E0 85 25 EA 0xEA2585E0

NSD:EA2585E8| 10 8F 16 C0 0xC0168F10 \\vmlinux\kernel/power/qos\pm_qos_work_fn

NSD:EA2585EC| 00 00 00 00 0x0

The (*(((struct pm_qos_request*)0xEA258598))).work.work.data is equal to 0xFFFFFFE0.

[-] (struct pm_qos_request*)0xEA258598 = 0xEA258598 -> (

//snip

D:0xEA2585DC] work = (

[D:0xEA2585DC] work = (

[D:0xEA2585DC] data = ([D:0xEA2585DC] counter = 0xFFFFFFE0), // <<-- WORK_STRUCT_NO_POOL

[D:0xEA2585E0] entry = ([D:0xEA2585E0] next = 0xEA2585E0, [D:0xEA2585E4] prev = 0xEA2585E0),

[D:0xEA2585E8] func = 0xC0168F10),

[D:0xEA2585EC] timer = ([D:0xEA2585EC] entry = ([D:0xEA2585EC] next = 0x0, [D:0xEA2585F0] prev = 0x0), [

[D:0xEA258620] wq = 0x0,

[D:0xEA258624] cpu = 0x0))

So it is needed to apply the following patch to see if the same crash occurs since 0xFFFFFFE0,

where 0xFFFFFFE0 = WORK_STRUCT_NO_POOL = (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT,

diff --git a/kernel/power/qos.c b/kernel/power/qos.c

index 5f4c006..a122891 100644

--- a/kernel/power/qos.c

+++ b/kernel/power/qos.c

@@ -405,6 +405,9 @@ void pm_qos_update_request(struct pm_qos_request *req,

return;

}

+     if (work_is_canceling(&req->work) )

+        return;

+

      cancel_delayed_work_sync(&req->work);

      __pm_qos_update_request(req, new_value);

}

@@ -427,6 +430,9 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,

"%s called for unknown object.", __func__))

return;

+     if (work_is_canceling(&req->work) )

+         return;

+

      cancel_delayed_work_sync(&req->work);

trace_pm_qos_update_request_timeout(req->pm_qos_class,

@@ -458,6 +464,9 @@ void pm_qos_remove_request(struct pm_qos_request *req)

return;

}

+     if (work_is_canceling(&req->work) )

+         return;

+

       cancel_delayed_work_sync(&req->work);

       trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);


static bool work_is_canceling(struct work_struct *work)

{

    unsigned long data = atomic_long_read(&work->data);

    return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);

}

(where)

WORK_STRUCT_PWQ = 0x4, WORK_OFFQ_CANCELING = 0x10


In-depth Analysis


The work->data is set as WORK_STRUCT_NO_POOL inside clear_work_data() which is called from

static void clear_work_data(struct work_struct *work)

{

    smp_wmb(); /* see set_work_pool_and_clear_pending() */

    set_work_data(work, WORK_STRUCT_NO_POOL, 0);

}


static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)

{

    static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);

    unsigned long flags;

    int ret;

    do {

        ret = try_to_grab_pending(work, is_dwork, &flags);

        if (unlikely(ret == -ENOENT)) {

               struct cwt_wait cwait;

               init_wait(&cwait.wait);

               cwait.wait.func = cwt_wakefn;

               cwait.work = work;

              prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,

              TASK_UNINTERRUPTIBLE);

              if (work_is_canceling(work))

                         schedule();

              finish_wait(&cancel_waitq, &cwait.wait);

      }

    } while (unlikely(ret < 0));

/* tell other tasks trying to grab @work to back off */ 

    mark_work_canceling(work);

    local_irq_restore(flags);

    flush_work(work); 

    clear_work_data(work); // <<-- 


I can restore callstack using T32 as followings;

[<c001df40>] do_page_fault+0x338/0x3f8 

[<c0008544>] do_DataAbort+0x38/0x98 

[<c0015058>] __dabt_svc+0x38/0x60 

[<c031a5a0>] tty_wakeup+0xc/0x64 

[<c06f82ec>] gs_start_io+0x94/0xf4 

[<c06f8698>] gserial_connect+0xe0/0x180

[<c06f7578>] acm_set_alt+0x88/0x1a8 

[<c06f3c5c>] composite_setup+0xd34/0x1520 

[<c070d154>] android_setup+0x1f4/0x1fc 

[<c03fc6e8>] forward_to_driver+0x64/0x100 

[<c03fd418>] musb_g_ep0_irq+0x7d8/0x1c18 

[<c03fb094>] musb_interrupt+0x94/0xc78 

[<c04024f0>] generic_interrupt+0xc34/0x1218 

[<c009b020>] handle_irq_event_percpu+0xe0/0x2e4

[<c009b268>] handle_irq_event+0x44/0x64

[<c009df54>] handle_fasteoi_irq+0xe8/0x1a4 

[<c009a7dc>] __handle_domain_irq+0x104/0x264 

[<c0008668>] gic_handle_irq+0x2c/0x64 

[<c00153a8>] __irq_usr+0x48/0x60


Kernel panic occurs because R0 is holding 0x0 whose instance is (struct tty_struct *tty).

NSR:C031A594|E1A0C00D tty_wakeup: cpy r12,r13

NSR:C031A598|E92DD830 push {r4-r5,r11-r12,r14,pc}

NSR:C031A59C|E24CB004 sub r11,r12,#0x4 ; r11,r12,#4

NSR:C031A5A0|E59031CC_______________ldr____r3,r0,#0x1CC  //<--


Let me take times to look into tty_wakeup() more.

void tty_wakeup(struct tty_struct *tty)

{

    struct tty_ldisc *ld;


    if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { // <<-- kernel panic

        ld = tty_ldisc_ref(tty);


        if (ld)

        {

           if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty);

           tty_ldisc_deref(ld);

        }

    }


    wake_up_interruptible_poll(&tty->write_wait, POLLOUT);

}


As kernel log indicates below right before kernel panic occurs, gs_close() is called as mdl_sock_host.

Inside gs_close(), port->port.tty is updated as NULL.

[ 6285.082083 / 06-01 11:43:40.318] -(4)[981:mdl_sock_host]gs_close: ttyGS1 (d3681900,d7e1ca80) ...


<< Code fragment >> 

static void gs_close(struct tty_struct *tty, struct file *file)

{

    struct gs_port *port = tty->driver_data;

    struct gserial *gser;


//snip

    if (gser == NULL)

        gs_buf_free(&port->port_write_buf);

    else

        gs_buf_clear(&port->port_write_buf);

        tty->driver_data = NULL;


        port->port.tty = NULL; // <<--

        port->openclose = false;


Right after gs_close() is called, acm driver calls tty_wakeup() function during enumeration operation, which causes kernel panic.


With following patch, kernel crash was fixed. 

diff --git a/drivers/usb/gadget/u_serial.c b/drivers/usb/gadget/u_serial.c

index 27a7cdc..4b88415 100644

? a/drivers/usb/gadget/u_serial.c

+++ b/drivers/usb/gadget/u_serial.c

@@ -784,8 +784,11 @@ static int gs_start_io(struct gs_port *port)

    port->n_read = 0;

    started = gs_start_rx(port);


+   if (!port->port_usb)

+      return -EIO;

+

/* unblock any pending writes into our circular buffer */


    if (started) {

+        if (started && port->port.tty) {

                tty_wakeup(port->port.tty);

         } else {

           gs_free_requests(ep, head, &port->read_allocated);


The call to tty_wakeup() is only made in case port->port.tty points to valid address.

Analysis 

Kernel log says that program counter indicates invalid address(0xecb29f00) as below.

[  257.879321 / 01-01 00:04:20.499][1] Unable to handle kernel paging request at virtual address ecb29f00

[  257.879343 / 01-01 00:04:20.499][1] pgd = c4ebc000

[  257.879355 / 01-01 00:04:20.499][0] [ecb29f00] *pgd=6ca1141e(bad)

[  257.879372 / 01-01 00:04:20.499][1] Internal error: Oops: 8000000d [#1] PREEMPT SMP ARM

[  257.879384 / 01-01 00:04:20.499][0] Modules linked in: texfat(PO)

[  257.879403 / 01-01 00:04:20.499][1] CPU: 1 PID: 384 Comm: ueventd Tainted: P        W  O   3.18.31-perf-gd069b48-00001-g8a6d6e5 #1

[  257.879416 / 01-01 00:04:20.499][1] task: eccc4d00 ti: c4eaa000 task.ti: c4eaa000

[  257.879429 / 01-01 00:04:20.499][1] PC is at 0xecb29f00

[  257.879447 / 01-01 00:04:20.499][1] LR is at security_context_to_sid_core+0x184/0x1b0

[  257.879462 / 01-01 00:04:20.499][1] pc : [<ecb29f00>]    lr : [<c033ab00>]    psr: 80030013

[  257.879462 / 01-01 00:04:20.499][1] sp : c4eabed8  ip : e14870c0  fp : b0d70e50

[  257.879479 / 01-01 00:04:20.499][1] r10: 00000000  r9 : c4eaa000  r8 : 00000027

[  257.879492 / 01-01 00:04:20.499][1] r7 : c4e2bcc0  r6 : ebef5cc0  r5 : c4e2e1c0  r4 : 00000000

[  257.879504 / 01-01 00:04:20.499][1] r3 : c193e700  r2 : 00000000  r1 : c193e700  r0 : 00000000

[  257.879517 / 01-01 00:04:20.499][1] Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user


As the linked register(R14) holds 0xc033ab00(security_context_to_sid_core+0x184), ueventd process has been running under the following callstacks.

-000|security_context_to_sid_core(scontext = 0xDEAD, scontext_len = 0, sid = 0xB

-001|security_context_to_sid(?, ?, ?, ?)

-002|selinux_inode_setsecurity(inode = 0x0, ?, value = 0x0, size = 0, flags = 0)

-003|selinux_inode_notifysecctx(?, ?, ?)

-004|security_inode_notifysecctx(?, ?, ?)

-005|kernfs_type(inline)

-005|kernfs_refresh_inode(kn = 0xFFFFFFFF, inode = 0x80030013)

-006|kernfs_iop_getattr(?, ?, stat = 0xC4EABF50)

-007|vfs_getattr_nosec(?, ?)

-008|vfs_fstatat(dfd = 0, filename = 0xC4E2BCC0, stat = 0xC4EABF50, ?)

-009|SYSC_fstatat64(inline)

-009|sys_fstatat64(?, ?, statbuf = -1328228292, ?)

-010|ret_fast_syscall(asm)


When program counter is 0xc033a980, its stack address should have been holding 0xC4EABE94.

In case of 0xc033a984 address, the stack address is updated as 0xC4EABE58(0xC4EABE94-0x3c)

0xc033a97c <security_context_to_sid_core>:      push    {r4, r5, r6, r7, r8, r9, r10, r11, lr}

0xc033a980 <security_context_to_sid_core+4>:    subs    r5, r1, #0

0xc033a984 <security_context_to_sid_core+8>:    sub     sp, sp, #60     ; 0x3c


I am suspecting that stack pop operation is not executed actually according to ARM calling conventions inside below functions.

0xc033a97c <security_context_to_sid_core>:      push    {r4, r5, r6, r7, r8, r9, r10, r11, lr}


0xc033a9c0 <security_context_to_sid_core+68>:   bl      0xc038542c <strcmp>

0xc033a9fc <security_context_to_sid_core+128>:  bl      0xc020b974 <__kmalloc>

0xc033aa10 <security_context_to_sid_core+148>:  bl      0xc037cb00 <memcpy>

0xc033aa28 <security_context_to_sid_core+172>:  bl      0xc01e806c <kstrdup>

0xc033aa44 <security_context_to_sid_core+200>:  bl      0xc0ee8d48 <_raw_read_lock>

0xc033aa5c <security_context_to_sid_core+224>:  bl      0xc033a060 <string_to_context_struct>

0xc033aaa0 <security_context_to_sid_core+292>:  bl      0xc03340a0 <sidtab_context_to_sid>

0xc033aac8 <security_context_to_sid_core+332>:  bl      0xc03333e8 <ebitmap_destroy>

0xc033aadc <security_context_to_sid_core+352>:  bl      0xc037d260 <__memzero>

0xc033aaf4 <security_context_to_sid_core+376>:  bl      0xc020b2f8 <kfree>


(For example)

-000|string_to_context_struct(pol = 0xDEAD, sidtabp = 0x0, scontext = 0xBEEF, sc

-001|security_context_to_sid_core(?, scontext_len = 0, sid = 0x0, ?, gfp_flags =

-002|security_context_to_sid(?, ?, ?, ?)

-003|selinux_inode_setsecurity(inode = 0x0, ?, value = 0x0, size = 0, flags = 0)

-004|selinux_inode_notifysecctx(?, ?, ?)

-005|security_inode_notifysecctx(?, ?, ?)

-006|kernfs_type(inline)

-006|kernfs_refresh_inode(kn = 0xFFFFFFFF, inode = 0x80030013)

-007|kernfs_iop_getattr(?, ?, stat = 0xC4EABF50)

-008|vfs_getattr_nosec(?, ?)

-009|vfs_fstatat(dfd = 0, filename = 0xC4E2BCC0, stat = 0xC4EABF50, ?)

-010|SYSC_fstatat64(inline)

-010|sys_fstatat64(?, ?, statbuf = -1328228292, ?)

-011|ret_fast_syscall(asm)


After the above functions are executed, the stack address should have been updated as 0xC4EABE58 instead of 0xc4eabed8.  

[  257.879462 / 01-01 00:04:20.499][1] pc : [<ecb29f00>]    lr : [<c033ab00>]    psr: 80030013

[  257.879462 / 01-01 00:04:20.499][1] sp : c4eabed8  ip : e14870c0  fp : b0d70e50

[  257.879479 / 01-01 00:04:20.499][1] r10: 00000000  r9 : c4eaa000  r8 : 00000027


The stack dump can be compared between this coredump and normal operation.

this ramdump

the stack in case of normal operation

NSD:C4EABE90| 00 00 00 00  0x0

NSD:C4EABE94| 00 E7 93 C1  0xC193E700         \\vmlinux\Global\__tracepoint_kfree

NSD:C4EABE98| 00 00 00 00  0x0

NSD:C4EABE9C| 00 E7 93 C1  0xC193E700         \\vmlinux\Global\__tracepoint_kfree

NSD:C4EABEA0| 00 00 00 00  0x0

NSD:C4EABEA4| C0 E1 E2 C4  0xC4E2E1C0

NSD:C4EABEA8| C0 5C EF EB  0xEBEF5CC0

NSD:C4EABEAC| C0 BC E2 C4  0xC4E2BCC0

NSD:C4EABEB0| 27 00 00 00  0x27               \\vmlinux\Global\cpu_v7_suspend_size+0x3

NSD:C4EABEB4| 00 A0 EA C4  0xC4EAA000 //<<-R14 should have been \\security_context_to_sid+0x14 

NSD:C4EABEB8| 00 00 00 00  0x0  // <<-- new SP

NSD:C4EABEBC| 50 0E D7 B0  0xB0D70E50

NSD:C4EABEC0| C0 70 48 E1  0xE14870C0

NSD:C4EABEC4| D8 BE EA C4  0xC4EABED8 //<<-- should have been R14  \\selinux_inode_setsecurity+0x48

NSD:C4EABEC8| 00 AB 33 C0  0xC033AB00         \\vmlinux\services\security_context_to_sid_core+0x184

NSD:C4EABECC| 00 9F B2 EC  0xECB29F00

NSD:C4EABED0| 13 00 03 80  0x80030013

NSD:C4EABED4| FF FF FF FF  0xFFFFFFFF

NSD:C4EABED8| 50 BF EA C4  0xC4EABF50  //<<-- SP address at the moment of kernel panic

NSD:C4EABEDC| 00 9F B2 EC  0xECB29F00

NSD:C4EABEE0| 50 BF EA C4  0xC4EABF50

NSD:C4EABEE4| 9C A3 32 C0  0xC032A39C

NSD:C4EABEE8| 00 00 00 00  0x0

NSD:C4EABEEC| C0 BC E2 C4  0xC4E2BCC0

NSD:C4EABE90|_00_00_00_00__0x0

NSD:C4EABE94| CC CC 00 00  0xCCCC //<-- R4 where new SP

NSD:C4EABE98| 00 00 00 00  0x0

NSD:C4EABE9C| 00 00 00 00  0x0

NSD:C4EABEA0| 00 00 00 00  0x0

NSD:C4EABEA4| 00 00 00 00  0x0

NSD:C4EABEA8| 00 00 00 00  0x0

NSD:C4EABEAC| 00 00 00 00  0x0

NSD:C4EABEB0| 00 00 00 00  0x0  //<-- R11

NSD:C4EABEB4| E4 C5 33 C0  0xC033C5E4     //<<-R14 \\security_context_to_sid+0x14

NSD:C4EABEB8| AD DE 00 00  0xDEAD  //<<--R0 where new SP

NSD:C4EABEBC| 00 00 00 00  0x0        //<<--R1

NSD:C4EABEC0| EF BE 00 00  0xBEEF    //<<--R2

NSD:C4EABEC4| D8 A2 32 C0  0xC032A2D8       //<<--R14  \\selinux_inode_setsecurity+0x48

NSD:C4EABEC8| 00 AB 33 C0  0xC033AB00   //<-- R0, where new SP      \\security_context_to_sid_core+0x184

NSD:C4EABECC| 00 9F B2 EC  0xECB29F00    //<-- R1

NSD:C4EABED0| 13 00 03 80  0x80030013    //<-- R4

NSD:C4EABED4| FF FF FF FF  0xFFFFFFFF      //<-- R5

NSD:C4EABED8| 50 BF EA C4  0xC4EABF50  //<-- R6

NSD:C4EABEDC| 00 9F B2 EC  0xECB29F00  //<-- R7

NSD:C4EABEE0| 50 BF EA C4  0xC4EABF50  //<-- R8

NSD:C4EABEE4| 9C A3 32 C0  0xC032A39C         \\vmlinux\hooks\selinux_inode_notifysecctx+0x20 //<-- R14

NSD:C4EABEE8| 00 00 00 00  0x0   //<<-- SP

NSD:C4EABEEC| C0 BC E2 C4  0xC4E2BCC0

Weird signature is that PC should have been 0xFFFF_FFFF instead of 0xecb29f00.


Another scenario I can think of is that stack is scribbled inside below functions (i.e: memcpy of out-of-bound) 

0xc033a97c <security_context_to_sid_core>:      push    {r4, r5, r6, r7, r8, r9, r10, r11, lr}


0xc033a9c0 <security_context_to_sid_core+68>:   bl      0xc038542c <strcmp>

0xc033a9fc <security_context_to_sid_core+128>:  bl      0xc020b974 <__kmalloc>

0xc033aa10 <security_context_to_sid_core+148>:  bl      0xc037cb00 <memcpy>

0xc033aa28 <security_context_to_sid_core+172>:  bl      0xc01e806c <kstrdup>

0xc033aa44 <security_context_to_sid_core+200>:  bl      0xc0ee8d48 <_raw_read_lock>

0xc033aa5c <security_context_to_sid_core+224>:  bl      0xc033a060 <string_to_context_struct>

0xc033aaa0 <security_context_to_sid_core+292>:  bl      0xc03340a0 <sidtab_context_to_sid>

0xc033aac8 <security_context_to_sid_core+332>:  bl      0xc03333e8 <ebitmap_destroy>

0xc033aadc <security_context_to_sid_core+352>:  bl      0xc037d260 <__memzero>

0xc033aaf4 <security_context_to_sid_core+376>:  bl      0xc020b2f8 <kfree>


Since the specific target device keeps crashing randomly, we swap the chipset from such device. After that, the crash disappears.


From callstack, "ndroid.systemui" process is executing inside arch_spin_lock() function. And then watchdog reset is triggered.

-000|aee_wdt_atf_entry()

-001|prefetchw(inline)

-001|arch_spin_lock(inline)

-001|do_raw_spin_lock_flags(inline)

-001|__raw_spin_lock_irqsave(inline)

-001|raw_spin_lock_irqsave(lock = 0xDD82725C)

-002|__wake_up(q = 0xDD82725C, mode = 1, nr_exclusive = 1, key = 0x0)

-003|mmc_wait_data_done

-004|msdc_irq_data_complete(host = 0xDD827280, ?, error = 0)

-005|msdc_irq(?, dev_id = 0xDD13B9C8)

-006|static_key_count(inline)

-006|static_key_false(inline)

-006|trace_irq_handler_exit(inline)

-006|handle_irq_event_percpu(?, action = 0xDD13B9C8)

-007|handle_irq_event(desc = 0xC0F38440)

-008|cond_unmask_eoi_irq(inline)

-008|handle_fasteoi_irq(?, desc = 0xC0F38440)

-009|static_key_count(inline)

-009|static_key_false(inline)

-009|trace_irq_exit(inline)

-009|__handle_domain_irq(domain = 0xDE806000, hwirq = 111, lookup = TRUE, ?)

-010|__raw_readl(inline)

-010|gic_handle_irq(regs = 0xDD13BFB0)

-011|__irq_usr(asm)


Now, it is time to analyze the spinlock field first. Looking at spinlock instance 0xDD82725C, the each owner and next filed is as below.

[owner: 0x56AC, next: 0x56AD]

(raw_spinlock_t *)0xDD82725C = 0xDD82725C -> (

  raw_lock = (

    slock = 0x56AD56AB,

       tickets = (

         owner = 0x56AB, // <<--

         next = 0x56AD))) // <<--


The fact that next field is larger than owner means the other process already acquired spinlock beforehand.


         

When scanning memory dump who is accessing  0xDD82725C spinlock instance, several candidate processes can be seen as below.

(where)

(raw_spinlock_t *)0xDD82725C

c2d01b78: dd82725c

c2d01b7c: dd82725c

c44f0994: dd82725c

dd13bd60: dd82725c // "ndroid.systemui"

dd13bd9c: dd82725c // "ndroid.systemui"

dd13bdc8: dd82725c // "ndroid.systemui"

ddbefdb8: dd82725c // <<--"mmcqd/0"

ddbefdd0: dd82725c

e0abd994: dd82725c


From the candidate list above, "mmcqd/0" is accessing  0xDD82725C spinlock instance.

v.v % (struct thread_info*)(0xDDBEFDB8 & ~0x1fff)

  (long unsigned int) swap_in = 0 = 0x0 = '....',

  (long unsigned int) swap_out = 0 = 0x0 = '....',

  (struct task_cputime) cputime_expires = ((cputime_t) utime = 0 = 0x0 = '....', (cputime_t) stime = 0 = 0x0 = '....', (long lo

  (struct list_head [3]) cpu_timers = ([0] = ((struct list_head *) next = 0xDDB49A08 = -> ((struct list_head *) next = 0xDDB49

  (struct cred *) real_cred = 0xDDB71200 = -> ((atomic_t) usage = ((int) counter = 2 = 0x2 = '....'), (kuid_t) uid = ((uid_t)

  (struct cred *) cred = 0xDDB71200 = -> ((atomic_t) usage = ((int) counter = 2 = 0x2 = '....'), (kuid_t) uid = ((uid_t) val =

   (char [16]) comm = "mmcqd/0",


The callstacks of "mmcqd/0" process can be observed as follows. 

-000|schedule()

-001|mmc_wait_for_data_req_done(inline)

-001|mmc_start_req(host = 0xC0F75D0C, areq = 0xC0F644E0, error = 0xC102CB08)

-002|mmc_blk_issue_rw_rq(mq = 0xDDB64408, rqc = 0x0)

-003|mmc_blk_issue_rq(mq = 0xDDB64408, req = 0x0)

-004|mmc_queue_thread(d = 0xDDB64408)

-005|kthread(_create = 0xDDB8BDC0)

-006|ret_from_fork(asm)

-007|ret_fast_syscall(asm)


Looking into mmc_wait_for_data_req_done(), the spinlock is acquired using spin_lock_irqsave(&context_info->lock, flags).

static int mmc_wait_for_data_req_done(struct mmc_host *host,

struct mmc_request *mrq,

struct mmc_async_req *next_req)

{

    struct mmc_command *cmd;

    struct mmc_context_info *context_info = &host->context_info;

    int err;

    unsigned long flags;

    while (1) {

        wait_event_interruptible(context_info->wait,

                                               (context_info->is_done_rcv ||

                                               context_info->is_new_req));

        spin_lock_irqsave(&context_info->lock, flags);

        context_info->is_waiting_last_req = false;


Looking into code snippet more, below functions are executed among callbacks of "ndroid.systemui" processes.

-003|mmc_wait_data_done

-004|msdc_irq_data_complete(host = 0xDD827280, ?, error = 0)


mmc_wait_data_done() 함수를 보면 mrq->host->context_info.is_done_rcv = true로 설정하고,

&mrq->host->context_info.wait wait queue을 깨우는 역할을 수행했다.

static void mmc_wait_data_done(struct mmc_request *mrq){

    mrq->host->context_info.is_done_rcv = true;

     wake_up_interruptible(&mrq->host->context_info.wait);

}


그런데 mmc_wait_for_data_req_done() 함수의 아래 구문은 context_info->is_done_rcv, context_info->is_new_req가 true가

될 때 까지 기다리는 코드다.

wait_event_interruptible(context_info->wait,

  (context_info->is_done_rcv ||

  context_info->is_new_req));


정리해보면, race condition으로 wake_up_interruptible(&mrq->host->context_info.wait); 함수를 통해 wait_queue에 queue된 

&mrq->host->context_info.wait으로 등록된 프로세스를 깨우기 전에 아래 구문이 풀려버려 이미 깨어버린 경우임을 짐작할 수 있다.


잠을 자고 있는 누군가를 깨우려고 손을 대려고 하는데 이미 깨버린 경우와 비슷하네.


In-depth Analysis

내 추정이 맞다면 아래 코드를 삽입하면 바로 커널 패닉이 나야 한다.

(램덤프 force_mmc_wait_data_done_crash)

static void mmc_wait_data_done(struct mmc_request mrq){*

    mrq->host->context_info.is_done_rcv = true;

    + mdelay(1);

    wake_up_interruptible(&mrq->host->context_info.wait);

}


그런데 바로 커널 패닉이 발생했다.

-000|do_DataAbort(addr = 0, fsr = 0, regs = 0x0)

-001|__dabt_svc(asm)

-->|exception

-002|debug_spin_lock_before(inline)

-002|do_raw_spin_lock(lock = 0x02B4)

-003|raw_spin_lock_irqsave(lock = 0x02B4)

-004|__wake_up(q = 0x02B4, mode = 1, nr_exclusive = 1, key = 0x0)

-005|mmc_wait_data_done(mrq = 0xDC9725F8)

-006|msdc_irq_data_complete(host = 0xDC919300, ?, error = 0)

-007|msdc_irq(?, dev_id = 0x1)

-008|handle_irq_event_percpu(?, action = 0xDC91E0C0)

-009|handle_irq_event(desc = 0xC121A440)

-010|cond_unmask_eoi_irq(inline)

-010|handle_fasteoi_irq(?, desc = 0xC121A440)

-011|static_key_count(inline)

-011|static_key_false(inline)

-011|trace_irq_exit(inline)

-011|__handle_domain_irq(domain = 0xDE806000, hwirq = 111, lookup = TRUE, ?)

-012|__raw_readl(inline)

-012|gic_handle_irq(regs = 0xCB773E10)

-013|__irq_svc(asm)

-->|exception

-014|current_thread_info(inline)

-014|check_preemption_disabled(what1 = 0xC0F115AC, what2 = 0xC0EE0B28)

-015|debug_smp_processor_id()

-016|MT_trace_preempt_on()

-017|preempt_count_sub

-018|current_thread_info(inline)

-018|preempt_count(inline)

-018|should_resched(inline)

-018|__percpu_counter_add(fbc = 0xD8AC7178, amount = 0, batch = 32)

-019|__sb_start_write(sb = 0xC0F115AC, level = -1058141400, wait = FALSE)

-020|file_start_write(inline)

-020|vfs_write(file = 0xD366C480, buf = 0xBEA83A4C, ?, pos = 0xCB773F78)

-021|SYSC_write(inline)

-021|sys_write(?, buf = -1096271284, count = 146)

-022|ret_fast_syscall(asm)


mrq->host->context_info.wait 변수를 확인해보면, 이미 mrq->host 변수가 0이다.

(struct mmc_request *)0xDC9725F8 = 0xDC9725F8 -> (

    sbc = 0xDC97262C,

    cmd = 0xDC972660,

    data = 0xDC9726C8,

    stop = 0xDC972694,

    completion = (done = 0x0, wait = (lock = (rlock = (raw_lock = (slock = 0x0,

      done = 0x0,

     host = 0x0)


어머 그런데, mmcqd/0 프로세스가 mmc_wait_for_data_req_done() 코드를 이미 수행했네?

-000|context_switch(inline)

-000|__schedule()

-001|schedule()

-002|mmc_wait_for_data_req_done(inline)

-002|mmc_start_req(host = 0xDC919000, areq = 0xDC972710, ?)

-003|mmc_blk_issue_rw_rq(mq = 0x0, rqc = 0xC64B18C0)

-004|mmc_blk_issue_rq(mq = 0xDC972494, req = 0xC64B18C0)

-005|mmc_queue_thread(d = 0xDC972494)

-006|kthread(_create = 0xDCA24E40)

-007|ret_from_fork(asm)

-008|ret_fast_syscall(asm)


@ftrace log를 좀 더 추가해보자.

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c

index db74413..fcf6692 100644

? a/drivers/mmc/core/core.c

+++ b/drivers/mmc/core/core.c

@@ -1034,6 +1034,9 @@ out:

}

EXPORT_SYMBOL(mmc_start_bkops);

#endif

/*

mmc_wait_data_done() - done callback for data request

@mrq: done data request

@@ -1042,10 +1045,25 @@ EXPORT_SYMBOL(mmc_start_bkops);

*/

static void mmc_wait_data_done(struct mmc_request *mrq)

static void mmc_wait_data_done(struct mmc_request mrq) *{

        mrq->host->context_info.is_done_rcv = true;


+      trace_printk("+msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));

        mdelay(1);

+      trace_printk("++msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));

        wake_up_interruptible(&mrq->host->context_info.wait);

+      trace_printk("--msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));


역시나 GIC mtk-msdc(111) IRQ가 Trigger된 다음에 아래와 같이 커널 패닉이 발생하였네.

(램덤프: neo_more_crash_ftrace)

-000|do_DataAbort(addr = 0, fsr = 0, regs = 0x0)

-001|__dabt_svc(asm)

-->|exception

-002|debug_spin_lock_before(inline)

-002|do_raw_spin_lock(lock = 0x02B4)

-003|raw_spin_lock_irqsave(lock = 0x02B4)

-004|__wake_up(q = 0x02B4, mode = 1, nr_exclusive = 1, key = 0x0)

-005|mmc_wait_data_done(mrq = 0xDC9235F8)

-006|msdc_irq_data_complete(host = 0xDC927300, ?, error = 0)

-007|msdc_irq(?, dev_id = 0x1)

-008|handle_irq_event_percpu(?, action = 0xDC92B180)

-009|handle_irq_event(desc = 0xC121A440)

-010|cond_unmask_eoi_irq(inline)

-010|handle_fasteoi_irq(?, desc = 0xC121A440)

-011|static_key_count(inline)

-011|static_key_false(inline)

-011|trace_irq_exit(inline)

-011|__handle_domain_irq(domain = 0xDE806000, hwirq = 111, lookup = TRUE, ?)

-012|__raw_readl(inline)

-012|gic_handle_irq(regs = 0xC2631E20)

-013|__irq_svc(asm)

-->|exception

-014|__fget(?, mask = 16384)

-015|__fget_light(?, mask = 16384)

-016|__fdget

-017|sockfd_lookup_light(?, err = 0xC2631EC8, fput_needed = 0xC2631ECC)

-018|SYSC_recvfrom(inline)

-018|sys_recvfrom(?, ubuf = -1212542344, ?, flags = 64, addr = 0, addr_len = 0)

-019|ret_fast_syscall(asm)

Ftrace log 분석 좀 해보자.

커널 패닉이 일어나기 직전 상황

[1]: irq=111 name=mtk-msdc 가 Trigger됨

[2]: IRQ context에서 mmc_wait_data_done() 함수가 호출 from(msdc_irq_data_complete)

[2]~[3] 사이 mmcq0 process가 context switching 되지 않고 수행 ( InputDispatcher-1538)

[3]: mdelay(1); 이후 찍히는 ftrace log

[4]: wake_up_interruptible(&mrq->host->context_info.wait); API 호출로 mmcq0 process을 wakeup 시킴

CPU5: mmcqd/0, pid=244

sched_wakeup: comm=mmcqd/0 pid=244 prio=120 success=1 target_cpu=005 state=W

[5]: wake_up_interruptible(&mrq->host->context_info.wait); API 호출이 끝난 후 찍히는 로그(별 의미 없음)


kworker/4:0-25 [004] 149.827488: irq_handler_entry: irq=111 name=mtk-msdc // <<--[1]

kworker/4:0-25 [004] 149.827495: bprint: mmc_wait_data_done: +msdc [F: mmc_wait_data_done, L:1049]: ? ? from(msdc_irq_data_complete) // <<--[2]

InputDispatcher-1538 [005] 149.827496: sched_migrate_task: comm=m.lge.launcher3 pid=2135 prio=120 orig_cpu=6 dest_cpu=4 state=W

InputDispatcher-1538 [005] 149.827519: sched_wakeup: comm=m.lge.launcher3 pid=2135 prio=120 success=1 target_cpu=004 state=W

InputDispatcher-1538 [005] 149.827591: sched_migrate_task: comm=ndroid.systemui pid=1622 prio=120 orig_cpu=5 dest_cpu=6 state=W

InputDispatcher-1538 [005] 149.827608: sched_wakeup: comm=ndroid.systemui pid=1622 prio=120 success=1 target_cpu=006 state=W

InputDispatcher-1538 [005] 149.827675: sched_migrate_task: comm=android.ui pid=1287 prio=118 orig_cpu=4 dest_cpu=6 state=W

InputDispatcher-1538 [005] 149.827694: sched_wakeup: comm=android.ui pid=1287 prio=118 success=1 target_cpu=006 state=W

InputDispatcher-1538 [005] 149.827855: sched_switch: prev_comm=InputDispatcher prev_pid=1538 prev_prio=112 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 extra_prev_state=K|m|d 

<idle>-0 [005] 149.827912: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=InputDispatcher next_pid=1538 next_prio=112 extra_prev_state=K|m|d 

InputDispatcher-1538 [005] 149.828064: sched_switch: prev_comm=InputDispatcher prev_pid=1538 prev_prio=112 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 extra_prev_state=K|m|d 

<idle>-0 [005] 149.828209: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=InputDispatcher next_pid=1538 next_prio=112 extra_prev_state=K|m|d 

InputDispatcher-1538 [005] 149.828352: sched_switch: prev_comm=InputDispatcher prev_pid=1538 prev_prio=112 prev_state=S ==> next_comm=swapper/5 next_pid=0 next_prio=120 extra_prev_state=K|m|d 

kworker/4:0-25 [004] 149.828497: bprint: mmc_wait_data_done: ++msdc [F: mmc_wait_data_done, L:1058]: ? ? from(msdc_irq_data_complete) // <<--[3]

kworker/4:0-25 [004] 149.828521: sched_wakeup: comm=mmcqd/0 pid=244 prio=120 success=1 target_cpu=005 state=W // <<--[4]

kworker/4:0-25 [004] 149.828525: bprint: mmc_wait_data_done: -msdc [F: mmc_wait_data_done, L:1060]: ? ? from(msdc_irq_data_complete) <<-[5]

<idle>0 [005] 149.828528: sched_switch: prev_comm=swapper/5 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=mmcqd/0 next_pid=244 next_prio=120 extra_prev_state=K|m|d <<-[6]

kworker/4:0-25 [004] 149.828530: irq_handler_exit: irq=111 ret=handled

1046 static void mmc_wait_data_done(struct mmc_request *mrq)

1047*{*

1048 mrq->host->context_info.is_done_rcv = true;

1049 trace_printk("+msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[2]

//생략

1051 mdelay(1);

//생략

1057

1058 trace_printk("++msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[3]

1059 wake_up_interruptible(&mrq->host->context_info.wait);

1060 trace_printk("-msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<-[5]


커널 패닉 상황

[7]: irq=111 name=mtk-msdc 가 handle 되었음

[8]: [mmcqd/0-244, CPU5]mmc_wait_for_data_req_done() 함수 내[from(mmc_blk_issue_rw_rq)] wait_event_interruptible()에서 빠져나옴.

[9][10]: [mmcqd/0-244, CPU5]mmc_wait_for_data_req_done() 함수를 빠져나옴

[11]: irq=111 name=mtk-msdc 가 Trigger됨 --> CPU4의 m.lge.launcher3-2135

[12]: [mmcqd/0-244, CPU5] 깨어난 mmcqd/0-244 프로세스 활동 시작

&mqrq->brq 변수를 0으로 밀어버림 (where: memset(brq, 0, sizeof(struct mmc_blk_request));

[13]: [m.lge.launcher3-2135 CPU4] irq=111 name=mtk-msdc IRQ로 Trigger되어 실행되어

mmc_wait_data_done() 함수 진입 

[14][15]: [mmcqd/0-244, CPU5] mmc_blk_rw_rq_prep() 함수 퇴장하려함

[16][17][18] [mmcqd/0-244, CPU5] mmc_wait_for_data_req_done() 함수 처리 후 퇴장하려함

[19]: [m.lge.launcher3-2135 CPU4] wake_up_interruptible(&mrq->host->context_info.wait);

[20] 커널 패닉

[12]번 동작에서 0으로 초기화된 &mrq->host 변수를 통해 밀린((where: memset(brq, 0, sizeof(struct mmc_blk_request)) wake_up_interruptible() API 호출

kworker/4:0-25 [004] 149.828530: irq_handler_exit: irq=111 ret=handled // <<--[7]

mmcqd/0-244 [005] 149.828536: bprint: mmc_start_req: ++mmcq [F: mmc_wait_for_data_req_done, L:1236]: ? ? from(mmc_blk_issue_rw_rq) // <<--[8]

kworker/4:0-25 [004] 149.828537: irq_handler_entry: irq=30 name=arch_timer

kworker/4:0-25 [004] 149.828546: irq_handler_exit: irq=30 ret=handled

mmcqd/0-244 [005] 149.828559: bprint: mmc_start_req: mmcq [F: mmc_wait_for_data_req_done, L:1268]: ? ? from(mmc_blk_issue_rw_rq) // <<-[9]

mmcqd/0-244 [005] 149.828561: bprint: mmc_start_req: mmcq release [F: mmc_wait_for_data_req_done, L:1270]: ? ? from(mmc_blk_issue_rw_rq) // <<--[10]

kworker/4:0-25 [004] 149.828584: workqueue_execute_end: work struct 0xcb40cae4

kworker/4:0-25 [004] 149.828630: sched_switch: prev_comm=kworker/4:0 prev_pid=25 prev_prio=120 prev_state=S ==> next_comm=m.lge.launcher3 next_pid=2135 next_prio=120 extra_prev_state=K|m|d 

m.lge.launcher3-2135 [004] 149.828660: irq_handler_entry: irq=111 name=mtk-msdc // <<--[11]

mmcqd/0-244 [005] 149.828661: bprint: mmc_blk_rw_rq_prep: ++mmc_blk_rw_rq_prep [F: mmc_blk_rw_rq_prep, L:1725]: ? ? from(mmc_blk_issue_rw_rq) // <<--[12]

m.lge.launcher3-2135 [004] 149.828666: bprint: mmc_wait_data_done: +msdc [F: mmc_wait_data_done, L:1049]: ? ? from(msdc_irq_data_complete) // <<--[13]

mmcqd/0-244 [005] 149.828670: bprint: mmc_blk_rw_rq_prep: -mmc_blk_rw_rq_prep [F: mmc_blk_rw_rq_prep, L:1905]: ? ? from(mmc_blk_issue_rw_rq) // <<-[14]

mmcqd/0-244 [005] 149.828672: bprint: mmc_blk_rw_rq_prep: mmc_blk_rw_rq_prep [F: mmc_blk_rw_rq_prep, L:1907]: ? ? from(mmc_blk_issue_rw_rq) // <<-[15]

mmcqd/0-244 [005] 149.828681: bprint: mmc_start_req: +mmcq [F: mmc_wait_for_data_req_done, L:1231]: ? ? from(mmc_blk_issue_rw_rq) // <<--[16]

mmcqd/0-244 [005] 149.828683: bprint: mmc_start_req: ++mmcq [F: mmc_wait_for_data_req_done, L:1236]: ? ? from(mmc_blk_issue_rw_rq) // <<--[17]

mmcqd/0-244 [005] 149.828695: bprint: mmc_start_req: -mmcq [F: mmc_wait_for_dat뗥nQ, L:0]: ? ? from(0x726f776bS) // <<--[18]

m.lge.launcher3-2135 [004] 149.829669: bprint: mmc_wait_data_done: ++msdc [F: mmc_wait_data_done, L:1058]: ? ? from(msdc_irq_data_complete) // <<--[19]

m.lge.launcher3-2135 [004] 149.829748: tracing_on: ftrace is disabled caller=oops_enter // <<?[20]


1222 static int mmc_wait_for_data_req_done(struct mmc_host *host,

1223 struct mmc_request *mrq,

1224 struct mmc_async_req *next_req)

1225*{*

1226 struct mmc_command *cmd;

1227 struct mmc_context_info *context_info = &host->context_info;

1228 int err;

1229 unsigned long flags;

1230

1231 trace_printk("+mmcq [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[16]

1232 while (1) {

1233 wait_event_interruptible(context_info->wait,

1234 (context_info->is_done_rcv ||

1235 context_info->is_new_req));

1236 trace_printk("++mmcq [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[8], [17]

1237 spin_lock_irqsave(&context_info->lock, flags);

//snip

1268 trace_printk("mmcq [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<-[9], [18]

1269 mmc_retune_release(host);

1270 trace_printk("mmcq release [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));

1271 return err;

1272 }


1701 static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,

1702 struct mmc_card *card,

1703 int disable_multi,

1704 struct mmc_queue *mq)

1705*{*

1706 u32 readcmd, writecmd;

1707 struct mmc_blk_request *brq = &mqrq->brq;

//생략

1724 memset(brq, 0, sizeof(struct mmc_blk_request));

1725 trace_printk("++mmc_blk_rw_rq_prep [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[12]

1726 brq->mrq.cmd = &brq->cmd;

//생략

1905 trace_printk("-mmc_blk_rw_rq_prep [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); //<<-[14]

1906 mmc_queue_bounce_pre(mqrq);

1907 trace_printk("-mmc_blk_rw_rq_prep [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));

1908 }


1046 static void mmc_wait_data_done(struct mmc_request *mrq)

1047*{*

1048 mrq->host->context_info.is_done_rcv = true;

1049 trace_printk("+msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[13]

//생략

1051 mdelay(1);

//생략

1057

1058 trace_printk("++msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0)); // <<--[19]

1059 wake_up_interruptible(&mrq->host->context_info.wait);

1060 trace_printk("--msdc [F: %s, L:%d]: ? ? from(%pS)\n", _func,LINE, (void *)_builtin_return_address(0));


정리하면 wake_up_interruptible() API에 파라미터로 전달되는 &mrq->host->context_info.wait를 다른 값(mmcqd를 깨울 수 있는 wait_queue)으로 바꾸거나,

wake_up_interruptible() 호출 시 조건을 줘야 한다. 혹은 spinlock irq save로 코드 보호


솔류션


분석된 내용에 따라 아래 패치를 적용하였더니 문제가 사라졌다. 

 diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c

index c3abe34..01e4bfc 100644

--- a/drivers/mmc/core/core.c

+++ b/drivers/mmc/core/core.c

@@ -403,8 +403,13 @@ EXPORT_SYMBOL(mmc_start_bkops);

*/

static void mmc_wait_data_done(struct mmc_request *mrq)

{

+       unsigned long flags;

+       struct mmc_context_info *context_info = &mrq->host->context_info;

+

+       spin_lock_irqsave(&context_info->lock, flags);

         mrq->host->context_info.is_done_rcv = true;

         wake_up_interruptible(&mrq->host->context_info.wait);

+       spin_unlock_irqrestore(&context_info->lock, flags);

}

static void mmc_wait_done(struct mmc_request *mrq)

@@ -465,6 +470,7 @@ static int mmc_wait_for_data_req_done(struct mmc_host *host,

         struct mmc_command *cmd;

         struct mmc_context_info *context_info = &host->context_info;

         int err;

+       bool is_done_rcv = false;

         unsigned long flags;

         while (1) {

@@ -472,9 +478,10 @@ static int mmc_wait_for_data_req_done(struct mmc_host *host,

        (context_info->is_done_rcv ||

                     context_info->is_new_req));

                     spin_lock_irqsave(&context_info->lock, flags);

+          is_done_rcv = context_info->is_done_rcv;

            context_info->is_waiting_last_req = false;

            spin_unlock_irqrestore(&context_info->lock, flags);

-          if (context_info->is_done_rcv) {

+         if (is_done_rcv) {

                  context_info->is_done_rcv = false;

                  context_info->is_new_req = false;

                  cmd = mrq->cmd;


By the way, relevant patch has been already released inside Linux Kernel Community. 

https://patchwork.kernel.org/patch/7089351/

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c

index 347108a..db74413 100644

--- a/drivers/mmc/core/core.c

+++ b/drivers/mmc/core/core.c

@@ -1042,8 +1042,10 @@ EXPORT_SYMBOL(mmc_start_bkops);

*/

static void mmc_wait_data_done(struct mmc_request *mrq)

{

-       mrq->host->context_info.is_done_rcv = true;

-       wake_up_interruptible(&mrq->host->context_info.wait);

+      struct mmc_context_info *context_info = &mrq->host->context_info;

+

+      context_info->is_done_rcv = true;

+      wake_up_interruptible(&context_info->wait);

}


Very interesting kernel panic occurs with the following callstacks.

crash> bt -I C0ED8B64 -S E853FD20  e5752c00

PID: 1787   TASK: e5752c00  CPU: 4   COMMAND: "python"

bt: WARNING:  stack address:0xe853fa38, program counter:0xc0ee5b60

 #0 [<c0ed8b64>] (panic) from [<c0125038>]

 #1 [<c0125038>] (__stack_chk_fail) from [<c032b6cc>]

 #2 [<c032b6cc>] (sock_has_perm) from [<c0327d00>]

 #3 [<c0327d00>] (security_socket_recvmsg) from [<c0ceb1c8>]

 #4 [<c0ceb1c8>] (sock_recvmsg) from [<c0cec474>]

 #5 [<c0cec474>] (___sys_recvmsg) from [<c0ced5b4>]

 #6 [<c0ced5b4>] (__sys_recvmsg) from [<c0106820>]


[1]: sock_has_perm() is first called from security_socket_recvmsg().. Stack: 0xE853FDD8

[2]: After a set of registers are pushed into stack area.. Stack: 0xE853FDBC(0xE853FDD8-0x1C)

[3]: The stack is pushed for local variable Stack: 0xE853FD68(0xE853FDBC-0x54)

[4]: R7 is holding 0xC1809948 where __stack_chk_guard is located.

(where: T32)

NSP:C032B6D4|C1809948            dcd     0xC1809948       ; __stack_chk_guard

(where: binary utility)

c032b6d4:       c1809948        .word   0xc1809948


crash> p &__stack_chk_guard

$3 = (unsigned long *) 0xc1809948 <__stack_chk_guard>


[5]: R3 is now holding stack canary magic dump 0xdc8cb01f where  R3 = 0xdc8cb01f =  *0xC1809948

crash> rd 0xC1809948

c1809948:  dc8cb01f    


[6]: The stack canary magic dump(0xdc8cb01f: R3) is stored into 0xE853FDB4(=0xE853FD68+0x4c = SP+0x4c)

[7]: To get stack canary magic dump, access 0xE853FDB4 where "ldr r2, [sp, #76] ; 0x4c" instruction is executed.

In this case, R2 is updated as 0xdc8cb01f.


[8]: R3 should have been 0xdc8cb01f instead of 0x0. The weird debug signature is that R7 holds 0xC1800048, which should have been 0xC1809948.

After the following instruction is executed, R7 is never changed within sock_has_perm().

  • 0xc032b638 <sock_has_perm+8>: ldr r7, [pc, #148]

[9]: kernel panic occurs


#Code

0xc032b630 <sock_has_perm>:     push    {r4, r5, r6, r7, r8, r9, lr} //<<--[2]

0xc032b634 <sock_has_perm+0x4>: sub     sp, sp, #84     ; 0x54  //<<--[3]

0xc032b638 <sock_has_perm+0x8>: ldr     r7, [pc, #148]  ; 0xc032b6d4 <sock_has_perm+0xa4> //<<--[4]

0xc032b63c <sock_has_perm+0xc>: add     r6, sp, #28

0xc032b640 <sock_has_perm+0x10>:        mov     r9, r0

0xc032b644 <sock_has_perm+0x14>:        ldr     r4, [r1, #420]  ; 0x1a4

0xc032b648 <sock_has_perm+0x18>:        mov     r5, r1

0xc032b64c <sock_has_perm+0x1c>:        mov     r8, r2

0xc032b650 <sock_has_perm+0x20>:        ldr     r3, [r7]  //<<--[5]

0xc032b654 <sock_has_perm+36>:  mov     r1, #0

0xc032b658 <sock_has_perm+40>:  mov     r2, #48 ; 0x30

0xc032b65c <sock_has_perm+44>:  mov     r0, r6

0xc032b660 <sock_has_perm+48>:  str     r3, [sp, #76]   ; 0x4c //<<--[6]

// snip

0xc032b6b8 <sock_has_perm+0x88>:        ldr     r2, [sp, #76]   ; 0x4c  //<<--[7]

0xc032b6bc <sock_has_perm+0x8c>:        ldr     r3, [r7]  //<<--[8]

0xc032b6c0 <sock_has_perm+0x90>:        cmp     r2, r3

0xc032b6c4 <sock_has_perm+0x94>:        beq     0xc032b6cc <sock_has_perm+156>

0xc032b6c8 <sock_has_perm+0x98>:        bl      0xc0125028 <__stack_chk_fail>  //<<--[9]


#Stack dump

NSD:E853FD5C| 00 00 00 00  0x0

NSD:E853FD60| 00 00 00 00  0x0

NSD:E853FD64| CC B6 32 C0  0xC032B6CC         \\vmlinux\hooks\sock_has_perm+0x9C

NSD:E853FD68| 74 FD 53 E8  0xE853FD74  //<<--[3]

NSD:E853FD6C| A0 FD 53 E8  0xE853FDA0

//snip

NSD:E853FDAC| 00 00 00 00  0x0

NSD:E853FDB0| 00 00 00 00  0x0

NSD:E853FDB4| 1F B0 8C DC  0xDC8CB01F  //<<--[6] location where stack canary magic is saved.

NSD:E853FDB8| 00 0A 82 ED  0xED820A00

NSD:E853FDBC| 00 9C C9 C3  0xC3C99C00  //<<--[R4]  //<<--[2] New Stack

NSD:E853FDC0| 00 00 01 00  0x10000    //<<--[R5]

NSD:E853FDC4| 7C FF 53 E8  0xE853FF7C   //<<--[R6]

NSD:E853FDC8| 00 00 00 00  0x0

NSD:E853FDCC| 00 2C 75 E5  0xE5752C00

NSD:E853FDD0| 00 00 00 00  0x0 //<<--[R9]

NSD:E853FDD4| 00 7D 32 C0  0xC0327D00  \\vmlinux\security\security_socket_recvmsg+0x14 //<<--[R14]

NSD:E853FDD8| 00 00 00 00  0x0  // <<--[1]


Please remember that  __stack_chk_fail symbol is pushed into the assemble code inside other Kernel APIs for stack corruption sanity-check.

0xc0100974 <do_one_initcall+404>:       bl      0xc0125028 <__stack_chk_fail>

0xc0100d8c <name_to_dev_t+828>: bl      0xc0125028 <__stack_chk_fail>

0xc0107854 <__show_regs+700>:   bl      0xc0125028 <__stack_chk_fail>

0xc01219c0 <sha384_neon_final+108>:     bl      0xc0125028 <__stack_chk_fail>

0xc0133c34 <sys_newuname+352>:  bl      0xc0125028 <__stack_chk_fail>

0xc0133d68 <sys_sethostname+276>:       bl      0xc0125028 <__stack_chk_fail>


In case of this coredump, the Linux Kernel is complied with fstack-protector option.

658 ifdef CONFIG_CC_STACKPROTECTOR_REGULAR

 659   stackp-flag := -fstack-protector

 660   ifeq ($(call cc-option, $(stackp-flag)),)

 661     $(warning Cannot use CONFIG_CC_STACKPROTECTOR_REGULAR: \

 662              -fstack-protector not supported by compiler)

 663   endif



+ Recent posts