Kernel panic log

 2107.232713 / 01-01 11:11:03.809][7] init: cannot find '/system/bin/qrngp' (No such file or directory), disabling 'qrngp'

[ 2107.239317 / 01-01 11:11:03.809][5] Unable to handle kernel NULL pointer dereference at virtual address 00000028

[ 2107.239351 / 01-01 11:11:03.809][5] pgd = e37ec000

[ 2107.239366 / 01-01 11:11:03.809][0] [00000028] *pgd=00000000

[ 2107.239388 / 01-01 11:11:03.809][5] Internal error: Oops: 5 [#1] PREEMPT SMP ARM

[ 2107.239405 / 01-01 11:11:03.809][0] Modules linked in: texfat(PO)

[ 2107.239433 / 01-01 11:11:03.809][5] CPU: 5 PID: 2803 Comm: sensorservice Tainted: P        W  O   3.18.31-perf-gd069b48-00001-g8a6d6e5 #1

[ 2107.239452 / 01-01 11:11:03.809][5] task: e3ffb700 ti: e37f4000 task.ti: e37f4000

[ 2107.239479 / 01-01 11:11:03.809][5] PC is at find_vma_links+0x1c/0x78

[ 2107.239499 / 01-01 11:11:03.809][5] LR is at vma_adjust+0x3a0/0x574


Equivalent callstacks can be restored using the following command.

crash>  bt -I C01002D8 -S  E37F5DD0 0xE3FFB700

PID: 2803   TASK: e3ffb700  CPU: 5   COMMAND: "sensorservice"

bt: WARNING:  stack address:0xe37f5b98, program counter:0xc0ee5b60

 #0 [<c01002d8>] (do_DataAbort) from [<c010ad58>]

    pc : [<c01f980c>]    lr : [<c01fa708>]    psr: 200f0013

    sp : e37f5ec4  ip : 00000034  fp : e8236d9c

    r10: 00000000  r9 : 00000000  r8 : b2c99000

    r7 : c4616c80  r6 : e8236d68  r5 : 7f555034  r4 : 7f555034

    r3 : e37f5f04  r2 : b2c97000  r1 : b2c95000  r0 : 7f555038

    Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM

 #1 [<c010ad58>] (__dabt_svc) from [<c01fa708>]

 #2 [<c01f980c>] (find_vma_links) from [<c01fa708>]

 #3 [<c01fa708>] (vma_adjust) from [<c01fa9e4>]

 #4 [<c01fa9e4>] (__split_vma) from [<c01fb364>]

 #5 [<c01fb364>] (do_munmap) from [<c01fb558>]

 #6 [<c01fb558>] (vm_munmap) from [<c0106820>]


Execution profile at the moment of kernel panic

[1]: R12 is updated as 0x34 where 0x34 = *0x7f555038 = *R0

[2]: Data abort is raised since base address for LDR execution is 0x28(0x34-0xC = R12-0xC)

0xc01f97f0 <find_vma_links>:    push    {r4, r5, lr}

0xc01f97f4 <find_vma_links+0x4>:        add     r0, r0, #4

0xc01f97f8 <find_vma_links+0x8>:        mov     r4, #0

0xc01f97fc <find_vma_links+0xc>:        mov     r5, r4

0xc01f9800 <find_vma_links+0x10>:       ldr     r12, [r0]  //<<--[1]

0xc01f9804 <find_vma_links+0x14>:       cmp     r12, #0

0xc01f9808 <find_vma_links+0x18>:       beq     0xc01f983c <find_vma_links+76>

0xc01f980c <find_vma_links+0x1c>:       ldr     r0, [r12, #-12] //<<--[2]


When tracing back to the context where the call to find_vma_links() is made inside vma_adjust(), R0 is copied from R7(0xC4616C80) whose data structure type is (struct mm_struct*) .

0xc01fa6e8 <vma_adjust+896>:    mov     r0, r7

0xc01fa6ec <vma_adjust+900>:    str     r3, [sp]

0xc01fa6f0 <vma_adjust+904>:    add     r3, sp, #60     ; 0x3c

0xc01fa6f4 <vma_adjust+908>:    str     r3, [sp, #4]

0xc01fa6f8 <vma_adjust+912>:    add     r3, sp, #52     ; 0x34

0xc01fa6fc <vma_adjust+916>:    ldr     r1, [r2]

0xc01fa700 <vma_adjust+920>:    ldr     r2, [r2, #4]

0xc01fa704 <vma_adjust+924>:    bl      0xc01f97f0 <find_vma_links>  //<<--


Identical debug information

crash> struct task_struct.mm e3ffb700

  mm = 0xc4616d80


At logical level, when find_vma_links() is called, R0 should have been 0xC4616C80 instead of 0x7f555038. This signature cannot be understandable from SW point of view.


This issue ended up with replacing defective PMIC with normal one in the problematic target device.


Analysis 

Kernel log says that program counter indicates invalid address(0xecb29f00) as below.

[  257.879321 / 01-01 00:04:20.499][1] Unable to handle kernel paging request at virtual address ecb29f00

[  257.879343 / 01-01 00:04:20.499][1] pgd = c4ebc000

[  257.879355 / 01-01 00:04:20.499][0] [ecb29f00] *pgd=6ca1141e(bad)

[  257.879372 / 01-01 00:04:20.499][1] Internal error: Oops: 8000000d [#1] PREEMPT SMP ARM

[  257.879384 / 01-01 00:04:20.499][0] Modules linked in: texfat(PO)

[  257.879403 / 01-01 00:04:20.499][1] CPU: 1 PID: 384 Comm: ueventd Tainted: P        W  O   3.18.31-perf-gd069b48-00001-g8a6d6e5 #1

[  257.879416 / 01-01 00:04:20.499][1] task: eccc4d00 ti: c4eaa000 task.ti: c4eaa000

[  257.879429 / 01-01 00:04:20.499][1] PC is at 0xecb29f00

[  257.879447 / 01-01 00:04:20.499][1] LR is at security_context_to_sid_core+0x184/0x1b0

[  257.879462 / 01-01 00:04:20.499][1] pc : [<ecb29f00>]    lr : [<c033ab00>]    psr: 80030013

[  257.879462 / 01-01 00:04:20.499][1] sp : c4eabed8  ip : e14870c0  fp : b0d70e50

[  257.879479 / 01-01 00:04:20.499][1] r10: 00000000  r9 : c4eaa000  r8 : 00000027

[  257.879492 / 01-01 00:04:20.499][1] r7 : c4e2bcc0  r6 : ebef5cc0  r5 : c4e2e1c0  r4 : 00000000

[  257.879504 / 01-01 00:04:20.499][1] r3 : c193e700  r2 : 00000000  r1 : c193e700  r0 : 00000000

[  257.879517 / 01-01 00:04:20.499][1] Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user


As the linked register(R14) holds 0xc033ab00(security_context_to_sid_core+0x184), ueventd process has been running under the following callstacks.

-000|security_context_to_sid_core(scontext = 0xDEAD, scontext_len = 0, sid = 0xB

-001|security_context_to_sid(?, ?, ?, ?)

-002|selinux_inode_setsecurity(inode = 0x0, ?, value = 0x0, size = 0, flags = 0)

-003|selinux_inode_notifysecctx(?, ?, ?)

-004|security_inode_notifysecctx(?, ?, ?)

-005|kernfs_type(inline)

-005|kernfs_refresh_inode(kn = 0xFFFFFFFF, inode = 0x80030013)

-006|kernfs_iop_getattr(?, ?, stat = 0xC4EABF50)

-007|vfs_getattr_nosec(?, ?)

-008|vfs_fstatat(dfd = 0, filename = 0xC4E2BCC0, stat = 0xC4EABF50, ?)

-009|SYSC_fstatat64(inline)

-009|sys_fstatat64(?, ?, statbuf = -1328228292, ?)

-010|ret_fast_syscall(asm)


When program counter is 0xc033a980, its stack address should have been holding 0xC4EABE94.

In case of 0xc033a984 address, the stack address is updated as 0xC4EABE58(0xC4EABE94-0x3c)

0xc033a97c <security_context_to_sid_core>:      push    {r4, r5, r6, r7, r8, r9, r10, r11, lr}

0xc033a980 <security_context_to_sid_core+4>:    subs    r5, r1, #0

0xc033a984 <security_context_to_sid_core+8>:    sub     sp, sp, #60     ; 0x3c


I am suspecting that stack pop operation is not executed actually according to ARM calling conventions inside below functions.

0xc033a97c <security_context_to_sid_core>:      push    {r4, r5, r6, r7, r8, r9, r10, r11, lr}


0xc033a9c0 <security_context_to_sid_core+68>:   bl      0xc038542c <strcmp>

0xc033a9fc <security_context_to_sid_core+128>:  bl      0xc020b974 <__kmalloc>

0xc033aa10 <security_context_to_sid_core+148>:  bl      0xc037cb00 <memcpy>

0xc033aa28 <security_context_to_sid_core+172>:  bl      0xc01e806c <kstrdup>

0xc033aa44 <security_context_to_sid_core+200>:  bl      0xc0ee8d48 <_raw_read_lock>

0xc033aa5c <security_context_to_sid_core+224>:  bl      0xc033a060 <string_to_context_struct>

0xc033aaa0 <security_context_to_sid_core+292>:  bl      0xc03340a0 <sidtab_context_to_sid>

0xc033aac8 <security_context_to_sid_core+332>:  bl      0xc03333e8 <ebitmap_destroy>

0xc033aadc <security_context_to_sid_core+352>:  bl      0xc037d260 <__memzero>

0xc033aaf4 <security_context_to_sid_core+376>:  bl      0xc020b2f8 <kfree>


(For example)

-000|string_to_context_struct(pol = 0xDEAD, sidtabp = 0x0, scontext = 0xBEEF, sc

-001|security_context_to_sid_core(?, scontext_len = 0, sid = 0x0, ?, gfp_flags =

-002|security_context_to_sid(?, ?, ?, ?)

-003|selinux_inode_setsecurity(inode = 0x0, ?, value = 0x0, size = 0, flags = 0)

-004|selinux_inode_notifysecctx(?, ?, ?)

-005|security_inode_notifysecctx(?, ?, ?)

-006|kernfs_type(inline)

-006|kernfs_refresh_inode(kn = 0xFFFFFFFF, inode = 0x80030013)

-007|kernfs_iop_getattr(?, ?, stat = 0xC4EABF50)

-008|vfs_getattr_nosec(?, ?)

-009|vfs_fstatat(dfd = 0, filename = 0xC4E2BCC0, stat = 0xC4EABF50, ?)

-010|SYSC_fstatat64(inline)

-010|sys_fstatat64(?, ?, statbuf = -1328228292, ?)

-011|ret_fast_syscall(asm)


After the above functions are executed, the stack address should have been updated as 0xC4EABE58 instead of 0xc4eabed8.  

[  257.879462 / 01-01 00:04:20.499][1] pc : [<ecb29f00>]    lr : [<c033ab00>]    psr: 80030013

[  257.879462 / 01-01 00:04:20.499][1] sp : c4eabed8  ip : e14870c0  fp : b0d70e50

[  257.879479 / 01-01 00:04:20.499][1] r10: 00000000  r9 : c4eaa000  r8 : 00000027


The stack dump can be compared between this coredump and normal operation.

this ramdump

the stack in case of normal operation

NSD:C4EABE90| 00 00 00 00  0x0

NSD:C4EABE94| 00 E7 93 C1  0xC193E700         \\vmlinux\Global\__tracepoint_kfree

NSD:C4EABE98| 00 00 00 00  0x0

NSD:C4EABE9C| 00 E7 93 C1  0xC193E700         \\vmlinux\Global\__tracepoint_kfree

NSD:C4EABEA0| 00 00 00 00  0x0

NSD:C4EABEA4| C0 E1 E2 C4  0xC4E2E1C0

NSD:C4EABEA8| C0 5C EF EB  0xEBEF5CC0

NSD:C4EABEAC| C0 BC E2 C4  0xC4E2BCC0

NSD:C4EABEB0| 27 00 00 00  0x27               \\vmlinux\Global\cpu_v7_suspend_size+0x3

NSD:C4EABEB4| 00 A0 EA C4  0xC4EAA000 //<<-R14 should have been \\security_context_to_sid+0x14 

NSD:C4EABEB8| 00 00 00 00  0x0  // <<-- new SP

NSD:C4EABEBC| 50 0E D7 B0  0xB0D70E50

NSD:C4EABEC0| C0 70 48 E1  0xE14870C0

NSD:C4EABEC4| D8 BE EA C4  0xC4EABED8 //<<-- should have been R14  \\selinux_inode_setsecurity+0x48

NSD:C4EABEC8| 00 AB 33 C0  0xC033AB00         \\vmlinux\services\security_context_to_sid_core+0x184

NSD:C4EABECC| 00 9F B2 EC  0xECB29F00

NSD:C4EABED0| 13 00 03 80  0x80030013

NSD:C4EABED4| FF FF FF FF  0xFFFFFFFF

NSD:C4EABED8| 50 BF EA C4  0xC4EABF50  //<<-- SP address at the moment of kernel panic

NSD:C4EABEDC| 00 9F B2 EC  0xECB29F00

NSD:C4EABEE0| 50 BF EA C4  0xC4EABF50

NSD:C4EABEE4| 9C A3 32 C0  0xC032A39C

NSD:C4EABEE8| 00 00 00 00  0x0

NSD:C4EABEEC| C0 BC E2 C4  0xC4E2BCC0

NSD:C4EABE90|_00_00_00_00__0x0

NSD:C4EABE94| CC CC 00 00  0xCCCC //<-- R4 where new SP

NSD:C4EABE98| 00 00 00 00  0x0

NSD:C4EABE9C| 00 00 00 00  0x0

NSD:C4EABEA0| 00 00 00 00  0x0

NSD:C4EABEA4| 00 00 00 00  0x0

NSD:C4EABEA8| 00 00 00 00  0x0

NSD:C4EABEAC| 00 00 00 00  0x0

NSD:C4EABEB0| 00 00 00 00  0x0  //<-- R11

NSD:C4EABEB4| E4 C5 33 C0  0xC033C5E4     //<<-R14 \\security_context_to_sid+0x14

NSD:C4EABEB8| AD DE 00 00  0xDEAD  //<<--R0 where new SP

NSD:C4EABEBC| 00 00 00 00  0x0        //<<--R1

NSD:C4EABEC0| EF BE 00 00  0xBEEF    //<<--R2

NSD:C4EABEC4| D8 A2 32 C0  0xC032A2D8       //<<--R14  \\selinux_inode_setsecurity+0x48

NSD:C4EABEC8| 00 AB 33 C0  0xC033AB00   //<-- R0, where new SP      \\security_context_to_sid_core+0x184

NSD:C4EABECC| 00 9F B2 EC  0xECB29F00    //<-- R1

NSD:C4EABED0| 13 00 03 80  0x80030013    //<-- R4

NSD:C4EABED4| FF FF FF FF  0xFFFFFFFF      //<-- R5

NSD:C4EABED8| 50 BF EA C4  0xC4EABF50  //<-- R6

NSD:C4EABEDC| 00 9F B2 EC  0xECB29F00  //<-- R7

NSD:C4EABEE0| 50 BF EA C4  0xC4EABF50  //<-- R8

NSD:C4EABEE4| 9C A3 32 C0  0xC032A39C         \\vmlinux\hooks\selinux_inode_notifysecctx+0x20 //<-- R14

NSD:C4EABEE8| 00 00 00 00  0x0   //<<-- SP

NSD:C4EABEEC| C0 BC E2 C4  0xC4E2BCC0

Weird signature is that PC should have been 0xFFFF_FFFF instead of 0xecb29f00.


Another scenario I can think of is that stack is scribbled inside below functions (i.e: memcpy of out-of-bound) 

0xc033a97c <security_context_to_sid_core>:      push    {r4, r5, r6, r7, r8, r9, r10, r11, lr}


0xc033a9c0 <security_context_to_sid_core+68>:   bl      0xc038542c <strcmp>

0xc033a9fc <security_context_to_sid_core+128>:  bl      0xc020b974 <__kmalloc>

0xc033aa10 <security_context_to_sid_core+148>:  bl      0xc037cb00 <memcpy>

0xc033aa28 <security_context_to_sid_core+172>:  bl      0xc01e806c <kstrdup>

0xc033aa44 <security_context_to_sid_core+200>:  bl      0xc0ee8d48 <_raw_read_lock>

0xc033aa5c <security_context_to_sid_core+224>:  bl      0xc033a060 <string_to_context_struct>

0xc033aaa0 <security_context_to_sid_core+292>:  bl      0xc03340a0 <sidtab_context_to_sid>

0xc033aac8 <security_context_to_sid_core+332>:  bl      0xc03333e8 <ebitmap_destroy>

0xc033aadc <security_context_to_sid_core+352>:  bl      0xc037d260 <__memzero>

0xc033aaf4 <security_context_to_sid_core+376>:  bl      0xc020b2f8 <kfree>


Since the specific target device keeps crashing randomly, we swap the chipset from such device. After that, the crash disappears.


Kernel panic occurs with defective device under the following call trace.

crash> bt -I C01002D8 -S  E7AABC08 0xE1804200

PID: 2285   TASK: e1804200  CPU: 5   COMMAND: "python"

bt: WARNING:  stack address:0xe7aabd80, program counter:0xc0ee5b60

 #0 [<c01002d8>] (do_DataAbort) from [<c010ad58>]

    pc : [<c01d7308>]    lr : [<c01d72ec>]    psr: 60020193

    sp : e7aabcf8  ip : c193e69c  fp : edf34bf4

    r10: 00000000  r9 : 0000001f  r8 : 00000002

    r7 : c1938280  r6 : c1938200  r5 : 00000010  r4 : ef4bddb4

    r3 : ef4bddb4  r2 : 00000100  r1 : 00000000  r0 : ef4bdda0

    Flags: nZCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM

 #1 [<c010ad58>] (__dabt_svc) from [<c01d72ec>]

 #2 [<c01d7308>] (rmqueue_bulk.constprop.11) from [<c01d7540>]  //<<-- kernel panic

 #3 [<c01d7540>] (get_page_from_freelist) from [<c01d79c4>]

 #4 [<c01d79c4>] (__alloc_pages_nodemask) from [<c01f7bf4>]

 #5 [<c01f7bf4>] (handle_mm_fault) from [<c011525c>]

 #6 [<c011525c>] (do_page_fault) from [<c01002d8>]

 #7 [<c01002d8>] (do_DataAbort) from [<c010b03c>]


The data abort is raised since page.lru->next(R2) holds invalid address 0x100.

0xc01d72f4 <rmqueue_bulk.constprop.11+0x58>:    cmp     r10, #0

0xc01d72f8 <rmqueue_bulk.constprop.11+0x5c>:    add     r3, r0, #20

0xc01d72fc <rmqueue_bulk.constprop.11+0x60>:    ldreq   r2, [r4]

0xc01d7300 <rmqueue_bulk.constprop.11+0x64>:    ldrne   r2, [r4, #4]

0xc01d7304 <rmqueue_bulk.constprop.11+0x68>:    strne   r3, [r4, #4]

0xc01d7308 <rmqueue_bulk.constprop.11+0x6c>:    streq   r3, [r2, #4]  //<<-- data abort

crash> struct page.lru  0xEF4BDDA0  -px

    lru = {

      next = 0x100,  //<<--

      prev = 0x200

    }


After having code review, I have figured out that attribute of page is pcp(per-cpu page frame cache: buddy system, 0 order page)

static int rmqueue_bulk(struct zone *zone, unsigned int order,

   unsigned long count, struct list_head *list,

   int migratetype, bool cold)

{

 int i;


 spin_lock(&zone->lock);

 for (i = 0; i < count; ++i) {

  struct page *page;


//snip

  if (likely(!cold))

   list_add(&page->lru, list);  //<<--

  else

   list_add_tail(&page->lru, list);


To find out pcp address for CPU5, the following command s are used.

crash> p contig_page_data.node_zones[1].pageset

$5 = (struct per_cpu_pageset *) 0xc177ebdc


crash> struct per_cpu_pages  EDF34BDC

struct per_cpu_pages {

  count = 0x1,

  high = 0xba,

  batch = 0x1f,

  lists = {{

      next = 0xef51fc74,  //<<--MIGRATE_UNMOVABLE

      prev = 0xef51fc74

    }, {

      next = 0xedf34bf0, //<<--MIGRATE_RECLAIMABLE

      prev = 0xedf34bf0

    }, {

      next = 0xef4bdcd4,//<<--MIGRATE_MOVABLE

      prev = 0xef4bddf4

    }, {

      next = 0xedf34c00, //<<--MIGRATE_PCPTYPES

      prev = 0xedf34c00

    }}

}


(where) 0xEDF34BDC = 0xc177ebdc+0x2c7b6000

crash> p  __per_cpu_offset[5]

$7 = 0x2c7b6000


BTW the listed list 0xef4bdcd4 address is found to be corrupted as follows. 

crash> list 0x0 0xef4bdcd4

ef4bdcd4

ef4bdcf4

ef4bdd14

ef4bdd34

ef4bdd54

ef4bdd74

ef4bddb4

100

(where)

 #0 [<c01002d8>] (do_DataAbort) from [<c010ad58>]

    pc : [<c01d7308>]    lr : [<c01d72ec>]    psr: 60020193

    sp : e7aabcf8  ip : c193e69c  fp : edf34bf4

    r10: 00000000  r9 : 0000001f  r8 : 00000002

    r7 : c1938280  r6 : c1938200  r5 : 00000010  r4 : ef4bddb4

    r3 : ef4bddb4  r2 : 00000100  r1 : 00000000  r0 : ef4bdda0

    Flags: nZCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM

 #1 [<c010ad58>] (__dabt_svc) from [<c01d72ec>]

 #2 [<c01d7308>] (rmqueue_bulk.constprop.11) from [<c01d7540>]  //<<-- kernel panic

 #3 [<c01d7540>] (get_page_from_freelist) from [<c01d79c4>]


After the device is disassembled again with another PMIC, the crash disappears.


+ Recent posts