01 背景

小概率出现 kernel 异常重启问题:3 个月内出现 3 例报 linux kernel 地址异常导致重启问题。通过场景分析,构造测试场景,并打开 ramdump 配置进行复测,抓到了问题现场。

02 crash 解析 ramdump

crash ./vmlinux /dev/random@0x80000000,DDRCS0-1.bin@0x80970000,DDRCS0-2.bin@0x100970000,DDRCS0-3.bin@0x140970000,DDRCS0-4.bin@0x180970000,DDRCS0-5.bin@0x1c0970000,DDRCS0-6.bin@0x200970000,DDRCS0-7.bin@0x240970000 --machdep vabits_actual=48

2.1 查看出错 log

通过 dmesg 命令抓取 log 缓存区,找到出错的日志和调用栈:

crash> dmesg
...
[ 1134.509848] ==================================================================
[ 1134.509888] BUG: KASAN: user-memory-access in cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.510057] Write of size 1 at addr 0000050100000780 by task ipi6_thread/4052
[ 1134.510080]
[ 1134.510092] CPU: 1 PID: 4052 Comm: ipi6_thread Tainted: P           O      5.10.59-rt52-gbdf2977878dd-dirty #2
[ 1134.510119] Hardware name: Horizon AI Technologies, Inc. HOBOT j5 RHODE B2 & C & Ca & Cb & Cc & Cd & Ce (DT)
[ 1134.510138] Call trace:
[ 1134.510147]  dump_backtrace+0x0/0x2e0
[ 1134.510192]  show_stack+0x14/0x20
[ 1134.510224]  dump_stack+0xf8/0x160
[ 1134.510256]  kasan_report+0x1a8/0x200
[ 1134.510284]  __asan_store1+0x9c/0xa8
[ 1134.510308]  cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.510443]  kthread+0x258/0x260
[ 1134.510475]  ret_from_fork+0x10/0x1c
[ 1134.510504] ==================================================================
...

[ 1134.690628] CPU: 6 PID: 4052 Comm: ipi6_thread Tainted: P    B      O      5.10.59-rt52-gbdf2977878dd-dirty #2
[ 1134.693905] Hardware name: Horizon AI Technologies, Inc. HOBOT j5 RHODE B2 & C & Ca & Cb & Cc & Cd & Ce (DT)
[ 1134.695165] pstate: 40c00005 (nZcv daif +PAN +UAO -TCO BTYPE=--)
[ 1134.695951] pc : cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.696841] lr : cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.697726] sp : ffff0001c0e5fc50
[ 1134.698163] pmr_save: 000000e0
[ 1134.698566] x29: ffff0001c0e5fc50 x28: 0000000000000000
[ 1134.699273] x27: 0000000000000000 x26: 0000000000000000
[ 1134.699977] x25: 000000000000004c x24: 0000050100000780
[ 1134.700682] x23: ffff000180e0b3c0 x22: ffff00020d4c0a50
[ 1134.701390] x21: ffff000180e0b1b0 x20: ffff000180e0b248
[ 1134.702099] x19: ffff000180e0b120 x18: 0000000000000000
[ 1134.702805] x17: 0000000000000000 x16: 0000000000000000
[ 1134.703509] x15: 0000000000000000 x14: 3d3d3d3d3d3d3d3d
[ 1134.704215] x13: 3d3d3d3d3d3d3d3d x12: ffff9400025cf1cf
[ 1134.704923] x11: 1ffff400025cf1ce x10: ffff9400025cf1ce
[ 1134.705631] x9 : dfffa00000000000 x8 : ffffa00012e78e70
[ 1134.706340] x7 : 0000000000000001 x6 : ffffa00012e78e70
[ 1134.707045] x5 : 00006bfffda30e32 x4 : dfffa00000000000
[ 1134.707754] x3 : ffffa00010c3c6a8 x2 : 0000000000000007
[ 1134.708459] x1 : ffff00017d1ec4c0 x0 : 0000000000000001
[ 1134.709166] Call trace:
[ 1134.709497]  cimdma_swap_buffer+0x2a4/0x4e0 [hobot_cim_dma]
[ 1134.710340]  kthread+0x258/0x260
[ 1134.710787]  ret_from_fork+0x10/0x1c
[ 1134.711288] Code: f94033e1 8b190300 387b4839 95d23efc (383c4b19)

通过日志中的信息可知,出错位置是 cimdma_swap_buffer+0x2a4,以及出错的线程 ipi6_thread=>pipeline 8。

2.2 定位出错代码行

加载 hobot_cim_dma.ko 符号表,并反汇编 cimdma_swap_buffer, 找到偏移是 0x2a4(676)的指令行:

//加载符号表
crash> mod -s hobot_cim_dma /home/kaikai.sun/cimdma_ramdump/symbols/kernel/hobot_cim_dma.ko
     MODULE       NAME                       BASE           SIZE  OBJECT FILE
ffffa00008d97d80  hobot_cim_dma        ffffa00008d70000   184320  /home/kaikai.sun/cimdma_ramdump/symbols/kernel/hobot_cim_dma.ko

//反汇编
crash> dis -l cimdma_swap_buffer
/home/ycj/work/adnoa/software/adpro_j5_acore_public_origin/kernel/drivers/media/platform/hobot/cim_dma/hobot_cim_dma_ops.c: 1124
0xffffa00008d724a4:    ldr     x0, [sp, #120]
0xffffa00008d724a8:    bl      0xffffa00010202488 <__asan_load8>
0xffffa00008d724ac:    ldr     x0, [x22, #176]
0xffffa00008d724b0:    str     x0, [sp, #96]
0xffffa00008d724b4:    ldr     x0, [sp, #104]
0xffffa00008d724b8:    bl      0xffffa00010202488 <__asan_load8>
0xffffa00008d724bc:    ldr     x0, [sp, #96]
0xffffa00008d724c0:    ldr     x24, [x23, #168]
0xffffa00008d724c4:    add     x0, x0, x26
0xffffa00008d724c8:    bl      0xffffa00010202020 <__asan_load1>
0xffffa00008d724cc:    ldr     x1, [sp, #96]
0xffffa00008d724d0:    add     x0, x24, x25
0xffffa00008d724d4:    ldrb    w25, [x1, w27, uxtw]
0xffffa00008d724d8:    bl      0xffffa000102020c8 <__asan_store1>
0xffffa00008d724dc:    strb    w25, [x24, w28, uxtw]

出错的指令行是 strb w25, [x24, w28, uxtw],结合 log 中的调用栈,x24: 0000050100000780,说明指令确实是出错点。

再来看 X24 是怎么赋值的:ldr x24, [x23, #168],结合 log 调用和代码 hobot_cim_dma_ops.c:1124。

x23: ffff000180e0b3c0

emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];

X23 的地址便是 emb_frame 的指针。

2.3 查看出错内存信息

查看 emb_frame 的内存信息:

crash> struct vio_frame 0xffff000180e0b3c0
struct vio_frame {
  list = {
    next = 0xffff00020d538000,
    prev = 0xffff00020d538a50
  },
  work = {
    node = {
      next = 0xffff000180e0b3d0,
      prev = 0xffff000180e0b3d0
    },
    func = 0xffff000180e0b3e0,
    worker = 0xffff000180e0b3e0,
    canceling = -2132757520
  },
  group = 0xffff000180e0b3f0,
  buffer = {
    ion_alloced = 30 '\036',
    ion_cached = 0 '\000',
    ion_cachesync = 0 '\000',
    consecutive_mode = 0 '\000',
    ion_mmap = 128 '\200',
    planeSize = {1281, 0, 1},
    info = {
      index = 0,
      planecount = 1,
      share_id = {0, 0, 0},
      planeSize = {1, 0, 1},
      paddr = {8589934595, 0, 4294967296},
      addr = {0x50100000780, 0x1e, 0x6}
    },
....
从上面的结构体解析地址来看,数值基本上不对;

crash> ptype /o struct vio_frame
/* offset    |  size */  type = struct vio_frame {
/*    0      |    16 */    struct list_head {
/*    0      |     8 */        struct list_head *next;
/*    8      |     8 */        struct list_head *prev;
....

/*  488      |    12 */    u32 paddr_buffer[3];
/*  500      |     4 */    enum vio_frame_state state;
/*  504      |     4 */    u32 instance;
/*  508      |     4 */    u32 fcount;
/*  512      |     4 */    u32 index;
/*  516      |     2 */    u16 dispatch_cnt;
/*  518      |     1 */    u8 iommu_map;
/*  519      |     1 */    u8 remote_buf;
/*  520      |     8 */    void *ext_data;

                           /* total size (bytes):  528 */
                         }

crash> rd ffff000180e0b3c0 -e ffff000180e0b600
ffff000180e0b3c0:  ffff00020d538000 ffff00020d538a50   ..S.....P.S.....
ffff000180e0b3d0:  ffff000180e0b3d0 ffff000180e0b3d0   ................
ffff000180e0b3e0:  ffff000180e0b3e0 ffff000180e0b3e0   ................
ffff000180e0b3f0:  ffff000180e0b3f0 ffff000180e0b3f0   ................
ffff000180e0b400:  000007800000001e 0000000000000501   ................
ffff000180e0b410:  0000000000000000 0000000000000001   ................
ffff000180e0b420:  0000000100000000 0000000000000000   ................
ffff000180e0b430:  0000000100000000 0000000000000001   ................
ffff000180e0b440:  0000000000000000 0000000000000001   ................
ffff000180e0b450:  0000000200000003 0000000000000000   ................
ffff000180e0b460:  0000000100000000 0000050100000780   ................
ffff000180e0b470:  000000000000001e 0000000000000006   ................
ffff000180e0b480:  0000000000000000 0000000000000001   ................
ffff000180e0b490:  0000000100000001 0000000100000780   ................
ffff000180e0b4a0:  0000000000000000 0000000000000000   ................
ffff000180e0b4b0:  0000000000000000 0000000000000000   ................
ffff000180e0b4c0:  1234567800000000 0000000000000000   ....xV4.........
ffff000180e0b4d0:  0000000000000000 0000000000000000   ................
ffff000180e0b4e0:  0000000000000000 0000000000000000   ................
ffff000180e0b4f0:  0000000000000000 0000000000000000   ................
ffff000180e0b500:  0000000000000000 0000000000000000   ................
ffff000180e0b510:  0000000000000000 0000000000000000   ................
ffff000180e0b520:  0000000000000000 00000000000080ac   ................
ffff000180e0b530:  000064b81c513364 0000000064b81c50   d3Q..d..P..d....
ffff000180e0b540:  00000000000e0011 0006000001010100   ................
ffff000180e0b550:  0000000000010001 0000000000000002   ................
ffff000180e0b560:  000080ad000080ad 0000000064b81c50   ........P..d....
ffff000180e0b570:  0000001c0000001e 0000000000000001   ................
ffff000180e0b580:  0000000064b81c50 0000001b0000001e   P..d............
ffff000180e0b590:  0000000000000001 0000000000000000   ................
ffff000180e0b5a0:  0000000000000000 0000000000000000   ................
ffff000180e0b5b0:  0000000000000001 ffff00017d1ec4c0   ...........}....
ffff000180e0b5c0:  0000000000000000 0000000000000000   ................
ffff000180e0b5d0:  0000000000000000 0000000000000000   ................
ffff000180e0b5e0:  0000000000000000 0000000000000000   ................
ffff000180e0b5f0:  ffff000180e0b5f0 ffff000180e0b5f0   ................

2.4 找到出错地址的保存位置

通过查看 0xffff000180e0b3c0 前后的内存信息,是用户设置的配置信息,都是保存在 struct cimdma_subdev 中,cimdma_subdev 是 struct j5_cimdma_dev 的成员变量。

struct j5_cimdma_dev {
    /* j5 cimdma information */
    struct platform_device *pdev;
    void __iomem *base_reg;
    resource_size_t regs_start;
    resource_size_t regs_end;
    s32 irq;
    unsigned long state;
    struct class *class;
    struct cdev cdev;
    dev_t devno;
    ...
    struct cimdma_subdev subdev[VIO_MAX_STREAM];
    struct vio_group *group[VIO_MAX_STREAM];
    struct vio_group_task gtask[VIO_MAX_STREAM];
    ...
    }
 //通过静态变量g_cimdma找到struct j5_cimdma_dev指针
 crash> g_cimdma
g_cimdma = $2 = (struct j5_cimdma_dev *) 0xffff000180e08080

crash> struct j5_cimdma_dev 0xffff000180e08080 -o
struct j5_cimdma_dev {
  [ffff000180e08080] struct platform_device *pdev;
  [ffff000180e08088] void *base_reg;
  [ffff000180e08090] resource_size_t regs_start;
  [ffff000180e08098] resource_size_t regs_end;
  [ffff000180e080a0] s32 irq;
  [ffff000180e080a8] unsigned long state;
  [ffff000180e080b0] struct class *class;
  [ffff000180e080b8] struct cdev cdev;
  [ffff000180e08120] dev_t devno;
  [ffff000180e08124] atomic_t instance;
  [ffff000180e08128] atomic_t rsccount;
  [ffff000180e0812c] atomic_t open_cnt;
  [ffff000180e08130] u32 sw_drop_count[16];
  [ffff000180e08170] u32 hw_drop_count[16];
  [ffff000180e081b0] raw_spinlock_t raw_slock;
  [ffff000180e081b8] struct mutex mlock;
  [ffff000180e081e0] atomic_t sensor_fcount[8];
  [ffff000180e08200] atomic_t backup_fcount[8];
  [ffff000180e08220] atomic_t enable_cnt[8];
  [ffff000180e08240] u32 cur_output_flag[8];
  [ffff000180e08260] struct cimdma_subdev subdev[16];
  [ffff000180e0dfe0] struct vio_group *group[16];
  [ffff000180e0e060] struct vio_group_task gtask[16];
  [ffff000180e0e760] u32 fusa_enable;
  [ffff000180e0e768] u64 jiffi;
  [ffff000180e0e770] struct vio_stl stl;
  [ffff000180e0e7a8] u32 last_frameid[8];
  [ffff000180e0e7c8] u32 error_cnt[8];
}
SIZE: 26472

由于通过之前的 log 已知出错的通路是 pipeline 8,对应的结构体是 subdev[8],下一步查看 subdev[8]内存信息。

crash> struct cimdma_subdev ffff000180e08260 -o 9
....
struct cimdma_subdev {
  [ffff000180e0b120] struct vio_subdev vdev;
  [ffff000180e0b300] struct j5_cimdma_dev *cimdma;
  [ffff000180e0b308] wait_queue_head_t done_wq;
  [ffff000180e0b348] struct vio_framemgr emb_fmgr;
  [ffff000180e0b400] cim_dma_cfg_t cim_cfg;
  [ffff000180e0b4c8] struct frame_info preint_info;
  [ffff000180e0b548] u8 initial_frameid;
  [ffff000180e0b549] u8 yuv_format;
  [ffff000180e0b54a] u8 embeded_data;
  [ffff000180e0b54b] u8 embeded_dependence;
  [ffff000180e0b54c] u8 embeded_start_cnt;
  [ffff000180e0b54d] u8 pack_mode;
  [ffff000180e0b54e] u8 ipi_index;
  [ffff000180e0b54f] u8 tpg_en;
  [ffff000180e0b550] u8 reqbuf_flag;
  [ffff000180e0b551] u8 stop_flag;
  [ffff000180e0b552] u8 start_flag;
  [ffff000180e0b554] u32 cnt_shift;
  [ffff000180e0b558] u32 irq_status;
  [ffff000180e0b55c] u32 force_drop;
  [ffff000180e0b560] u32 sw_frameid;
  [ffff000180e0b564] u32 last_hw_frameid;
  [ffff000180e0b568] struct fps_debug fps[2];
  [ffff000180e0b598] fps_ctrl_t fps_ctrl;
  [ffff000180e0b5b0] u32 thread_run;
  [ffff000180e0b5b8] struct task_struct *cimdma_thread;
  [ffff000180e0b5c0] wait_queue_head_t cimdma_done_wq;
  [ffff000180e0b600] struct completion stop_complete;
  [ffff000180e0b620] struct vio_drop_mgr drop_mgr;
}
SIZE: 1496

X23:ffff000180e0b3c0 是在[ffff000180e0b348]struct vio_framemgr emb_fmgr 内,下一步查看 emb_fmgr 结构体信息。

crash> struct vio_framemgr ffff000180e0b348 -o
struct vio_framemgr {
  [ffff000180e0b348] u32 id;
  [ffff000180e0b34c] raw_spinlock_t raw_slock;
  [ffff000180e0b350] spinlock_t slock;
  [ffff000180e0b380] ulong sindex;
  [ffff000180e0b388] u32 num_frames;
  [ffff000180e0b38c] u32 num_buffers;
  [ffff000180e0b390] struct vio_frame *frames;
  [ffff000180e0b398] u32 queued_count[5];
  [ffff000180e0b3b0] struct list_head queued_list[5];
}
SIZE: 184

crash> list ffff000180e0b3f0
ffff000180e0b3f0
crash> struct list_head ffff000180e0b3b0 -o 5
struct list_head {
  [ffff000180e0b3b0] struct list_head *next;
  [ffff000180e0b3b8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3c0] struct list_head *next;
  [ffff000180e0b3c8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3d0] struct list_head *next;
  [ffff000180e0b3d8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3e0] struct list_head *next;
  [ffff000180e0b3e8] struct list_head *prev;
}
SIZE: 16

struct list_head {
  [ffff000180e0b3f0] struct list_head *next;
  [ffff000180e0b3f8] struct list_head *prev;
}
SIZE: 16

2.5 定位原因

由此可知 X23:ffff000180e0b3c0 是 queued_list[1]的起始地址,queued_list 是 5 个 list 的 list head,queued_list[1]是 FS_REQUEST queue,对应代码 emb_frame 是从 FS_REQUEST 队列中获取,也就是说 peek_frame 拿到的是 FS_REQUEST 队列的 head。

static void cimdma_separate_embedded_data(struct cimdma_subdev *subdev)
{
....
    emb_fmgr = &subdev->emb_fmgr;
    vio_e_barrier_irqs(emb_fmgr, flags);/*PRQA S 2996*/
    emb_frame = peek_frame(emb_fmgr, FS_REQUEST);
    vio_x_barrier_irqr(emb_fmgr, flags);/*PRQA S 2996*/
    if (emb_frame == NULL) {
        vio_err("[S%d] emb FS_REQUEST queue has no member;\n", group->instance);
        framemgr_print_queues(emb_fmgr);
        return;
    }

    emb_frame->frameinfo.frame_id = frame->frameinfo.frame_id;
    emb_frame->frameinfo.timestamps = frame->frameinfo.timestamps;
    emb_frame->frameinfo.tv_sec = frame->frameinfo.tv_sec;
    emb_frame->frameinfo.tv_usec = frame->frameinfo.tv_usec;
    vio_frame_sync_for_cpu(frame);
    for (i = 0; i < subdev->cim_cfg.embeded_width; i++) {
        if (i % 2 == 0)
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];
        else
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[0][i/2];
    }    emb_fmgr = &subdev->emb_fmgr;
    vio_e_barrier_irqs(emb_fmgr, flags);/*PRQA S 2996*/
    emb_frame = peek_frame(emb_fmgr, FS_REQUEST);
    vio_x_barrier_irqr(emb_fmgr, flags);/*PRQA S 2996*/
    if (emb_frame == NULL) {
        vio_err("[S%d] emb FS_REQUEST queue has no member;\n", group->instance);
        framemgr_print_queues(emb_fmgr);
        return;
    }

    emb_frame->frameinfo.frame_id = frame->frameinfo.frame_id;
    emb_frame->frameinfo.timestamps = frame->frameinfo.timestamps;
    emb_frame->frameinfo.tv_sec = frame->frameinfo.tv_sec;
    emb_frame->frameinfo.tv_usec = frame->frameinfo.tv_usec;
    vio_frame_sync_for_cpu(frame);
    for (i = 0; i < subdev->cim_cfg.embeded_width; i++) {
        if (i % 2 == 0)
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[1][i/2];
        else
            emb_frame->buffer.info.addr[0][i] = frame->buffer.info.addr[0][i/2];
    }
      ...
   }

通过以上可知,emb_fmgr 的链表操作存在的问题,应该是锁保护异常了,重新 review 代码发现确实是锁异常了,emb_fmgr 链表操作时使用了 framemgr 的 spinlock。

   static void cimdma_separate_embedded_data(struct cimdma_subdev *subdev)
   {
    ...
    vio_e_barrier_irqs(framemgr, flags);/*PRQA S 2996*/
    trans_frame(emb_fmgr, emb_frame, FS_COMPLETE);
    vio_x_barrier_irqr(framemgr, flags);/*PRQA S 2996*/
    wake_up(&subdev->done_wq);
    }

修改此次的锁异常,便能根本的修复该问题。

03 结论与反思

用锁保护临界资源是多进程并发问题的常用手段,但是锁保护的范围是否正确一直没有有效手段进行检查;在后续的项目或者芯片平台上,用锁保护得增加注释,方便自己其他同学检查,减少出错概率。


地平线智驾开发者
1 声望2 粉丝

地平线智能驾驶开发者社区旨在连接智能驾驶领域的开发者和对相关技术感兴趣的其他行业开发者、从业者。