XFS導致進程核心棧溢位的解決辦法

2020-06-16 17:39:18

系統環境

系統版本： CentOS release 6.5
kenel版本：2.6.32-431.20.3.el6.x86_64
檔案系統： XFS

問題描述

系統panic，並列印以下calltrace資訊：

kvm: 16396: cpu1 unhandled wrmsr: 0x391 data 2000000f
BUG: scheduling while atomic:qemu-system-x86/27122/0xffff8811
BUG: unable to handle kernel paging request at 00000000dd7ed3a8
IP: [<fffffff81058e5d>] task_rq_lock+0x4d/0xa8
PGD 0
Oops:0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:00/0000:00:02.2/0000:04:00.0/host0/target0:2:1/0:2:1/block/sdb/queue/logical_block_size
...
[<ffffffff81058e5d>] ? task_rq_lock+0x4d/0xa0
[<ffffffff8106195c>] ? try_to_wakeup+0x3c/0x3e0
[<ffffffff81061d55>] ? wake_up_process+0x15/0x20
[<ffffffff810a0f62>] ? __up+0x2a/0x40
[<ffffffffa03394c2>] ? xfs_buf_unlock+0x32/0x90 [xfs]
[<ffffffffa030297f>] ? xfs_buf_item_unpin+0xcf/0x1a0 [xfs]
[<ffffffffa032f18c>] ? xfs_trans_committed_bulk+0x29c/0x2b0 [xfs]
[<ffffffff81069f15>] ? enqueue_entity+0x125/0x450
[<ffffffff81060aa3>] ? perf_event_task_sched_out+0x33/0x70
[<ffffffff81069973>] ? dequeue_entity+0x113/0x2e0
[<ffffffffa032326d>] ? xlog_cli_committed+0x0x3d/0x100 [xfs]
[<ffffffffa031f79d>] ? xlog_state_do_callback+0x15d/0x2b0 [xfs]
[<ffffffffa031f96e>] ? xlog_state_done_syncing+0x7e/0xb0 [xfs]
[<ffffffffa03200e9>] ? xlog_iodone+0x59/0xb0 [xfs]
[<ffffffffa033ae50>] ? xfs_buf_iodone_work+0x0/0x50 [xfs]
[<ffffffffa033ae76>] ? xfs_buf_iodone_work+0x26/0x50 [xfs]

截圖如下：

錯誤跟蹤

unable to handle kernel paging request at 00000000dd7ed3a0
00000000dd7ed3a0是使用者空間地址，核心正常是不會存取的，所以，可以定性為核心出了BUG。

IP: [<ffffffff81058e5d>] task_rq_lock + 0x4d/0xa8

由於系統中沒有部署kdump，只能通過objdump靜態分析，進一步跟蹤出錯的指令地址。

    ffffffff81058e10 <task_rq_lock>:
    * interrupts. Note the ordering: we can safely lookup the task_rq without
    * explicitly disabling preemption.
    */
    static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
            __acquires(rq->lock)
    {
    ffffffff81058e10:       55                      push   %rbp
    ffffffff81058e11:       48 89 e5                mov    %rsp,%rbp
    ffffffff81058e14:       48 83 ec 20             sub    $0x20,%rsp
    ffffffff81058e18:       48 89 1c 24             mov    %rbx,(%rsp)
    ffffffff81058e1c:       4c 89 64 24 08          mov    %r12,0x8(%rsp)
    ffffffff81058e21:       4c 89 6c 24 10          mov    %r13,0x10(%rsp)
    ffffffff81058e26:       4c 89 74 24 18          mov    %r14,0x18(%rsp)
    ffffffff81058e2b:       e8 10 1f fb ff          callq  ffffffff8100ad40 <mcount>
    ffffffff81058e30:       48 c7 c3 40 68 01 00    mov    $0x16840,%rbx
    ffffffff81058e37:       49 89 fc                mov    %rdi,%r12
    ffffffff81058e3a:       49 89 f5                mov    %rsi,%r13
    ffffffff81058e3d:       ff 14 25 80 8b a9 81    callq  *0xffffffff81a98b80
    ffffffff81058e44:       48 89 c2                mov    %rax,%rdx
            PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
    }
    static inline void raw_local_irq_disable(void)
    {
            PVOP_VCALLEE0(pv_irq_ops.irq_disable);
    ffffffff81058e47:       ff 14 25 90 8b a9 81    callq  *0xffffffff81a98b90
            struct rq *rq;

            for (;;) {
                    local_irq_save(*flags);
    ffffffff81058e4e:       49 89 55 00             mov    %rdx,0x0(%r13)
                    rq = task_rq(p);
    ffffffff81058e52:       49 8b 44 24 08          mov    0x8(%r12),%rax
    ffffffff81058e57:       49 89 de                mov    %rbx,%r14
    ffffffff81058e5a:       8b 40 18                mov    0x18(%rax),%eax
    ffffffff81058e5d:       4c 03 34 c5 60 cf bf    add    -0x7e4030a0(,%rax,8),%r14
    ffffffff81058e64:       81
                    spin_lock(&rq->lock);
    ffffffff81058e65:       4c 89 f7                mov    %r14,%rdi
    ffffffff81058e68:       e8 a3 23 4d 00          callq  ffffffff8152b210 <_spin_lock>

通過objdump反組合vmlinux，定位出錯的指令，當執行到ffffffff81058e5d這個地址時，系統出錯，找到對應的程式碼段，發現是在task_rq_lock()呼叫task_rq()時出錯。

kernel/sched.c

    #define task_rq(p)              cpu_rq(task_cpu(p))

    /*
    * task_rq_lock - lock the runqueue a given task resides on and disable
    * interrupts. Note the ordering: we can safely lookup the task_rq without
    * explicitly disabling preemption.
    */
    static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
            __acquires(rq->lock)
    {
            struct rq *rq;

            for (;;) {
                    local_irq_save(*flags);
                    rq = task_rq(p);
                    spin_lock(&rq->lock);
                    if (likely(rq == task_rq(p)))
                            return rq;
                    spin_unlock_irqrestore(&rq->lock, *flags);
            }
    }

include/linux/sched.h

    #define task_thread_info(task)  ((struct thread_info *)(task)->stack)

    static inline unsigned int task_cpu(const struct task_struct *p)
    {
            return task_thread_info(p)->cpu;
    }

    union thread_union {
        struct thread_info thread_info;
        unsigned long stack[THREAD_SIZE/sizeof(long)];
    };

看到這裡終於有了眉目，原來進程的thread_info和核心棧stack共處在一個union中，由於核心棧溢位導致thread_info被破壞。再來看看核心棧的大小：
arch/x86/include/asm/page_64_types.h

    #define THREAD_ORDER    1
    #define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
    #define CURRENT_MASK (~(THREAD_SIZE - 1))

在64位元系統中，核心棧大小為8K。

thread_info結構和進程的核心態stack結構共存於一個union結構中，結構總大小預設是8KB。XFS進程由於某種原因使用過多的stack空間，導致stack溢位，破壞thread_info結構。

“scheduling while atomic”應該是由於堆疊溢位覆蓋了進程的thread_info結構體中的搶佔計數（preempt count），導致下次被喚醒時搶佔計數非零，出現panic。

原因分析

經objdump分析，XFS導致堆疊溢位有兩種可能性：

一種可能是xfs_iomap_write_direct()函數未使用XFS_BMAPI_STACK_SWITCH標誌，導致xfs_bmapi_allocate分配時，沒有使用xfs_bmapi_allocate_worker分配到一個新的thread上（新的thread能保證有充足的棧），而是直接分配到了進程自己的核心棧，從而導致進程的核心棧溢位。

該bug在kernel-3.4的（commit c999a22 “xfs: introduce an allocation workqueue”）中得到fix。

另有一種爭議認為，使用專門的allocation工作佇列會因為執行緒建立的增加系統開銷導致IO回寫變慢，並且8K的核心棧對於超過8K的呼叫深度的進程依然會束手無策，所以kernel-3.16引入了（6538b8e x86_64: expand kernel stack to 16K）

核心討論組https://lwn.net/Articles/600647/比較了(commit c999a22 “xfs: introduce an allocation workqueue”)將writeback stack分到一個worker thread上和擴充套件核心棧為16K（6538b8e x86_64: expand kernel stack to 16K）這兩種方案，有興趣可以讀一下。

目前centos的2.6.32-520.el6已經將kernel-3.16的這個patch（6538b8e x86_64: expand kernel stack to 16K）從mainline拉了回來。這兩個patch並不衝突，建議先將kernel升級看一下擴充套件核心棧為16K能否解決xfs_iomap_write_direct的問題，如果不能可以進一步把（commit c999a22 “xfs: introduce an allocation workqueue”）拉回來。

另外一個可能的原因是xfs_buf_lock()函數恰好在被一個號誌阻塞之前，執行了一個log force操作，而log force的呼叫比較深，堆疊消耗比較大，導致系統panic。與centos kernel changelog裡的bug號1028831是同一個問題，該bug已經在2.6.32-495.el6中fix。

解決方案

升級kenel版本至≥2.6.32-520.el6，保證相關的patch已經包含進來。

changelog

[2.6.32-520.el6]

[kernel] x86_64: expand kernel stack to 16K (Johannes Weiner) [1045190 1060721]

[2.6.32-495.el6]

[fs] xfs: always do log forces via the workqueue (Eric Sandeen) [1028831]

[fs] xfs: Do background CIL flushes via a workqueue (Eric Sandeen) [1028831]

本文永久更新連結地址：http://www.linuxidc.com/Linux/2016-06/132383.htm