[Bug 1743529] Comment bridged from LTC Bugzilla

bugproxy bugproxy at us.ibm.com
Mon Mar 26 10:21:33 UTC 2018


------- Comment From indira.priya at in.ibm.com 2018-03-26 06:13 EDT-------
Hi,

Verified issue with latest ubunu1804 daily build kernel and now not
seeing problem with triggering crash with below levels

root at whip:~#  dpkg -l | grep kexec-tools
ii  kexec-tools                         1:2.0.16-1ubuntu1                                              ppc64el      tools to support fast kexec reboots
root at whip:~# dpkg -l | grep makedumpfile
ii  makedumpfile                        1:1.6.3-1                                                      ppc64el      VMcore extraction tool
root at whip:~# uname -a
Linux whip 4.15.0-12-generic #13 SMP Thu Mar 22 07:28:54 CDT 2018 ppc64le ppc64le ppc64le GNU/Linux

Triggered crash:
*****************
root at whip:/etc/default/grub.d# echo c > /proc/sysrq-trigger
[  183.215596] sysrq: SysRq : This sysrq operation is disabled.
root at whip:/etc/default/grub.d# echo 1 > /proc/sys/kernel/sysrq
root at whip:/etc/default/grub.d# echo c > /proc/sysrq-trigger
[  210.082354] sysrq: SysRq : Trigger a crash
[  210.082396] Unable to handle kernel paging request for data at address 0x00000000
[  210.082518] Faulting instruction address: 0xc0000000007ec4e8
[  210.082581] Oops: Kernel access of bad area, sig: 11 [#1]
[  210.082646] LE SMP NR_CPUS=2048 NUMA PowerNV
[  210.082713] Modules linked in: rpcsec_gss_krb5 nfsv4 nfs fscache rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_uverbs(OE) ib_umad(OE) esp6_offload esp6 esp4_offload esp4 xfrm_algo mlx5_fpga_tools(OE) mlx4_en(OE) mlx4_ib(OE) mlx4_core(OE) ofpart cmdlinepart vmx_crypto powernv_flash mtd idt_89hpesx crct10dif_vpmsum ipmi_powernv ipmi_devintf ipmi_msghandler at24 uio_pdrv_genirq uio opal_prd ibmpowernv binfmt_misc nfsd auth_rpcgss nfs_acl lockd grace sunrpc sch_fq_codel knem(OE) ip_tables x_tables autofs4 btrfs xor zstd_compress raid6_pq mlx5_ib(OE) ib_core(OE) mlx5_core(OE) nouveau mlxfw(OE) devlink mlx_compat(OE) lpfc ast i2c_algo_bit ttm drm_kms_helper nvmet_fc syscopyarea nvmet cxl sysfillrect sysimgblt nvme_fc fb_sys_fops ahci nvme_fabrics crc32c_vpmsum drm tg3 pnv_php
[  210.083672]  libahci scsi_transport_fc
[  210.083722] CPU: 10 PID: 5235 Comm: bash Tainted: G           OE    4.15.0-12-generic #13
[  210.083792] NIP:  c0000000007ec4e8 LR: c0000000007ed428 CTR: c0000000007ec4c0
[  210.083895] REGS: c000007fb73279f0 TRAP: 0300   Tainted: G           OE     (4.15.0-12-generic)
[  210.084027] MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28222222  XER: 20040000
[  210.084154] CFAR: c0000000007ed424 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 1
[  210.084154] GPR00: c0000000007ed428 c000007fb7327c70 c0000000016eaf00 0000000000000063
[  210.084154] GPR04: c000007fdeb7ce18 c000007fdeb94368 9000000000009033 000000000000000a
[  210.084154] GPR08: 0000000000000007 0000000000000001 0000000000000000 9000000000001003
[  210.084154] GPR12: c0000000007ec4c0 c000000003266e00 00000f1697af6b08 0000000000000000
[  210.084154] GPR16: 00000f167ebce9f0 00000f167ec61998 00000f167ec619d0 00000f167ec98204
[  210.084154] GPR20: 0000000000000000 0000000000000001 0000000000000000 00007fffc5069ac4
[  210.084154] GPR24: 00007fffc5069ac0 00000f167ec9afc4 c0000000015e9968 0000000000000002
[  210.084154] GPR28: 0000000000000063 0000000000000007 c000000001572a9c c0000000015e9d08
[  210.085152] NIP [c0000000007ec4e8] sysrq_handle_crash+0x28/0x30
[  210.085269] LR [c0000000007ed428] __handle_sysrq+0xf8/0x2c0
[  210.085328] Call Trace:
[  210.085378] [c000007fb7327c70] [c0000000007ed408] __handle_sysrq+0xd8/0x2c0 (unreliable)
[  210.085482] [c000007fb7327d10] [c0000000007edc34] write_sysrq_trigger+0x64/0x90
[  210.085584] [c000007fb7327d40] [c00000000047de88] proc_reg_write+0x88/0xd0
[  210.085673] [c000007fb7327d70] [c0000000003d11bc] __vfs_write+0x3c/0x70
[  210.085751] [c000007fb7327d90] [c0000000003d1418] vfs_write+0xd8/0x220
[  210.085824] [c000007fb7327de0] [c0000000003d1738] SyS_write+0x68/0x110
[  210.085941] [c000007fb7327e30] [c00000000000b184] system_call+0x58/0x6c
[  210.086030] Instruction dump:
[  210.086067] 4bfff9f1 4bfffe50 3c4c00f0 3842ea40 7c0802a6 60000000 39200001 3d42001c
[  210.086185] 394a6db0 912a0000 7c0004ac 39400000 <992a0000> 4e800020 3c4c00f0 3842ea10
[  210.086293] ---[ end trace 2141bc6e05b3cc02 ]---
[  211.090273]
211.090393] Sending IPI to other CP[  373.057331960,5] OPAL: Switch to big-endian OS
Us
[  211.12[  377.207676398,5] OPAL: Switch to little-endian OS
0361] IPI complete
[  213.393057] kexec: Starting switchover sequence.

[    1.295245] i
ntegrity: Unable
to open file: /
etc/keys/x509_im
a.der (-2)

[    1.295249] integrity: Unable to open file: /etc/keys/x509_evm.der (-2)
[    1.353447] vio vio: uevent: failed to send synthetic uevent
[    2.089461] nouveau 0004:04:00.0: unknown chipset (140000a1)
[    2.131257] nouveau 0004:05:00.0: unknown chipset (140000a1)
[    2.131538] nouveau 0035:03:00.0: unknown chipset (140000a1)
[    2.131664] nouveau 0035:04:00.0: unknown chipset (140000a1)
/dev/sda2: recovering journal
/dev/sda2: clean, 335484/122101760 files, 13969682/488376576 blocks
[    6.208502] vio vio: uevent: failed to send synthetic uevent
[  OK  ] Started Show Plymouth Boot Screen.
plymouth-start.service
[  OK  ] Started Forward Password Requests to Plymouth Directory Watch.
[  OK  ] Reached target Local Encrypted Volumes.
systemd-networkd.service
[  OK  ] Started Network Service.
Starting Wait for Network to be Configured...
[  OK  ] Started Network Time Synchronization.
systemd-timesyncd.service
[  OK  ] Reached target System Time Synchronized.
[    8.506483] lpfc 0000:01:00.0: 0:6101 Disabling NVME support: Not supported by firmware: 1 1
[    8.506624] lpfc 0000:01:00.0: 0:2574 IO channels: irqs 4 fcp 4 nvme 0 MRQ: 0
[  OK  ] Listening on Load/Save RF Kill Switch Status /dev/rfkill Watch.
[    9.358738] lpfc 0000:01:00.0: 0:3176 Port Name 0 Physical Link is functional
[    9.574467] lpfc 0000:01:00.1: 1:6101 Disabling NVME support: Not supported by firmware: 1 1
[    9.574550] lpfc 0000:01:00.1: 1:2574 IO channels: irqs 4 fcp 4 nvme 0 MRQ: 0
[  OK  ] Started AppArmor initialization.
apparmor.service
[  OK  ] Reached target System Initialization.
[   10.386796] lpfc 0000:01:00.1: 1:3176 Port Name 1 Physical Link is functional
[   10.646666] lpfc 0000:01:00.0: 0:1303 Link Up Event x1 received Data: x1 x0 x20 x0 x0 x0 0
[  OK  ] Created slice system-mlnx_interface_mgr.slice.
mlnx_interface_mgr at enP48p1s0f0.service
[  OK  ] Started mlnx_interface_mgr - configure enP48p1s0f0.
[  OK  ] Started mlnx_interface_mgr - configure enP48p1s0f1.
mlnx_interface_mgr at enP48p1s0f1.service
openibd.service
[  OK  ] Started openibd - configure Mellanox devices.
[  OK  ] Reached target Network.
[  OK  ] Started Wait for Network to be Configured.
systemd-networkd-wait-online.service
[  OK  ] Reached target Network is Online.
Starting Kernel crash dump capture service...
[   16.055959] kdump-tools[2400]: Starting kdump-tools:  * running makedumpfile -c -d 31 /proc/vmcore /var/crash/201803221639/dump-incomplete
Copying data                                      : [100.0 %] /           eta: 0s
[   40.957946] kdump-tools[2400]: The kernel version is not supported.
[   40.958026] kdump-tools[2400]: The makedumpfile operation may be incomplete.
[   40.958099] kdump-tools[2400]: The dumpfile is saved to /var/crash/201803221639/dump-incomplete.
[   40.958167] kdump-tools[2400]: makedumpfile Completed.
[   40.974357] kdump-tools[2400]:  * kdump-tools: saved vmcore in /var/crash/201803221639
[   41.840111] kdump-tools[2400]:  * running makedumpfile --dump-dmesg /proc/vmcore /var/crash/201803221639/dmesg.201803221639
[   41.878179] kdump-tools[2400]: The kernel version is not supported.
[   41.878303] kdump-tools[2400]: The makedumpfile operation may be incomplete.
[   41.878371] kdump-tools[2400]: The dmesg log is saved to /var/crash/201803221639/dmesg.201803221639.
[   41.878454] kdump-tools[2400]: makedumpfile Completed.
[   41.878536] kdump-tools[2400]:  * kdump-tools: saved dmesg content in /var/crash/201803221639
[   41.966570] kdump-tools[2400]: Thu, 22 Mar 2018 16:39:38 -0500
[   42.071819] kdump-tools[2400]: Rebooting.
[   42.084325] mlx5_core 0030:01:00.1: mlx5_enter_error_state:141:(pid 2441): start
[   42.084399] mlx5_core 0030:01:00.1: mlx5_enter_error_state:159:(pid 2441): end
[   42.094529] mlx5_core 0030:01:00.0: mlx5_enter_error_state:141:(pid 2441): start
[   42.094611] mlx5_core 0030:01:00.0: mlx5_enter_error_state:159:(pid 2441): end
[   45.826681] reboot: Restarting system
[  460.446012693,5] OPAL: Reboot request...
..

--== Welcome to Hostboot hostboot-ca203c9/hbicore.bin ==--

4.04291|secure|SecureROM valid - enabling functionality
4.04295|secure|Booting in non-secure mode.
5.94075|ISTEP  6. 5 - host_init_fsi
6.10500|ISTEP  6. 6 - host_set_ipl_parms
6.12308|ISTEP  6. 7 - host_discover_targets
6.66043|HWAS|PRESENT> DIMM[03]=AAAA000000000000
6.66044|HWAS|PRESENT> Proc[05]=8800000000000000
6.66045|HWAS|PRESENT> Core[07]=EFEFFFFDFDFF0000
6.68759|ISTEP  6. 8 - host_update_master_tpm

CRASH LOGS:
************

s -lrt /varoot at whip:~# ls -lrt /var/crash
total 40
drwxr-xr-x 2 root root  4096 Mar 22 16:39 201803221639
-rw-r--r-- 1 root root   223 Mar 22 16:43 kexec_cmd
-rw-r----- 1 root root 29741 Mar 22 16:43 linux-image-4.15.0-12-generic-201803221639.crash
root at whip:~# date
Thu Mar 22 16:43:31 CDT 2018
root at whip:~# cd /var/crash
root at whip:/var/crash# cd 201803221639
root at whip:/var/crash/201803221639# ls
dmesg.201803221639  dump.201803221639
root at whip:/var/crash/201803221639# ls -l
total 439040
-rw------- 1 root root    111536 Mar 22 16:39 dmesg.201803221639
-rw------- 1 root root 449549329 Mar 22 16:39 dump.201803221639
root at whip:/var/crash/201803221639#

-- 
You received this bug notification because you are a member of Ubuntu
Sponsors Team, which is subscribed to the bug report.
https://bugs.launchpad.net/bugs/1743529

Title:
  Merge kexec-tools 2.0.16-1 from Debian: System hung with Kernel panic
  -not syncing: Out of memory message when crash is triggered.

Status in The Ubuntu-power-systems project:
  Triaged
Status in kexec-tools package in Ubuntu:
  Fix Released
Status in kexec-tools source package in Xenial:
  New
Status in kexec-tools source package in Artful:
  New
Status in kexec-tools source package in Bionic:
  Fix Released

Bug description:
  == Comment: #0 - INDIRA P. JOGA
  Problem Description:
  ===================
  System hung with kernel panic Kernel panic - not syncing: Out of memory message when crash is triggered

  Steps to re-create:
  ==================
  > Installed ubuntu1804 daily build on Witherspoon test system
  root at whip:~# uname -a
  Linux whip 4.13.0-17-generic #20-Ubuntu SMP Mon Nov 6 10:03:08 UTC 2017 ppc64le ppc64le ppc64le GNU/Linux
  root at whip:~# uname -r
  4.13.0-17-generic

  > root at whip:~# free -h
                total        used        free      shared  buff/cache   available
  Mem:           507G        2.0G        504G         19M        728M        503G
  Swap:          2.0G          0B        2.0G

  
  > Edited the grub /etc/default/grub.d/kexec-tools.cfg file and set the crash kernel parameter=4096M

  > Updated grub using update-grub command and reboot system.

  cat root at whip:~# cat /proc/cmdline
  root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet crashkernel=4096M

  > kdump status before triggering crash

  root at whip:~# kdump-config show
  DUMP_MODE:        kdump
  USE_KDUMP:        1
  KDUMP_SYSCTL:     kernel.panic_on_oops=1
  KDUMP_COREDIR:    /var/crash
  crashkernel addr: 
     /var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.13.0-17-generic
  kdump initrd: 
     /var/lib/kdump/initrd.img: symbolic link to /var/lib/kdump/initrd.img-4.13.0-17-generic
  current state:    ready to kdump

  kexec command:
    /sbin/kexec -p --command-line="root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet irqpoll noirqdistrib nr_cpus=1 nousb systemd.unit=kdump-tools.service ata_piix.prefer_ms_hyperv=0" --initrd=/var/lib/kdump/initrd.img /var/lib/kdump/vmlinuz

  root at whip:~# kdump-config status
  current state   : ready to kdump

  >  Enabled sysrq
  root at whip:~# sysctl -w kernel.sysrq=1
  kernel.sysrq = 1

  > Triggered crash and it hangs with kernel panic- OOM message as below

  root at whip:~# echo c > /proc/sysrq-trigger
  [   85.731415] sysrq: SysRq : Trigger a crash
  [   85.731472] Unable to handle kernel paging request for data at address 0x00000000
  [   85.731584] Faulting instruction address: 0xc00000000078f588
  [   85.731670] Oops: Kernel access of bad area, sig: 11 [#1]
  [   85.731744] SMP NR_CPUS=2048 
  [   85.731745] NUMA 
  [   85.731790] PowerNV
  [   85.731853] Modules linked in: rpcsec_gss_krb5 nfsv4 nfs fscache sctp_diag sctp dccp_diag dccp tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag binfmt_misc vmx_crypto crct10dif_vpmsum ofpart cmdlinepart idt_89hpesx powernv_flash ipmi_powernv opal_prd ibmpowernv mtd ipmi_devintf ipmi_msghandler at24 uio_pdrv_genirq uio dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua nfsd auth_rpcgss sch_fq_codel nfs_acl lockd grace sunrpc ip_tables x_tables autofs4 btrfs xor raid6_pq nouveau bnx2x ast i2c_algo_bit ttm drm_kms_helper mdio libcrc32c crc32c_vpmsum mlx5_core syscopyarea sysfillrect sysimgblt fb_sys_fops tg3 drm ahci mlxfw libahci nvme devlink nvme_core
  [   85.732704] CPU: 10 PID: 4316 Comm: bash Not tainted 4.13.0-17-generic #20-Ubuntu
  [   85.732764] task: c000003fcb141700 task.stack: c000003fc2374000
  [   85.732858] NIP: c00000000078f588 LR: c0000000007904b8 CTR: c00000000078f560
  [   85.732977] REGS: c000003fc23779f0 TRAP: 0300   Not tainted  (4.13.0-17-generic)
  [   85.733066] MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>
  [   85.733075]   CR: 28422222  XER: 20040000
  [   85.733201] CFAR: c0000000007904b4 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 1 
  [   85.733201] GPR00: c0000000007904b8 c000003fc2377c70 c0000000015f6000 0000000000000063 
  [   85.733201] GPR04: c000003feedfade8 c000003feee12068 9000000000009033 000000000000000a 
  [   85.733201] GPR08: 0000000000000007 0000000000000001 0000000000000000 9000000000001003 
  [   85.733201] GPR12: c00000000078f560 c000000007a66900 0000000010180df8 0000000010189e30 
  [   85.733201] GPR16: 0000000010189ea8 0000000010151210 000000001018bd58 000000001018de48 
  [   85.733201] GPR20: 00000000321168d8 0000000000000001 0000000010164590 0000000010163bb0 
  [   85.733201] GPR24: 00007fffcfa6e7d4 00007fffcfa6e7d0 c0000000014fa570 0000000000000002 
  [   85.733201] GPR28: 0000000000000063 0000000000000004 c0000000014822f4 c0000000014fa910 
  [   85.734116] NIP [c00000000078f588] sysrq_handle_crash+0x28/0x30
  [   85.734211] LR [c0000000007904b8] __handle_sysrq+0xf8/0x2b0
  [   85.734285] Call Trace:
  [   85.734316] [c000003fc2377c70] [c000000000790498] __handle_sysrq+0xd8/0x2b0 (unreliable)
  [   85.734418] [c000003fc2377d10] [c000000000790cb4] write_sysrq_trigger+0x64/0x90
  [   85.734541] [c000003fc2377d40] [c00000000044c0c8] proc_reg_write+0x88/0xd0
  [   85.734656] [c000003fc2377d70] [c00000000039db8c] __vfs_write+0x3c/0x70
  [   85.734741] [c000003fc2377d90] [c00000000039f7c8] vfs_write+0xd8/0x220
  [   85.734845] [c000003fc2377de0] [c0000000003a1648] SyS_write+0x68/0x110
  [   85.734959] [c000003fc2377e30] [c00000000000b184] system_call+0x58/0x6c
  [   85.735053] Instruction dump:
  [   85.735102] 4bfff9f1 4bfffe50 3c4c00e6 38426aa0 7c0802a6 60000000 39200001 3d42001d 
  [   85.735209] 394ad788 912a0000 7c0004ac 39400000 <992a0000> 4e800020 3c4c00e6 38426a70 
  [   85.735319] ---[ end trace 711a5d30c86f0359 ]---
  [   86.742408] 
  [   86.742572] Sending IPI to o[  184.508956788,5] OPAL: Switch to big-endian OS
  ther CPUs
  [   86.743959] IPI[  187.269699704,5] OPAL: Switch to little-endian OS
   complete
  [   86.749730] kexec: waiting for cpu 16 (physical 16) to enter OPAL
  [   88.909562] kexec: Starting switchover sequence.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
   -> smp_release_cpus()
  spinning_secondaries = 159
   <- smp_release_cpus()
  [    1.433082] Unable to open file: /etc/keys/x509_ima.der (-2)
  [    1.433086] Unable to open file: /etc/keys/x509_evm.der (-2)
  [    1.500890] vio vio: uevent: failed to send synthetic uevent
  [    1.670523] nouveau 0004:04:00.0: unknown chipset (140000a1)
  [    1.670632] nouveau 0004:05:00.0: unknown chipset (140000a1)
  [    1.670768] nouveau 0035:03:00.0: unknown chipset (140000a1)
  [    1.670872] nouveau 0035:04:00.0: unknown chipset (140000a1)
  /dev/nvme0n1p2: recovering journal
  /dev/nvme0n1p2: clean, 81988/25001984 files, 3168955/99997696 blocks
  [    4.042999] vio vio: uevent: failed to send synthetic uevent


  
  .  .  .  .[    8.856941] Kernel panic - not syncing: Out of memory and no killable processes...
  [    8.856941] 
  [    8.857009] CPU: 0 PID: 175 Comm: kworker/u8:5 Tainted: G        W       4.13.0-17-generic #20-Ubuntu
  [    8.857131] Workqueue: mkey_cache cache_work_func [mlx5_ib]
  [    8.857188] Call Trace:
  [    8.857215] [c0000000ea90b4f0] [c000000008c5365c] dump_stack+0xb0/0xf4 (unreliable)
  [    8.857297] [c0000000ea90b530] [c0000000080f9e2c] panic+0x144/0x338
  [    8.857381] [c0000000ea90b5c0] [c0000000082bf530] out_of_memory+0x3e0/0x6f0
  [    8.857456] [c0000000ea90b660] [c0000000082c71a8] __alloc_pages_nodemask+0xfe8/0x1080
  [    8.857568] [c0000000ea90b860] [c00000000834d2b0] alloc_pages_current+0xa0/0x140
  [    8.857658] [c0000000ea90b8a0] [c00000000835c834] new_slab+0x3d4/0x810
  [    8.857752] [c0000000ea90b970] [c00000000835e96c] ___slab_alloc+0x3fc/0x7a0
  [    8.857826] [c0000000ea90baa0] [c00000000835ed44] __slab_alloc+0x34/0x60
  [    8.857901] [c0000000ea90bad0] [c00000000835f1a4] kmem_cache_alloc_trace+0x124/0x300
  [    8.857977] [c0000000ea90bb30] [c0080000033863e4] add_keys+0x5c/0x3a0 [mlx5_ib]
  [    8.858083] [c0000000ea90bc20] [c008000003386d84] __cache_work_func+0x12c/0x2b0 [mlx5_ib]
  [    8.858199] [c0000000ea90bca0] [c000000008120a58] process_one_work+0x298/0x5a0
  [    8.858299] [c0000000ea90bd30] [c000000008120de8] worker_thread+0x88/0x620
  [    8.858383] [c0000000ea90bdc0] [c000000008129c5c] kthread+0x1ac/0x1c0
  [    8.858476] [c0000000ea90be30] [c00000000800b4e8] ret_from_kernel_thread+0x5c/0x74
  [    9.666011] ---[ end Kernel panic - not syncing: Out of memory and no killable processes...
  [    9.666

  > System is available for debugging.

  
  == Comment: #1 - INDIRA P. JOGA 
  > Tried with default crash kernel parameter
  crashkernel=2G-4G:320M,4G-32G:512M,32G-64G:1024M,64G-128G:2048M,128G-:4096M  and still facing same kernel panic hung issue with the system

  root at whip:~# cat /proc/cmdline
  root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet crashkernel=2G-4G:320M,4G-32G:512M,32G-64G:1024M,64G-128G:2048M,128G-:4096M

  
  > Triggered crash and it hangs 
  Regards,
  Indira

  
  == Comment: #6 - INDIRA P. JOGA 
  Hi Urvashi,

  Tried triggering crash with higher crash kernel parameter like 16384M
  and i see issue recreated like it throws similar messages like
  "uevent: failed to send synthetic uevent" but it rebooted and hangs
  this time while booting.

  
  > Crashkernel parameter

  root at whip:~# cat /proc/cmdline
  root=UUID=2f422155-c251-4bc1-abd8-47cb17a13e65 ro splash quiet crashkernel=16384M

  
  > Triggered crash 

  > Attached console logs

  Regards,
  Indira

  == Comment: #8 - INDIRA P. JOGA <indira.priya at in.ibm.com> - 2018-01-03 22:45:02 ==
  > kexectools and makeudmp file verisons from test system

  kexec-tools/bionic,now 1:2.0.15-0ubuntu1 ppc64el [installed,automatic]
  makedumpfile/bionic,now 1:1.6.2-1ubuntu1 ppc64el [installed,automatic]

  Regards,
  Indira

  
  == Comment: #10 - Hari Krishna Bathini <hbathini at in.ibm.com> - 2018-01-04 04:50:01 ==
  (In reply to comment #6)

  This is due to checkstop that occurs on systems with GPUs connected.
  The below two patches are need to fix this:

  
    commit aec4d0f7a2502a13fc21e90ff32dc306b0ad1190
    Author: Hari Bathini <hbathini at linux.vnet.ibm.com>
    Date:   Thu Aug 17 18:01:51 2017 +0530

      kexec-tools: ppc64: avoid adding coherent memory regions to crash memory ranges
      
      Accelerator devices like GPU and FPGA cards contain onboard memory. This
      onboard memory is represented as a memory only NUMA node, integrating it
      with core memory subsystem. Since, the link through which these devices
      are integrated to core memory goes down after a system crash and they are
      meant for user workloads, avoid adding coherent device memory regions to
      crash memory ranges. Without this change, makedumpfile tool tries to save
      unaccessible coherent device memory regions, crashing the system.
      
      Signed-off-by: Hari Bathini <hbathini at linux.vnet.ibm.com>
      Tested-by: Pingfan Liu <piliu at redhat.com>
      Signed-off-by: Simon Horman <horms at verge.net.au>
  --

    commit 69431282f075ab723c4886f20aa248976920aaae
    Author: Hari Bathini <hbathini at linux.vnet.ibm.com>
    Date:   Tue Aug 29 23:08:02 2017 +0530

      kexec-tools: ppc64: fix leak while checking for coherent device memory
      
      Signed-off-by: Hari Bathini <hbathini at linux.vnet.ibm.com>
      Signed-off-by: Simon Horman <horms at verge.net.au>
  --

  
  Thanks
  Hari

  == Comment: #11 - Hari Krishna Bathini <hbathini at in.ibm.com> - 2018-01-04 04:52:40 ==
  urvashi, also check if the below patches are included in the kexec-tools shipped
  with 18.04. If not, they should also be included:

    commit 21eb397a5fc9227cd95d23e8c74a49cf6a293e57
    Author: Hari Bathini <hbathini at linux.vnet.ibm.com>
    Date:   Wed Aug 9 23:47:42 2017 +0530

      kexec-tools: powerpc: fix command line overflow error
      
      Since kernel commit a5980d064fe2 ("powerpc: Bump COMMAND_LINE_SIZE
      to 2048"), powerpc bumped command line size to 2048 but the size
      used here is still the default value of 512. Bump it to 2048 to
      fix command line overflow errors observed when command line length
      is above 512 bytes. Also, get rid of the multiple definitions of
      COMMAND_LINE_SIZE macro in ppc architecture.
      
      Signed-off-by: Hari Bathini <hbathini at linux.vnet.ibm.com>
      Signed-off-by: Simon Horman <horms at verge.net.au>
  --

    commit 47478ea66d4301b12a07862aebc8447a2932f0ed
    Author: Hari Bathini <hbathini at linux.vnet.ibm.com>
    Date:   Wed Jul 26 22:49:41 2017 +0530

      kexec-tools: ppc64: fix how RMA top is deduced
      
      Hang was observed, in purgatory, on a machine configured with
      single LPAR. This was because one of the segments was loaded
      outside the actual Real Memory Area (RMA) due to wrongly
      deduced RMA top value.
      
      Currently, top of real memory area, which is crucial for loading
      kexec/kdump kernel, is obtained by iterating through mem nodes
      and setting its value based on the base and size values of the
      last mem node in the iteration. That can't always be correct as
      the order of iteration may not be same and RMA base & size are
      always based on the first memory property. Fix this by setting
      RMA top value based on the base and size values of the memory
      node that has the smallest base value (first memory property)
      among all the memory nodes.
      
      Also, correct the misnomers rmo_base and rmo_top to rma_base
      and rma_top respectively.
      
      While how RMA top is deduced was broken for sometime, the issue
      may not have been seen so far, for couple of possible reasons:
      
          1. Only one mem node was available.
          2. First memory property has been the last node in
             iteration when multiple mem nodes were present.
      
      Fixes: 02f4088ffded ("kexec fix ppc64 device-tree mem node")
      Reported-by: Ankit Kumar <ankit at linux.vnet.ibm.com>
      Cc: Michael Ellerman <mpe at ellerman.id.au>
      Cc: Geoff Levand <geoff at infradead.org>
      Signed-off-by: Hari Bathini <hbathini at linux.vnet.ibm.com>
      Signed-off-by: Simon Horman <horms at verge.net.au>
  --

  Thanks
  Hari

  == Comment:

  As discussed with Hari,

  commit 21eb397a5fc9227cd95d23e8c74a49cf6a293e57
      kexec-tools: powerpc: fix command line overflow error

  commit 47478ea66d4301b12a07862aebc8447a2932f0ed
      kexec-tools: ppc64: fix how RMA top is deduced 

  Both the commits are not present in Ubuntu 18.04

  Comment 19 Urvashi Jawere

  Hi Indira,

  Installed updated kexec tools (which includes all the 4 patches) in
  the whip system. Please retest and update us with your findings.

  
  == Comment: #20 - INDIRA P. JOGA <indira.priya at in.ibm.com> - 2018-01-10 10:40:06 ==
  Hi Urvashi,

  I have tried the kdump (triggering crash) test scenario and observed
  below findings

  -->> For both default and 4096M crash kernel parameters system did not
  hang when triggered crash and it booted it up properly but did not
  find /var/crash logs saved.

  Regards,
  Indira

  == Comment: #

  Hi,
  Urvashi/Indira, that is probably because of the below..

  >[  154.241351] Faulting instruction address: 0xc000000000792f88
  > [  154.241447] Oops: Kernel access of bad area, sig: 11 [#1]
  > [  154.241517] SMP NR_CPUS=2048 
  > [  154.241518] NUMA 
  >

  
  The above shows that kdump kernel has failed with OOM.
  Can you try using "crashkernel=8192M" and see if dump capture is successful with that?

  Thanks
  Hari

  == Comment: #24 - INDIRA P. JOGA 
  Hi Hari/Urvashi,

  Tried kdump (triggering crash) test scenario with "crashkernel=8192M"
  and crash logs are saved as below

  > root at whip:~# cat /proc/cmdline
  root=UUID=49ed695c-a4dc-4e55-9ca6-319145265826 ro splash quiet crashkernel=8192M

  root at whip:~# free -h
                total        used        free      shared  buff/cache   available
  Mem:           503G        3.2G        499G         12M        909M        498G
  Swap:          2.0G          0B        2.0G

  > root at whip:~# kdump-config show
  DUMP_MODE:        kdump
  USE_KDUMP:        1
  KDUMP_SYSCTL:     kernel.panic_on_oops=1
  KDUMP_COREDIR:    /var/crash
  crashkernel addr: 
     /var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.13.0-25-generic
  kdump initrd: 
     /var/lib/kdump/initrd.img: symbolic link to /var/lib/kdump/initrd.img-4.13.0-25-generic
  current state:    ready to kdump

  kexec command:
    /sbin/kexec -p --command-line="root=UUID=49ed695c-a4dc-4e55-9ca6-319145265826 ro splash quiet irqpoll noirqdistrib nr_cpus=1 nousb systemd.unit=kdump-tools.service ata_piix.prefer_ms_hyperv=0" --initrd=/var/lib/kdump/initrd.img /var/lib/kdump/vmlinuz

  > root at whip:~# kdump-config status
  current state   : ready to kdump
  root at whip:~# sysctl -w kernel.sysrq=1
  kernel.sysrq = 1
  root at whip:~# 

  > Triggered crash as below

  root at whip:~# echo c > /proc/sysrq-trigger
  [  533.615228] sysrq: SysRq : Trigger a crash
  [  533.615288] Unable to handle kernel paging request for data at address 0x00000000
  [  533.615397] Faulting instruction address: 0xc000000000792f88
  [  533.615479] Oops: Kernel access of bad area, sig: 11 [#1]
  [  533.615550] SMP NR_CPUS=2048 
  [  533.615551] NUMA 
  [  533.615590] PowerNV
  [  533.615657] Modules linked in: rpcsec_gss_krb5 nfsv4 nfs fscache sctp_diag sctp dccp_diag dccp tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag vmx_crypto crct10dif_vpmsum idt_89hpesx ofpart cmdlinepart ipmi_powernv powernv_flash ipmi_devintf ibmpowernv ipmi_msghandler mtd opal_prd at24 uio_pdrv_genirq uio binfmt_misc dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua sch_fq_codel nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables x_tables autofs4 btrfs xor raid6_pq uas usb_storage nouveau bnx2x ast i2c_algo_bit ttm drm_kms_helper mdio libcrc32c mlx5_core crc32c_vpmsum syscopyarea sysfillrect sysimgblt fb_sys_fops tg3 drm ahci libahci mlxfw nvme devlink nvme_core
  [  533.616497] CPU: 10 PID: 4887 Comm: bash Not tainted 4.13.0-25-generic #29-Ubuntu
  [  533.616600] task: c000003fa20ab200 task.stack: c000003fa80d4000
  [  533.616702] NIP: c000000000792f88 LR: c000000000793eb8 CTR: c000000000792f60
  [  533.616795] REGS: c000003fa80d79f0 TRAP: 0300   Not tainted  (4.13.0-25-generic)
  [  533.616895] MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>
  [  533.616904]   CR: 28422222  XER: 20040000

  > Crash logs as below

  root at whip:~# ls -lr /var/crash
  total 21528
  -rw-r----- 1 root root 22002927 Jan 12 04:06 _usr_lpp_htx_bin_hxestorage.0.crash
  -rw-r----- 1 root root    28945 Jan 16 01:14 linux-image-4.13.0-25-generic-201801160110.crash
  -rw-r--r-- 1 root root      251 Jan 16 01:14 kexec_cmd
  drwxr-xr-x 2 root root     4096 Jan 16 01:11 201801160110
  root at whip:~# date
  Tue Jan 16 01:14:44 CST 2018

  Regards,
  Indira

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1743529/+subscriptions



More information about the Ubuntu-sponsors mailing list