virsh dump MyGuestName /storage/MyGuestName.dump
dump-guest-memory [-z|-l|-s] FILENAME
dump-core domain-id filename
/sys/kernel/kexec_crash_{loaded,size}
echo c > /proc/sysrq-trigger
ipmitool power diag
virsh inject-nmi MyGuestName
- kernel.unknown_nmi_panic=1
nmi_watchdog=1
sysctl kernel.softlockup_panic=1
sysctl vm.panic_on_oom=1
{SOFT,HARD}LOCKUP_DETECTOR / BOOTPARAM_{SOFT,HARD}
LOCKUP_PANIC
{SOFT,CLOCKSOURCE}_WATCHDOG
debuginfo-install
kernel
apt-get install linux-
image-$(uname -r)-dbg
KERNEL: /var/crash/127.0.0.1-2015-08-20-20:00:00/vmcore
DUMPFILE: vmcore.myserver [PARTIAL DUMP]
CPUS: 24
DATE: Mon Aug 20 20:00:00 2015
UPTIME: 32 days, 17:12:02
LOAD AVERAGE: 1625.88, 1603.11, 1509.73
TASKS: 25639
NODENAME: myserver
RELEASE: 2.6.18-371.8.1.el5
VERSION: #1 SMB Fri Mar 28 05:53:58 EDT 2014
MACHINE: x86_64 (2933Mhz)
MEMORY: 284 GB
PANIC: “Kernel panic - not syncing: An NMI occured”
PID: 61015
COMMAND: "java"
TAKS: ffff8135b50e5830 [THREAD_INFO: ffff8104bd256000]
CPU: 0
STATE: TASK_RUNNING (PANIC)
Let’s check for a real kernel bug
KERNEL: /usr/lib/debug/lib/modules/2.6.32-431.29.2.el6.x86_64/vmlinux
DUMPFILE: vmcore [PARTIAL DUMP]
CPUS: 64
DATE: Wed Jun 14 11:23:14 2015
UPTIME: 44 days, 04:14:21
LOAD AVERAGE: 0.70, 0.58, 0.55
TASKS: 1917
NODENAME: myredhat65
RELEASE: 2.6.32-431.29.2.el6.x86_64
VERSION: #1 SMP Sun Jul 27 15:55:46 EDT 2014
MACHINE: x86_64 (1997 Mhz)
MEMORY: 64 GB
PANIC: "BUG: unable to handle kernel NULL pointer dereference at (null)"
PID: 2120
COMMAND: "scsi_eh_6"
TASK: ffff880437dcf540 [THREAD_INFO: ffff880435a94000]
CPU: 50
STATE: TASK_RUNNING (PANIC)
crash> bt
PID: 2120 TASK: ffff880437dcf540 CPU: 50 COMMAND: "scsi_eh_6"
#0 [ffff880435a95890] machine_kexec at ffffffff81038f3b
#1 [ffff880435a958f0] crash_kexec at ffffffff810c5af2
#2 [ffff880435a959c0] oops_end at ffffffff8152ca50
#3 [ffff880435a959f0] no_context at ffffffff8104a00b
#4 [ffff880435a95a40] __bad_area_nosemaphore at ffffffff8104a295
#5 [ffff880435a95a90] bad_area_nosemaphore at ffffffff8104a363
#6 [ffff880435a95aa0] __do_page_fault at ffffffff8104aabf
#7 [ffff880435a95bc0] do_page_fault at ffffffff8152e99e
#8 [ffff880435a95bf0] page_fault at ffffffff8152bd55
[exception RIP: scsi_send_eh_cmnd+99]
RIP: ffffffff813860e3 RSP: ffff880435a95ca0 RFLAGS: 00010286
RAX: 0000000000000000 RBX: ffff880c2d600ec0 RCX: 0000000000002710
RDX: ffff880c3002f000 RSI: ffffffff82017288 RDI:ffff880c2d600ec0
RBP: ffff880435a95da0 R8: 0000000000000000 R9: 0000000000000000
R10: 000d8f6a631f7b23 R11: 0000000000000001 R12: 0000000000000001
R13: ffff880435a95e90 R14: 0000000000000000 R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#9 [ffff880435a95da8] scsi_eh_tur at ffffffff81386672
#10 [ffff880435a95dd8] scsi_eh_test_devices at ffffffff8138675a
#11 [ffff880435a95e28] scsi_error_handler at ffffffff81387d4c
#12 [ffff880435a95ee8] kthread at ffffffff8109abf6
#13 [ffff880435a95f48] kernel_thread at ffffffff8100c20a
crash> gdb set disassemble-flavor intel
crash> dis scsi_send_eh_cmnd
0xffffffff81386080 <scsi_send_eh_cmnd>: push rbp
0xffffffff81386081 <scsi_send_eh_cmnd+1>: mov rbp,rsp
0xffffffff81386084 <scsi_send_eh_cmnd+4>: push r15
0xffffffff81386086 <scsi_send_eh_cmnd+6>: push r14
0xffffffff81386088 <scsi_send_eh_cmnd+8>: push r13
0xffffffff8138608a <scsi_send_eh_cmnd+10>: push r12
0xffffffff8138608c <scsi_send_eh_cmnd+12>: push rbx
0xffffffff8138608d <scsi_send_eh_cmnd+13>: sub rsp,0xd8
0xffffffff81386094 <scsi_send_eh_cmnd+20>: nop DWORD PTR
[rax+rax*1+0x0]
0xffffffff81386099 <scsi_send_eh_cmnd+25>: mov rax,QWORD PTR gs:0x28
0xffffffff813860a2 <scsi_send_eh_cmnd+34>: mov QWORD PTR [rbp-0x38],rax
0xffffffff813860a6 <scsi_send_eh_cmnd+38>: xor eax,eax
0xffffffff813860a8 <scsi_send_eh_cmnd+40>: mov QWORD PTR [rbp-0xc8],rsi
0xffffffff813860af <scsi_send_eh_cmnd+47>: mov DWORD PTR [rbp-0xcc],edx
0xffffffff813860b5 <scsi_send_eh_cmnd+53>: mov rbx,rdi
0xffffffff813860b8 <scsi_send_eh_cmnd+56>: mov rax,QWORD PTR [rdi+0x80]
crash> rd -o 0x80 0xffff880c2d600ec0
ffff880c2d600f40: ffff880c372afd00
0xffffffff813860bf <scsi_send_eh_cmnd+63>: mov rdx,QWORD PTR [rdi]
0xffffffff813860c2 <scsi_send_eh_cmnd+66>: mov r14d,r8d
0xffffffff813860c5 <scsi_send_eh_cmnd+69>: mov rax,QWORD PTR [rax+0xb0]
crash> rd -64 -o 0xb0 ffff880c372afd00
ffff880c372afdb0: ffff880c2a4d0400
0xffffffff813860cc <scsi_send_eh_cmnd+76>: mov QWORD PTR [rbp-0xe8],0x0
0xffffffff813860d7 <scsi_send_eh_cmnd+87>: test rax,rax
0xffffffff813860da <scsi_send_eh_cmnd+90>: je 0xffffffff813860ed
<scsi_send_eh_cmnd+109>
0xffffffff813860dc <scsi_send_eh_cmnd+92>: mov rax,QWORD PTR [rax+0x2c8]
crash> rd -64 -o 0x2c8 ffff880c2a4d0400
ffff880c2a4d06c8: 0000000000000000
0xffffffff813860e3 <scsi_send_eh_cmnd+99>: mov rax,QWORD PTR [rax]
static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) {
if (!cmd->request->rq_disk)
return NULL;
return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
}
if (!cmd->request->rq_disk)
crash> struct scsi_cmnd
struct scsi_cmnd {
…
unsigned int transfersize;
__struct request *request;__
unsigned char *sense_buffer;
…
}
crash> struct request
struct request {
…
struct gendisk *rq_disk;
…
}
crash> struct -xo gendisk
struct gendisk {
…
[0x2c0] struct request_queue *queue;
[0x2c8] void *private_data;
[0x2d0] int flags;
…
}
ffff880c2d600ec0 = scsi_cmnd
ffff880c372afd00 = request
ffff880c2a4d0400 = gendisk
0xffffffff813860dc <scsi_send_eh_cmnd+92>: mov rax,QWORD PTR [rax+0x2c8]
crash> struct gendisk.disk_name ffff880c2a4d0400
disk_name = "sg96000000000000...000"
addr : 0xffff880c3002f000
crash> scsi_device.vendor 0xffff880c3002f000
vendor = 0xffff880c2a3a2ac8 "QUANTUM Scalar i6000 656Q656Q.GS01501 001"
crash> scsi_device.model 0xffff880c3002f000
model = 0xffff880c2a3a2ad0 "Scalar i6000 656Q656Q.GS01501 001"
crash> scsi_device.rev 0xffff880c3002f000
rev = 0xffff880c2a3a2ae0 "656Q656Q.GS01501 001"
/* Return-probe handler: force return value to be 1. */
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
#if defined(__i386__) && !defined(__KERNEL__)
regs->eax = 1;
#else
regs->ax = 1;
#endif
return 0;
}
crash> net
NET_DEVICE NAME IP ADDRESS(ES)
ffff88003e999020 lo 127.0.0.1
ffff88003e228020 eth0 192.168.122.13
crash> struct net_device.mtu
ffff88003e228020
mtu = 1500
crash> struct -o net_device.mtu
ffff88003e228020
struct net_device {
[ffff88003e22818c] unsigned int mtu;
}
crash> rd -32 -D ffff88003e22818c
ffff88003e22818c: 1500
crash> wr -32 ffff88003e22818c 1400
[root@centos6 ~]# ifconfig eth0 |grep
-Po 'MTU:[0-9]+'
MTU:1400
Kernel crashdump
Kernel crashdump
Kernel crashdump
Kernel crashdump
Kernel crashdump
Kernel crashdump
Kernel crashdump

Kernel crashdump

  • 4.
    virsh dump MyGuestName/storage/MyGuestName.dump dump-guest-memory [-z|-l|-s] FILENAME dump-core domain-id filename
  • 6.
  • 7.
    echo c >/proc/sysrq-trigger ipmitool power diag virsh inject-nmi MyGuestName - kernel.unknown_nmi_panic=1 nmi_watchdog=1 sysctl kernel.softlockup_panic=1 sysctl vm.panic_on_oom=1
  • 8.
  • 11.
  • 12.
    KERNEL: /var/crash/127.0.0.1-2015-08-20-20:00:00/vmcore DUMPFILE: vmcore.myserver[PARTIAL DUMP] CPUS: 24 DATE: Mon Aug 20 20:00:00 2015 UPTIME: 32 days, 17:12:02 LOAD AVERAGE: 1625.88, 1603.11, 1509.73 TASKS: 25639 NODENAME: myserver RELEASE: 2.6.18-371.8.1.el5 VERSION: #1 SMB Fri Mar 28 05:53:58 EDT 2014 MACHINE: x86_64 (2933Mhz) MEMORY: 284 GB PANIC: “Kernel panic - not syncing: An NMI occured” PID: 61015 COMMAND: "java" TAKS: ffff8135b50e5830 [THREAD_INFO: ffff8104bd256000] CPU: 0 STATE: TASK_RUNNING (PANIC)
  • 13.
    Let’s check fora real kernel bug KERNEL: /usr/lib/debug/lib/modules/2.6.32-431.29.2.el6.x86_64/vmlinux DUMPFILE: vmcore [PARTIAL DUMP] CPUS: 64 DATE: Wed Jun 14 11:23:14 2015 UPTIME: 44 days, 04:14:21 LOAD AVERAGE: 0.70, 0.58, 0.55 TASKS: 1917 NODENAME: myredhat65 RELEASE: 2.6.32-431.29.2.el6.x86_64 VERSION: #1 SMP Sun Jul 27 15:55:46 EDT 2014 MACHINE: x86_64 (1997 Mhz) MEMORY: 64 GB PANIC: "BUG: unable to handle kernel NULL pointer dereference at (null)" PID: 2120 COMMAND: "scsi_eh_6" TASK: ffff880437dcf540 [THREAD_INFO: ffff880435a94000] CPU: 50 STATE: TASK_RUNNING (PANIC)
  • 14.
    crash> bt PID: 2120TASK: ffff880437dcf540 CPU: 50 COMMAND: "scsi_eh_6" #0 [ffff880435a95890] machine_kexec at ffffffff81038f3b #1 [ffff880435a958f0] crash_kexec at ffffffff810c5af2 #2 [ffff880435a959c0] oops_end at ffffffff8152ca50 #3 [ffff880435a959f0] no_context at ffffffff8104a00b #4 [ffff880435a95a40] __bad_area_nosemaphore at ffffffff8104a295 #5 [ffff880435a95a90] bad_area_nosemaphore at ffffffff8104a363 #6 [ffff880435a95aa0] __do_page_fault at ffffffff8104aabf #7 [ffff880435a95bc0] do_page_fault at ffffffff8152e99e #8 [ffff880435a95bf0] page_fault at ffffffff8152bd55 [exception RIP: scsi_send_eh_cmnd+99] RIP: ffffffff813860e3 RSP: ffff880435a95ca0 RFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff880c2d600ec0 RCX: 0000000000002710 RDX: ffff880c3002f000 RSI: ffffffff82017288 RDI:ffff880c2d600ec0 RBP: ffff880435a95da0 R8: 0000000000000000 R9: 0000000000000000 R10: 000d8f6a631f7b23 R11: 0000000000000001 R12: 0000000000000001 R13: ffff880435a95e90 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #9 [ffff880435a95da8] scsi_eh_tur at ffffffff81386672 #10 [ffff880435a95dd8] scsi_eh_test_devices at ffffffff8138675a #11 [ffff880435a95e28] scsi_error_handler at ffffffff81387d4c #12 [ffff880435a95ee8] kthread at ffffffff8109abf6 #13 [ffff880435a95f48] kernel_thread at ffffffff8100c20a crash> gdb set disassemble-flavor intel crash> dis scsi_send_eh_cmnd 0xffffffff81386080 <scsi_send_eh_cmnd>: push rbp 0xffffffff81386081 <scsi_send_eh_cmnd+1>: mov rbp,rsp 0xffffffff81386084 <scsi_send_eh_cmnd+4>: push r15 0xffffffff81386086 <scsi_send_eh_cmnd+6>: push r14 0xffffffff81386088 <scsi_send_eh_cmnd+8>: push r13 0xffffffff8138608a <scsi_send_eh_cmnd+10>: push r12 0xffffffff8138608c <scsi_send_eh_cmnd+12>: push rbx 0xffffffff8138608d <scsi_send_eh_cmnd+13>: sub rsp,0xd8 0xffffffff81386094 <scsi_send_eh_cmnd+20>: nop DWORD PTR [rax+rax*1+0x0] 0xffffffff81386099 <scsi_send_eh_cmnd+25>: mov rax,QWORD PTR gs:0x28 0xffffffff813860a2 <scsi_send_eh_cmnd+34>: mov QWORD PTR [rbp-0x38],rax 0xffffffff813860a6 <scsi_send_eh_cmnd+38>: xor eax,eax 0xffffffff813860a8 <scsi_send_eh_cmnd+40>: mov QWORD PTR [rbp-0xc8],rsi 0xffffffff813860af <scsi_send_eh_cmnd+47>: mov DWORD PTR [rbp-0xcc],edx 0xffffffff813860b5 <scsi_send_eh_cmnd+53>: mov rbx,rdi 0xffffffff813860b8 <scsi_send_eh_cmnd+56>: mov rax,QWORD PTR [rdi+0x80] crash> rd -o 0x80 0xffff880c2d600ec0 ffff880c2d600f40: ffff880c372afd00 0xffffffff813860bf <scsi_send_eh_cmnd+63>: mov rdx,QWORD PTR [rdi] 0xffffffff813860c2 <scsi_send_eh_cmnd+66>: mov r14d,r8d 0xffffffff813860c5 <scsi_send_eh_cmnd+69>: mov rax,QWORD PTR [rax+0xb0] crash> rd -64 -o 0xb0 ffff880c372afd00 ffff880c372afdb0: ffff880c2a4d0400 0xffffffff813860cc <scsi_send_eh_cmnd+76>: mov QWORD PTR [rbp-0xe8],0x0 0xffffffff813860d7 <scsi_send_eh_cmnd+87>: test rax,rax 0xffffffff813860da <scsi_send_eh_cmnd+90>: je 0xffffffff813860ed <scsi_send_eh_cmnd+109> 0xffffffff813860dc <scsi_send_eh_cmnd+92>: mov rax,QWORD PTR [rax+0x2c8] crash> rd -64 -o 0x2c8 ffff880c2a4d0400 ffff880c2a4d06c8: 0000000000000000 0xffffffff813860e3 <scsi_send_eh_cmnd+99>: mov rax,QWORD PTR [rax] static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { if (!cmd->request->rq_disk) return NULL; return *(struct scsi_driver **)cmd->request->rq_disk->private_data; } if (!cmd->request->rq_disk)
  • 15.
    crash> struct scsi_cmnd structscsi_cmnd { … unsigned int transfersize; __struct request *request;__ unsigned char *sense_buffer; … } crash> struct request struct request { … struct gendisk *rq_disk; … } crash> struct -xo gendisk struct gendisk { … [0x2c0] struct request_queue *queue; [0x2c8] void *private_data; [0x2d0] int flags; … } ffff880c2d600ec0 = scsi_cmnd ffff880c372afd00 = request ffff880c2a4d0400 = gendisk 0xffffffff813860dc <scsi_send_eh_cmnd+92>: mov rax,QWORD PTR [rax+0x2c8] crash> struct gendisk.disk_name ffff880c2a4d0400 disk_name = "sg96000000000000...000" addr : 0xffff880c3002f000 crash> scsi_device.vendor 0xffff880c3002f000 vendor = 0xffff880c2a3a2ac8 "QUANTUM Scalar i6000 656Q656Q.GS01501 001" crash> scsi_device.model 0xffff880c3002f000 model = 0xffff880c2a3a2ad0 "Scalar i6000 656Q656Q.GS01501 001" crash> scsi_device.rev 0xffff880c3002f000 rev = 0xffff880c2a3a2ae0 "656Q656Q.GS01501 001"
  • 17.
    /* Return-probe handler:force return value to be 1. */ static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { #if defined(__i386__) && !defined(__KERNEL__) regs->eax = 1; #else regs->ax = 1; #endif return 0; }
  • 18.
    crash> net NET_DEVICE NAMEIP ADDRESS(ES) ffff88003e999020 lo 127.0.0.1 ffff88003e228020 eth0 192.168.122.13 crash> struct net_device.mtu ffff88003e228020 mtu = 1500 crash> struct -o net_device.mtu ffff88003e228020 struct net_device { [ffff88003e22818c] unsigned int mtu; } crash> rd -32 -D ffff88003e22818c ffff88003e22818c: 1500 crash> wr -32 ffff88003e22818c 1400 [root@centos6 ~]# ifconfig eth0 |grep -Po 'MTU:[0-9]+' MTU:1400