diff options
-rw-r--r-- | Documentation/watchdog/hpwdt.txt | 84 | ||||
-rw-r--r-- | drivers/watchdog/hpwdt.c | 59 |
2 files changed, 128 insertions, 15 deletions
diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.txt new file mode 100644 index 0000000..127839e --- /dev/null +++ b/Documentation/watchdog/hpwdt.txt @@ -0,0 +1,84 @@ +Last reviewed: 06/02/2009 + + HP iLO2 NMI Watchdog Driver + NMI sourcing for iLO2 based ProLiant Servers + Documentation and Driver by + Thomas Mingarelli <thomas.mingarelli@hp.com> + + The HP iLO2 NMI Watchdog driver is a kernel module that provides basic + watchdog functionality and the added benefit of NMI sourcing. Both the + watchdog functionality and the NMI sourcing capability need to be enabled + by the user. Remember that the two modes are not dependant on one another. + A user can have the NMI sourcing without the watchdog timer and vice-versa. + + Watchdog functionality is enabled like any other common watchdog driver. That + is, an application needs to be started that kicks off the watchdog timer. A + basic application exists in the Documentation/watchdog/src directory called + watchdog-test.c. Simply compile the C file and kick it off. If the system + gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will + not be updated in a timely fashion and a hardware system reset (also known as + an Automatic Server Recovery (ASR)) event will occur. + + The hpwdt driver also has three (3) module parameters. They are the following: + + soft_margin - allows the user to set the watchdog timer value + allow_kdump - allows the user to save off a kernel dump image after an NMI + nowayout - basic watchdog parameter that does not allow the timer to + be restarted or an impending ASR to be escaped. + + NOTE: More information about watchdog drivers in general, including the ioctl + interface to /dev/watchdog can be found in + Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt. + + The NMI sourcing capability is disabled when the driver discovers that the + nmi_watchdog is turned on (nmi_watchdog = 1). This is due to the inability to + distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the + Linux kernel. What this means is that the hpwdt nmi handler code is called + each time the NMI signal fires off. This could amount to several thousands of + NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and + confused" message in the logs or if the system gets into a hung state, then + the user should reboot with nmi_watchdog=0. + + 1. If the kernel has not been booted with nmi_watchdog turned off then + edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the + currently booting kernel line. + 2. reboot the sever + + Now, the hpwdt can successfully receive and source the NMI and provide a log + message that details the reason for the NMI (as determined by the HP BIOS). + + Below is a list of NMIs the HP BIOS understands along with the associated + code (reason): + + No source found 00h + + Uncorrectable Memory Error 01h + + ASR NMI 1Bh + + PCI Parity Error 20h + + NMI Button Press 27h + + SB_BUS_NMI 28h + + ILO Doorbell NMI 29h + + ILO IOP NMI 2Ah + + ILO Watchdog NMI 2Bh + + Proc Throt NMI 2Ch + + Front Side Bus NMI 2Dh + + PCI Express Error 2Fh + + DMA controller NMI 30h + + Hypertransport/CSI Error 31h + + + + -- Tom Mingarelli + (thomas.mingarelli@hp.com) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 3137361..c0b9169 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -19,6 +19,7 @@ #include <linux/interrupt.h> #include <linux/io.h> #include <linux/irq.h> +#include <linux/nmi.h> #include <linux/kernel.h> #include <linux/miscdevice.h> #include <linux/mm.h> @@ -47,7 +48,7 @@ #define PCI_BIOS32_PARAGRAPH_LEN 16 #define PCI_ROM_BASE1 0x000F0000 #define ROM_SIZE 0x10000 -#define HPWDT_VERSION "1.01" +#define HPWDT_VERSION "1.1.1" struct bios32_service_dir { u32 signature; @@ -119,6 +120,7 @@ static int nowayout = WATCHDOG_NOWAYOUT; static char expect_release; static unsigned long hpwdt_is_open; static unsigned int allow_kdump; +static int hpwdt_nmi_sourcing; static void __iomem *pci_mem_addr; /* the PCI-memory address */ static unsigned long __iomem *hpwdt_timer_reg; @@ -468,21 +470,22 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason, if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI) return NOTIFY_OK; - spin_lock_irqsave(&rom_lock, rom_pl); - if (!die_nmi_called) - asminline_call(&cmn_regs, cru_rom_addr); - die_nmi_called = 1; - spin_unlock_irqrestore(&rom_lock, rom_pl); - if (cmn_regs.u1.ral == 0) { - printk(KERN_WARNING "hpwdt: An NMI occurred, " - "but unable to determine source.\n"); - } else { - if (allow_kdump) - hpwdt_stop(); - panic("An NMI occurred, please see the Integrated " - "Management Log for details.\n"); + if (hpwdt_nmi_sourcing) { + spin_lock_irqsave(&rom_lock, rom_pl); + if (!die_nmi_called) + asminline_call(&cmn_regs, cru_rom_addr); + die_nmi_called = 1; + spin_unlock_irqrestore(&rom_lock, rom_pl); + if (cmn_regs.u1.ral == 0) { + printk(KERN_WARNING "hpwdt: An NMI occurred, " + "but unable to determine source.\n"); + } else { + if (allow_kdump) + hpwdt_stop(); + panic("An NMI occurred, please see the Integrated " + "Management Log for details.\n"); + } } - return NOTIFY_OK; } @@ -627,12 +630,38 @@ static struct notifier_block die_notifier = { * Init & Exit */ +#ifdef ARCH_HAS_NMI_WATCHDOG +static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev) +{ + /* + * If nmi_watchdog is turned off then we can turn on + * our nmi sourcing capability. + */ + if (!nmi_watchdog_active()) + hpwdt_nmi_sourcing = 1; + else + dev_warn(&dev->dev, "NMI sourcing is disabled. To enable this " + "functionality you must reboot with nmi_watchdog=0.\n"); +} +#else +static void __devinit hpwdt_check_nmi_sourcing(struct pci_dev *dev) +{ + dev_warn(&dev->dev, "NMI sourcing is disabled. " + "Your kernel does not support a NMI Watchdog.\n"); +} +#endif + static int __devinit hpwdt_init_one(struct pci_dev *dev, const struct pci_device_id *ent) { int retval; /* + * Check if we can do NMI sourcing or not + */ + hpwdt_check_nmi_sourcing(dev); + + /* * First let's find out if we are on an iLO2 server. We will * not run on a legacy ASM box. * So we only support the G5 ProLiant servers and higher. |