diff options
471 files changed, 13280 insertions, 10998 deletions
@@ -1653,14 +1653,14 @@ S: Chapel Hill, North Carolina 27514-4818 S: USA N: Dave Jones -E: davej@codemonkey.org.uk +E: davej@redhat.com W: http://www.codemonkey.org.uk -D: x86 errata/setup maintenance. -D: AGPGART driver. +D: Assorted VIA x86 support. +D: 2.5 AGPGART overhaul. D: CPUFREQ maintenance. -D: Backport/Forwardport merge monkey. -D: Various Janitor work. -S: United Kingdom +D: Fedora kernel maintainence. +D: Misc/Other. +S: 314 Littleton Rd, Westford, MA 01886, USA N: Martin Josfsson E: gandalf@wlug.westbo.se diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index 4c63e58..ae15d55 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -1105,7 +1105,7 @@ static struct block_device_operations opt_fops = { </listitem> <listitem> <para> - Function names as strings (__FUNCTION__). + Function names as strings (__func__). </para> </listitem> <listitem> diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt index a51f693..256defd 100644 --- a/Documentation/MSI-HOWTO.txt +++ b/Documentation/MSI-HOWTO.txt @@ -236,10 +236,8 @@ software system can set different pages for controlling accesses to the MSI-X structure. The implementation of MSI support requires the PCI subsystem, not a device driver, to maintain full control of the MSI-X table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X -table/MSI-X PBA. A device driver is prohibited from requesting the MMIO -address space of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem -will fail enabling MSI-X on its hardware device when it calls the function -pci_enable_msix(). +table/MSI-X PBA. A device driver should not access the MMIO address +space of the MSI-X table/MSI-X PBA. 5.3.2 API pci_enable_msix diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt index 8d4dc62..fd4907a 100644 --- a/Documentation/PCI/pci.txt +++ b/Documentation/PCI/pci.txt @@ -163,6 +163,10 @@ need pass only as many optional fields as necessary: o class and classmask fields default to 0 o driver_data defaults to 0UL. +Note that driver_data must match the value used by any of the pci_device_id +entries defined in the driver. This makes the driver_data field mandatory +if all the pci_device_id entries have a non-zero driver_data value. + Once added, the driver probe routine will be invoked for any unclaimed PCI devices listed in its (newly updated) pci_ids list. diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt index 16c2512..ddeb14b 100644 --- a/Documentation/PCI/pcieaer-howto.txt +++ b/Documentation/PCI/pcieaer-howto.txt @@ -203,22 +203,17 @@ to mmio_enabled. 3.3 helper functions -3.3.1 int pci_find_aer_capability(struct pci_dev *dev); -pci_find_aer_capability locates the PCI Express AER capability -in the device configuration space. If the device doesn't support -PCI-Express AER, the function returns 0. - -3.3.2 int pci_enable_pcie_error_reporting(struct pci_dev *dev); +3.3.1 int pci_enable_pcie_error_reporting(struct pci_dev *dev); pci_enable_pcie_error_reporting enables the device to send error messages to root port when an error is detected. Note that devices don't enable the error reporting by default, so device drivers need call this function to enable it. -3.3.3 int pci_disable_pcie_error_reporting(struct pci_dev *dev); +3.3.2 int pci_disable_pcie_error_reporting(struct pci_dev *dev); pci_disable_pcie_error_reporting disables the device to send error messages to root port when an error is detected. -3.3.4 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev); +3.3.3 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev); pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable error status register. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0f1544f..53ba7c7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -101,6 +101,7 @@ parameter is applicable: X86-64 X86-64 architecture is enabled. More X86-64 boot options can be found in Documentation/x86_64/boot-options.txt . + X86 Either 32bit or 64bit x86 (same as X86-32+X86-64) In addition, the following text indicates that the option: @@ -1588,7 +1589,7 @@ and is between 256 and 4096 characters. It is defined in the file See also Documentation/paride.txt. pci=option[,option...] [PCI] various PCI subsystem options: - off [X86-32] don't probe for the PCI bus + off [X86] don't probe for the PCI bus bios [X86-32] force use of PCI BIOS, don't access the hardware directly. Use this if your machine has a non-standard PCI host bridge. @@ -1596,9 +1597,9 @@ and is between 256 and 4096 characters. It is defined in the file hardware access methods are allowed. Use this if you experience crashes upon bootup and you suspect they are caused by the BIOS. - conf1 [X86-32] Force use of PCI Configuration + conf1 [X86] Force use of PCI Configuration Mechanism 1. - conf2 [X86-32] Force use of PCI Configuration + conf2 [X86] Force use of PCI Configuration Mechanism 2. noaer [PCIE] If the PCIEAER kernel config parameter is enabled, this kernel boot option can be used to @@ -1618,37 +1619,37 @@ and is between 256 and 4096 characters. It is defined in the file this option if the kernel is unable to allocate IRQs or discover secondary PCI buses on your motherboard. - rom [X86-32] Assign address space to expansion ROMs. + rom [X86] Assign address space to expansion ROMs. Use with caution as certain devices share address decoders between ROMs and other resources. - norom [X86-32,X86_64] Do not assign address space to + norom [X86] Do not assign address space to expansion ROMs that do not already have BIOS assigned address ranges. - irqmask=0xMMMM [X86-32] Set a bit mask of IRQs allowed to be + irqmask=0xMMMM [X86] Set a bit mask of IRQs allowed to be assigned automatically to PCI devices. You can make the kernel exclude IRQs of your ISA cards this way. - pirqaddr=0xAAAAA [X86-32] Specify the physical address + pirqaddr=0xAAAAA [X86] Specify the physical address of the PIRQ table (normally generated by the BIOS) if it is outside the F0000h-100000h range. - lastbus=N [X86-32] Scan all buses thru bus #N. Can be + lastbus=N [X86] Scan all buses thru bus #N. Can be useful if the kernel is unable to find your secondary buses and you want to tell it explicitly which ones they are. - assign-busses [X86-32] Always assign all PCI bus + assign-busses [X86] Always assign all PCI bus numbers ourselves, overriding whatever the firmware may have done. - usepirqmask [X86-32] Honor the possible IRQ mask stored + usepirqmask [X86] Honor the possible IRQ mask stored in the BIOS $PIR table. This is needed on some systems with broken BIOSes, notably some HP Pavilion N5400 and Omnibook XE3 notebooks. This will have no effect if ACPI IRQ routing is enabled. - noacpi [X86-32] Do not use ACPI for IRQ routing + noacpi [X86] Do not use ACPI for IRQ routing or for PCI scanning. - use_crs [X86-32] Use _CRS for PCI resource + use_crs [X86] Use _CRS for PCI resource allocation. routeirq Do IRQ routing for all PCI devices. This is normally done in pci_enable_device(), @@ -1677,6 +1678,12 @@ and is between 256 and 4096 characters. It is defined in the file reserved for the CardBus bridge's memory window. The default value is 64 megabytes. + pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power + Management. + off Disable ASPM. + force Enable ASPM even on devices that claim not to support it. + WARNING: Forcing ASPM on may cause system lockups. + pcmv= [HW,PCMCIA] BadgePAD 4 pd. [PARIDE] diff --git a/Documentation/markers.txt b/Documentation/markers.txt index d9f50a1..089f613 100644 --- a/Documentation/markers.txt +++ b/Documentation/markers.txt @@ -50,10 +50,12 @@ Connecting a function (probe) to a marker is done by providing a probe (function to call) for the specific marker through marker_probe_register() and can be activated by calling marker_arm(). Marker deactivation can be done by calling marker_disarm() as many times as marker_arm() has been called. Removing a probe -is done through marker_probe_unregister(); it will disarm the probe and make -sure there is no caller left using the probe when it returns. Probe removal is -preempt-safe because preemption is disabled around the probe call. See the -"Probe example" section below for a sample probe module. +is done through marker_probe_unregister(); it will disarm the probe. +marker_synchronize_unregister() must be called before the end of the module exit +function to make sure there is no caller left using the probe. This, and the +fact that preemption is disabled around the probe call, make sure that probe +removal and module unload are safe. See the "Probe example" section below for a +sample probe module. The marker mechanism supports inserting multiple instances of the same marker. Markers can be put in inline functions, inlined static functions, and diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 49378a9..10a0263 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -95,8 +95,9 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'p' - Will dump the current registers and flags to your console. -'q' - Will dump a list of all running hrtimers. - WARNING: Does not cover any other timers +'q' - Will dump per CPU lists of all armed hrtimers (but NOT regular + timer_list timers) and detailed information about all + clockevent devices. 'r' - Turns off keyboard raw mode and sets it to XLATE. diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt new file mode 100644 index 0000000..5d354e1 --- /dev/null +++ b/Documentation/tracepoints.txt @@ -0,0 +1,101 @@ + Using the Linux Kernel Tracepoints + + Mathieu Desnoyers + + +This document introduces Linux Kernel Tracepoints and their use. It provides +examples of how to insert tracepoints in the kernel and connect probe functions +to them and provides some examples of probe functions. + + +* Purpose of tracepoints + +A tracepoint placed in code provides a hook to call a function (probe) that you +can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or +"off" (no probe is attached). When a tracepoint is "off" it has no effect, +except for adding a tiny time penalty (checking a condition for a branch) and +space penalty (adding a few bytes for the function call at the end of the +instrumented function and adds a data structure in a separate section). When a +tracepoint is "on", the function you provide is called each time the tracepoint +is executed, in the execution context of the caller. When the function provided +ends its execution, it returns to the caller (continuing from the tracepoint +site). + +You can put tracepoints at important locations in the code. They are +lightweight hooks that can pass an arbitrary number of parameters, +which prototypes are described in a tracepoint declaration placed in a header +file. + +They can be used for tracing and performance accounting. + + +* Usage + +Two elements are required for tracepoints : + +- A tracepoint definition, placed in a header file. +- The tracepoint statement, in C code. + +In order to use tracepoints, you should include linux/tracepoint.h. + +In include/trace/subsys.h : + +#include <linux/tracepoint.h> + +DEFINE_TRACE(subsys_eventname, + TPPTOTO(int firstarg, struct task_struct *p), + TPARGS(firstarg, p)); + +In subsys/file.c (where the tracing statement must be added) : + +#include <trace/subsys.h> + +void somefct(void) +{ + ... + trace_subsys_eventname(arg, task); + ... +} + +Where : +- subsys_eventname is an identifier unique to your event + - subsys is the name of your subsystem. + - eventname is the name of the event to trace. +- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function + called by this tracepoint. +- TPARGS(firstarg, p) are the parameters names, same as found in the prototype. + +Connecting a function (probe) to a tracepoint is done by providing a probe +(function to call) for the specific tracepoint through +register_trace_subsys_eventname(). Removing a probe is done through +unregister_trace_subsys_eventname(); it will remove the probe sure there is no +caller left using the probe when it returns. Probe removal is preempt-safe +because preemption is disabled around the probe call. See the "Probe example" +section below for a sample probe module. + +The tracepoint mechanism supports inserting multiple instances of the same +tracepoint, but a single definition must be made of a given tracepoint name over +all the kernel to make sure no type conflict will occur. Name mangling of the +tracepoints is done using the prototypes to make sure typing is correct. +Verification of probe type correctness is done at the registration site by the +compiler. Tracepoints can be put in inline functions, inlined static functions, +and unrolled loops as well as regular functions. + +The naming scheme "subsys_event" is suggested here as a convention intended +to limit collisions. Tracepoint names are global to the kernel: they are +considered as being the same whether they are in the core kernel image or in +modules. + + +* Probe / tracepoint example + +See the example provided in samples/tracepoints/src + +Compile them with your kernel. + +Run, as root : +modprobe tracepoint-example (insmod order is not important) +modprobe tracepoint-probe-example +cat /proc/tracepoint-example (returns an expected error) +rmmod tracepoint-example tracepoint-probe-example +dmesg diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index a4afb56..5bbbe20 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -36,7 +36,7 @@ $ mount -t debugfs debugfs /debug $ echo mmiotrace > /debug/tracing/current_tracer $ cat /debug/tracing/trace_pipe > mydump.txt & Start X or whatever. -$ echo "X is up" > /debug/tracing/marker +$ echo "X is up" > /debug/tracing/trace_marker $ echo none > /debug/tracing/current_tracer Check for lost events. @@ -59,9 +59,8 @@ The 'cat' process should stay running (sleeping) in the background. Load the driver you want to trace and use it. Mmiotrace will only catch MMIO accesses to areas that are ioremapped while mmiotrace is active. -[Unimplemented feature:] During tracing you can place comments (markers) into the trace by -$ echo "X is up" > /debug/tracing/marker +$ echo "X is up" > /debug/tracing/trace_marker This makes it easier to see which part of the (huge) trace corresponds to which action. It is recommended to place descriptive markers about what you do. diff --git a/MAINTAINERS b/MAINTAINERS index 22303e5..6d51f00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1198,7 +1198,7 @@ S: Maintained CPU FREQUENCY DRIVERS P: Dave Jones -M: davej@codemonkey.org.uk +M: davej@redhat.com L: cpufreq@vger.kernel.org W: http://www.codemonkey.org.uk/projects/cpufreq/ T: git kernel.org/pub/scm/linux/kernel/git/davej/cpufreq.git diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 99a7f19..a4555f4 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -47,7 +47,7 @@ typedef struct irq_swizzle_struct static irq_swizzle_t *sable_lynx_irq_swizzle; -static void sable_lynx_init_irq(int nr_irqs); +static void sable_lynx_init_irq(int nr_of_irqs); #if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SABLE) @@ -530,11 +530,11 @@ sable_lynx_srm_device_interrupt(unsigned long vector) } static void __init -sable_lynx_init_irq(int nr_irqs) +sable_lynx_init_irq(int nr_of_irqs) { long i; - for (i = 0; i < nr_irqs; ++i) { + for (i = 0; i < nr_of_irqs; ++i) { irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL; irq_desc[i].chip = &sable_lynx_irq_type; } diff --git a/arch/arm/mach-iop13xx/include/mach/time.h b/arch/arm/mach-iop13xx/include/mach/time.h index 49213d9..d6d5252 100644 --- a/arch/arm/mach-iop13xx/include/mach/time.h +++ b/arch/arm/mach-iop13xx/include/mach/time.h @@ -41,7 +41,7 @@ static inline unsigned long iop13xx_core_freq(void) return 1200000000; default: printk("%s: warning unknown frequency, defaulting to 800Mhz\n", - __FUNCTION__); + __func__); } return 800000000; @@ -60,7 +60,7 @@ static inline unsigned long iop13xx_xsi_bus_ratio(void) return 4; default: printk("%s: warning unknown ratio, defaulting to 2\n", - __FUNCTION__); + __func__); } return 2; diff --git a/arch/arm/mach-ixp2000/ixdp2x00.c b/arch/arm/mach-ixp2000/ixdp2x00.c index b0653a8..3045130 100644 --- a/arch/arm/mach-ixp2000/ixdp2x00.c +++ b/arch/arm/mach-ixp2000/ixdp2x00.c @@ -143,7 +143,7 @@ static struct irq_chip ixdp2x00_cpld_irq_chip = { .unmask = ixdp2x00_irq_unmask }; -void __init ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_irqs) +void __init ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_of_irqs) { unsigned int irq; @@ -154,7 +154,7 @@ void __init ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigne board_irq_stat = stat_reg; board_irq_mask = mask_reg; - board_irq_count = nr_irqs; + board_irq_count = nr_of_irqs; *board_irq_mask = 0xffffffff; diff --git a/arch/arm/mach-omap2/irq.c b/arch/arm/mach-omap2/irq.c index d354e0f..c40fc37 100644 --- a/arch/arm/mach-omap2/irq.c +++ b/arch/arm/mach-omap2/irq.c @@ -119,7 +119,7 @@ static void __init omap_irq_bank_init_one(struct omap_irq_bank *bank) void __init omap_init_irq(void) { - unsigned long nr_irqs = 0; + unsigned long nr_of_irqs = 0; unsigned int nr_banks = 0; int i; @@ -133,14 +133,14 @@ void __init omap_init_irq(void) omap_irq_bank_init_one(bank); - nr_irqs += bank->nr_irqs; + nr_of_irqs += bank->nr_irqs; nr_banks++; } printk(KERN_INFO "Total of %ld interrupts on %d active controller%s\n", - nr_irqs, nr_banks, nr_banks > 1 ? "s" : ""); + nr_of_irqs, nr_banks, nr_banks > 1 ? "s" : ""); - for (i = 0; i < nr_irqs; i++) { + for (i = 0; i < nr_of_irqs; i++) { set_irq_chip(i, &omap_irq_chip); set_irq_handler(i, handle_level_irq); set_irq_flags(i, IRQF_VALID); diff --git a/arch/arm/mach-pxa/include/mach/zylonite.h b/arch/arm/mach-pxa/include/mach/zylonite.h index 0d35ca0..bf6785a 100644 --- a/arch/arm/mach-pxa/include/mach/zylonite.h +++ b/arch/arm/mach-pxa/include/mach/zylonite.h @@ -30,7 +30,7 @@ extern void zylonite_pxa300_init(void); static inline void zylonite_pxa300_init(void) { if (cpu_is_pxa300() || cpu_is_pxa310()) - panic("%s: PXA300/PXA310 not supported\n", __FUNCTION__); + panic("%s: PXA300/PXA310 not supported\n", __func__); } #endif @@ -40,7 +40,7 @@ extern void zylonite_pxa320_init(void); static inline void zylonite_pxa320_init(void) { if (cpu_is_pxa320()) - panic("%s: PXA320 not supported\n", __FUNCTION__); + panic("%s: PXA320 not supported\n", __func__); } #endif diff --git a/arch/arm/mach-sa1100/include/mach/ide.h b/arch/arm/mach-sa1100/include/mach/ide.h deleted file mode 100644 index 4c99c8f..0000000 --- a/arch/arm/mach-sa1100/include/mach/ide.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * arch/arm/mach-sa1100/include/mach/ide.h - * - * Copyright (c) 1998 Hugo Fiennes & Nicolas Pitre - * - * 18-aug-2000: Cleanup by Erik Mouw (J.A.K.Mouw@its.tudelft.nl) - * Get rid of the special ide_init_hwif_ports() functions - * and make a generalised function that can be used by all - * architectures. - */ - -#include <asm/irq.h> -#include <mach/hardware.h> -#include <asm/mach-types.h> - -#error "This code is broken and needs update to match with current ide support" - - -/* - * Set up a hw structure for a specified data port, control port and IRQ. - * This should follow whatever the default interface uses. - */ -static inline void ide_init_hwif_ports(hw_regs_t *hw, unsigned long data_port, - unsigned long ctrl_port, int *irq) -{ - unsigned long reg = data_port; - int i; - int regincr = 1; - - /* The Empeg board has the first two address lines unused */ - if (machine_is_empeg()) - regincr = 1 << 2; - - /* The LART doesn't use A0 for IDE */ - if (machine_is_lart()) - regincr = 1 << 1; - - memset(hw, 0, sizeof(*hw)); - - for (i = 0; i <= 7; i++) { - hw->io_ports_array[i] = reg; - reg += regincr; - } - - hw->io_ports.ctl_addr = ctrl_port; - - if (irq) - *irq = 0; -} - -/* - * This registers the standard ports for this architecture with the IDE - * driver. - */ -static __inline__ void -ide_init_default_hwifs(void) -{ - if (machine_is_lart()) { -#ifdef CONFIG_SA1100_LART - hw_regs_t hw; - - /* Enable GPIO as interrupt line */ - GPDR &= ~LART_GPIO_IDE; - set_irq_type(LART_IRQ_IDE, IRQ_TYPE_EDGE_RISING); - - /* set PCMCIA interface timing */ - MECR = 0x00060006; - - /* init the interface */ - ide_init_hwif_ports(&hw, PCMCIA_IO_0_BASE + 0x0000, PCMCIA_IO_0_BASE + 0x1000, NULL); - hw.irq = LART_IRQ_IDE; - ide_register_hw(&hw); -#endif - } -} diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c index c36a6d5..310477b 100644 --- a/arch/avr32/mach-at32ap/extint.c +++ b/arch/avr32/mach-at32ap/extint.c @@ -191,7 +191,7 @@ static int __init eic_probe(struct platform_device *pdev) struct eic *eic; struct resource *regs; unsigned int i; - unsigned int nr_irqs; + unsigned int nr_of_irqs; unsigned int int_irq; int ret; u32 pattern; @@ -224,7 +224,7 @@ static int __init eic_probe(struct platform_device *pdev) eic_writel(eic, IDR, ~0UL); eic_writel(eic, MODE, ~0UL); pattern = eic_readl(eic, MODE); - nr_irqs = fls(pattern); + nr_of_irqs = fls(pattern); /* Trigger on low level unless overridden by driver */ eic_writel(eic, EDGE, 0UL); @@ -232,7 +232,7 @@ static int __init eic_probe(struct platform_device *pdev) eic->chip = &eic_chip; - for (i = 0; i < nr_irqs; i++) { + for (i = 0; i < nr_of_irqs; i++) { set_irq_chip_and_handler(eic->first_irq + i, &eic_chip, handle_level_irq); set_irq_chip_data(eic->first_irq + i, eic); @@ -256,7 +256,7 @@ static int __init eic_probe(struct platform_device *pdev) eic->regs, int_irq); dev_info(&pdev->dev, "Handling %u external IRQs, starting with IRQ %u\n", - nr_irqs, eic->first_irq); + nr_of_irqs, eic->first_irq); return 0; diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h index 0149097..ce342fb 100644 --- a/arch/ia64/include/asm/pci.h +++ b/arch/ia64/include/asm/pci.h @@ -95,16 +95,8 @@ extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine); #define HAVE_PCI_LEGACY extern int pci_mmap_legacy_page_range(struct pci_bus *bus, - struct vm_area_struct *vma); -extern ssize_t pci_read_legacy_io(struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count); -extern ssize_t pci_write_legacy_io(struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count); -extern int pci_mmap_legacy_mem(struct kobject *kobj, - struct bin_attribute *attr, - struct vm_area_struct *vma); + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state); #define pci_get_legacy_mem platform_pci_get_legacy_mem #define pci_legacy_read platform_pci_legacy_read diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 7545037..211fcfd 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -614,12 +614,17 @@ char *ia64_pci_get_legacy_mem(struct pci_bus *bus) * vector to get the base address. */ int -pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma) +pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state) { unsigned long size = vma->vm_end - vma->vm_start; pgprot_t prot; char *addr; + /* We only support mmap'ing of legacy memory space */ + if (mmap_state != pci_mmap_mem) + return -ENOSYS; + /* * Avoid attribute aliasing. See Documentation/ia64/aliasing.txt * for more details. diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index fc29948..39cb6da 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -40,6 +40,7 @@ */ #include <linux/module.h> +#include <linux/cpu.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 2bd1f6e..644a70b 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -9,6 +9,8 @@ config PARISC def_bool y select HAVE_IDE select HAVE_OPROFILE + select RTC_CLASS + select RTC_DRV_PARISC help The PA-RISC microprocessor is designed by Hewlett-Packard and used in many of their workstations & servers (HP9000 700 and 800 series, diff --git a/include/asm-parisc/Kbuild b/arch/parisc/include/asm/Kbuild index f88b252..f88b252 100644 --- a/include/asm-parisc/Kbuild +++ b/arch/parisc/include/asm/Kbuild diff --git a/include/asm-parisc/agp.h b/arch/parisc/include/asm/agp.h index 9651660..9651660 100644 --- a/include/asm-parisc/agp.h +++ b/arch/parisc/include/asm/agp.h diff --git a/include/asm-parisc/asmregs.h b/arch/parisc/include/asm/asmregs.h index d93c646..d93c646 100644 --- a/include/asm-parisc/asmregs.h +++ b/arch/parisc/include/asm/asmregs.h diff --git a/include/asm-parisc/assembly.h b/arch/parisc/include/asm/assembly.h index ffb2088..ffb2088 100644 --- a/include/asm-parisc/assembly.h +++ b/arch/parisc/include/asm/assembly.h diff --git a/include/asm-parisc/atomic.h b/arch/parisc/include/asm/atomic.h index 57fcc4a..57fcc4a 100644 --- a/include/asm-parisc/atomic.h +++ b/arch/parisc/include/asm/atomic.h diff --git a/include/asm-parisc/auxvec.h b/arch/parisc/include/asm/auxvec.h index 9c3ac4b..9c3ac4b 100644 --- a/include/asm-parisc/auxvec.h +++ b/arch/parisc/include/asm/auxvec.h diff --git a/include/asm-parisc/bitops.h b/arch/parisc/include/asm/bitops.h index 7a6ea10..7a6ea10 100644 --- a/include/asm-parisc/bitops.h +++ b/arch/parisc/include/asm/bitops.h diff --git a/include/asm-parisc/bug.h b/arch/parisc/include/asm/bug.h index 8cfc553..8cfc553 100644 --- a/include/asm-parisc/bug.h +++ b/arch/parisc/include/asm/bug.h diff --git a/include/asm-parisc/bugs.h b/arch/parisc/include/asm/bugs.h index 9e62843..9e62843 100644 --- a/include/asm-parisc/bugs.h +++ b/arch/parisc/include/asm/bugs.h diff --git a/include/asm-parisc/byteorder.h b/arch/parisc/include/asm/byteorder.h index db14831..db14831 100644 --- a/include/asm-parisc/byteorder.h +++ b/arch/parisc/include/asm/byteorder.h diff --git a/include/asm-parisc/cache.h b/arch/parisc/include/asm/cache.h index 32c2cca..32c2cca 100644 --- a/include/asm-parisc/cache.h +++ b/arch/parisc/include/asm/cache.h diff --git a/include/asm-parisc/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index b7ca6dc..b7ca6dc 100644 --- a/include/asm-parisc/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h diff --git a/include/asm-parisc/checksum.h b/arch/parisc/include/asm/checksum.h index e9639cc..e9639cc 100644 --- a/include/asm-parisc/checksum.h +++ b/arch/parisc/include/asm/checksum.h diff --git a/include/asm-parisc/compat.h b/arch/parisc/include/asm/compat.h index 7f32611..7f32611 100644 --- a/include/asm-parisc/compat.h +++ b/arch/parisc/include/asm/compat.h diff --git a/include/asm-parisc/compat_rt_sigframe.h b/arch/parisc/include/asm/compat_rt_sigframe.h index 81bec28..81bec28 100644 --- a/include/asm-parisc/compat_rt_sigframe.h +++ b/arch/parisc/include/asm/compat_rt_sigframe.h diff --git a/include/asm-parisc/compat_signal.h b/arch/parisc/include/asm/compat_signal.h index 6ad02c3..6ad02c3 100644 --- a/include/asm-parisc/compat_signal.h +++ b/arch/parisc/include/asm/compat_signal.h diff --git a/include/asm-parisc/compat_ucontext.h b/arch/parisc/include/asm/compat_ucontext.h index 2f7292a..2f7292a 100644 --- a/include/asm-parisc/compat_ucontext.h +++ b/arch/parisc/include/asm/compat_ucontext.h diff --git a/include/asm-parisc/cputime.h b/arch/parisc/include/asm/cputime.h index dcdf2fb..dcdf2fb 100644 --- a/include/asm-parisc/cputime.h +++ b/arch/parisc/include/asm/cputime.h diff --git a/include/asm-parisc/current.h b/arch/parisc/include/asm/current.h index 0fb9338..0fb9338 100644 --- a/include/asm-parisc/current.h +++ b/arch/parisc/include/asm/current.h diff --git a/include/asm-parisc/delay.h b/arch/parisc/include/asm/delay.h index 7a75e98..7a75e98 100644 --- a/include/asm-parisc/delay.h +++ b/arch/parisc/include/asm/delay.h diff --git a/include/asm-parisc/device.h b/arch/parisc/include/asm/device.h index d8f9872..d8f9872 100644 --- a/include/asm-parisc/device.h +++ b/arch/parisc/include/asm/device.h diff --git a/include/asm-parisc/div64.h b/arch/parisc/include/asm/div64.h index 6cd978c..6cd978c 100644 --- a/include/asm-parisc/div64.h +++ b/arch/parisc/include/asm/div64.h diff --git a/include/asm-parisc/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h index 53af696..53af696 100644 --- a/include/asm-parisc/dma-mapping.h +++ b/arch/parisc/include/asm/dma-mapping.h diff --git a/include/asm-parisc/dma.h b/arch/parisc/include/asm/dma.h index 31ad0f0..31ad0f0 100644 --- a/include/asm-parisc/dma.h +++ b/arch/parisc/include/asm/dma.h diff --git a/include/asm-parisc/eisa_bus.h b/arch/parisc/include/asm/eisa_bus.h index 201085f..201085f 100644 --- a/include/asm-parisc/eisa_bus.h +++ b/arch/parisc/include/asm/eisa_bus.h diff --git a/include/asm-parisc/eisa_eeprom.h b/arch/parisc/include/asm/eisa_eeprom.h index 9c9da98..9c9da98 100644 --- a/include/asm-parisc/eisa_eeprom.h +++ b/arch/parisc/include/asm/eisa_eeprom.h diff --git a/include/asm-parisc/elf.h b/arch/parisc/include/asm/elf.h index 7fa6757..7fa6757 100644 --- a/include/asm-parisc/elf.h +++ b/arch/parisc/include/asm/elf.h diff --git a/include/asm-parisc/emergency-restart.h b/arch/parisc/include/asm/emergency-restart.h index 108d8c4..108d8c4 100644 --- a/include/asm-parisc/emergency-restart.h +++ b/arch/parisc/include/asm/emergency-restart.h diff --git a/include/asm-parisc/errno.h b/arch/parisc/include/asm/errno.h index e2f3ddc..e2f3ddc 100644 --- a/include/asm-parisc/errno.h +++ b/arch/parisc/include/asm/errno.h diff --git a/include/asm-parisc/fb.h b/arch/parisc/include/asm/fb.h index 4d503a0..4d503a0 100644 --- a/include/asm-parisc/fb.h +++ b/arch/parisc/include/asm/fb.h diff --git a/include/asm-parisc/fcntl.h b/arch/parisc/include/asm/fcntl.h index 1e1c824..1e1c824 100644 --- a/include/asm-parisc/fcntl.h +++ b/arch/parisc/include/asm/fcntl.h diff --git a/include/asm-parisc/fixmap.h b/arch/parisc/include/asm/fixmap.h index de3fe3a..de3fe3a 100644 --- a/include/asm-parisc/fixmap.h +++ b/arch/parisc/include/asm/fixmap.h diff --git a/include/asm-parisc/floppy.h b/arch/parisc/include/asm/floppy.h index 4ca69f5..4ca69f5 100644 --- a/include/asm-parisc/floppy.h +++ b/arch/parisc/include/asm/floppy.h diff --git a/include/asm-parisc/futex.h b/arch/parisc/include/asm/futex.h index 0c705c3..0c705c3 100644 --- a/include/asm-parisc/futex.h +++ b/arch/parisc/include/asm/futex.h diff --git a/include/asm-parisc/grfioctl.h b/arch/parisc/include/asm/grfioctl.h index 671e060..671e060 100644 --- a/include/asm-parisc/grfioctl.h +++ b/arch/parisc/include/asm/grfioctl.h diff --git a/include/asm-parisc/hardirq.h b/arch/parisc/include/asm/hardirq.h index ce93133..ce93133 100644 --- a/include/asm-parisc/hardirq.h +++ b/arch/parisc/include/asm/hardirq.h diff --git a/include/asm-parisc/hardware.h b/arch/parisc/include/asm/hardware.h index 4e96268..4e96268 100644 --- a/include/asm-parisc/hardware.h +++ b/arch/parisc/include/asm/hardware.h diff --git a/include/asm-parisc/hw_irq.h b/arch/parisc/include/asm/hw_irq.h index 6707f7d..6707f7d 100644 --- a/include/asm-parisc/hw_irq.h +++ b/arch/parisc/include/asm/hw_irq.h diff --git a/include/asm-parisc/ide.h b/arch/parisc/include/asm/ide.h index c246ef7..81700a2 100644 --- a/include/asm-parisc/ide.h +++ b/arch/parisc/include/asm/ide.h @@ -13,10 +13,6 @@ #ifdef __KERNEL__ -#define ide_request_irq(irq,hand,flg,dev,id) request_irq((irq),(hand),(flg),(dev),(id)) -#define ide_free_irq(irq,dev_id) free_irq((irq), (dev_id)) -#define ide_request_region(from,extent,name) request_region((from), (extent), (name)) -#define ide_release_region(from,extent) release_region((from), (extent)) /* Generic I/O and MEMIO string operations. */ #define __ide_insw insw diff --git a/include/asm-parisc/io.h b/arch/parisc/include/asm/io.h index 55ddb18..55ddb18 100644 --- a/include/asm-parisc/io.h +++ b/arch/parisc/include/asm/io.h diff --git a/include/asm-parisc/ioctl.h b/arch/parisc/include/asm/ioctl.h index ec8efa0..ec8efa0 100644 --- a/include/asm-parisc/ioctl.h +++ b/arch/parisc/include/asm/ioctl.h diff --git a/include/asm-parisc/ioctls.h b/arch/parisc/include/asm/ioctls.h index 6747fad..6747fad 100644 --- a/include/asm-parisc/ioctls.h +++ b/arch/parisc/include/asm/ioctls.h diff --git a/include/asm-parisc/ipcbuf.h b/arch/parisc/include/asm/ipcbuf.h index bd956c4..bd956c4 100644 --- a/include/asm-parisc/ipcbuf.h +++ b/arch/parisc/include/asm/ipcbuf.h diff --git a/include/asm-parisc/irq.h b/arch/parisc/include/asm/irq.h index 399c819..399c819 100644 --- a/include/asm-parisc/irq.h +++ b/arch/parisc/include/asm/irq.h diff --git a/include/asm-parisc/irq_regs.h b/arch/parisc/include/asm/irq_regs.h index 3dd9c0b..3dd9c0b 100644 --- a/include/asm-parisc/irq_regs.h +++ b/arch/parisc/include/asm/irq_regs.h diff --git a/include/asm-parisc/kdebug.h b/arch/parisc/include/asm/kdebug.h index 6ece1b0..6ece1b0 100644 --- a/include/asm-parisc/kdebug.h +++ b/arch/parisc/include/asm/kdebug.h diff --git a/include/asm-parisc/kmap_types.h b/arch/parisc/include/asm/kmap_types.h index 806aae3..806aae3 100644 --- a/include/asm-parisc/kmap_types.h +++ b/arch/parisc/include/asm/kmap_types.h diff --git a/include/asm-parisc/led.h b/arch/parisc/include/asm/led.h index c3405ab..c3405ab 100644 --- a/include/asm-parisc/led.h +++ b/arch/parisc/include/asm/led.h diff --git a/include/asm-parisc/linkage.h b/arch/parisc/include/asm/linkage.h index 0b19a72..0b19a72 100644 --- a/include/asm-parisc/linkage.h +++ b/arch/parisc/include/asm/linkage.h diff --git a/include/asm-parisc/local.h b/arch/parisc/include/asm/local.h index c11c530..c11c530 100644 --- a/include/asm-parisc/local.h +++ b/arch/parisc/include/asm/local.h diff --git a/include/asm-parisc/machdep.h b/arch/parisc/include/asm/machdep.h index a231c97..a231c97 100644 --- a/include/asm-parisc/machdep.h +++ b/arch/parisc/include/asm/machdep.h diff --git a/include/asm-parisc/mc146818rtc.h b/arch/parisc/include/asm/mc146818rtc.h index adf4163..adf4163 100644 --- a/include/asm-parisc/mc146818rtc.h +++ b/arch/parisc/include/asm/mc146818rtc.h diff --git a/include/asm-parisc/mckinley.h b/arch/parisc/include/asm/mckinley.h index d1ea6f1..d1ea6f1 100644 --- a/include/asm-parisc/mckinley.h +++ b/arch/parisc/include/asm/mckinley.h diff --git a/include/asm-parisc/mman.h b/arch/parisc/include/asm/mman.h index defe752..defe752 100644 --- a/include/asm-parisc/mman.h +++ b/arch/parisc/include/asm/mman.h diff --git a/include/asm-parisc/mmu.h b/arch/parisc/include/asm/mmu.h index 6a310cf..6a310cf 100644 --- a/include/asm-parisc/mmu.h +++ b/arch/parisc/include/asm/mmu.h diff --git a/include/asm-parisc/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index 85856c7..85856c7 100644 --- a/include/asm-parisc/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h diff --git a/include/asm-parisc/mmzone.h b/arch/parisc/include/asm/mmzone.h index 9608d2c..9608d2c 100644 --- a/include/asm-parisc/mmzone.h +++ b/arch/parisc/include/asm/mmzone.h diff --git a/include/asm-parisc/module.h b/arch/parisc/include/asm/module.h index c2cb49e..c2cb49e 100644 --- a/include/asm-parisc/module.h +++ b/arch/parisc/include/asm/module.h diff --git a/include/asm-parisc/msgbuf.h b/arch/parisc/include/asm/msgbuf.h index fe88f26..fe88f26 100644 --- a/include/asm-parisc/msgbuf.h +++ b/arch/parisc/include/asm/msgbuf.h diff --git a/include/asm-parisc/mutex.h b/arch/parisc/include/asm/mutex.h index 458c1f7..458c1f7 100644 --- a/include/asm-parisc/mutex.h +++ b/arch/parisc/include/asm/mutex.h diff --git a/include/asm-parisc/page.h b/arch/parisc/include/asm/page.h index c3941f0..c3941f0 100644 --- a/include/asm-parisc/page.h +++ b/arch/parisc/include/asm/page.h diff --git a/include/asm-parisc/param.h b/arch/parisc/include/asm/param.h index 32e03d8..32e03d8 100644 --- a/include/asm-parisc/param.h +++ b/arch/parisc/include/asm/param.h diff --git a/include/asm-parisc/parisc-device.h b/arch/parisc/include/asm/parisc-device.h index 7aa13f2..7aa13f2 100644 --- a/include/asm-parisc/parisc-device.h +++ b/arch/parisc/include/asm/parisc-device.h diff --git a/include/asm-parisc/parport.h b/arch/parisc/include/asm/parport.h index 00d9cc3..00d9cc3 100644 --- a/include/asm-parisc/parport.h +++ b/arch/parisc/include/asm/parport.h diff --git a/include/asm-parisc/pci.h b/arch/parisc/include/asm/pci.h index 4ba868f..4ba868f 100644 --- a/include/asm-parisc/pci.h +++ b/arch/parisc/include/asm/pci.h diff --git a/include/asm-parisc/pdc.h b/arch/parisc/include/asm/pdc.h index 9eaa794..c584b00 100644 --- a/include/asm-parisc/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -332,6 +332,9 @@ #define BOOT_CONSOLE_SPA_OFFSET 0x3c4 #define BOOT_CONSOLE_PATH_OFFSET 0x3a8 +/* size of the pdc_result buffer for firmware.c */ +#define NUM_PDC_RESULT 32 + #if !defined(__ASSEMBLY__) #ifdef __KERNEL__ @@ -600,6 +603,7 @@ int pdc_chassis_info(struct pdc_chassis_info *chassis_info, void *led_info, unsi int pdc_chassis_disp(unsigned long disp); int pdc_chassis_warn(unsigned long *warn); int pdc_coproc_cfg(struct pdc_coproc_cfg *pdc_coproc_info); +int pdc_coproc_cfg_unlocked(struct pdc_coproc_cfg *pdc_coproc_info); int pdc_iodc_read(unsigned long *actcnt, unsigned long hpa, unsigned int index, void *iodc_data, unsigned int iodc_data_size); int pdc_system_map_find_mods(struct pdc_system_map_mod_info *pdc_mod_info, @@ -638,6 +642,7 @@ int pdc_mem_mem_table(struct pdc_memory_table_raddr *r_addr, #endif void set_firmware_width(void); +void set_firmware_width_unlocked(void); int pdc_do_firm_test_reset(unsigned long ftc_bitmap); int pdc_do_reset(void); int pdc_soft_power_info(unsigned long *power_reg); diff --git a/include/asm-parisc/pdc_chassis.h b/arch/parisc/include/asm/pdc_chassis.h index a609273..a609273 100644 --- a/include/asm-parisc/pdc_chassis.h +++ b/arch/parisc/include/asm/pdc_chassis.h diff --git a/include/asm-parisc/pdcpat.h b/arch/parisc/include/asm/pdcpat.h index 47539f1..47539f1 100644 --- a/include/asm-parisc/pdcpat.h +++ b/arch/parisc/include/asm/pdcpat.h diff --git a/include/asm-parisc/percpu.h b/arch/parisc/include/asm/percpu.h index a0dcd19..a0dcd19 100644 --- a/include/asm-parisc/percpu.h +++ b/arch/parisc/include/asm/percpu.h diff --git a/include/asm-parisc/perf.h b/arch/parisc/include/asm/perf.h index a18e119..a18e119 100644 --- a/include/asm-parisc/perf.h +++ b/arch/parisc/include/asm/perf.h diff --git a/include/asm-parisc/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index fc987a1..fc987a1 100644 --- a/include/asm-parisc/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h diff --git a/include/asm-parisc/pgtable.h b/arch/parisc/include/asm/pgtable.h index 470a4b8..470a4b8 100644 --- a/include/asm-parisc/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h diff --git a/include/asm-parisc/poll.h b/arch/parisc/include/asm/poll.h index c98509d..c98509d 100644 --- a/include/asm-parisc/poll.h +++ b/arch/parisc/include/asm/poll.h diff --git a/include/asm-parisc/posix_types.h b/arch/parisc/include/asm/posix_types.h index bb725a6..bb725a6 100644 --- a/include/asm-parisc/posix_types.h +++ b/arch/parisc/include/asm/posix_types.h diff --git a/include/asm-parisc/prefetch.h b/arch/parisc/include/asm/prefetch.h index c5edc60..c5edc60 100644 --- a/include/asm-parisc/prefetch.h +++ b/arch/parisc/include/asm/prefetch.h diff --git a/include/asm-parisc/processor.h b/arch/parisc/include/asm/processor.h index 3c9d348..3c9d348 100644 --- a/include/asm-parisc/processor.h +++ b/arch/parisc/include/asm/processor.h diff --git a/include/asm-parisc/psw.h b/arch/parisc/include/asm/psw.h index 5a3e23c..5a3e23c 100644 --- a/include/asm-parisc/psw.h +++ b/arch/parisc/include/asm/psw.h diff --git a/include/asm-parisc/ptrace.h b/arch/parisc/include/asm/ptrace.h index 3e94c5d..afa5333 100644 --- a/include/asm-parisc/ptrace.h +++ b/arch/parisc/include/asm/ptrace.h @@ -47,6 +47,16 @@ struct pt_regs { #define task_regs(task) ((struct pt_regs *) ((char *)(task) + TASK_REGS)) +#define __ARCH_WANT_COMPAT_SYS_PTRACE + +struct task_struct; +#define arch_has_single_step() 1 +void user_disable_single_step(struct task_struct *task); +void user_enable_single_step(struct task_struct *task); + +#define arch_has_block_step() 1 +void user_enable_block_step(struct task_struct *task); + /* XXX should we use iaoq[1] or iaoq[0] ? */ #define user_mode(regs) (((regs)->iaoq[0] & 3) ? 1 : 0) #define user_space(regs) (((regs)->iasq[1] != 0) ? 1 : 0) diff --git a/include/asm-parisc/real.h b/arch/parisc/include/asm/real.h index 82acb25..82acb25 100644 --- a/include/asm-parisc/real.h +++ b/arch/parisc/include/asm/real.h diff --git a/include/asm-parisc/resource.h b/arch/parisc/include/asm/resource.h index 8b06343..8b06343 100644 --- a/include/asm-parisc/resource.h +++ b/arch/parisc/include/asm/resource.h diff --git a/include/asm-parisc/ropes.h b/arch/parisc/include/asm/ropes.h index 007a8806..09f51d5 100644 --- a/include/asm-parisc/ropes.h +++ b/arch/parisc/include/asm/ropes.h @@ -1,7 +1,7 @@ #ifndef _ASM_PARISC_ROPES_H_ #define _ASM_PARISC_ROPES_H_ -#include <asm-parisc/parisc-device.h> +#include <asm/parisc-device.h> #ifdef CONFIG_64BIT /* "low end" PA8800 machines use ZX1 chipset: PAT PDC and only run 64-bit */ diff --git a/include/asm-parisc/rt_sigframe.h b/arch/parisc/include/asm/rt_sigframe.h index f0dd3b3..f0dd3b3 100644 --- a/include/asm-parisc/rt_sigframe.h +++ b/arch/parisc/include/asm/rt_sigframe.h diff --git a/include/asm-parisc/rtc.h b/arch/parisc/include/asm/rtc.h index 099d641..099d641 100644 --- a/include/asm-parisc/rtc.h +++ b/arch/parisc/include/asm/rtc.h diff --git a/include/asm-parisc/runway.h b/arch/parisc/include/asm/runway.h index 5bea02d..5bea02d 100644 --- a/include/asm-parisc/runway.h +++ b/arch/parisc/include/asm/runway.h diff --git a/include/asm-parisc/scatterlist.h b/arch/parisc/include/asm/scatterlist.h index 62269b3..62269b3 100644 --- a/include/asm-parisc/scatterlist.h +++ b/arch/parisc/include/asm/scatterlist.h diff --git a/include/asm-parisc/sections.h b/arch/parisc/include/asm/sections.h index 9d13c35..9d13c35 100644 --- a/include/asm-parisc/sections.h +++ b/arch/parisc/include/asm/sections.h diff --git a/include/asm-parisc/segment.h b/arch/parisc/include/asm/segment.h index 26794dd..26794dd 100644 --- a/include/asm-parisc/segment.h +++ b/arch/parisc/include/asm/segment.h diff --git a/include/asm-parisc/sembuf.h b/arch/parisc/include/asm/sembuf.h index 1e59ffd..1e59ffd 100644 --- a/include/asm-parisc/sembuf.h +++ b/arch/parisc/include/asm/sembuf.h diff --git a/include/asm-parisc/serial.h b/arch/parisc/include/asm/serial.h index d7e3cc6..d7e3cc6 100644 --- a/include/asm-parisc/serial.h +++ b/arch/parisc/include/asm/serial.h diff --git a/include/asm-parisc/setup.h b/arch/parisc/include/asm/setup.h index 7da2e5b..7da2e5b 100644 --- a/include/asm-parisc/setup.h +++ b/arch/parisc/include/asm/setup.h diff --git a/include/asm-parisc/shmbuf.h b/arch/parisc/include/asm/shmbuf.h index 0a3eada..0a3eada 100644 --- a/include/asm-parisc/shmbuf.h +++ b/arch/parisc/include/asm/shmbuf.h diff --git a/include/asm-parisc/shmparam.h b/arch/parisc/include/asm/shmparam.h index 628ddc2..628ddc2 100644 --- a/include/asm-parisc/shmparam.h +++ b/arch/parisc/include/asm/shmparam.h diff --git a/include/asm-parisc/sigcontext.h b/arch/parisc/include/asm/sigcontext.h index 27ef31b..27ef31b 100644 --- a/include/asm-parisc/sigcontext.h +++ b/arch/parisc/include/asm/sigcontext.h diff --git a/include/asm-parisc/siginfo.h b/arch/parisc/include/asm/siginfo.h index d703472..d703472 100644 --- a/include/asm-parisc/siginfo.h +++ b/arch/parisc/include/asm/siginfo.h diff --git a/include/asm-parisc/signal.h b/arch/parisc/include/asm/signal.h index c203563..c203563 100644 --- a/include/asm-parisc/signal.h +++ b/arch/parisc/include/asm/signal.h diff --git a/include/asm-parisc/smp.h b/arch/parisc/include/asm/smp.h index 398cdba..398cdba 100644 --- a/include/asm-parisc/smp.h +++ b/arch/parisc/include/asm/smp.h diff --git a/include/asm-parisc/socket.h b/arch/parisc/include/asm/socket.h index fba402c..fba402c 100644 --- a/include/asm-parisc/socket.h +++ b/arch/parisc/include/asm/socket.h diff --git a/include/asm-parisc/sockios.h b/arch/parisc/include/asm/sockios.h index dabfbc7..dabfbc7 100644 --- a/include/asm-parisc/sockios.h +++ b/arch/parisc/include/asm/sockios.h diff --git a/include/asm-parisc/spinlock.h b/arch/parisc/include/asm/spinlock.h index f3d2090..f3d2090 100644 --- a/include/asm-parisc/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h diff --git a/include/asm-parisc/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index 3f72f47..3f72f47 100644 --- a/include/asm-parisc/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h diff --git a/include/asm-parisc/stat.h b/arch/parisc/include/asm/stat.h index 9d5fbbc..9d5fbbc 100644 --- a/include/asm-parisc/stat.h +++ b/arch/parisc/include/asm/stat.h diff --git a/include/asm-parisc/statfs.h b/arch/parisc/include/asm/statfs.h index 324bea9..324bea9 100644 --- a/include/asm-parisc/statfs.h +++ b/arch/parisc/include/asm/statfs.h diff --git a/include/asm-parisc/string.h b/arch/parisc/include/asm/string.h index eda01be..eda01be 100644 --- a/include/asm-parisc/string.h +++ b/arch/parisc/include/asm/string.h diff --git a/include/asm-parisc/superio.h b/arch/parisc/include/asm/superio.h index 6598acb..6598acb 100644 --- a/include/asm-parisc/superio.h +++ b/arch/parisc/include/asm/superio.h diff --git a/include/asm-parisc/system.h b/arch/parisc/include/asm/system.h index ee80c92..ee80c92 100644 --- a/include/asm-parisc/system.h +++ b/arch/parisc/include/asm/system.h diff --git a/include/asm-parisc/termbits.h b/arch/parisc/include/asm/termbits.h index d8bbc73..d8bbc73 100644 --- a/include/asm-parisc/termbits.h +++ b/arch/parisc/include/asm/termbits.h diff --git a/include/asm-parisc/termios.h b/arch/parisc/include/asm/termios.h index a2a57a4..a2a57a4 100644 --- a/include/asm-parisc/termios.h +++ b/arch/parisc/include/asm/termios.h diff --git a/include/asm-parisc/thread_info.h b/arch/parisc/include/asm/thread_info.h index 0407959..0407959 100644 --- a/include/asm-parisc/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h diff --git a/include/asm-parisc/timex.h b/arch/parisc/include/asm/timex.h index 3b68d77..3b68d77 100644 --- a/include/asm-parisc/timex.h +++ b/arch/parisc/include/asm/timex.h diff --git a/include/asm-parisc/tlb.h b/arch/parisc/include/asm/tlb.h index 383b1db..383b1db 100644 --- a/include/asm-parisc/tlb.h +++ b/arch/parisc/include/asm/tlb.h diff --git a/include/asm-parisc/tlbflush.h b/arch/parisc/include/asm/tlbflush.h index b72ec66..b72ec66 100644 --- a/include/asm-parisc/tlbflush.h +++ b/arch/parisc/include/asm/tlbflush.h diff --git a/include/asm-parisc/topology.h b/arch/parisc/include/asm/topology.h index d8133eb..d8133eb 100644 --- a/include/asm-parisc/topology.h +++ b/arch/parisc/include/asm/topology.h diff --git a/include/asm-parisc/traps.h b/arch/parisc/include/asm/traps.h index 1945f99..1945f99 100644 --- a/include/asm-parisc/traps.h +++ b/arch/parisc/include/asm/traps.h diff --git a/include/asm-parisc/types.h b/arch/parisc/include/asm/types.h index 7f5a39b..7f5a39b 100644 --- a/include/asm-parisc/types.h +++ b/arch/parisc/include/asm/types.h diff --git a/include/asm-parisc/uaccess.h b/arch/parisc/include/asm/uaccess.h index 4878b95..4878b95 100644 --- a/include/asm-parisc/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h diff --git a/include/asm-parisc/ucontext.h b/arch/parisc/include/asm/ucontext.h index 6c8883e..6c8883e 100644 --- a/include/asm-parisc/ucontext.h +++ b/arch/parisc/include/asm/ucontext.h diff --git a/include/asm-parisc/unaligned.h b/arch/parisc/include/asm/unaligned.h index dfc5d33..dfc5d33 100644 --- a/include/asm-parisc/unaligned.h +++ b/arch/parisc/include/asm/unaligned.h diff --git a/include/asm-parisc/unistd.h b/arch/parisc/include/asm/unistd.h index a7d857f..ef26b00 100644 --- a/include/asm-parisc/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -801,8 +801,14 @@ #define __NR_timerfd_create (__NR_Linux + 306) #define __NR_timerfd_settime (__NR_Linux + 307) #define __NR_timerfd_gettime (__NR_Linux + 308) - -#define __NR_Linux_syscalls (__NR_timerfd_gettime + 1) +#define __NR_signalfd4 (__NR_Linux + 309) +#define __NR_eventfd2 (__NR_Linux + 310) +#define __NR_epoll_create1 (__NR_Linux + 311) +#define __NR_dup3 (__NR_Linux + 312) +#define __NR_pipe2 (__NR_Linux + 313) +#define __NR_inotify_init1 (__NR_Linux + 314) + +#define __NR_Linux_syscalls (__NR_inotify_init1 + 1) #define __IGNORE_select /* newselect */ diff --git a/include/asm-parisc/unwind.h b/arch/parisc/include/asm/unwind.h index 2f7e6e5..52482e4 100644 --- a/include/asm-parisc/unwind.h +++ b/arch/parisc/include/asm/unwind.h @@ -74,4 +74,6 @@ void unwind_frame_init_running(struct unwind_frame_info *info, struct pt_regs *r int unwind_once(struct unwind_frame_info *info); int unwind_to_user(struct unwind_frame_info *info); +int unwind_init(void); + #endif diff --git a/include/asm-parisc/user.h b/arch/parisc/include/asm/user.h index 8022475..8022475 100644 --- a/include/asm-parisc/user.h +++ b/arch/parisc/include/asm/user.h diff --git a/include/asm-parisc/vga.h b/arch/parisc/include/asm/vga.h index 171399a..171399a 100644 --- a/include/asm-parisc/vga.h +++ b/arch/parisc/include/asm/vga.h diff --git a/include/asm-parisc/xor.h b/arch/parisc/include/asm/xor.h index c82eb12..c82eb12 100644 --- a/include/asm-parisc/xor.h +++ b/arch/parisc/include/asm/xor.h diff --git a/arch/parisc/kernel/.gitignore b/arch/parisc/kernel/.gitignore new file mode 100644 index 0000000..c5f676c --- /dev/null +++ b/arch/parisc/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 3efc0b7..699cf8e 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -290,5 +290,8 @@ int main(void) DEFINE(EXCDATA_IP, offsetof(struct exception_data, fault_ip)); DEFINE(EXCDATA_SPACE, offsetof(struct exception_data, fault_space)); DEFINE(EXCDATA_ADDR, offsetof(struct exception_data, fault_addr)); + BLANK(); + DEFINE(ASM_PDC_RESULT_SIZE, NUM_PDC_RESULT * sizeof(unsigned long)); + BLANK(); return 0; } diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 7177a6c..03f26bd 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -71,8 +71,8 @@ #include <asm/processor.h> /* for boot_cpu_data */ static DEFINE_SPINLOCK(pdc_lock); -static unsigned long pdc_result[32] __attribute__ ((aligned (8))); -static unsigned long pdc_result2[32] __attribute__ ((aligned (8))); +extern unsigned long pdc_result[NUM_PDC_RESULT]; +extern unsigned long pdc_result2[NUM_PDC_RESULT]; #ifdef CONFIG_64BIT #define WIDE_FIRMWARE 0x1 @@ -150,26 +150,40 @@ static void convert_to_wide(unsigned long *addr) #endif } +#ifdef CONFIG_64BIT +void __init set_firmware_width_unlocked(void) +{ + int ret; + + ret = mem_pdc_call(PDC_MODEL, PDC_MODEL_CAPABILITIES, + __pa(pdc_result), 0); + convert_to_wide(pdc_result); + if (pdc_result[0] != NARROW_FIRMWARE) + parisc_narrow_firmware = 0; +} + /** * set_firmware_width - Determine if the firmware is wide or narrow. * - * This function must be called before any pdc_* function that uses the convert_to_wide - * function. + * This function must be called before any pdc_* function that uses the + * convert_to_wide function. */ void __init set_firmware_width(void) { -#ifdef CONFIG_64BIT - int retval; unsigned long flags; + spin_lock_irqsave(&pdc_lock, flags); + set_firmware_width_unlocked(); + spin_unlock_irqrestore(&pdc_lock, flags); +} +#else +void __init set_firmware_width_unlocked(void) { + return; +} - spin_lock_irqsave(&pdc_lock, flags); - retval = mem_pdc_call(PDC_MODEL, PDC_MODEL_CAPABILITIES, __pa(pdc_result), 0); - convert_to_wide(pdc_result); - if(pdc_result[0] != NARROW_FIRMWARE) - parisc_narrow_firmware = 0; - spin_unlock_irqrestore(&pdc_lock, flags); -#endif +void __init set_firmware_width(void) { + return; } +#endif /*CONFIG_64BIT*/ /** * pdc_emergency_unlock - Unlock the linux pdc lock @@ -288,6 +302,20 @@ int pdc_chassis_warn(unsigned long *warn) return retval; } +int __init pdc_coproc_cfg_unlocked(struct pdc_coproc_cfg *pdc_coproc_info) +{ + int ret; + + ret = mem_pdc_call(PDC_COPROC, PDC_COPROC_CFG, __pa(pdc_result)); + convert_to_wide(pdc_result); + pdc_coproc_info->ccr_functional = pdc_result[0]; + pdc_coproc_info->ccr_present = pdc_result[1]; + pdc_coproc_info->revision = pdc_result[17]; + pdc_coproc_info->model = pdc_result[18]; + + return ret; +} + /** * pdc_coproc_cfg - To identify coprocessors attached to the processor. * @pdc_coproc_info: Return buffer address. @@ -297,19 +325,14 @@ int pdc_chassis_warn(unsigned long *warn) */ int __init pdc_coproc_cfg(struct pdc_coproc_cfg *pdc_coproc_info) { - int retval; + int ret; unsigned long flags; - spin_lock_irqsave(&pdc_lock, flags); - retval = mem_pdc_call(PDC_COPROC, PDC_COPROC_CFG, __pa(pdc_result)); - convert_to_wide(pdc_result); - pdc_coproc_info->ccr_functional = pdc_result[0]; - pdc_coproc_info->ccr_present = pdc_result[1]; - pdc_coproc_info->revision = pdc_result[17]; - pdc_coproc_info->model = pdc_result[18]; - spin_unlock_irqrestore(&pdc_lock, flags); + spin_lock_irqsave(&pdc_lock, flags); + ret = pdc_coproc_cfg_unlocked(pdc_coproc_info); + spin_unlock_irqrestore(&pdc_lock, flags); - return retval; + return ret; } /** diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index a84e31e..0e3d9f9 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -121,7 +121,7 @@ $pgt_fill_loop: copy %r0,%r2 /* And the RFI Target address too */ - load32 start_kernel,%r11 + load32 start_parisc,%r11 /* And the initial task pointer */ load32 init_thread_union,%r6 diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 49c6379..90904f9 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -4,6 +4,7 @@ * Copyright (C) 2000 Hewlett-Packard Co, Linuxcare Inc. * Copyright (C) 2000 Matthew Wilcox <matthew@wil.cx> * Copyright (C) 2000 David Huggins-Daines <dhd@debian.org> + * Copyright (C) 2008 Helge Deller <deller@gmx.de> */ #include <linux/kernel.h> @@ -27,15 +28,149 @@ /* PSW bits we allow the debugger to modify */ #define USER_PSW_BITS (PSW_N | PSW_V | PSW_CB) -#undef DEBUG_PTRACE +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure single step bits etc are not set. + */ +void ptrace_disable(struct task_struct *task) +{ + task->ptrace &= ~(PT_SINGLESTEP|PT_BLOCKSTEP); -#ifdef DEBUG_PTRACE -#define DBG(x...) printk(x) -#else -#define DBG(x...) -#endif + /* make sure the trap bits are not set */ + pa_psw(task)->r = 0; + pa_psw(task)->t = 0; + pa_psw(task)->h = 0; + pa_psw(task)->l = 0; +} + +/* + * The following functions are called by ptrace_resume() when + * enabling or disabling single/block tracing. + */ +void user_disable_single_step(struct task_struct *task) +{ + ptrace_disable(task); +} + +void user_enable_single_step(struct task_struct *task) +{ + task->ptrace &= ~PT_BLOCKSTEP; + task->ptrace |= PT_SINGLESTEP; + + if (pa_psw(task)->n) { + struct siginfo si; + + /* Nullified, just crank over the queue. */ + task_regs(task)->iaoq[0] = task_regs(task)->iaoq[1]; + task_regs(task)->iasq[0] = task_regs(task)->iasq[1]; + task_regs(task)->iaoq[1] = task_regs(task)->iaoq[0] + 4; + pa_psw(task)->n = 0; + pa_psw(task)->x = 0; + pa_psw(task)->y = 0; + pa_psw(task)->z = 0; + pa_psw(task)->b = 0; + ptrace_disable(task); + /* Don't wake up the task, but let the + parent know something happened. */ + si.si_code = TRAP_TRACE; + si.si_addr = (void __user *) (task_regs(task)->iaoq[0] & ~3); + si.si_signo = SIGTRAP; + si.si_errno = 0; + force_sig_info(SIGTRAP, &si, task); + /* notify_parent(task, SIGCHLD); */ + return; + } + + /* Enable recovery counter traps. The recovery counter + * itself will be set to zero on a task switch. If the + * task is suspended on a syscall then the syscall return + * path will overwrite the recovery counter with a suitable + * value such that it traps once back in user space. We + * disable interrupts in the tasks PSW here also, to avoid + * interrupts while the recovery counter is decrementing. + */ + pa_psw(task)->r = 1; + pa_psw(task)->t = 0; + pa_psw(task)->h = 0; + pa_psw(task)->l = 0; +} + +void user_enable_block_step(struct task_struct *task) +{ + task->ptrace &= ~PT_SINGLESTEP; + task->ptrace |= PT_BLOCKSTEP; + + /* Enable taken branch trap. */ + pa_psw(task)->r = 0; + pa_psw(task)->t = 1; + pa_psw(task)->h = 0; + pa_psw(task)->l = 0; +} + +long arch_ptrace(struct task_struct *child, long request, long addr, long data) +{ + unsigned long tmp; + long ret = -EIO; -#ifdef CONFIG_64BIT + switch (request) { + + /* Read the word at location addr in the USER area. For ptraced + processes, the kernel saves all regs on a syscall. */ + case PTRACE_PEEKUSR: + if ((addr & (sizeof(long)-1)) || + (unsigned long) addr >= sizeof(struct pt_regs)) + break; + tmp = *(unsigned long *) ((char *) task_regs(child) + addr); + ret = put_user(tmp, (unsigned long *) data); + break; + + /* Write the word at location addr in the USER area. This will need + to change when the kernel no longer saves all regs on a syscall. + FIXME. There is a problem at the moment in that r3-r18 are only + saved if the process is ptraced on syscall entry, and even then + those values are overwritten by actual register values on syscall + exit. */ + case PTRACE_POKEUSR: + /* Some register values written here may be ignored in + * entry.S:syscall_restore_rfi; e.g. iaoq is written with + * r31/r31+4, and not with the values in pt_regs. + */ + if (addr == PT_PSW) { + /* Allow writing to Nullify, Divide-step-correction, + * and carry/borrow bits. + * BEWARE, if you set N, and then single step, it won't + * stop on the nullified instruction. + */ + data &= USER_PSW_BITS; + task_regs(child)->gr[0] &= ~USER_PSW_BITS; + task_regs(child)->gr[0] |= data; + ret = 0; + break; + } + + if ((addr & (sizeof(long)-1)) || + (unsigned long) addr >= sizeof(struct pt_regs)) + break; + if ((addr >= PT_GR1 && addr <= PT_GR31) || + addr == PT_IAOQ0 || addr == PT_IAOQ1 || + (addr >= PT_FR0 && addr <= PT_FR31 + 4) || + addr == PT_SAR) { + *(unsigned long *) ((char *) task_regs(child) + addr) = data; + ret = 0; + } + break; + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + + +#ifdef CONFIG_COMPAT /* This function is needed to translate 32 bit pt_regs offsets in to * 64 bit pt_regs offsets. For example, a 32 bit gdb under a 64 bit kernel @@ -61,106 +196,25 @@ static long translate_usr_offset(long offset) else return -1; } -#endif -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure single step bits etc are not set. - */ -void ptrace_disable(struct task_struct *child) +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t addr, compat_ulong_t data) { - /* make sure the trap bits are not set */ - pa_psw(child)->r = 0; - pa_psw(child)->t = 0; - pa_psw(child)->h = 0; - pa_psw(child)->l = 0; -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - long ret; -#ifdef DEBUG_PTRACE - long oaddr=addr, odata=data; -#endif + compat_uint_t tmp; + long ret = -EIO; switch (request) { - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { -#ifdef CONFIG_64BIT - if (__is_compat_task(child)) { - int copied; - unsigned int tmp; - - addr &= 0xffffffffL; - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - goto out_tsk; - ret = put_user(tmp,(unsigned int *) data); - DBG("sys_ptrace(PEEK%s, %d, %lx, %lx) returning %ld, data %x\n", - request == PTRACE_PEEKTEXT ? "TEXT" : "DATA", - pid, oaddr, odata, ret, tmp); - } - else -#endif - ret = generic_ptrace_peekdata(child, addr, data); - goto out_tsk; - } - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = 0; -#ifdef CONFIG_64BIT - if (__is_compat_task(child)) { - unsigned int tmp = (unsigned int)data; - DBG("sys_ptrace(POKE%s, %d, %lx, %lx)\n", - request == PTRACE_POKETEXT ? "TEXT" : "DATA", - pid, oaddr, odata); - addr &= 0xffffffffL; - if (access_process_vm(child, addr, &tmp, sizeof(tmp), 1) == sizeof(tmp)) - goto out_tsk; - } - else -#endif - { - if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) - goto out_tsk; - } - ret = -EIO; - goto out_tsk; - - /* Read the word at location addr in the USER area. For ptraced - processes, the kernel saves all regs on a syscall. */ - case PTRACE_PEEKUSR: { - ret = -EIO; -#ifdef CONFIG_64BIT - if (__is_compat_task(child)) { - unsigned int tmp; - - if (addr & (sizeof(int)-1)) - goto out_tsk; - if ((addr = translate_usr_offset(addr)) < 0) - goto out_tsk; - - tmp = *(unsigned int *) ((char *) task_regs(child) + addr); - ret = put_user(tmp, (unsigned int *) data); - DBG("sys_ptrace(PEEKUSR, %d, %lx, %lx) returning %ld, addr %lx, data %x\n", - pid, oaddr, odata, ret, addr, tmp); - } - else -#endif - { - unsigned long tmp; + case PTRACE_PEEKUSR: + if (addr & (sizeof(compat_uint_t)-1)) + break; + addr = translate_usr_offset(addr); + if (addr < 0) + break; - if ((addr & (sizeof(long)-1)) || (unsigned long) addr >= sizeof(struct pt_regs)) - goto out_tsk; - tmp = *(unsigned long *) ((char *) task_regs(child) + addr); - ret = put_user(tmp, (unsigned long *) data); - } - goto out_tsk; - } + tmp = *(compat_uint_t *) ((char *) task_regs(child) + addr); + ret = put_user(tmp, (compat_uint_t *) (unsigned long) data); + break; /* Write the word at location addr in the USER area. This will need to change when the kernel no longer saves all regs on a syscall. @@ -169,185 +223,46 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) those values are overwritten by actual register values on syscall exit. */ case PTRACE_POKEUSR: - ret = -EIO; /* Some register values written here may be ignored in * entry.S:syscall_restore_rfi; e.g. iaoq is written with * r31/r31+4, and not with the values in pt_regs. */ - /* PT_PSW=0, so this is valid for 32 bit processes under 64 - * bit kernels. - */ if (addr == PT_PSW) { - /* PT_PSW=0, so this is valid for 32 bit processes - * under 64 bit kernels. - * - * Allow writing to Nullify, Divide-step-correction, - * and carry/borrow bits. - * BEWARE, if you set N, and then single step, it won't - * stop on the nullified instruction. + /* Since PT_PSW==0, it is valid for 32 bit processes + * under 64 bit kernels as well. */ - DBG("sys_ptrace(POKEUSR, %d, %lx, %lx)\n", - pid, oaddr, odata); - data &= USER_PSW_BITS; - task_regs(child)->gr[0] &= ~USER_PSW_BITS; - task_regs(child)->gr[0] |= data; - ret = 0; - goto out_tsk; - } -#ifdef CONFIG_64BIT - if (__is_compat_task(child)) { - if (addr & (sizeof(int)-1)) - goto out_tsk; - if ((addr = translate_usr_offset(addr)) < 0) - goto out_tsk; - DBG("sys_ptrace(POKEUSR, %d, %lx, %lx) addr %lx\n", - pid, oaddr, odata, addr); + ret = arch_ptrace(child, request, addr, data); + } else { + if (addr & (sizeof(compat_uint_t)-1)) + break; + addr = translate_usr_offset(addr); + if (addr < 0) + break; if (addr >= PT_FR0 && addr <= PT_FR31 + 4) { /* Special case, fp regs are 64 bits anyway */ - *(unsigned int *) ((char *) task_regs(child) + addr) = data; + *(__u64 *) ((char *) task_regs(child) + addr) = data; ret = 0; } else if ((addr >= PT_GR1+4 && addr <= PT_GR31+4) || addr == PT_IAOQ0+4 || addr == PT_IAOQ1+4 || addr == PT_SAR+4) { /* Zero the top 32 bits */ - *(unsigned int *) ((char *) task_regs(child) + addr - 4) = 0; - *(unsigned int *) ((char *) task_regs(child) + addr) = data; + *(__u32 *) ((char *) task_regs(child) + addr - 4) = 0; + *(__u32 *) ((char *) task_regs(child) + addr) = data; ret = 0; } - goto out_tsk; } - else -#endif - { - if ((addr & (sizeof(long)-1)) || (unsigned long) addr >= sizeof(struct pt_regs)) - goto out_tsk; - if ((addr >= PT_GR1 && addr <= PT_GR31) || - addr == PT_IAOQ0 || addr == PT_IAOQ1 || - (addr >= PT_FR0 && addr <= PT_FR31 + 4) || - addr == PT_SAR) { - *(unsigned long *) ((char *) task_regs(child) + addr) = data; - ret = 0; - } - goto out_tsk; - } - - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: - ret = -EIO; - DBG("sys_ptrace(%s)\n", - request == PTRACE_SYSCALL ? "SYSCALL" : "CONT"); - if (!valid_signal(data)) - goto out_tsk; - child->ptrace &= ~(PT_SINGLESTEP|PT_BLOCKSTEP); - if (request == PTRACE_SYSCALL) { - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } else { - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } - child->exit_code = data; - goto out_wake_notrap; - - case PTRACE_KILL: - /* - * make the child exit. Best I can do is send it a - * sigkill. perhaps it should be put in the status - * that it wants to exit. - */ - ret = 0; - DBG("sys_ptrace(KILL)\n"); - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - goto out_tsk; - child->exit_code = SIGKILL; - goto out_wake_notrap; - - case PTRACE_SINGLEBLOCK: - DBG("sys_ptrace(SINGLEBLOCK)\n"); - ret = -EIO; - if (!valid_signal(data)) - goto out_tsk; - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->ptrace &= ~PT_SINGLESTEP; - child->ptrace |= PT_BLOCKSTEP; - child->exit_code = data; - - /* Enable taken branch trap. */ - pa_psw(child)->r = 0; - pa_psw(child)->t = 1; - pa_psw(child)->h = 0; - pa_psw(child)->l = 0; - goto out_wake; - - case PTRACE_SINGLESTEP: - DBG("sys_ptrace(SINGLESTEP)\n"); - ret = -EIO; - if (!valid_signal(data)) - goto out_tsk; - - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->ptrace &= ~PT_BLOCKSTEP; - child->ptrace |= PT_SINGLESTEP; - child->exit_code = data; - - if (pa_psw(child)->n) { - struct siginfo si; - - /* Nullified, just crank over the queue. */ - task_regs(child)->iaoq[0] = task_regs(child)->iaoq[1]; - task_regs(child)->iasq[0] = task_regs(child)->iasq[1]; - task_regs(child)->iaoq[1] = task_regs(child)->iaoq[0] + 4; - pa_psw(child)->n = 0; - pa_psw(child)->x = 0; - pa_psw(child)->y = 0; - pa_psw(child)->z = 0; - pa_psw(child)->b = 0; - ptrace_disable(child); - /* Don't wake up the child, but let the - parent know something happened. */ - si.si_code = TRAP_TRACE; - si.si_addr = (void __user *) (task_regs(child)->iaoq[0] & ~3); - si.si_signo = SIGTRAP; - si.si_errno = 0; - force_sig_info(SIGTRAP, &si, child); - //notify_parent(child, SIGCHLD); - //ret = 0; - goto out_wake; - } - - /* Enable recovery counter traps. The recovery counter - * itself will be set to zero on a task switch. If the - * task is suspended on a syscall then the syscall return - * path will overwrite the recovery counter with a suitable - * value such that it traps once back in user space. We - * disable interrupts in the childs PSW here also, to avoid - * interrupts while the recovery counter is decrementing. - */ - pa_psw(child)->r = 1; - pa_psw(child)->t = 0; - pa_psw(child)->h = 0; - pa_psw(child)->l = 0; - /* give it a chance to run. */ - goto out_wake; - - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, (unsigned int __user *) data); - goto out_tsk; + break; default: - ret = ptrace_request(child, request, addr, data); - goto out_tsk; + ret = compat_ptrace_request(child, request, addr, data); + break; } -out_wake_notrap: - ptrace_disable(child); -out_wake: - wake_up_process(child); - ret = 0; -out_tsk: - DBG("arch_ptrace(%ld, %d, %lx, %lx) returning %ld\n", - request, pid, oaddr, odata, ret); return ret; } +#endif + void syscall_trace(void) { diff --git a/arch/parisc/kernel/real2.S b/arch/parisc/kernel/real2.S index 7a92695..5f3d3a1 100644 --- a/arch/parisc/kernel/real2.S +++ b/arch/parisc/kernel/real2.S @@ -8,12 +8,24 @@ * */ +#include <asm/pdc.h> #include <asm/psw.h> #include <asm/assembly.h> +#include <asm/asm-offsets.h> #include <linux/linkage.h> + .section .bss + + .export pdc_result + .export pdc_result2 + .align 8 +pdc_result: + .block ASM_PDC_RESULT_SIZE +pdc_result2: + .block ASM_PDC_RESULT_SIZE + .export real_stack .export real32_stack .export real64_stack diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 39e7c5a..7d27853 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -44,6 +44,7 @@ #include <asm/pdc_chassis.h> #include <asm/io.h> #include <asm/setup.h> +#include <asm/unwind.h> static char __initdata command_line[COMMAND_LINE_SIZE]; @@ -123,6 +124,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_64BIT extern int parisc_narrow_firmware; #endif + unwind_init(); init_per_cpu(smp_processor_id()); /* Set Modes & Enable FP */ @@ -368,6 +370,31 @@ static int __init parisc_init(void) return 0; } - arch_initcall(parisc_init); +void start_parisc(void) +{ + extern void start_kernel(void); + + int ret, cpunum; + struct pdc_coproc_cfg coproc_cfg; + + cpunum = smp_processor_id(); + + set_firmware_width_unlocked(); + + ret = pdc_coproc_cfg_unlocked(&coproc_cfg); + if (ret >= 0 && coproc_cfg.ccr_functional) { + mtctl(coproc_cfg.ccr_functional, 10); + + cpu_data[cpunum].fp_rev = coproc_cfg.revision; + cpu_data[cpunum].fp_model = coproc_cfg.model; + + asm volatile ("fstd %fr0,8(%sp)"); + } else { + panic("must have an fpu to boot linux"); + } + + start_kernel(); + // not reached +} diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index c7e59f5..303d2b6 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -87,7 +87,7 @@ ENTRY_SAME(setuid) ENTRY_SAME(getuid) ENTRY_COMP(stime) /* 25 */ - ENTRY_SAME(ptrace) + ENTRY_COMP(ptrace) ENTRY_SAME(alarm) /* see stat comment */ ENTRY_COMP(newfstat) @@ -407,6 +407,12 @@ ENTRY_SAME(timerfd_create) ENTRY_COMP(timerfd_settime) ENTRY_COMP(timerfd_gettime) + ENTRY_COMP(signalfd4) + ENTRY_SAME(eventfd2) /* 310 */ + ENTRY_SAME(epoll_create1) + ENTRY_SAME(dup3) + ENTRY_SAME(pipe2) + ENTRY_SAME(inotify_init1) /* Nothing yet */ diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 24be86b..4d09203 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -23,6 +23,7 @@ #include <linux/smp.h> #include <linux/profile.h> #include <linux/clocksource.h> +#include <linux/platform_device.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -215,6 +216,24 @@ void __init start_cpu_itimer(void) cpu_data[cpu].it_value = next_tick; } +struct platform_device rtc_parisc_dev = { + .name = "rtc-parisc", + .id = -1, +}; + +static int __init rtc_init(void) +{ + int ret; + + ret = platform_device_register(&rtc_parisc_dev); + if (ret < 0) + printk(KERN_ERR "unable to register rtc device...\n"); + + /* not necessarily an error */ + return 0; +} +module_init(rtc_init); + void __init time_init(void) { static struct pdc_tod tod_data; @@ -245,4 +264,3 @@ void __init time_init(void) xtime.tv_nsec = 0; } } - diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 701b2d2..6773c58 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -170,7 +170,7 @@ void unwind_table_remove(struct unwind_table *table) } /* Called from setup_arch to import the kernel unwind info */ -static int unwind_init(void) +int unwind_init(void) { long start, stop; register unsigned long gp __asm__ ("r27"); @@ -417,5 +417,3 @@ int unwind_to_user(struct unwind_frame_info *info) return ret; } - -module_init(unwind_init); diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 64e1445..5ac51e6 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -10,9 +10,13 @@ * 2 of the License, or (at your option) any later version. */ +#ifndef __ASSEMBLY__ +#include <linux/types.h> +#else +#include <asm/types.h> +#endif #include <asm/asm-compat.h> #include <asm/kdump.h> -#include <asm/types.h> /* * On PPC32 page size is 4K. For PPC64 we support either 4K or 64K software diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index ae2ea80..9047af7 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -74,6 +74,13 @@ struct pci_controller { unsigned long pci_io_size; #endif + /* Some machines have a special region to forward the ISA + * "memory" cycles such as VGA memory regions. Left to 0 + * if unsupported + */ + resource_size_t isa_mem_phys; + resource_size_t isa_mem_size; + struct pci_ops *ops; unsigned int __iomem *cfg_addr; void __iomem *cfg_data; diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 0e52c78..39d547f 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -123,6 +123,16 @@ int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma, /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */ #define HAVE_PCI_MMAP 1 +extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, + size_t count); +extern int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, + size_t count); +extern int pci_mmap_legacy_page_range(struct pci_bus *bus, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state); + +#define HAVE_PCI_LEGACY 1 + #if defined(CONFIG_PPC64) || defined(CONFIG_NOT_COHERENT_CACHE) /* * For 64-bit kernels, pci_unmap_{single,page} is not a nop. @@ -226,5 +236,6 @@ extern void pci_resource_to_user(const struct pci_dev *dev, int bar, extern void pcibios_do_bus_setup(struct pci_bus *bus); extern void pcibios_fixup_of_probed_bus(struct pci_bus *bus); + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 734e075..280a90c 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -129,7 +129,7 @@ extern int ptrace_put_reg(struct task_struct *task, int regno, #define CHECK_FULL_REGS(regs) \ do { \ if ((regs)->trap & 1) \ - printk(KERN_CRIT "%s: partial register set\n", __FUNCTION__); \ + printk(KERN_CRIT "%s: partial register set\n", __func__); \ } while (0) #endif /* __powerpc64__ */ diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 01ce8c3..3815d84 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -451,7 +451,8 @@ pgprot_t pci_phys_mem_access_prot(struct file *file, pci_dev_put(pdev); } - DBG("non-PCI map for %lx, prot: %lx\n", offset, prot); + DBG("non-PCI map for %llx, prot: %lx\n", + (unsigned long long)offset, prot); return __pgprot(prot); } @@ -490,6 +491,131 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return ret; } +/* This provides legacy IO read access on a bus */ +int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size) +{ + unsigned long offset; + struct pci_controller *hose = pci_bus_to_host(bus); + struct resource *rp = &hose->io_resource; + void __iomem *addr; + + /* Check if port can be supported by that bus. We only check + * the ranges of the PHB though, not the bus itself as the rules + * for forwarding legacy cycles down bridges are not our problem + * here. So if the host bridge supports it, we do it. + */ + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + offset += port; + + if (!(rp->flags & IORESOURCE_IO)) + return -ENXIO; + if (offset < rp->start || (offset + size) > rp->end) + return -ENXIO; + addr = hose->io_base_virt + port; + + switch(size) { + case 1: + *((u8 *)val) = in_8(addr); + return 1; + case 2: + if (port & 1) + return -EINVAL; + *((u16 *)val) = in_le16(addr); + return 2; + case 4: + if (port & 3) + return -EINVAL; + *((u32 *)val) = in_le32(addr); + return 4; + } + return -EINVAL; +} + +/* This provides legacy IO write access on a bus */ +int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, size_t size) +{ + unsigned long offset; + struct pci_controller *hose = pci_bus_to_host(bus); + struct resource *rp = &hose->io_resource; + void __iomem *addr; + + /* Check if port can be supported by that bus. We only check + * the ranges of the PHB though, not the bus itself as the rules + * for forwarding legacy cycles down bridges are not our problem + * here. So if the host bridge supports it, we do it. + */ + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + offset += port; + + if (!(rp->flags & IORESOURCE_IO)) + return -ENXIO; + if (offset < rp->start || (offset + size) > rp->end) + return -ENXIO; + addr = hose->io_base_virt + port; + + /* WARNING: The generic code is idiotic. It gets passed a pointer + * to what can be a 1, 2 or 4 byte quantity and always reads that + * as a u32, which means that we have to correct the location of + * the data read within those 32 bits for size 1 and 2 + */ + switch(size) { + case 1: + out_8(addr, val >> 24); + return 1; + case 2: + if (port & 1) + return -EINVAL; + out_le16(addr, val >> 16); + return 2; + case 4: + if (port & 3) + return -EINVAL; + out_le32(addr, val); + return 4; + } + return -EINVAL; +} + +/* This provides legacy IO or memory mmap access on a bus */ +int pci_mmap_legacy_page_range(struct pci_bus *bus, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + resource_size_t offset = + ((resource_size_t)vma->vm_pgoff) << PAGE_SHIFT; + resource_size_t size = vma->vm_end - vma->vm_start; + struct resource *rp; + + pr_debug("pci_mmap_legacy_page_range(%04x:%02x, %s @%llx..%llx)\n", + pci_domain_nr(bus), bus->number, + mmap_state == pci_mmap_mem ? "MEM" : "IO", + (unsigned long long)offset, + (unsigned long long)(offset + size - 1)); + + if (mmap_state == pci_mmap_mem) { + if ((offset + size) > hose->isa_mem_size) + return -ENXIO; + offset += hose->isa_mem_phys; + } else { + unsigned long io_offset = (unsigned long)hose->io_base_virt - _IO_BASE; + unsigned long roffset = offset + io_offset; + rp = &hose->io_resource; + if (!(rp->flags & IORESOURCE_IO)) + return -ENXIO; + if (roffset < rp->start || (roffset + size) > rp->end) + return -ENXIO; + offset += hose->io_base_phys; + } + pr_debug(" -> mapping phys %llx\n", (unsigned long long)offset); + + vma->vm_pgoff = offset >> PAGE_SHIFT; + vma->vm_page_prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) @@ -592,6 +718,12 @@ void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose, cpu_addr = of_translate_address(dev, ranges + 3); size = of_read_number(ranges + pna + 3, 2); ranges += np; + + /* If we failed translation or got a zero-sized region + * (some FW try to feed us with non sensical zero sized regions + * such as power3 which look like some kind of attempt at exposing + * the VGA memory hole) + */ if (cpu_addr == OF_BAD_ADDR || size == 0) continue; @@ -665,6 +797,8 @@ void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose, isa_hole = memno; if (primary || isa_mem_base == 0) isa_mem_base = cpu_addr; + hose->isa_mem_phys = cpu_addr; + hose->isa_mem_size = size; } /* We get the PCI/Mem offset from the first range or diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c index 92d20e9..2ece399 100644 --- a/arch/powerpc/platforms/cell/spufs/sputrace.c +++ b/arch/powerpc/platforms/cell/spufs/sputrace.c @@ -232,6 +232,7 @@ static void __exit sputrace_exit(void) remove_proc_entry("sputrace", NULL); kfree(sputrace_log); + marker_synchronize_unregister(); } module_init(sputrace_init); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 49349ba..5b9b123 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,6 +26,7 @@ config X86 select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) @@ -1242,14 +1243,6 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. -config IRQBALANCE - def_bool y - prompt "Enable kernel irq balancing" - depends on X86_32 && SMP && X86_IO_APIC - help - The default yes will allow the kernel to do irq load balancing. - Saying no will keep the kernel from doing irq load balancing. - config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 52d0359..13b8c86 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -287,7 +287,6 @@ CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set CONFIG_X86_PAT=y CONFIG_EFI=y -# CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d41f03..d7e5a58 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o @@ -60,8 +60,8 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o @@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o - obj-y += bios_uv.o + obj-y += bios_uv.o uv_irq.o uv_sysfs.o obj-y += genx2apic_cluster.o obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index eb875cd..0d1c26a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1256,7 +1256,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -1276,7 +1276,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 426e5d9..c44cd6d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -10,6 +10,7 @@ #include <linux/dmi.h> #include <linux/cpumask.h> #include <asm/segment.h> +#include <asm/desc.h> #include "realmode/wakeup.h" #include "sleep.h" @@ -98,6 +99,8 @@ int acpi_save_state_mem(void) header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP stack_start.sp = temp_stack + 4096; + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c index 21c831d..04a7f96 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic.c @@ -23,11 +23,13 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> +#include <linux/ioport.h> #include <linux/cpu.h> #include <linux/clockchips.h> #include <linux/acpi_pmtmr.h> #include <linux/module.h> #include <linux/dmi.h> +#include <linux/dmar.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -36,8 +38,14 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/hpet.h> +#include <asm/pgalloc.h> #include <asm/i8253.h> #include <asm/nmi.h> +#include <asm/idle.h> +#include <asm/proto.h> +#include <asm/timex.h> +#include <asm/apic.h> +#include <asm/i8259.h> #include <mach_apic.h> #include <mach_apicdef.h> @@ -50,16 +58,58 @@ # error SPURIOUS_APIC_VECTOR definition error #endif -unsigned long mp_lapic_addr; - +#ifdef CONFIG_X86_32 /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ static int force_enable_local_apic; -int disable_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +#endif + +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC +int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif +unsigned long mp_lapic_addr; +int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ @@ -110,9 +160,6 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - static unsigned long apic_phys; /* @@ -202,6 +249,42 @@ static struct apic_ops xapic_ops = { struct apic_ops __read_mostly *apic_ops = &xapic_ops; EXPORT_SYMBOL_GPL(apic_ops); +#ifdef HAVE_X2APIC +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; +#endif + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ @@ -219,6 +302,7 @@ void __cpuinit enable_NMI_through_LVT0(void) apic_write(APIC_LVT0, v); } +#ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs */ @@ -226,6 +310,7 @@ int get_physical_broadcast(void) { return modern_apic() ? 0xff : 0xf; } +#endif /** * lapic_get_maxlvt - get the maximum number of local vector table entries @@ -247,11 +332,7 @@ int lapic_get_maxlvt(void) */ /* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else #define APIC_DIVISOR 16 -#endif /* * This function sets up the local APIC timer, with a timeout of @@ -383,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask) * Setup the local APIC timer for this CPU. Copy the initilized values * of the boot CPU and register the clock event in the framework. */ -static void __devinit setup_APIC_timer(void) +static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); @@ -453,14 +534,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } +static int __init calibrate_by_pmtimer(long deltapm, long *delta) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + } + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; void (*real_handler)(struct clock_event_device *dev); unsigned long deltaj; - long delta, deltapm; + long delta; int pm_referenced = 0; local_irq_disable(); @@ -470,10 +588,10 @@ static int __init calibrate_APIC_clock(void) global_clock_event->event_handler = lapic_cal_handler; /* - * Setup the APIC counter to 1e9. There is no way the lapic + * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ - __setup_APIC_LVTT(1000000000, 0, 0); + __setup_APIC_LVTT(0xffffffff, 0, 0); /* Let the interrupts run */ local_irq_enable(); @@ -490,34 +608,9 @@ static int __init calibrate_APIC_clock(void) delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta); /* Calculate the scaled math multiplication factor */ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -559,7 +652,10 @@ static int __init calibrate_APIC_clock(void) levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - /* We trust the pm timer based calibration */ + /* + * PM timer calibration failed or not turned on + * so lets try APIC timer based calibration + */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -652,7 +748,7 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -void __devinit setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { setup_APIC_timer(); } @@ -718,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -991,40 +1090,43 @@ void __init init_bsp_APIC(void) static void __cpuinit lapic_setup_esr(void) { - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); + unsigned int oldvalue, value, maxlvt; + + if (!lapic_is_integrated()) { + printk(KERN_INFO "No ESR for 82489DX.\n"); + return; + } - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); + if (esr_disable) { /* - * spec says clear errors after enabling vector. + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); } @@ -1033,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned long value, integrated; + unsigned int value; int i, j; +#ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { + if (lapic_is_integrated() && esr_disable) { apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); } +#endif - integrated = lapic_is_integrated(); + preempt_disable(); /* * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. */ if (!apic_id_registered()) - WARN_ON_ONCE(1); + BUG(); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1096,6 +1201,7 @@ void __cpuinit setup_local_APIC(void) */ value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* * Some unknown Intel IO/APIC (or APIC) errata is biting us with * certain networking cards. If high frequency interrupts are @@ -1116,8 +1222,13 @@ void __cpuinit setup_local_APIC(void) * See also the comment in end_level_ioapic_irq(). --macro */ - /* Enable focus processor (bit==0) */ + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif /* * Set spurious IRQ vector @@ -1154,9 +1265,11 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!integrated) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); + + preempt_enable(); } void __cpuinit end_local_APIC_setup(void) @@ -1177,6 +1290,153 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef HAVE_X2APIC +void check_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + + if (msr & X2APIC_ENABLE) { + printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic = 1; + apic_ops = &x2apic_ops; + } +} + +void enable_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (!(msr & X2APIC_ENABLE)) { + printk("Enabling x2apic\n"); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + } +} + +void enable_IR_x2apic(void) +{ +#ifdef CONFIG_INTR_REMAP + int ret; + unsigned long flags; + + if (!cpu_has_x2apic) + return; + + if (!x2apic_preenabled && disable_x2apic) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); + return; + } + + if (x2apic_preenabled && disable_x2apic) + panic("Bios already enabled x2apic, can't enforce nox2apic"); + + if (!x2apic_preenabled && skip_ioapic_setup) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); + return; + } + + ret = dmar_table_init(); + if (ret) { + printk(KERN_INFO + "dmar_table_init() failed with %d:\n", ret); + + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else + printk(KERN_INFO + "Not enabling x2apic,Intr-remapping\n"); + return; + } + + local_irq_save(flags); + mask_8259A(); + + ret = save_mask_IO_APIC_setup(); + if (ret) { + printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + goto end; + } + + ret = enable_intr_remapping(1); + + if (ret && x2apic_preenabled) { + local_irq_restore(flags); + panic("x2apic enabled by bios. But IR enabling failed"); + } + + if (ret) + goto end_restore; + + if (!x2apic) { + x2apic = 1; + apic_ops = &x2apic_ops; + enable_x2apic(); + } + +end_restore: + if (ret) + /* + * IR enabling failed + */ + restore_IO_APIC_setup(); + else + reinit_intr_remapped_IO_APIC(x2apic_preenabled); + +end: + unmask_8259A(); + local_irq_restore(flags); + + if (!ret) { + if (!x2apic_preenabled) + printk(KERN_INFO + "Enabled x2apic and interrupt-remapping\n"); + else + printk(KERN_INFO + "Enabled Interrupt-remapping\n"); + } else + printk(KERN_ERR + "Failed to enable Interrupt-remapping and x2apic\n"); +#else + if (!cpu_has_x2apic) + return; + + if (x2apic_preenabled) + panic("x2apic enabled prior OS handover," + " enable CONFIG_INTR_REMAP"); + + printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); +#endif + + return; +} +#endif /* HAVE_X2APIC */ + +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_physical_apicid = 0; + return 0; +} +#else /* * Detect and initialize APIC */ @@ -1255,12 +1515,46 @@ no_apic: printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +#endif + +#ifdef CONFIG_X86_64 +void __init early_init_lapic_mapping(void) +{ + unsigned long phys_addr; + + /* + * If no local APIC can be found then go out + * : it means there is no mpatable and MADT + */ + if (!smp_found_config) + return; + + phys_addr = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, phys_addr); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_physical_apicid = read_apic_id(); +} +#endif /** * init_apic_mappings - initialize APIC mappings */ void __init init_apic_mappings(void) { +#ifdef HAVE_X2APIC + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } +#endif + /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1273,8 +1567,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, - apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); /* * Fetch the APIC ID of the BSP in case we have a @@ -1282,18 +1576,27 @@ void __init init_apic_mappings(void) */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = read_apic_id(); - } /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ - int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { +#ifdef CONFIG_X86_64 + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } +#else if (!smp_found_config && !cpu_has_apic) return -1; @@ -1302,39 +1605,68 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } +#endif - verify_local_APIC(); +#ifdef HAVE_X2APIC + enable_IR_x2apic(); +#endif +#ifdef CONFIG_X86_64 + setup_apic_routing(); +#endif + verify_local_APIC(); connect_bsp_APIC(); +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else /* * Hack: In case of kdump, after a crash, kernel might be booting * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid * might be zero if read from MP tables. Get it from LAPIC. */ -#ifdef CONFIG_CRASH_DUMP +# ifdef CONFIG_CRASH_DUMP boot_cpu_physical_apicid = read_apic_id(); +# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); +#ifdef CONFIG_X86_64 + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + #ifdef CONFIG_X86_IO_APIC if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) #endif localise_nmi_watchdog(); end_local_APIC_setup(); + #ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif #endif + +#ifdef CONFIG_X86_64 + setup_boot_APIC_clock(); + check_nmi_watchdog(); +#else setup_boot_clock(); +#endif return 0; } @@ -1348,8 +1680,11 @@ int __init APIC_init_uniprocessor(void) */ void smp_spurious_interrupt(struct pt_regs *regs) { - unsigned long v; + u32 v; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1360,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); +#ifdef CONFIG_X86_64 + add_pda(irq_spurious_count, 1); +#else /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); __get_cpu_var(irq_stat).irq_spurious_count++; +#endif irq_exit(); } @@ -1372,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs) */ void smp_error_interrupt(struct pt_regs *regs) { - unsigned long v, v1; + u32 v, v1; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1392,7 +1734,7 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1565,6 +1907,13 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu_set(cpu, cpu_present_map); } +#ifdef CONFIG_X86_64 +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} +#endif + /* * Power management */ @@ -1640,7 +1989,7 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_64 +#ifdef HAVE_X2APIC if (x2apic) enable_x2apic(); else @@ -1702,7 +2051,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __devinit apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -1728,16 +2077,87 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ +#ifdef CONFIG_X86_64 /* - * APIC command line parameters + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. */ -static int __init parse_lapic(char *arg) +__cpuinit int apic_is_clustered_box(void) { - force_enable_local_apic = 1; - return 0; + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + /* + * there is not this kind of box with AMD CPU yet. + * Some AMD box with quadcore cpu and 8 sockets apicid + * will be [4, 0x23] or [8, 0x27] could be thought to + * vsmp box still need checking... + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) + return 0; + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (is_vsmp_box() && clusters > 1) + return 1; + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); } -early_param("lapic", parse_lapic); +#endif +/* + * APIC command line parameters + */ static int __init setup_disableapic(char *arg) { disable_apic = 1; @@ -1779,7 +2199,6 @@ static int __init apic_set_verbosity(char *arg) if (!arg) { #ifdef CONFIG_X86_64 skip_ioapic_setup = 0; - ioapic_force = 1; return 0; #endif return -EINVAL; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c deleted file mode 100644 index 94ddb69..0000000 --- a/arch/x86/kernel/apic_64.c +++ /dev/null @@ -1,1848 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/ioport.h> -#include <linux/clockchips.h> -#include <linux/acpi_pmtmr.h> -#include <linux/module.h> -#include <linux/dmar.h> - -#include <asm/atomic.h> -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/hpet.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/idle.h> -#include <asm/proto.h> -#include <asm/timex.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -#include <mach_ipi.h> -#include <mach_apic.h> - -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; -static int apic_calibrate_pmtmr __initdata; -int disable_apic; -int disable_x2apic; -int x2apic; - -/* x2apic enabled before OS handover */ -int x2apic_preenabled; - -/* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -/* - * Debug level, exported for io_apic.c - */ -unsigned int apic_verbosity; - -/* Have we found an MP table */ -int smp_found_config; - -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -static unsigned int calibration_result; - -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); -static void apic_pm_activate(void); - -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - -static unsigned long apic_phys; - -unsigned long mp_lapic_addr; - -/* - * Get the LAPIC version - */ -static inline int lapic_get_version(void) -{ - return GET_APIC_VERSION(apic_read(APIC_LVR)); -} - -/* - * Check, if the APIC is integrated or a separate chip - */ -static inline int lapic_is_integrated(void) -{ -#ifdef CONFIG_X86_64 - return 1; -#else - return APIC_INTEGRATED(lapic_get_version()); -#endif -} - -/* - * Check, whether this is a modern or a first generation APIC - */ -static int modern_apic(void) -{ - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} - -/* - * Paravirt kernels also might be using these below ops. So we still - * use generic apic_read()/apic_write(), which might be pointing to different - * ops in PARAVIRT case. - */ -void xapic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 safe_xapic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -void xapic_icr_write(u32 low, u32 id) -{ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write(APIC_ICR, low); -} - -u64 xapic_icr_read(void) -{ - u32 icr1, icr2; - - icr2 = apic_read(APIC_ICR2); - icr1 = apic_read(APIC_ICR); - - return icr1 | ((u64)icr2 << 32); -} - -static struct apic_ops xapic_ops = { - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .icr_read = xapic_icr_read, - .icr_write = xapic_icr_write, - .wait_icr_idle = xapic_wait_icr_idle, - .safe_wait_icr_idle = safe_xapic_wait_icr_idle, -}; - -struct apic_ops __read_mostly *apic_ops = &xapic_ops; -EXPORT_SYMBOL_GPL(apic_ops); - -static void x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static u32 safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - -void x2apic_icr_write(u32 low, u32 id) -{ - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); -} - -u64 x2apic_icr_read(void) -{ - unsigned long val; - - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); - return val; -} - -static struct apic_ops x2apic_ops = { - .read = native_apic_msr_read, - .write = native_apic_msr_write, - .icr_read = x2apic_icr_read, - .icr_write = x2apic_icr_write, - .wait_icr_idle = x2apic_wait_icr_idle, - .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, -}; - -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - -/** - * lapic_get_maxlvt - get the maximum number of local vector table entries - */ -int lapic_get_maxlvt(void) -{ - unsigned int v; - - v = apic_read(APIC_LVR); - /* - * - we always have APIC integrated on 64bit mode - * - 82489DXs do not report # of LVT entries - */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; -} - -/* - * Local APIC timer - */ - -/* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else -#define APIC_DIVISOR 16 -#endif - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks / APIC_DIVISOR); -} - -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); - -/* - * Program the next event, relative to now - */ -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write(APIC_TMICT, delta); - return 0; -} - -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - unsigned int v; - - /* Lapic used as dummy for broadcast ? */ - if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; - - local_irq_save(flags); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - - local_irq_restore(flags); -} - -/* - * Local APIC timer broadcast function - */ -static void lapic_timer_broadcast(cpumask_t mask) -{ -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#endif -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) -{ - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - - if (calibrate_APIC_clock()) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); - - /* Setup the lapic or request the broadcast */ - setup_APIC_timer(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -/* - * The guts of the apic timer interrupt - */ -static void local_apic_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - local_apic_timer_interrupt(); - irq_exit(); - - set_irq_regs(old_regs); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - - -/* - * Local APIC start and shutdown - */ - -/** - * clear_local_APIC - shutdown the local APIC - * - * This is called, when a CPU is disabled and before rebooting, so the state of - * the local APIC has no dangling leftovers. Also used to cleanout any BIOS - * leftovers during boot. - */ -void clear_local_APIC(void) -{ - int maxlvt; - u32 v; - - /* APIC hasn't been mapped yet */ - if (!apic_phys) - return; - - maxlvt = lapic_get_maxlvt(); - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - - /* Integrated APIC (!82489DX) ? */ - if (lapic_is_integrated()) { - if (maxlvt > 3) - /* Clear ESR due to Pentium errata 3AP and 11AP */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -/** - * disable_local_APIC - clear and disable the local APIC - */ -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); - -#ifdef CONFIG_X86_32 - /* - * When LAPIC was disabled by the BIOS and enabled by the kernel, - * restore the disabled state. - */ - if (enabled_via_apicbase) { - unsigned int l, h; - - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); - } -#endif -} - -/* - * If Linux enabled the LAPIC against the BIOS default disable it down before - * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and - * not power-off. Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - -#ifdef CONFIG_X86_32 - if (!enabled_via_apicbase) - clear_local_APIC(); - else -#endif - disable_local_APIC(); - - - local_irq_restore(flags); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -/** - * sync_Arb_IDs - synchronize APIC bus arbitration IDs - */ -void __init sync_Arb_IDs(void) -{ - /* - * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not - * needed on AMD. - */ - if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - -#ifdef CONFIG_X86_32 - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else -#endif - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!lapic_is_integrated()) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write(APIC_LVT1, value); -} - -static void __cpuinit lapic_setup_esr(void) -{ - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); - } -} - - -/** - * setup_local_APIC - setup the local APIC - */ -void __cpuinit setup_local_APIC(void) -{ - unsigned int value; - int i, j; - - preempt_disable(); - value = apic_read(APIC_LVR); - - BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); - } - } - - /* - * Now that we are all set up, enable the APIC - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - /* - * Enable APIC - */ - value |= APIC_SPIV_APIC_ENABLED; - - /* We always use processor focus */ - - /* - * Set spurious IRQ vector - */ - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up LVT0, LVT1: - * - * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessary in pure symmetric-IO mode, but sometimes - * we delegate interrupts to the 8259A. - */ - /* - * TODO: set up through-local-APIC from through-I/O-APIC? --macro - */ - value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && !value) { - value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); - } else { - value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); - } - apic_write(APIC_LVT0, value); - - /* - * only the BP should see the LINT1 NMI signal, obviously. - */ - if (!smp_processor_id()) - value = APIC_DM_NMI; - else - value = APIC_DM_NMI | APIC_LVT_MASKED; - apic_write(APIC_LVT1, value); - preempt_enable(); -} - -void __cpuinit end_local_APIC_setup(void) -{ - lapic_setup_esr(); - -#ifdef CONFIG_X86_32 - { - unsigned int value; - /* Disable the local apic timer */ - value = apic_read(APIC_LVTT); - value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, value); - } -#endif - - setup_apic_nmi_watchdog(NULL); - apic_pm_activate(); -} - -void check_x2apic(void) -{ - int msr, msr2; - - rdmsr(MSR_IA32_APICBASE, msr, msr2); - - if (msr & X2APIC_ENABLE) { - printk("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; - apic_ops = &x2apic_ops; - } -} - -void enable_x2apic(void) -{ - int msr, msr2; - - rdmsr(MSR_IA32_APICBASE, msr, msr2); - if (!(msr & X2APIC_ENABLE)) { - printk("Enabling x2apic\n"); - wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); - } -} - -void enable_IR_x2apic(void) -{ -#ifdef CONFIG_INTR_REMAP - int ret; - unsigned long flags; - - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; - } - - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; - } - - ret = dmar_table_init(); - if (ret) { - printk(KERN_INFO - "dmar_table_init() failed with %d:\n", ret); - - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - printk(KERN_INFO - "Not enabling x2apic,Intr-remapping\n"); - return; - } - - local_irq_save(flags); - mask_8259A(); - save_mask_IO_APIC_setup(); - - ret = enable_intr_remapping(1); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } - - if (ret) - goto end; - - if (!x2apic) { - x2apic = 1; - apic_ops = &x2apic_ops; - enable_x2apic(); - } -end: - if (ret) - /* - * IR enabling failed - */ - restore_IO_APIC_setup(); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled); - - unmask_8259A(); - local_irq_restore(flags); - - if (!ret) { - if (!x2apic_preenabled) - printk(KERN_INFO - "Enabled x2apic and interrupt-remapping\n"); - else - printk(KERN_INFO - "Enabled Interrupt-remapping\n"); - } else - printk(KERN_ERR - "Failed to enable Interrupt-remapping and x2apic\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); -#endif - - return; -} - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ -static int __init detect_init_APIC(void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; - return 0; -} - -void __init early_init_lapic_mapping(void) -{ - unsigned long phys_addr; - - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} - -/** - * init_apic_mappings - initialize APIC mappings - */ -void __init init_apic_mappings(void) -{ - if (x2apic) { - boot_cpu_physical_apicid = read_apic_id(); - return; - } - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int apic_version[MAX_APICS]; - -int __init APIC_init_uniprocessor(void) -{ - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } - - enable_IR_x2apic(); - setup_apic_routing(); - - verify_local_APIC(); - - connect_bsp_APIC(); - - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); - - setup_local_APIC(); - - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling vector on BP - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); - - if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) - localise_nmi_watchdog(); - end_local_APIC_setup(); - - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - setup_boot_APIC_clock(); - check_nmi_watchdog(); - return 0; -} - -/* - * Local APIC interrupts - */ - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - exit_idle(); - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - add_pda(irq_spurious_count, 1); - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - exit_idle(); - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -/** - * connect_bsp_APIC - attach the APIC to the interrupt system - */ -void __init connect_bsp_APIC(void) -{ -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's - * local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -#endif - enable_apic_mode(); -} - -/** - * disconnect_bsp_APIC - detach the APIC from the interrupt system - * @virt_wire_setup: indicates, whether virtual wire mode is selected - * - * Virtual wire mode is necessary to deliver legacy interrupts even when the - * APIC is disabled. - */ -void disconnect_bsp_APIC(int virt_wire_setup) -{ - unsigned int value; - -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect only on - * certain older boards). Note that APIC interrupts, including - * IPIs, won't work beyond this point! The only exception are - * INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - return; - } -#endif - - /* Go back to Virtual Wire compatibility mode */ - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* - * For LVT1 make it edge triggered, active high, - * nmi and enabled - */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void __cpuinit generic_processor_info(int apicid, int version) -{ - int cpu; - cpumask_t tmp_map; - - /* - * Validate version - */ - if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(apicid, phys_cpu_present_map); - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj <ashok.raj@intel.com> - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) - /* are we being called early in kernel startup? */ - if (early_per_cpu_ptr(x86_cpu_to_apicid)) { - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - - cpu_to_apicid[cpu] = apicid; - bios_cpu_apicid[cpu] = apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = apicid; - } -#endif - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} - -/* - * Power management - */ -#ifdef CONFIG_PM - -static struct { - /* - * 'active' is true if the local APIC was enabled by us and - * not the BIOS; this signifies that we are also responsible - * for disabling it before entering apm/acpi suspend - */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - local_irq_save(flags); - -#ifdef CONFIG_X86_64 - if (x2apic) - enable_x2apic(); - else -#endif - { - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - } - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - - local_irq_restore(flags); - - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - .name = "lapic", - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } - else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } - else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) - return 1; - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} - -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); - - -/* - * APIC command line parameters - */ -static int __init setup_disableapic(char *arg) -{ - disable_apic = 1; - setup_clear_cpu_cap(X86_FEATURE_APIC); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init parse_disable_apic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("noapictimer", parse_disable_apic_timer); - -static int __init parse_nolapic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("nolapic_timer", parse_nolapic_timer); - -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); - -static int __init apic_set_verbosity(char *arg) -{ - if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; -#endif - return -EINVAL; - } - - if (strcmp("debug", arg) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", arg) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", arg); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -static int __init lapic_insert_resource(void) -{ - if (!apic_phys) - return -1; - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - return 0; -} - -/* - * need call insert after e820_reserve_resources() - * that is using request_resource - */ -late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index fdd585f..f0dfe6f 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -1,8 +1,6 @@ /* * BIOS run time interface routines. * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -16,33 +14,128 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ +#include <linux/efi.h> +#include <asm/efi.h> +#include <linux/io.h> #include <asm/uv/bios.h> +#include <asm/uv/uv_hub.h> + +struct uv_systab uv_systab; -const char * -x86_bios_strerror(long status) +s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { - const char *str; - switch (status) { - case 0: str = "Call completed without error"; break; - case -1: str = "Not implemented"; break; - case -2: str = "Invalid argument"; break; - case -3: str = "Call completed with error"; break; - default: str = "Unknown BIOS status code"; break; - } - return str; + struct uv_systab *tab = &uv_systab; + + if (!tab->function) + /* + * BIOS does not support UV systab + */ + return BIOS_STATUS_UNIMPLEMENTED; + + return efi_call6((void *)__va(tab->function), + (u64)which, a1, a2, a3, a4, a5); } -long -x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, - unsigned long *drift_info) +s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { - struct uv_bios_retval isrv; + unsigned long bios_flags; + s64 ret; - BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); - *ticks_per_second = isrv.v0; - *drift_info = isrv.v1; - return isrv.status; + local_irq_save(bios_flags); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + local_irq_restore(bios_flags); + + return ret; } -EXPORT_SYMBOL_GPL(x86_bios_freq_base); + +s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) +{ + s64 ret; + + preempt_disable(); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + preempt_enable(); + + return ret; +} + + +long sn_partition_id; +EXPORT_SYMBOL_GPL(sn_partition_id); +long uv_coherency_id; +EXPORT_SYMBOL_GPL(uv_coherency_id); +long uv_region_size; +EXPORT_SYMBOL_GPL(uv_region_size); +int uv_type; + + +s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, + long *region) +{ + s64 ret; + u64 v0, v1; + union partition_info_u part; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, + (u64)(&v0), (u64)(&v1), 0, 0); + if (ret != BIOS_STATUS_SUCCESS) + return ret; + + part.val = v0; + if (uvtype) + *uvtype = part.hub_version; + if (partid) + *partid = part.partition_id; + if (coher) + *coher = part.coherence_id; + if (region) + *region = part.region_size; + return ret; +} + + +s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) +{ + return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, + (u64)ticks_per_second, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_freq_base); + + +#ifdef CONFIG_EFI +void uv_bios_init(void) +{ + struct uv_systab *tab; + + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + (efi.uv_systab == (unsigned long)NULL)) { + printk(KERN_CRIT "No EFI UV System Table.\n"); + uv_systab.function = (unsigned long)NULL; + return; + } + + tab = (struct uv_systab *)ioremap(efi.uv_systab, + sizeof(struct uv_systab)); + if (strncmp(tab->signature, "UVST", 4) != 0) + printk(KERN_ERR "bad signature in UV system table!"); + + /* + * Copy table to permanent spot for later use. + */ + memcpy(&uv_systab, tab, sizeof(struct uv_systab)); + iounmap(tab); + + printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision); +} +#else /* !CONFIG_EFI */ + +void uv_bios_init(void) { } +#endif + diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 32e7352..8f1e31d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -249,7 +249,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) } numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 06fcce5..b046185 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -1,5 +1,5 @@ /* - * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> + * (C) 2001-2004 Dave Jones. <davej@redhat.com> * (C) 2002 Padraig Brady. <padraig@antefacto.com> * * Licensed under the terms of the GNU GPL License version 2. @@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); module_param(revid_errata, int, 0644); MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); -MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index b5ced80..c1ac579 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -246,7 +246,7 @@ static void __exit powernow_k6_exit(void) } -MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 0a61159..7c7d56b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -1,6 +1,6 @@ /* * AMD K7 Powernow driver. - * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. + * (C) 2003 Dave Jones on behalf of SuSE Labs. * (C) 2003-2004 Dave Jones <davej@redhat.com> * * Licensed under the terms of the GNU GPL License version 2. @@ -692,7 +692,7 @@ static void __exit powernow_exit (void) module_param(acpi_force, int, 0444); MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); -MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395..008d23b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -7,7 +7,7 @@ * Support : mark.langsdorf@amd.com * * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs + * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski <linux@brodo.de> * (C) 2004 Pavel Machek <pavel@suse.cz> * Licensed under the terms of the GNU GPL License version 2. diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 191f726..04d0376 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -431,7 +431,7 @@ static void __exit speedstep_exit(void) } -MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 99468db..cce0b61 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -174,7 +174,7 @@ static void __cpuinit srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index f390c9f..dd3af6e 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -1,6 +1,6 @@ /* - * Athlon/Hammer specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> + * Athlon specific Machine Check Exception Reporting + * (C) Copyright 2002 Dave Jones <davej@redhat.com> */ #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 774d87c..0ebf3fc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -1,6 +1,6 @@ /* * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> + * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com> */ #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index cc1fccd..a74af12 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -1,7 +1,7 @@ /* * Non Fatal Machine Check Exception Reporting * - * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> + * (C) Copyright 2002 Dave Jones. <davej@redhat.com> * * This file contains routines to check for non-fatal MCEs every 15s * diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6bff382..9abd48b 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -17,6 +17,8 @@ #include <linux/bitops.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <linux/kprobes.h> + #include <asm/apic.h> #include <asm/intel_arch_perfmon.h> @@ -336,7 +338,8 @@ static void single_msr_unreserve(void) release_perfctr_nmi(wd_ops->perfctr); } -static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes +single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); @@ -401,7 +404,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) return 1; } -static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* * P6 based Pentium M need to re-unmask @@ -605,7 +608,7 @@ static void p4_unreserve(void) release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); } -static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* @@ -784,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz) return hz; } -int lapic_wd_event(unsigned nmi_hz) +int __kprobes lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 945a31c..1119d24 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -367,6 +367,10 @@ void __init efi_init(void) efi.smbios = config_tables[i].table; printk(" SMBIOS=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, + UV_SYSTEM_TABLE_GUID)) { + efi.uv_systab = config_tables[i].table; + printk(" UVsystab=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID)) { efi.hcdp = config_tables[i].table; printk(" HCDP=0x%lx ", config_tables[i].table); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index b21fbfa..c356423 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -629,7 +629,7 @@ ENTRY(interrupt) ENTRY(irq_entries_start) RING0_INT_FRAME vector=0 -.rept NR_IRQS +.rept NR_VECTORS ALIGN .if vector CFI_ADJUST_CFA_OFFSET -4 @@ -1153,20 +1153,6 @@ ENDPROC(xen_failsafe_callback) #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - subl $MCOUNT_INSN_SIZE, %eax - -.globl mcount_call -mcount_call: - call ftrace_stub - - popl %edx - popl %ecx - popl %eax - ret END(mcount) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1db6ce4..09e7145 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -64,32 +64,6 @@ #ifdef CONFIG_FTRACE #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - - movq 0x38(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - -.globl mcount_call -mcount_call: - call ftrace_stub - - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp - retq END(mcount) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ab115cd..d073d98 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -11,17 +11,18 @@ #include <linux/spinlock.h> #include <linux/hardirq.h> +#include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/list.h> -#include <asm/alternative.h> #include <asm/ftrace.h> +#include <asm/nops.h> /* Long is fine, even if it is only 4 bytes ;-) */ -static long *ftrace_nop; +static unsigned long *ftrace_nop; union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -60,11 +61,7 @@ notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned replaced; - unsigned old = *(unsigned *)old_code; /* 4 bytes */ - unsigned new = *(unsigned *)new_code; /* 4 bytes */ - unsigned char newch = new_code[4]; - int faulted = 0; + unsigned char replaced[MCOUNT_INSN_SIZE]; /* * Note: Due to modules and __init, code can @@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, * as well as code changing. * * No real locking needed, this code is run through - * kstop_machine. + * kstop_machine, or before SMP starts. */ - asm volatile ( - "1: lock\n" - " cmpxchg %3, (%2)\n" - " jnz 2f\n" - " movb %b4, 4(%2)\n" - "2:\n" - ".section .fixup, \"ax\"\n" - "3: movl $1, %0\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "c"(newch), - "0"(faulted), "a"(old) - : "memory"); - sync_core(); + if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) + return 1; + + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return 2; - if (replaced != old && replaced != new) - faulted = 2; + WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code, + MCOUNT_INSN_SIZE)); - return faulted; + sync_core(); + + return 0; } notrace int ftrace_update_ftrace_func(ftrace_func_t func) @@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func) notrace int ftrace_mcount_set(unsigned long *data) { - unsigned long ip = (long)(&mcount_call); - unsigned long *addr = data; - unsigned char old[MCOUNT_INSN_SIZE], *new; - - /* - * Replace the mcount stub with a pointer to the - * ip recorder function. - */ - memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, *addr); - *addr = ftrace_modify_code(ip, old, new); - + /* mcount is initialized as a nop */ + *data = 0; return 0; } int __init ftrace_dyn_arch_init(void *data) { - const unsigned char *const *noptable = find_nop_table(); - - /* This is running in kstop_machine */ - - ftrace_mcount_set(data); + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; - ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "jmp ftrace_test_jmp\n" + /* This code needs to stay around */ + ".section .text, \"ax\"\n" + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "jmp 1f\n" + ".previous\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + ftrace_nop = (unsigned long *)ftrace_test_p6nop; + break; + case 1: + pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + ftrace_nop = (unsigned long *)ftrace_test_nop5; + break; + case 2: + pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + ftrace_nop = (unsigned long *)ftrace_test_jmp; + break; + } + + /* The return code is retured via data */ + *(unsigned long *)data = 0; return 0; } diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 9eca5ba..2ec2de8 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -179,8 +179,10 @@ static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * is an example). */ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; + } #endif return 0; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 33581d9..bfd5328 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -341,12 +341,12 @@ static __init void map_mmioh_high(int max_pnode) static __init void uv_rtc_init(void) { - long status, ticks_per_sec, drift; + long status; + u64 ticks_per_sec; - status = - x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, - &drift); - if (status != 0 || ticks_per_sec < 100000) { + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, + &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { printk(KERN_WARNING "unable to determine platform RTC clock frequency, " "guessing.\n"); @@ -356,7 +356,22 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } -static bool uv_system_inited; +/* + * Called on each cpu to initialize the per_cpu UV data area. + * ZZZ hotplug not supported yet + */ +void __cpuinit uv_cpu_init(void) +{ + /* CPU 0 initilization will be done via uv_system_init. */ + if (!uv_blade_info) + return; + + uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + + if (get_uv_system_type() == UV_NON_UNIQUE_APIC) + set_x2apic_extra_bits(uv_hub_info->pnode); +} + void __init uv_system_init(void) { @@ -412,6 +427,9 @@ void __init uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_bios_init(); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, + &uv_coherency_id, &uv_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -433,7 +451,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ + uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@ -448,21 +466,6 @@ void __init uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); - uv_system_inited = true; -} -/* - * Called on each cpu to initialize the per_cpu UV data area. - * ZZZ hotplug not supported yet - */ -void __cpuinit uv_cpu_init(void) -{ - BUG_ON(!uv_system_inited); - - uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); + uv_cpu_init(); } - - diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index acf62fc..77017e8 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,29 +1,49 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/hpet.h> #include <linux/init.h> -#include <linux/sysdev.h> +#include <linux/cpu.h> #include <linux/pm.h> +#include <linux/io.h> #include <asm/fixmap.h> -#include <asm/hpet.h> #include <asm/i8253.h> -#include <asm/io.h> +#include <asm/hpet.h> -#define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000L +#define FSEC_PER_NSEC 1000000L + +#define HPET_DEV_USED_BIT 2 +#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) +#define HPET_DEV_VALID 0x8 +#define HPET_DEV_FSB_CAP 0x1000 +#define HPET_DEV_PERI_CAP 0x2000 + +#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ -unsigned long hpet_address; -static void __iomem *hpet_virt_address; +unsigned long hpet_address; +unsigned long hpet_num_timers; +static void __iomem *hpet_virt_address; + +struct hpet_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; +}; unsigned long hpet_readl(unsigned long a) { @@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void) static int boot_hpet_disable; int hpet_force_user; -static int __init hpet_setup(char* str) +static int __init hpet_setup(char *str) { if (str) { if (!strncmp("disable", str, 7)) @@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet); static inline int is_hpet_capable(void) { - return (!boot_hpet_disable && hpet_address); + return !boot_hpet_disable && hpet_address; } /* @@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET + +static void hpet_reserve_msi_timers(struct hpet_data *hd); + static void hpet_reserve_platform_timers(unsigned long id) { struct hpet __iomem *hpet = hpet_virt_address; @@ -111,10 +134,10 @@ static void hpet_reserve_platform_timers(unsigned long id) nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - memset(&hd, 0, sizeof (hd)); - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet; - hd.hd_nirqs = nrtimers; + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet; + hd.hd_nirqs = nrtimers; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC @@ -130,10 +153,12 @@ static void hpet_reserve_platform_timers(unsigned long id) hd.hd_irq[1] = HPET_LEGACY_RTC; for (i = 2; i < nrtimers; timer++, i++) { - hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + hd.hd_irq[i] = (readl(&timer->hpet_config) & + Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } + hpet_reserve_msi_timers(&hd); + hpet_alloc(&hd); } @@ -227,60 +252,70 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hpet_setup_msi_irq(unsigned int irq); + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt, int timer) { unsigned long cfg, cmp, now; uint64_t delta; - switch(mode) { + switch (mode) { case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; - delta >>= hpet_clockevent.shift; + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; + delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); /* * The first write after writing TN_SETVAL to the * config register sets the counter value, the second * write sets the period. */ - hpet_writel(cmp, HPET_T0_CMP); + hpet_writel(cmp, HPET_Tn_CMP(timer)); udelay(1); - hpet_writel((unsigned long) delta, HPET_T0_CMP); + hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); break; case CLOCK_EVT_MODE_ONESHOT: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_RESUME: - hpet_enable_legacy_int(); + if (timer == 0) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_setup_msi_irq(hdev->irq); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + enable_irq(hdev->irq); + } break; } } -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt, int timer) { u32 cnt; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; - hpet_writel(cnt, HPET_T0_CMP); + hpet_writel(cnt, HPET_Tn_CMP(timer)); /* * We need to read back the CMP register to make sure that @@ -292,6 +327,347 @@ static int hpet_legacy_next_event(unsigned long delta, return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } +static void hpet_legacy_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + hpet_set_mode(mode, evt, 0); +} + +static int hpet_legacy_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + return hpet_next_event(delta, evt, 0); +} + +/* + * HPET MSI Support + */ +#ifdef CONFIG_PCI_MSI + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); +static struct hpet_dev *hpet_devs; + +void hpet_msi_unmask(unsigned int irq) +{ + struct hpet_dev *hdev = get_irq_data(irq); + unsigned long cfg; + + /* unmask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg |= HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_mask(unsigned int irq) +{ + unsigned long cfg; + struct hpet_dev *hdev = get_irq_data(irq); + + /* mask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg &= ~HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_write(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); +} + +void hpet_msi_read(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); + msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); + msg->address_hi = 0; +} + +static void hpet_msi_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_set_mode(mode, evt, hdev->num); +} + +static int hpet_msi_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + return hpet_next_event(delta, evt, hdev->num); +} + +static int hpet_setup_msi_irq(unsigned int irq) +{ + if (arch_setup_hpet_msi(irq)) { + destroy_irq(irq); + return -EINVAL; + } + return 0; +} + +static int hpet_assign_irq(struct hpet_dev *dev) +{ + unsigned int irq; + + irq = create_irq(); + if (!irq) + return -EINVAL; + + set_irq_data(irq, dev); + + if (hpet_setup_msi_irq(irq)) + return -EINVAL; + + dev->irq = irq; + return 0; +} + +static irqreturn_t hpet_interrupt_handler(int irq, void *data) +{ + struct hpet_dev *dev = (struct hpet_dev *)data; + struct clock_event_device *hevt = &dev->evt; + + if (!hevt->event_handler) { + printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", + dev->num); + return IRQ_HANDLED; + } + + hevt->event_handler(hevt); + return IRQ_HANDLED; +} + +static int hpet_setup_irq(struct hpet_dev *dev) +{ + + if (request_irq(dev->irq, hpet_interrupt_handler, + IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev)) + return -1; + + disable_irq(dev->irq); + irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + enable_irq(dev->irq); + + printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", + dev->name, dev->irq); + + return 0; +} + +/* This should be called in specific @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +{ + struct clock_event_device *evt = &hdev->evt; + uint64_t hpet_freq; + + WARN_ON(cpu != smp_processor_id()); + if (!(hdev->flags & HPET_DEV_VALID)) + return; + + if (hpet_setup_msi_irq(hdev->irq)) + return; + + hdev->cpu = cpu; + per_cpu(cpu_hpet_dev, cpu) = hdev; + evt->name = hdev->name; + hpet_setup_irq(hdev); + evt->irq = hdev->irq; + + evt->rating = 110; + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hdev->flags & HPET_DEV_PERI_CAP) + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + + evt->set_mode = hpet_msi_set_mode; + evt->set_next_event = hpet_msi_next_event; + evt->shift = 32; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + evt->mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, evt->shift); + /* Calculate the max delta */ + evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); + /* 5 usec minimum reprogramming delta. */ + evt->min_delta_ns = 5000; + + evt->cpumask = cpumask_of_cpu(hdev->cpu); + clockevents_register_device(evt); +} + +#ifdef CONFIG_HPET +/* Reserve at least one timer for userspace (/dev/hpet) */ +#define RESERVE_TIMERS 1 +#else +#define RESERVE_TIMERS 0 +#endif + +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + unsigned int id; + unsigned int num_timers; + unsigned int num_timers_used = 0; + int i; + + id = hpet_readl(HPET_ID); + + num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + num_timers++; /* Value read out starts from 0 */ + + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + if (!hpet_devs) + return; + + hpet_num_timers = num_timers; + + for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { + struct hpet_dev *hdev = &hpet_devs[num_timers_used]; + unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + + /* Only consider HPET timer with MSI support */ + if (!(cfg & HPET_TN_FSB_CAP)) + continue; + + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + hdev->num = i; + + sprintf(hdev->name, "hpet%d", i); + if (hpet_assign_irq(hdev)) + continue; + + hdev->flags |= HPET_DEV_FSB_CAP; + hdev->flags |= HPET_DEV_VALID; + num_timers_used++; + if (num_timers_used == num_possible_cpus()) + break; + } + + printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", + num_timers, num_timers_used); +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} +#endif + +static struct hpet_dev *hpet_get_unused_timer(void) +{ + int i; + + if (!hpet_devs) + return NULL; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + if (test_and_set_bit(HPET_DEV_USED_BIT, + (unsigned long *)&hdev->flags)) + continue; + return hdev; + } + return NULL; +} + +struct hpet_work_struct { + struct delayed_work work; + struct completion complete; +}; + +static void hpet_work(struct work_struct *w) +{ + struct hpet_dev *hdev; + int cpu = smp_processor_id(); + struct hpet_work_struct *hpet_work; + + hpet_work = container_of(w, struct hpet_work_struct, work.work); + + hdev = hpet_get_unused_timer(); + if (hdev) + init_one_hpet_msi_clockevent(hdev, cpu); + + complete(&hpet_work->complete); +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct hpet_work_struct work; + struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + + switch (action & 0xf) { + case CPU_ONLINE: + INIT_DELAYED_WORK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + break; + case CPU_DEAD: + if (hdev) { + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + } + break; + } + return NOTIFY_OK; +} +#else + +static int hpet_setup_msi_irq(unsigned int irq) +{ + return 0; +} +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + return; +} +#endif + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + return NOTIFY_OK; +} + +#endif + /* * Clock source related code */ @@ -427,8 +803,10 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); + hpet_msi_capability_lookup(2); return 1; } + hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -445,6 +823,8 @@ out_nohpet: */ static __init int hpet_late_init(void) { + int cpu; + if (boot_hpet_disable) return -ENODEV; @@ -460,6 +840,13 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + for_each_online_cpu(cpu) { + hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); + } + + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(hpet_cpuhp_notify, -20); + return 0; } fs_initcall(hpet_late_init); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c index 02063ae..b764d74 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic.c @@ -27,17 +27,21 @@ #include <linux/sched.h> #include <linux/pci.h> #include <linux/mc146818rtc.h> +#include <linux/compiler.h> #include <linux/acpi.h> +#include <linux/module.h> #include <linux/sysdev.h> #include <linux/msi.h> #include <linux/htirq.h> -#include <linux/dmar.h> -#include <linux/jiffies.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif #include <linux/bootmem.h> #include <linux/dmar.h> +#include <linux/hpet.h> #include <asm/idle.h> #include <asm/io.h> @@ -46,61 +50,28 @@ #include <asm/proto.h> #include <asm/acpi.h> #include <asm/dma.h> +#include <asm/timer.h> #include <asm/i8259.h> #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> +#include <asm/setup.h> #include <asm/irq_remapping.h> +#include <asm/hpet.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/uv_irq.h> #include <mach_ipi.h> #include <mach_apic.h> +#include <mach_apicdef.h> #define __apicdebuginit(type) static type __init -struct irq_cfg { - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static int assign_irq_vector(int irq, cpumask_t mask); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static int disable_timer_pin_1 __initdata; - -int timer_through_8259 __initdata; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); @@ -110,9 +81,6 @@ static DEFINE_SPINLOCK(vector_lock); */ int nr_ioapic_registers[MAX_IO_APICS]; -/* I/O APIC RTE contents at the OS boot up */ -struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - /* I/O APIC entries */ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; @@ -123,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + +struct irq_pin_list; +struct irq_cfg { + unsigned int irq; + struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; + u8 vector; + u8 move_in_progress : 1; +}; + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfgx[NR_IRQS] = { + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +}; + +#define for_each_irq_cfg(irq, cfg) \ + for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + return irq_cfg(irq); +} + /* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. + * Rough estimation of how many shared IRQs there are, can be changed + * anytime. */ #define MAX_PLUS_SHARED_IRQS NR_IRQS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) @@ -139,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); * between pins and IRQs. */ -static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; +static struct irq_pin_list *irq_2_pin_ptr; + +static void __init irq_2_pin_init(void) +{ + struct irq_pin_list *pin = irq_2_pin_head; + int i; + + for (i = 1; i < PIN_MAP_SIZE; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin = irq_2_pin_ptr; + + if (!pin) + panic("can not get more irq_2_pin\n"); + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; +} struct io_apic { unsigned int index; @@ -172,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register */ -static inline void io_apic_modify(unsigned int apic, unsigned int value) +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -183,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); - entry = irq_2_pin + irq; + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; int pin; - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { @@ -201,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq) } if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); return false; } -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - BUG_ON(irq >= NR_IRQS); \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -299,59 +326,71 @@ static void ioapic_mask_entry(int apic, int pin) static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) { int apic, pin; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; - BUG_ON(irq >= NR_IRQS); + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; + + if (!entry) + break; + apic = entry->apic; pin = entry->pin; - if (pin == -1) - break; +#ifdef CONFIG_INTR_REMAP /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ if (!irq_remapped(irq)) io_apic_write(apic, 0x11 + pin*2, dest); +#else + io_apic_write(apic, 0x11 + pin*2, dest); +#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; - io_apic_modify(apic, reg); + io_apic_modify(apic, 0x10 + pin*2, reg); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } } +static int assign_irq_vector(int irq, cpumask_t mask); + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) return; + cfg = irq_cfg(irq); if (assign_irq_vector(irq, mask)) return; cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); - /* * Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } -#endif +#endif /* CONFIG_SMP */ /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are @@ -360,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + return; + } - BUG_ON(irq >= NR_IRQS); - while (entry->next) - entry = irq_2_pin + entry->next; + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: ran out of irq_2_pin entries!"); + entry = entry->next; } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; entry->apic = apic; entry->pin = pin; } @@ -384,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; - while (1) { + while (entry) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - } - if (!entry->next) + replaced = 1; + /* every one is different, right? */ break; - entry = irq_2_pin + entry->next; + } + entry = entry->next; + } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); +} + +static inline void io_apic_modify_irq(unsigned int irq, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + int pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + unsigned int reg; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); } } +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); +} -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) +#ifdef CONFIG_X86_64 +void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +} +#else /* CONFIG_X86_32 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); +} -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); +} +#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -450,6 +556,68 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + /* * Saves and masks all the unmasked IO-APIC RTE's */ @@ -474,7 +642,7 @@ int save_mask_IO_APIC_setup(void) kzalloc(sizeof(struct IO_APIC_route_entry) * nr_ioapic_registers[apic], GFP_KERNEL); if (!early_ioapic_entries[apic]) - return -ENOMEM; + goto nomem; } for (apic = 0; apic < nr_ioapics; apic++) @@ -488,17 +656,31 @@ int save_mask_IO_APIC_setup(void) ioapic_write_entry(apic, pin, entry); } } + return 0; + +nomem: + while (apic >= 0) + kfree(early_ioapic_entries[apic--]); + memset(early_ioapic_entries, 0, + ARRAY_SIZE(early_ioapic_entries)); + + return -ENOMEM; } void restore_IO_APIC_setup(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ioapic_write_entry(apic, pin, early_ioapic_entries[apic][pin]); + kfree(early_ioapic_entries[apic]); + early_ioapic_entries[apic] = NULL; + } } void reinit_intr_remapped_IO_APIC(int intr_remapping) @@ -512,25 +694,7 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping) */ restore_IO_APIC_setup(); } - -int skip_ioapic_setup; -int ioapic_force; - -static int __init parse_noapic(char *str) -{ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 1; -} -__setup("disable_timer_pin_1", disable_timer_pin_setup); - +#endif /* * Find the IRQ entry number of a certain pin. @@ -634,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } - BUG_ON(best_guess >= NR_IRQS); return best_guess; } +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ #define default_ISA_trigger(idx) (0) #define default_ISA_polarity(idx) (0) +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + /* PCI interrupts are always polarity one level triggered, * when listed as conforming in the MP table. */ #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + static int MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mp_srcbus; @@ -707,6 +903,36 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); else trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif break; case 1: /* edge */ { @@ -744,6 +970,7 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -765,8 +992,32 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } - BUG_ON(irq >= NR_IRQS); + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + return irq; } @@ -801,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) int cpu; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); /* Only try and allocate irqs on cpus that are present */ cpus_and(mask, mask, cpu_online_map); @@ -837,8 +1087,13 @@ next: } if (unlikely(current_vector == vector)) continue; +#ifdef CONFIG_X86_64 if (vector == IA32_SYSCALL_VECTOR) goto next; +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; @@ -875,8 +1130,7 @@ static void __clear_irq_vector(int irq) cpumask_t mask; int cpu, vector; - BUG_ON((unsigned)irq >= NR_IRQS); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; @@ -893,12 +1147,13 @@ void __setup_vector_irq(int cpu) /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ int irq, vector; + struct irq_cfg *cfg; /* Mark the inuse vectors */ - for (irq = 0; irq < NR_IRQS; ++irq) { - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + for_each_irq_cfg(irq, cfg) { + if (!cpu_isset(cpu, cfg->domain)) continue; - vector = irq_cfg[irq].vector; + vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -906,7 +1161,9 @@ void __setup_vector_irq(int cpu) irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -916,16 +1173,49 @@ static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; #endif +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + static void ioapic_register_intr(int irq, unsigned long trigger) { - if (trigger) - irq_desc[irq].status |= IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + desc->status |= IRQ_LEVEL; else - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - irq_desc[irq].status |= IRQ_MOVE_PCNTXT; + desc->status |= IRQ_MOVE_PCNTXT; if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, @@ -936,7 +1226,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger) return; } #endif - if (trigger) + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); @@ -1009,13 +1300,15 @@ static int setup_ioapic_entry(int apic, int irq, static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, int trigger, int polarity) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct IO_APIC_route_entry entry; cpumask_t mask; if (!IO_APIC_IRQ(irq)) return; + cfg = irq_cfg(irq); + mask = TARGET_CPUS; if (assign_irq_vector(irq, mask)) return; @@ -1047,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, static void __init setup_IO_APIC_irqs(void) { - int apic, pin, idx, irq, first_notcon = 1; + int apic, pin, idx, irq; + int notcon = 0; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - irq = pin_2_irq(idx, apic, pin); - add_pin_to_irq(irq, apic, pin); + idx = find_irq_entry(apic, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } + irq = pin_2_irq(idx, apic, pin); +#ifdef CONFIG_X86_32 + if (multi_timer_check(apic, irq)) + continue; +#endif + add_pin_to_irq(irq, apic, pin); + + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } } - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); + if (notcon) + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); } /* @@ -1088,8 +1393,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) return; +#endif memset(&entry, 0, sizeof(entry)); @@ -1124,7 +1431,10 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; unsigned long flags; + struct irq_cfg *cfg; + unsigned int irq; if (apic_verbosity == APIC_QUIET) return; @@ -1147,12 +1457,16 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); @@ -1160,11 +1474,27 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if (reg_01.bits.version >= 0x10) { + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); } + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" @@ -1193,16 +1523,16 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) + for_each_irq_cfg(irq, cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } printk("\n"); } @@ -1236,7 +1566,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; - unsigned long icr; + u64 icr; if (apic_verbosity == APIC_QUIET) return; @@ -1253,20 +1583,31 @@ __apicdebuginit(void) print_local_APIC(void *dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + } - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); v = apic_read(APIC_LDR); printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + } v = apic_read(APIC_SPIV); printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); @@ -1277,8 +1618,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } icr = apic_icr_read(); printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); @@ -1312,7 +1658,12 @@ __apicdebuginit(void) print_local_APIC(void *dummy) __apicdebuginit(void) print_all_local_APICs(void) { - on_each_cpu(print_local_APIC, NULL, 1); + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + preempt_enable(); } __apicdebuginit(void) print_PIC(void) @@ -1359,17 +1710,22 @@ __apicdebuginit(int) print_all_ICs(void) fs_initcall(print_all_ICs); +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; - int i, apic; + int apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif /* * The number of IO-APIC IRQ registers (== #pins): @@ -1399,6 +1755,10 @@ void __init enable_IO_APIC(void) } found_i8259: /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ @@ -1458,6 +1818,133 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1471,6 +1958,9 @@ static int __init timer_irq_works(void) unsigned long t1 = jiffies; unsigned long flags; + if (no_timer_check) + return 1; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ @@ -1531,9 +2021,11 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { - struct irq_cfg *cfg = &irq_cfg[irq]; + + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; spin_lock_irqsave(&vector_lock, flags); @@ -1542,6 +2034,14 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -1580,11 +2080,11 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); */ static void migrate_ioapic_irq(int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; - struct irq_desc *desc = irq_desc + irq; + struct irq_cfg *cfg; + struct irq_desc *desc; cpumask_t tmp, cleanup_mask; struct irte irte; - int modify_ioapic_rte = desc->status & IRQ_LEVEL; + int modify_ioapic_rte; unsigned int dest; unsigned long flags; @@ -1598,9 +2098,12 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); @@ -1622,18 +2125,19 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc->affinity = mask; } static int migrate_irq_remapped_level(int irq) { int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); mask_IO_APIC_irq(irq); if (io_apic_level_ack_pending(irq)) { /* - * Interrupt in progress. Migrating irq now will change the + * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse * the EOI broadcast performed by cpu. * So, delay the irq migration to the next instance. @@ -1643,11 +2147,11 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); + migrate_ioapic_irq(irq, desc->pending_mask); ret = 0; - irq_desc[irq].status &= ~IRQ_MOVE_PENDING; - cpus_clear(irq_desc[irq].pending_mask); + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); unmask: unmask_IO_APIC_irq(irq); @@ -1656,10 +2160,10 @@ unmask: static void ir_irq_migration(struct work_struct *work) { - int irq; + unsigned int irq; + struct irq_desc *desc; - for (irq = 0; irq < NR_IRQS; irq++) { - struct irq_desc *desc = irq_desc + irq; + for_each_irq_desc(irq, desc) { if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -1671,8 +2175,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, - irq_desc[irq].pending_mask); + desc->chip->set_affinity(irq, desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1683,9 +2186,11 @@ static void ir_irq_migration(struct work_struct *work) */ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - if (irq_desc[irq].status & IRQ_LEVEL) { - irq_desc[irq].status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; migrate_irq_remapped_level(irq); return; } @@ -1698,7 +2203,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; ack_APIC_irq(); +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); me = smp_processor_id(); @@ -1707,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= NR_IRQS) + + desc = irq_to_desc(irq); + if (!desc) continue; - desc = irq_desc + irq; - cfg = irq_cfg + irq; + cfg = irq_cfg(irq); spin_lock(&desc->lock); if (!cfg->move_cleanup_count) goto unlock; @@ -1730,7 +2238,7 @@ unlock: static void irq_complete_move(unsigned int irq) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg = irq_cfg(irq); unsigned vector, me; if (likely(!cfg->move_in_progress)) @@ -1769,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +atomic_t irq_mis_count; + static void ack_apic_level(unsigned int irq) { +#ifdef CONFIG_X86_32 + unsigned long v; + int i; +#endif int do_unmask_irq = 0; irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } #endif +#ifdef CONFIG_X86_32 + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +#endif + /* * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. @@ -1820,31 +2359,41 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq(irq); } + +#ifdef CONFIG_X86_32 + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +#endif } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; #ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, + .set_affinity = set_ir_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -1853,6 +2402,8 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; /* * NOTE! The local APIC isn't very good at handling @@ -1865,8 +2416,8 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < NR_IRQS ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { + for_each_irq_cfg(irq, cfg) { + if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -1874,27 +2425,33 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } -static void unmask_lapic_irq(unsigned int irq) +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } -static void mask_lapic_irq(unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static void ack_lapic_irq (unsigned int irq) @@ -1911,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -1919,19 +2479,19 @@ static void lapic_register_intr(int irq) static void __init setup_nmi(void) { /* - * Dirty trick to enable the NMI watchdog ... + * Dirty trick to enable the NMI watchdog ... * We put the 8259A master into AEOI mode and * unmask on all local APICs LVT0 as NMI. * * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') * is from Maciej W. Rozycki - so we do not have to EOI from * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); enable_NMI_through_LVT0(); - printk(" done.\n"); + apic_printk(APIC_VERBOSE, " done.\n"); } /* @@ -1948,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void) unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -1988,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void) ioapic_write_entry(apic, pin, entry0); } +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. * - * FIXME: really need to revamp this for modern platforms only. + * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg + 0; + struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; unsigned long flags; + unsigned int ver; int no_pin1 = 0; local_irq_save(flags); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + /* * get/set the timer IRQ vector: */ @@ -2013,10 +2593,18 @@ static inline void __init check_timer(void) /* * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2035,8 +2623,10 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); +#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2054,7 +2644,7 @@ static inline void __init check_timer(void) setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { + if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); enable_8259A_irq(0); @@ -2063,8 +2653,10 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2104,6 +2696,9 @@ static inline void __init check_timer(void) "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -2140,13 +2735,6 @@ out: local_irq_restore(flags); } -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - /* * Traditionally ISA IRQ2 is the cascade IRQ, and is not available * to devices. However there may be an I/O APIC pin available for @@ -2164,25 +2752,49 @@ __setup("no_timer_check", notimercheck); * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1<<2) +#define PIC_IRQS (1 << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { +#ifdef CONFIG_X86_32 + enable_IO_APIC(); +#else /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ +#endif io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); } +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; @@ -2270,32 +2882,51 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq; - int new; + unsigned int irq; + unsigned int new; unsigned long flags; + struct irq_cfg *cfg_new; + + irq_want = nr_irqs - 1; - irq = -ENOSPC; + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_cfg[new].vector != 0) + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2316,7 +2947,7 @@ void destroy_irq(unsigned int irq) #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; unsigned dest; cpumask_t tmp; @@ -2326,6 +2957,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms if (err) return err; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2383,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms #ifdef CONFIG_SMP static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2395,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2406,7 +3040,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #ifdef CONFIG_INTR_REMAP @@ -2416,10 +3051,11 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) */ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp, cleanup_mask; struct irte irte; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2431,6 +3067,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2454,7 +3091,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif #endif /* CONFIG_SMP */ @@ -2507,7 +3145,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) if (index < 0) { printk(KERN_ERR "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); + pci_name(dev)); return -ENOSPC; } return index; @@ -2528,7 +3166,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* * irq migration in process context */ @@ -2538,16 +3176,34 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #endif set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + return 0; } +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - int irq, ret; + unsigned int irq; + int ret; + unsigned int irq_want; - irq = create_irq(); - if (irq < 0) - return irq; + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) @@ -2574,18 +3230,22 @@ error: int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret, sub_handle; + unsigned int irq; + int ret, sub_handle; struct msi_desc *desc; + unsigned int irq_want; + #ifdef CONFIG_INTR_REMAP struct intel_iommu *iommu = 0; int index = 0; #endif + irq_want = build_irq_for_pci_dev(dev) + 0x100; sub_handle = 0; list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq(); - if (irq < 0) - return irq; + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) goto no_ir; @@ -2636,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2648,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2659,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2689,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq) } #endif +#ifdef CONFIG_HPET_TIMER + +#ifdef CONFIG_SMP +static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + hpet_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = hpet_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_hpet_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + + hpet_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, + "edge"); + + return 0; +} +#endif + #endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support @@ -2713,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2724,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif @@ -2745,7 +3474,7 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; cpumask_t tmp; @@ -2755,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) struct ht_irq_msg msg; unsigned dest; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2777,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); } return err; } #endif /* CONFIG_HT_IRQ */ +#ifdef CONFIG_X86_64 +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long flags; + int err; + + err = assign_irq_vector(irq, *eligible_cpu); + if (err != 0) + return err; + + spin_lock_irqsave(&vector_lock, flags); + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + spin_unlock_irqrestore(&vector_lock, flags); + + cfg = irq_cfg(irq); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->vector = cfg->vector; + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cpu_mask_to_apicid(*eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int mmr_pnode; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->mask = 1; + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} +#endif /* CONFIG_X86_64 */ + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + +int __init probe_nr_irqs(void) +{ + int idx; + int nr = 0; +#ifndef CONFIG_XEN + int nr_min = 32; +#else + int nr_min = NR_IRQS; +#endif + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + /* double it for hotplug and msi and nmi */ + nr <<= 1; + + /* something wrong ? */ + if (nr < nr_min) + nr = nr_min; + + return nr; +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI -#define IO_APIC_MAX_ID 0xFE +#ifdef CONFIG_X86_32 +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; -int __init io_apic_get_redir_entries (int ioapic) + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -2799,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic) reg_01.raw = io_apic_read(ioapic, 1); spin_unlock_irqrestore(&ioapic_lock, flags); - return reg_01.bits.entries; + return reg_01.bits.version; } - +#endif int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { @@ -2853,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; if (skip_ioapic_setup == 1) return; @@ -2868,7 +3775,8 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - if (!irq_cfg[irq].vector) + cfg = irq_cfg(irq); + if (!cfg->vector) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); @@ -2926,18 +3834,33 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; + irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", + "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; @@ -2971,4 +3894,3 @@ static int __init ioapic_insert_resources(void) /* Insert the IO APIC resources after PCI initialization has occured to handle * IO APICS that are mapped in on a BAR in PCI space. */ late_initcall(ioapic_insert_resources); - diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c deleted file mode 100644 index e710289..0000000 --- a/arch/x86/kernel/io_apic_32.c +++ /dev/null @@ -1,2908 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and - * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, - * further tested and cleaned up by Zach Brown <zab@redhat.com> - * and Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/bootmem.h> -#include <linux/mc146818rtc.h> -#include <linux/compiler.h> -#include <linux/acpi.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/pci.h> -#include <linux/msi.h> -#include <linux/htirq.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/jiffies.h> /* time_after() */ - -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/timer.h> -#include <asm/i8259.h> -#include <asm/nmi.h> -#include <asm/msidef.h> -#include <asm/hypertransport.h> -#include <asm/setup.h> - -#include <mach_apic.h> -#include <mach_apicdef.h> - -#define __apicdebuginit(type) static type __init - -int (*ioapic_renumber_irq)(int ioapic, int irq); -atomic_t irq_mis_count; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); - -int timer_through_8259 __initdata; - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -static int disable_timer_pin_1 __initdata; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - volatile struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); - } - entry->apic = apic; - entry->pin = pin; -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - - while (1) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - } - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int pin, reg; - - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -static void mask_IO_APIC_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq(unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -{ - unsigned long flags; - int pin; - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int apicid_value; - cpumask_t tmp; - - cpus_and(tmp, cpumask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(cpumask, tmp, CPU_MASK_ALL); - - apicid_value = cpu_mask_to_apicid(cpumask); - /* Prepare to do the io_apic_write */ - apicid_value = apicid_value << 24; - spin_lock_irqsave(&ioapic_lock, flags); - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - irq_desc[irq].affinity = cpumask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#if defined(CONFIG_IRQBALANCE) -# include <asm/processor.h> /* kernel_thread() */ -# include <linux/kernel_stat.h> /* kstat */ -# include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> - -#define IRQBALANCE_CHECK_ARCH -999 -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -static int physical_balance __read_mostly; -static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; - -static struct irq_cpu_info { - unsigned long *last_irq; - unsigned long *irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) - -static cpumask_t balance_irq_affinity[NR_IRQS] = { - [0 ... NR_IRQS-1] = CPU_MASK_ALL -}; - -void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ - balance_irq_affinity[irq] = mask; -} - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu, now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) - set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - - for_each_online_cpu(i) { - for (j = 0; j < NR_IRQS; j++) { - if (!irq_desc[j].action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - - for_each_possible_cpu(i) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < NR_IRQS; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ or balancing disabled ? */ - if (!irq_desc[j].action || irq_balancing_disabled(j)) - continue; - if (package_index == i) - IRQ_DELTA(package_index, j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i, j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i, j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index, j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* - * Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* - * In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* - * if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) - goto not_worth_the_effort; - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < NR_IRQS; j++) { - /* Is this an active IRQ? */ - if (!irq_desc[j].action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded, j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded, j)) { - move_this_load = IRQ_DELTA(max_loaded, j); - selected_irq = j; - } - } - if (selected_irq == -1) - goto tryanothercpu; - - imbalance = move_this_load; - - /* For physical_balance case, we accumulated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, - cpu_online_map, - balance_irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - /* mark for change destination */ - set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) { - irq_desc[i].pending_mask = cpumask_of_cpu(0); - set_pending_irq(i, cpumask_of_cpu(0)); - } - - set_freezable(); - for ( ; ; ) { - time_remaining = schedule_timeout_interruptible(time_remaining); - try_to_freeze(); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - preempt_enable(); - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) - return 0; - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for_each_possible_cpu(i) { - kfree(irq_cpu_data[i].irq_delta); - irq_cpu_data[i].irq_delta = NULL; - kfree(irq_cpu_data[i].last_irq); - irq_cpu_data[i].last_irq = NULL; - } - return 0; -} - -int __devinit irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 1; -} - -__setup("noirqbalance", irqbalance_disable); - -late_initcall(balanced_irq_init); -#endif /* CONFIG_IRQBALANCE */ -#endif /* CONFIG_SMP */ - -#ifndef CONFIG_SMP -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP */ - - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; -int skip_ioapic_setup; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for (apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " - "slot:%d, pin:%d.\n", bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - polarity = test_bit(bus, mp_bus_not_pci)? - default_ISA_polarity(idx): - default_PCI_polarity(idx); - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - case 0: /* conforms, ie. bus-type dependent */ - { - trigger = test_bit(bus, mp_bus_not_pci)? - default_ISA_trigger(idx): - default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) - irq = mp_irqs[idx].mp_srcbusirq; - else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - return irq; -} - -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; - -static int __assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset; - int vector, offset; - - BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); - - if (irq_vector[irq] > 0) - return irq_vector[irq]; - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (vector == current_vector) - return -ENOSPC; - if (test_and_set_bit(vector, used_vectors)) - goto next; - - current_vector = vector; - current_offset = offset; - irq_vector[irq] = vector; - - return vector; -} - -static int assign_irq_vector(int irq) -{ - unsigned long flags; - int vector; - - spin_lock_irqsave(&vector_lock, flags); - vector = __assign_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); - - return vector; -} - -static struct irq_chip ioapic_chip; - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) -{ - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); - } - set_intr_gate(vector, interrupt[irq]); -} - -static void __init setup_IO_APIC_irqs(void) -{ - struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = - cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic, pin, mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mp_apicid, - pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mp_apicid, pin); - continue; - } - - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - } - - irq = pin_2_irq(idx, apic, pin); - /* - * skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum - */ - if (multi_timer_check(apic, irq)) - continue; - else - add_pin_to_irq(irq, apic, pin); - - if (!apic && !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - ioapic_register_intr(0, vector, IOAPIC_EDGE); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - - -__apicdebuginit(void) print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -__apicdebuginit(void) print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } -} - -__apicdebuginit(void) print_local_APIC(void *dummy) -{ - unsigned int v, ver, maxlvt; - u64 icr; - - if (apic_verbosity == APIC_QUIET) - return; - - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, - GET_APIC_ID(v)); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = lapic_get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } - - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -__apicdebuginit(void) print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -__apicdebuginit(void) print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b, 0xa0); - outb(0x0b, 0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a, 0xa0); - outb(0x0a, 0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -__apicdebuginit(int) print_all_ICs(void) -{ - print_PIC(); - print_all_local_APICs(); - print_IO_APIC(); - - return 0; -} - -fs_initcall(print_all_ICs); - - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int i, apic; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for (apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest.physical.physical_dest = read_apic_id(); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) - return; - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - if (time_after(jiffies, t1 + 4)) - return 1; - - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Startup quirk: - * - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - * - * (We do this for level-triggered IRQs too - it cannot hurt.) - */ -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -static void ack_ioapic_irq(unsigned int irq) -{ - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_ioapic_quirk_irq(unsigned int irq) -{ - unsigned long v; - int i; - - move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_vector[irq]; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} - -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_vector[irq]); - - return 1; -} - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_ioapic_irq, - .eoi = ack_ioapic_quirk_irq, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else - /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; - } - } -} - -/* - * The local APIC irq-chip implementation: - */ - -static void ack_lapic_irq(unsigned int irq) -{ - ack_APIC_irq(); -} - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq, int vector) -{ - irq_desc[irq].status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); - set_intr_gate(vector, interrupt[irq]); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - */ -static inline void __init check_timer(void) -{ - int apic1, pin1, apic2, pin2; - int no_pin1 = 0; - int vector; - unsigned int ver; - unsigned long flags; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } - timer_ack = 0; - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0, vector); - apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - int i; - - /* Reserve all the system vectors. */ - for (i = first_system_vector; i < NR_VECTORS; i++) - set_bit(i, used_vectors); - - enable_IO_APIC(); - - io_apic_irqs = ~PIC_IRQS; - - printk("ENABLING IO-APIC IRQs\n"); - - /* - * Set up IO-APIC IRQ routing. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - entry[i] = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device *dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -int create_irq(void) -{ - /* Allocate an unused irq */ - int irq, new, vector = 0; - unsigned long flags; - - irq = -ENOSPC; - spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { - if (platform_legacy_irq(new)) - continue; - if (irq_vector[new] != 0) - continue; - vector = __assign_irq_vector(new); - if (likely(vector > 0)) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq >= 0) { - set_intr_gate(vector, interrupt[irq]); - dynamic_irq_init(irq); - } - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - - spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_vector[irq], used_vectors); - irq_vector[irq] = 0; - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI message composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - int vector; - unsigned dest; - - vector = assign_irq_vector(irq); - if (vector >= 0) { - dest = cpu_mask_to_apicid(TARGET_CPUS); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? -MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? -MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(vector); - } - return vector; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - int vector; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - vector = assign_irq_vector(irq); - if (vector < 0) - return; - - dest = cpu_mask_to_apicid(mask); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; -} -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_ioapic_irq, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, - "edge"); - - return 0; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#endif /* CONFIG_PCI_MSI */ - -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - unsigned int dest; - cpumask_t tmp; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(mask, tmp, CPU_MASK_ALL); - - dest = cpu_mask_to_apicid(mask); - - target_ht_irq(irq, dest); - irq_desc[irq].affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_ioapic_irq, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - int vector; - - vector = assign_irq_vector(irq); - if (vector >= 0) { - struct ht_irq_msg msg; - unsigned dest; - cpumask_t tmp; - - cpus_clear(tmp); - cpu_set(vector >> 8, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return vector; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - -int __init io_apic_get_redir_entries(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) -{ - struct IO_APIC_route_entry entry; - - if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - ioapic_write_entry(ioapic, pin, entry); - - return 0; -} - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -static int __init parse_disable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); - -static int __init parse_enable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = -1; - return 0; -} -early_param("enable_timer_pin_1", parse_enable_timer_pin_1); - -static int __init parse_noapic(char *arg) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } - } else { -fake_ioapic_page: - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } -} - diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 0000000..ccf6c50 --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,189 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include <linux/cpu.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/seq_file.h> + +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/smp.h> + +atomic_t irq_err_count; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +#endif +} + +#ifdef CONFIG_X86_32 +# define irq_stats(x) (&per_cpu(irq_stat,x)) +#else +# define irq_stats(x) cpu_pda(x) +#endif +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +# ifdef CONFIG_X86_64 + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +# endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return show_other_interrupts(p); + + /* print header */ + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); +#endif + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +#endif + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; + sum += irq_stats(cpu)->irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += irq_stats(cpu)->irq_thermal_count; +# ifdef CONFIG_X86_64 + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b71e02d..a513826 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK - */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) @@ -223,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; + int overflow; + unsigned vector = ~regs->orig_ax; + struct irq_desc *desc; + unsigned irq; - if (unlikely((unsigned)irq >= NR_IRQS)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d\n", - __func__, irq); - BUG(); - } old_regs = set_irq_regs(regs); irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; overflow = check_stack_overflow(); + desc = irq_to_desc(irq); + if (unlikely(!desc)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", + __func__, irq, vector, smp_processor_id()); + BUG(); + } + if (!execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); @@ -248,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs) return 1; } -/* - * Interrupt statistics: - */ - -atomic_t irq_err_count; - -/* - * /proc/interrupts printing: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - unsigned any_count = 0; - - spin_lock_irqsave(&irq_desc[i].lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; -#endif - action = irq_desc[i].action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); - seq_printf(p, " Non-maskable interrupts\n"); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#endif -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif - } - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = nmi_count(cpu); - -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif -#ifdef CONFIG_SMP - sum += per_cpu(irq_stat, cpu).irq_resched_count; - sum += per_cpu(irq_stat, cpu).irq_call_count; - sum += per_cpu(irq_stat, cpu).irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += per_cpu(irq_stat, cpu).irq_thermal_count; -#endif -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).irq_spurious_count; -#endif - return sum; -} - -u64 arch_irq_stat(void) -{ - u64 sum = atomic_read(&irq_err_count); - -#ifdef CONFIG_X86_IO_APIC - sum += atomic_read(&irq_mis_count); -#endif - return sum; -} - #ifdef CONFIG_HOTPLUG_CPU #include <mach_apic.h> @@ -395,20 +237,22 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < NR_IRQS; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; + if (irq == 2) continue; - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); + else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f065fe9..60eb84e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,28 +18,6 @@ #include <asm/idle.h> #include <asm/smp.h> -atomic_t irq_err_count; - -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs) #endif /* - * Generic, controller-independent functions: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - unsigned any_count = 0; - - spin_lock_irqsave(&irq_desc[i].lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; -#endif - action = irq_desc[i].action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); - seq_printf(p, "THR: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); -#endif - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); - } - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = cpu_pda(cpu)->__nmi_count; - - sum += cpu_pda(cpu)->apic_timer_irqs; -#ifdef CONFIG_SMP - sum += cpu_pda(cpu)->irq_resched_count; - sum += cpu_pda(cpu)->irq_call_count; - sum += cpu_pda(cpu)->irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += cpu_pda(cpu)->irq_thermal_count; - sum += cpu_pda(cpu)->irq_threshold_count; -#endif - sum += cpu_pda(cpu)->irq_spurious_count; - return sum; -} - -u64 arch_irq_stat(void) -{ - return atomic_read(&irq_err_count); -} - -/* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). @@ -188,6 +50,7 @@ u64 arch_irq_stat(void) asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; @@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(irq < NR_IRQS)) - generic_handle_irq(irq); + desc = irq_to_desc(irq); + if (likely(desc)) + generic_handle_irq_desc(irq, desc); else { if (!disable_apic) ack_APIC_irq(); @@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < NR_IRQS; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; int break_affinity = 0; int set_affinity = 1; @@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map) continue; /* interrupt's are disabled at this point */ - spin_lock(&irq_desc[irq].lock); + spin_lock(&desc->lock); if (!irq_has_action(irq) || - cpus_equal(irq_desc[irq].affinity, map)) { - spin_unlock(&irq_desc[irq].lock); + cpus_equal(desc->affinity, map)) { + spin_unlock(&desc->lock); continue; } - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (cpus_empty(mask)) { break_affinity = 1; mask = map; } - if (irq_desc[irq].chip->mask) - irq_desc[irq].chip->mask(irq); + if (desc->chip->mask) + desc->chip->mask(irq); - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); else if (!(warned++)) set_affinity = 0; - if (irq_desc[irq].chip->unmask) - irq_desc[irq].chip->unmask(irq); + if (desc->chip->unmask) + desc->chip->unmask(irq); - spin_unlock(&irq_desc[irq].lock); + spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9200a1e..845aa98 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -69,6 +69,13 @@ void __init init_ISA_irqs (void) * 16 old-style INTA-cycle interrupts: */ for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); } @@ -83,6 +90,27 @@ static struct irqaction irq2 = { .name = "cascade", }; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -98,22 +126,14 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i]); } -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -128,6 +148,9 @@ void __init native_init_IRQ(void) /* IPI for single call function */ set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 5b5be9d..ff02353 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,23 +142,19 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, + for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } } } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index f6a11b9..67465ed 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " "disabling irq balancing and affinity\n"); -#ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); -#endif noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b2c9787..0fa6790 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1073,6 +1073,7 @@ void __init setup_arch(char **cmdline_p) #endif prefill_possible_map(); + #ifdef CONFIG_X86_64 init_cpu_to_node(); #endif @@ -1080,6 +1081,9 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); ioapic_init_mappings(); + /* need to wait for io_apic is mapped */ + nr_irqs = probe_nr_irqs(); + kvm_guest_init(); e820_reserve_resources(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0e67f72..410c88f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,25 +140,30 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size = PERCPU_ENOUGH_ROOM; + ssize_t size, old_size; char *ptr; int cpu; + unsigned long align = 1; /* Setup cpu_pda map */ setup_cpu_pda_map(); /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; + old_size = PERCPU_ENOUGH_ROOM; + align = max_t(unsigned long, PAGE_SIZE, align); + size = roundup(old_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); #else int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", cpu, node); @@ -167,7 +172,8 @@ void __init setup_per_cpu_areas(void) cpu, __pa(ptr)); } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); + ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, + __pa(MAX_DMA_ADDRESS)); if (ptr) printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", cpu, node, __pa(ptr)); @@ -175,7 +181,6 @@ void __init setup_per_cpu_areas(void) #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7ed9e07..7ece815 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -543,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); + printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. @@ -874,7 +874,7 @@ do_rest: start_ip = setup_trampoline(); /* So we see what's up */ - printk(KERN_INFO "Booting processor %d/%d ip %lx\n", + printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", cpu, apicid, start_ip); /* diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c new file mode 100644 index 0000000..aeef529 --- /dev/null +++ b/arch/x86/kernel/uv_irq.c @@ -0,0 +1,79 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ functions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/irq.h> + +#include <asm/apic.h> +#include <asm/uv/uv_irq.h> + +static void uv_noop(unsigned int irq) +{ +} + +static unsigned int uv_noop_ret(unsigned int irq) +{ + return 0; +} + +static void uv_ack_apic(unsigned int irq) +{ + ack_APIC_irq(); +} + +struct irq_chip uv_irq_chip = { + .name = "UV-CORE", + .startup = uv_noop_ret, + .shutdown = uv_noop, + .enable = uv_noop, + .disable = uv_noop, + .ack = uv_noop, + .mask = uv_noop, + .unmask = uv_noop, + .eoi = uv_ack_apic, + .end = uv_noop, +}; + +/* + * Set up a mapping of an available irq and vector, and enable the specified + * MMR that defines the MSI that is to be sent to the specified CPU when an + * interrupt is raised. + */ +int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + int irq; + int ret; + + irq = create_irq(); + if (irq <= 0) + return -EBUSY; + + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); + if (ret != irq) + destroy_irq(irq); + + return ret; +} +EXPORT_SYMBOL_GPL(uv_setup_irq); + +/* + * Tear down a mapping of an irq and vector, and disable the specified MMR that + * defined the MSI that was to be sent to the specified CPU when an interrupt + * was raised. + * + * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). + */ +void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +{ + arch_disable_uv_irq(mmr_blade, mmr_offset); + destroy_irq(irq); +} +EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c new file mode 100644 index 0000000..67f9b9d --- /dev/null +++ b/arch/x86/kernel/uv_sysfs.c @@ -0,0 +1,72 @@ +/* + * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson + */ + +#include <linux/sysdev.h> +#include <asm/uv/bios.h> + +struct kobject *sgi_uv_kobj; + +static ssize_t partition_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); +} + +static ssize_t coherence_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); +} + +static struct kobj_attribute partition_id_attr = + __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); + +static struct kobj_attribute coherence_id_attr = + __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); + + +static int __init sgi_uv_sysfs_init(void) +{ + unsigned long ret; + + if (!sgi_uv_kobj) + sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); + if (!sgi_uv_kobj) { + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + return -EINVAL; + } + + ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + return ret; + } + + ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + return ret; + } + + return 0; +} + +device_initcall(sgi_uv_sysfs_init); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 61a97e6..0c9667f 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq) static unsigned int startup_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); return 0; @@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq) static void end_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); } @@ -626,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) spin_unlock_irqrestore(&i8259A_lock, flags); - desc = irq_desc + realirq; + desc = irq_to_desc(realirq); /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_cpu(smp_processor_id()).irqs[realirq]++; + kstat_incr_irqs_this_cpu(realirq, desc); if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); @@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void) int i; for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = 0; + desc->depth = 1; if (i == 0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE1) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_8259) { - irq_desc[i].chip = &piix4_master_irq_type; + desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { - irq_desc[i].chip = &piix4_virtual_irq_type; + desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } } diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 6953859..254ee07 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { + unsigned int cpu; /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 65f0b8a..48ee4f9 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -582,7 +582,7 @@ static void __init lguest_init_IRQ(void) for (i = 0; i < LGUEST_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[i]); + set_intr_gate(vector, interrupt[vector]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index df37fc9..3c3b471 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -41,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; +static cpumask_t vector_allocation_domain(int cpu) +{ + return cpumask_of_cpu(cpu); +} static int probe_bigsmp(void) { diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 6513d41..28459ca 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -75,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8cf5839..71a309b 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6ad6b67..6272b5e 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -23,4 +23,18 @@ static int probe_summit(void) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 199a5f4..0f6e8a6 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq) * the interrupt off to another CPU */ static void before_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); __u8 cpu = smp_processor_id(); _raw_spin_lock(&vic_irq_lock); @@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq) /* Finish the VIC interrupt: basically mask */ static void after_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); _raw_spin_lock(&vic_irq_lock); { diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 635b50e..2c4baa8 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -56,13 +56,6 @@ struct remap_trace { static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); -#if 0 /* XXX: no way gather this info anymore */ -/* Access to this is not per-cpu. */ -static DEFINE_PER_CPU(atomic_t, dropped); -#endif - -static struct dentry *marker_file; - static DEFINE_MUTEX(mmiotrace_mutex); static DEFINE_SPINLOCK(trace_lock); static atomic_t mmiotrace_enabled; @@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ * and trace_lock. * - Routines depending on is_enabled() must take trace_lock. * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that mmio_trace_record is allowed. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. * - pre/post callbacks assume the effect of is_enabled() being true. */ @@ -97,44 +90,6 @@ static bool is_enabled(void) return atomic_read(&mmiotrace_enabled); } -#if 0 /* XXX: needs rewrite */ -/* - * Write callback for the debugfs entry: - * Read a marker and write it to the mmio trace log - */ -static ssize_t write_marker(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *event = NULL; - struct mm_io_header *headp; - ssize_t len = (count > 65535) ? 65535 : count; - - event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); - if (!event) - return -ENOMEM; - - headp = (struct mm_io_header *)event; - headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); - headp->data_len = len; - - if (copy_from_user(event + sizeof(*headp), buffer, len)) { - kfree(event); - return -EFAULT; - } - - spin_lock_irq(&trace_lock); -#if 0 /* XXX: convert this to use tracing */ - if (is_enabled()) - relay_write(chan, event, sizeof(*headp) + len); - else -#endif - len = -EINVAL; - spin_unlock_irq(&trace_lock); - kfree(event); - return len; -} -#endif - static void print_pte(unsigned long address) { unsigned int level; @@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, map.map_id = trace->id; spin_lock_irq(&trace_lock); - if (!is_enabled()) + if (!is_enabled()) { + kfree(trace); goto not_enabled; + } mmio_trace_mapping(&map); list_add_tail(&trace->list, &trace_list); @@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr) iounmap_trace_core(addr); } +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + static void clear_trace_list(void) { struct remap_trace *trace; @@ -462,26 +436,12 @@ static void leave_uniprocessor(void) } #endif -#if 0 /* XXX: out of order */ -static struct file_operations fops_marker = { - .owner = THIS_MODULE, - .write = write_marker -}; -#endif - void enable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (is_enabled()) goto out; -#if 0 /* XXX: tracing does not support text entries */ - marker_file = debugfs_create_file("marker", 0660, dir, NULL, - &fops_marker); - if (!marker_file) - pr_err(NAME "marker file creation failed.\n"); -#endif - if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); enter_uniprocessor(); @@ -506,11 +466,6 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); - if (marker_file) { - debugfs_remove(marker_file); - marker_file = NULL; - } - pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index efa1911..df3d5c8 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 }; static unsigned int mw64[] = { 0x89, 0x8B }; #endif /* not __i386__ */ -static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, - int *rexr) +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) { int i; unsigned char *p = addr; - *shorted = 0; - *enlarged = 0; - *rexr = 0; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; restart: for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { if (*p == prefix_codes[i]) { if (*p == 0x66) - *shorted = 1; + prf->shorted = 1; #ifdef __amd64__ if ((*p & 0xf8) == 0x48) - *enlarged = 1; + prf->enlarged = 1; if ((*p & 0xf4) == 0x44) - *rexr = 1; + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; #endif p++; goto restart; @@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int shorted, enlarged, rexr; + struct prefix_bits prf; int i; enum reason_type rv = OTHERS; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); CHECK_OP_TYPE(opcode, reg_rop, REG_READ); @@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(rw8); i++) @@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(rw32); i++) if (rw32[i] == opcode) - return (shorted ? 2 : (enlarged ? 8 : 4)); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(mw8); i++) @@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(mw32); i++) if (mw32[i] == opcode) - return shorted ? 2 : 4; + return prf.shorted ? 2 : 4; for (i = 0; i < ARRAY_SIZE(mw64); i++) if (mw64[i] == opcode) - return shorted ? 2 : (enlarged ? 8 : 4); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -238,7 +249,7 @@ enum { #endif }; -static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) { unsigned char *rv = NULL; @@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) case arg_DL: rv = (unsigned char *)®s->dx; break; - case arg_AH: - rv = 1 + (unsigned char *)®s->ax; - break; - case arg_BH: - rv = 1 + (unsigned char *)®s->bx; - break; - case arg_CH: - rv = 1 + (unsigned char *)®s->cx; - break; - case arg_DH: - rv = 1 + (unsigned char *)®s->dx; - break; #ifdef __amd64__ case arg_R8: rv = (unsigned char *)®s->r8; @@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) break; #endif default: - printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); break; } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + return rv; } @@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char mod_rm; int reg; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) if (reg_rop[i] == opcode) { @@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) do_work: mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); switch (get_ins_reg_width(ins_addr)) { case 1: - return *get_reg_w8(reg, regs); + return *get_reg_w8(reg, prf.rex, regs); case 2: return *(unsigned short *)get_reg_w32(reg, regs); @@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char mod_rm; unsigned char mod; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) if (imm_wop[i] == opcode) { diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index d877c5b..ab50a8d 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -3,6 +3,7 @@ */ #include <linux/module.h> #include <linux/io.h> +#include <linux/mmiotrace.h> #define MODULE_NAME "testmmiotrace" @@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); static void do_write_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) iowrite8(i, p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p) static void do_read_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -39,6 +42,7 @@ static void do_test(void) pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } + mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); iounmap(p); diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 006599d..bf69dbe 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq if (pirq <= 4) irq = read_config_nybble(router, 0x56, pirq - 1); dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n", dev->vendor, dev->device, pirq, irq); return irq; } @@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n", dev->vendor, dev->device, pirq, irq); if (pirq <= 4) write_config_nybble(router, 0x56, pirq - 1, irq); @@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: - case PCI_DEVICE_ID_INTEL_PCH_0: - case PCI_DEVICE_ID_INTEL_PCH_1: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } + + if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; } @@ -823,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r) r->get = NULL; r->set = NULL; - DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); @@ -843,7 +850,7 @@ static void __init pirq_find_router(struct irq_router *r) h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } - dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", pirq_router.name, pirq_router_dev->vendor, pirq_router_dev->device); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 28b85ab..bb04260 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -21,7 +21,6 @@ void xen_force_evtchn_callback(void) static void __init __xen_init_IRQ(void) { -#ifdef CONFIG_X86_64 int i; /* Create identity vector->irq map */ @@ -31,7 +30,6 @@ static void __init __xen_init_IRQ(void) for_each_possible_cpu(cpu) per_cpu(vector_irq, cpu)[i] = i; } -#endif /* CONFIG_X86_64 */ xen_init_IRQ(); } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index dd71e3a..5601506 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: raw_local_irq_restore(flags); diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index e8362c1..dcbf1be 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -115,34 +115,32 @@ EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); * (start) dependent operations on their target channel * @tx: transaction with dependencies */ -void -async_tx_run_dependencies(struct dma_async_tx_descriptor *tx) +void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx) { - struct dma_async_tx_descriptor *next = tx->next; + struct dma_async_tx_descriptor *dep = tx->next; + struct dma_async_tx_descriptor *dep_next; struct dma_chan *chan; - if (!next) + if (!dep) return; - tx->next = NULL; - chan = next->chan; + chan = dep->chan; /* keep submitting up until a channel switch is detected * in that case we will be called again as a result of * processing the interrupt from async_tx_channel_switch */ - while (next && next->chan == chan) { - struct dma_async_tx_descriptor *_next; - - spin_lock_bh(&next->lock); - next->parent = NULL; - _next = next->next; - if (_next && _next->chan == chan) - next->next = NULL; - spin_unlock_bh(&next->lock); - - next->tx_submit(next); - next = _next; + for (; dep; dep = dep_next) { + spin_lock_bh(&dep->lock); + dep->parent = NULL; + dep_next = dep->next; + if (dep_next && dep_next->chan == chan) + dep->next = NULL; /* ->next will be submitted */ + else + dep_next = NULL; /* submit current dep and terminate */ + spin_unlock_bh(&dep->lock); + + dep->tx_submit(dep); } chan->device->device_issue_pending(chan); diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 31dcd91..dc8d1a9 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -417,6 +417,6 @@ static void __exit agp_ali_cleanup(void) module_init(agp_ali_init); module_exit(agp_ali_cleanup); -MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 2812ee2..52f4361 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -772,6 +772,6 @@ module_init(agp_amd64_init); module_exit(agp_amd64_cleanup); #endif -MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>, Andi Kleen"); +MODULE_AUTHOR("Dave Jones <davej@redhat.com>, Andi Kleen"); module_param(agp_try_unsupported, bool, 0); MODULE_LICENSE("GPL"); diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index ae2791b..f1537ee 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -561,6 +561,6 @@ static void __exit agp_ati_cleanup(void) module_init(agp_ati_init); module_exit(agp_ati_cleanup); -MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 3a3cc03..8c617ad 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -349,7 +349,7 @@ static __init int agp_setup(char *s) __setup("agp=", agp_setup); #endif -MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); MODULE_DESCRIPTION("AGP GART driver"); MODULE_LICENSE("GPL and additional rights"); MODULE_ALIAS_MISCDEV(AGPGART_MINOR); diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 1108665..9cf6e9b 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -2390,5 +2390,5 @@ static void __exit agp_intel_cleanup(void) module_init(agp_intel_init); module_exit(agp_intel_cleanup); -MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index 5bbed3d..16acee2 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -1,7 +1,7 @@ /* * Nvidia AGPGART routines. * Based upon a 2.4 agpgart diff by the folks from NVIDIA, and hacked up - * to work in 2.5 by Dave Jones <davej@codemonkey.org.uk> + * to work in 2.5 by Dave Jones <davej@redhat.com> */ #include <linux/module.h> diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index f2492ec..db60539 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -20,8 +20,8 @@ #include <linux/agp_backend.h> #include <linux/log2.h> -#include <asm-parisc/parisc-device.h> -#include <asm-parisc/ropes.h> +#include <asm/parisc-device.h> +#include <asm/ropes.h> #include "agp.h" diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c index 9f4d49e..d3bd243 100644 --- a/drivers/char/agp/via-agp.c +++ b/drivers/char/agp/via-agp.c @@ -595,4 +595,4 @@ module_init(agp_via_init); module_exit(agp_via_cleanup); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index f3cfb4c..408f5f9 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -219,7 +219,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp) for (irq = find_first_bit(&v, HPET_MAX_IRQ); irq < HPET_MAX_IRQ; irq = find_next_bit(&v, HPET_MAX_IRQ, 1 + irq)) { - if (irq >= NR_IRQS) { + if (irq >= nr_irqs) { irq = HPET_MAX_IRQ; break; } diff --git a/drivers/char/random.c b/drivers/char/random.c index c8752ea..705a839 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,9 +558,26 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -static struct timer_rand_state input_timer_state; static struct timer_rand_state *irq_timer_state[NR_IRQS]; +static struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + if (irq >= nr_irqs) + return NULL; + + return irq_timer_state[irq]; +} + +static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + if (irq >= nr_irqs) + return; + + irq_timer_state[irq] = state; +} + +static struct timer_rand_state input_timer_state; + /* * This function adds entropy to the entropy "pool" by using timing * delays. It uses the timer_rand_state structure to make an estimate @@ -648,11 +665,15 @@ EXPORT_SYMBOL_GPL(add_input_randomness); void add_interrupt_randomness(int irq) { - if (irq >= NR_IRQS || irq_timer_state[irq] == NULL) + struct timer_rand_state *state; + + state = get_timer_rand_state(irq); + + if (state == NULL) return; DEBUG_ENT("irq event %d\n", irq); - add_timer_randomness(irq_timer_state[irq], 0x100 + irq); + add_timer_randomness(state, 0x100 + irq); } #ifdef CONFIG_BLOCK @@ -912,7 +933,12 @@ void rand_initialize_irq(int irq) { struct timer_rand_state *state; - if (irq >= NR_IRQS || irq_timer_state[irq]) + if (irq >= nr_irqs) + return; + + state = get_timer_rand_state(irq); + + if (state) return; /* @@ -921,7 +947,7 @@ void rand_initialize_irq(int irq) */ state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL); if (state) - irq_timer_state[irq] = state; + set_timer_rand_state(irq, state); } #ifdef CONFIG_BLOCK diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index d0c0d64..ce0d9da 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -168,7 +168,7 @@ static void sysrq_handle_show_timers(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_show_timers_op = { .handler = sysrq_handle_show_timers, .help_msg = "show-all-timers(Q)", - .action_msg = "Show pending hrtimers (no others)", + .action_msg = "Show clockevent devices & pending hrtimers (no others)", }; static void sysrq_handle_mountro(int key, struct tty_struct *tty) diff --git a/drivers/char/vr41xx_giu.c b/drivers/char/vr41xx_giu.c index ffe9b4e..54c8372 100644 --- a/drivers/char/vr41xx_giu.c +++ b/drivers/char/vr41xx_giu.c @@ -641,7 +641,7 @@ static int __devinit giu_probe(struct platform_device *dev) } irq = platform_get_irq(dev, 0); - if (irq < 0 || irq >= NR_IRQS) + if (irq < 0 || irq >= nr_irqs) return -EBUSY; return cascade_irq(irq, giu_get_irq); diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index 71d2ac4..c201710 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -237,9 +237,12 @@ static int __init parse_pmtmr(char *arg) if (strict_strtoul(arg, 16, &base)) return -EINVAL; - +#ifdef CONFIG_X86_64 + if (base > UINT_MAX) + return -ERANGE; +#endif printk(KERN_INFO "PMTMR IOPort override: 0x%04x -> 0x%04lx\n", - (unsigned int)pmtmr_ioport, base); + pmtmr_ioport, base); pmtmr_ioport = base; return 1; diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index cd30390..904e575 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -48,13 +48,13 @@ config DW_DMAC can be integrated in chips such as the Atmel AT32ap7000. config FSL_DMA - bool "Freescale MPC85xx/MPC83xx DMA support" - depends on PPC + tristate "Freescale Elo and Elo Plus DMA support" + depends on FSL_SOC select DMA_ENGINE ---help--- - Enable support for the Freescale DMA engine. Now, it support - MPC8560/40, MPC8555, MPC8548 and MPC8641 processors. - The MPC8349, MPC8360 is also supported. + Enable support for the Freescale Elo and Elo Plus DMA controllers. + The Elo is the DMA controller on some 82xx and 83xx parts, and the + Elo Plus is the DMA controller on 85xx and 86xx parts. config MV_XOR bool "Marvell XOR engine support" diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index a08d197..d1e381e 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -325,7 +325,12 @@ static enum dma_state_client dmatest_add_channel(struct dma_chan *chan) struct dmatest_thread *thread; unsigned int i; - dtc = kmalloc(sizeof(struct dmatest_chan), GFP_ATOMIC); + /* Have we already been told about this channel? */ + list_for_each_entry(dtc, &dmatest_channels, node) + if (dtc->chan == chan) + return DMA_DUP; + + dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); if (!dtc) { pr_warning("dmatest: No memory for %s\n", chan->dev.bus_id); return DMA_NAK; diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index c0059ca..0b95dcc 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -370,7 +370,10 @@ static int fsl_dma_alloc_chan_resources(struct dma_chan *chan, struct dma_client *client) { struct fsl_dma_chan *fsl_chan = to_fsl_chan(chan); - LIST_HEAD(tmp_list); + + /* Has this channel already been allocated? */ + if (fsl_chan->desc_pool) + return 1; /* We need the descriptor to be aligned to 32bytes * for meeting FSL DMA specification requirement. @@ -410,6 +413,8 @@ static void fsl_dma_free_chan_resources(struct dma_chan *chan) } spin_unlock_irqrestore(&fsl_chan->desc_lock, flags); dma_pool_destroy(fsl_chan->desc_pool); + + fsl_chan->desc_pool = NULL; } static struct dma_async_tx_descriptor * @@ -786,159 +791,29 @@ static void dma_do_tasklet(unsigned long data) fsl_chan_ld_cleanup(fsl_chan); } -static void fsl_dma_callback_test(void *param) -{ - struct fsl_dma_chan *fsl_chan = param; - if (fsl_chan) - dev_dbg(fsl_chan->dev, "selftest: callback is ok!\n"); -} - -static int fsl_dma_self_test(struct fsl_dma_chan *fsl_chan) -{ - struct dma_chan *chan; - int err = 0; - dma_addr_t dma_dest, dma_src; - dma_cookie_t cookie; - u8 *src, *dest; - int i; - size_t test_size; - struct dma_async_tx_descriptor *tx1, *tx2, *tx3; - - test_size = 4096; - - src = kmalloc(test_size * 2, GFP_KERNEL); - if (!src) { - dev_err(fsl_chan->dev, - "selftest: Cannot alloc memory for test!\n"); - return -ENOMEM; - } - - dest = src + test_size; - - for (i = 0; i < test_size; i++) - src[i] = (u8) i; - - chan = &fsl_chan->common; - - if (fsl_dma_alloc_chan_resources(chan, NULL) < 1) { - dev_err(fsl_chan->dev, - "selftest: Cannot alloc resources for DMA\n"); - err = -ENODEV; - goto out; - } - - /* TX 1 */ - dma_src = dma_map_single(fsl_chan->dev, src, test_size / 2, - DMA_TO_DEVICE); - dma_dest = dma_map_single(fsl_chan->dev, dest, test_size / 2, - DMA_FROM_DEVICE); - tx1 = fsl_dma_prep_memcpy(chan, dma_dest, dma_src, test_size / 2, 0); - async_tx_ack(tx1); - - cookie = fsl_dma_tx_submit(tx1); - fsl_dma_memcpy_issue_pending(chan); - msleep(2); - - if (fsl_dma_is_complete(chan, cookie, NULL, NULL) != DMA_SUCCESS) { - dev_err(fsl_chan->dev, "selftest: Time out!\n"); - err = -ENODEV; - goto free_resources; - } - - /* Test free and re-alloc channel resources */ - fsl_dma_free_chan_resources(chan); - - if (fsl_dma_alloc_chan_resources(chan, NULL) < 1) { - dev_err(fsl_chan->dev, - "selftest: Cannot alloc resources for DMA\n"); - err = -ENODEV; - goto free_resources; - } - - /* Continue to test - * TX 2 - */ - dma_src = dma_map_single(fsl_chan->dev, src + test_size / 2, - test_size / 4, DMA_TO_DEVICE); - dma_dest = dma_map_single(fsl_chan->dev, dest + test_size / 2, - test_size / 4, DMA_FROM_DEVICE); - tx2 = fsl_dma_prep_memcpy(chan, dma_dest, dma_src, test_size / 4, 0); - async_tx_ack(tx2); - - /* TX 3 */ - dma_src = dma_map_single(fsl_chan->dev, src + test_size * 3 / 4, - test_size / 4, DMA_TO_DEVICE); - dma_dest = dma_map_single(fsl_chan->dev, dest + test_size * 3 / 4, - test_size / 4, DMA_FROM_DEVICE); - tx3 = fsl_dma_prep_memcpy(chan, dma_dest, dma_src, test_size / 4, 0); - async_tx_ack(tx3); - - /* Interrupt tx test */ - tx1 = fsl_dma_prep_interrupt(chan, 0); - async_tx_ack(tx1); - cookie = fsl_dma_tx_submit(tx1); - - /* Test exchanging the prepared tx sort */ - cookie = fsl_dma_tx_submit(tx3); - cookie = fsl_dma_tx_submit(tx2); - - if (dma_has_cap(DMA_INTERRUPT, ((struct fsl_dma_device *) - dev_get_drvdata(fsl_chan->dev->parent))->common.cap_mask)) { - tx3->callback = fsl_dma_callback_test; - tx3->callback_param = fsl_chan; - } - fsl_dma_memcpy_issue_pending(chan); - msleep(2); - - if (fsl_dma_is_complete(chan, cookie, NULL, NULL) != DMA_SUCCESS) { - dev_err(fsl_chan->dev, "selftest: Time out!\n"); - err = -ENODEV; - goto free_resources; - } - - err = memcmp(src, dest, test_size); - if (err) { - for (i = 0; (*(src + i) == *(dest + i)) && (i < test_size); - i++); - dev_err(fsl_chan->dev, "selftest: Test failed, data %d/%ld is " - "error! src 0x%x, dest 0x%x\n", - i, (long)test_size, *(src + i), *(dest + i)); - } - -free_resources: - fsl_dma_free_chan_resources(chan); -out: - kfree(src); - return err; -} - -static int __devinit of_fsl_dma_chan_probe(struct of_device *dev, - const struct of_device_id *match) +static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, + struct device_node *node, u32 feature, const char *compatible) { - struct fsl_dma_device *fdev; struct fsl_dma_chan *new_fsl_chan; int err; - fdev = dev_get_drvdata(dev->dev.parent); - BUG_ON(!fdev); - /* alloc channel */ new_fsl_chan = kzalloc(sizeof(struct fsl_dma_chan), GFP_KERNEL); if (!new_fsl_chan) { - dev_err(&dev->dev, "No free memory for allocating " + dev_err(fdev->dev, "No free memory for allocating " "dma channels!\n"); return -ENOMEM; } /* get dma channel register base */ - err = of_address_to_resource(dev->node, 0, &new_fsl_chan->reg); + err = of_address_to_resource(node, 0, &new_fsl_chan->reg); if (err) { - dev_err(&dev->dev, "Can't get %s property 'reg'\n", - dev->node->full_name); + dev_err(fdev->dev, "Can't get %s property 'reg'\n", + node->full_name); goto err_no_reg; } - new_fsl_chan->feature = *(u32 *)match->data; + new_fsl_chan->feature = feature; if (!fdev->feature) fdev->feature = new_fsl_chan->feature; @@ -948,13 +823,13 @@ static int __devinit of_fsl_dma_chan_probe(struct of_device *dev, */ WARN_ON(fdev->feature != new_fsl_chan->feature); - new_fsl_chan->dev = &dev->dev; + new_fsl_chan->dev = &new_fsl_chan->common.dev; new_fsl_chan->reg_base = ioremap(new_fsl_chan->reg.start, new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1); new_fsl_chan->id = ((new_fsl_chan->reg.start - 0x100) & 0xfff) >> 7; if (new_fsl_chan->id > FSL_DMA_MAX_CHANS_PER_DEVICE) { - dev_err(&dev->dev, "There is no %d channel!\n", + dev_err(fdev->dev, "There is no %d channel!\n", new_fsl_chan->id); err = -EINVAL; goto err_no_chan; @@ -988,29 +863,23 @@ static int __devinit of_fsl_dma_chan_probe(struct of_device *dev, &fdev->common.channels); fdev->common.chancnt++; - new_fsl_chan->irq = irq_of_parse_and_map(dev->node, 0); + new_fsl_chan->irq = irq_of_parse_and_map(node, 0); if (new_fsl_chan->irq != NO_IRQ) { err = request_irq(new_fsl_chan->irq, &fsl_dma_chan_do_interrupt, IRQF_SHARED, "fsldma-channel", new_fsl_chan); if (err) { - dev_err(&dev->dev, "DMA channel %s request_irq error " - "with return %d\n", dev->node->full_name, err); + dev_err(fdev->dev, "DMA channel %s request_irq error " + "with return %d\n", node->full_name, err); goto err_no_irq; } } - err = fsl_dma_self_test(new_fsl_chan); - if (err) - goto err_self_test; - - dev_info(&dev->dev, "#%d (%s), irq %d\n", new_fsl_chan->id, - match->compatible, new_fsl_chan->irq); + dev_info(fdev->dev, "#%d (%s), irq %d\n", new_fsl_chan->id, + compatible, new_fsl_chan->irq); return 0; -err_self_test: - free_irq(new_fsl_chan->irq, new_fsl_chan); err_no_irq: list_del(&new_fsl_chan->common.device_node); err_no_chan: @@ -1020,38 +889,20 @@ err_no_reg: return err; } -const u32 mpc8540_dma_ip_feature = FSL_DMA_IP_85XX | FSL_DMA_BIG_ENDIAN; -const u32 mpc8349_dma_ip_feature = FSL_DMA_IP_83XX | FSL_DMA_LITTLE_ENDIAN; - -static struct of_device_id of_fsl_dma_chan_ids[] = { - { - .compatible = "fsl,eloplus-dma-channel", - .data = (void *)&mpc8540_dma_ip_feature, - }, - { - .compatible = "fsl,elo-dma-channel", - .data = (void *)&mpc8349_dma_ip_feature, - }, - {} -}; - -static struct of_platform_driver of_fsl_dma_chan_driver = { - .name = "of-fsl-dma-channel", - .match_table = of_fsl_dma_chan_ids, - .probe = of_fsl_dma_chan_probe, -}; - -static __init int of_fsl_dma_chan_init(void) +static void fsl_dma_chan_remove(struct fsl_dma_chan *fchan) { - return of_register_platform_driver(&of_fsl_dma_chan_driver); + free_irq(fchan->irq, fchan); + list_del(&fchan->common.device_node); + iounmap(fchan->reg_base); + kfree(fchan); } static int __devinit of_fsl_dma_probe(struct of_device *dev, const struct of_device_id *match) { int err; - unsigned int irq; struct fsl_dma_device *fdev; + struct device_node *child; fdev = kzalloc(sizeof(struct fsl_dma_device), GFP_KERNEL); if (!fdev) { @@ -1085,9 +936,9 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev, fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending; fdev->common.dev = &dev->dev; - irq = irq_of_parse_and_map(dev->node, 0); - if (irq != NO_IRQ) { - err = request_irq(irq, &fsl_dma_do_interrupt, IRQF_SHARED, + fdev->irq = irq_of_parse_and_map(dev->node, 0); + if (fdev->irq != NO_IRQ) { + err = request_irq(fdev->irq, &fsl_dma_do_interrupt, IRQF_SHARED, "fsldma-device", fdev); if (err) { dev_err(&dev->dev, "DMA device request_irq error " @@ -1097,7 +948,21 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev, } dev_set_drvdata(&(dev->dev), fdev); - of_platform_bus_probe(dev->node, of_fsl_dma_chan_ids, &dev->dev); + + /* We cannot use of_platform_bus_probe() because there is no + * of_platform_bus_remove. Instead, we manually instantiate every DMA + * channel object. + */ + for_each_child_of_node(dev->node, child) { + if (of_device_is_compatible(child, "fsl,eloplus-dma-channel")) + fsl_dma_chan_probe(fdev, child, + FSL_DMA_IP_85XX | FSL_DMA_BIG_ENDIAN, + "fsl,eloplus-dma-channel"); + if (of_device_is_compatible(child, "fsl,elo-dma-channel")) + fsl_dma_chan_probe(fdev, child, + FSL_DMA_IP_83XX | FSL_DMA_LITTLE_ENDIAN, + "fsl,elo-dma-channel"); + } dma_async_device_register(&fdev->common); return 0; @@ -1109,6 +974,30 @@ err_no_reg: return err; } +static int of_fsl_dma_remove(struct of_device *of_dev) +{ + struct fsl_dma_device *fdev; + unsigned int i; + + fdev = dev_get_drvdata(&of_dev->dev); + + dma_async_device_unregister(&fdev->common); + + for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) + if (fdev->chan[i]) + fsl_dma_chan_remove(fdev->chan[i]); + + if (fdev->irq != NO_IRQ) + free_irq(fdev->irq, fdev); + + iounmap(fdev->reg_base); + + kfree(fdev); + dev_set_drvdata(&of_dev->dev, NULL); + + return 0; +} + static struct of_device_id of_fsl_dma_ids[] = { { .compatible = "fsl,eloplus-dma", }, { .compatible = "fsl,elo-dma", }, @@ -1116,15 +1005,32 @@ static struct of_device_id of_fsl_dma_ids[] = { }; static struct of_platform_driver of_fsl_dma_driver = { - .name = "of-fsl-dma", + .name = "fsl-elo-dma", .match_table = of_fsl_dma_ids, .probe = of_fsl_dma_probe, + .remove = of_fsl_dma_remove, }; static __init int of_fsl_dma_init(void) { - return of_register_platform_driver(&of_fsl_dma_driver); + int ret; + + pr_info("Freescale Elo / Elo Plus DMA driver\n"); + + ret = of_register_platform_driver(&of_fsl_dma_driver); + if (ret) + pr_err("fsldma: failed to register platform driver\n"); + + return ret; +} + +static void __exit of_fsl_dma_exit(void) +{ + of_unregister_platform_driver(&of_fsl_dma_driver); } -subsys_initcall(of_fsl_dma_chan_init); subsys_initcall(of_fsl_dma_init); +module_exit(of_fsl_dma_exit); + +MODULE_DESCRIPTION("Freescale Elo / Elo Plus DMA driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h index 6faf07b..4f21a51 100644 --- a/drivers/dma/fsldma.h +++ b/drivers/dma/fsldma.h @@ -114,6 +114,7 @@ struct fsl_dma_device { struct dma_device common; struct fsl_dma_chan *chan[FSL_DMA_MAX_CHANS_PER_DEVICE]; u32 feature; /* The same as DMA channels */ + int irq; /* Channel IRQ */ }; /* Define macros for fsl_dma_chan->feature property */ diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index bc8c6e3..1ef68b3 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -971,11 +971,9 @@ static struct ioat_desc_sw *ioat_dma_get_next_descriptor( switch (ioat_chan->device->version) { case IOAT_VER_1_2: return ioat1_dma_get_next_descriptor(ioat_chan); - break; case IOAT_VER_2_0: case IOAT_VER_3_0: return ioat2_dma_get_next_descriptor(ioat_chan); - break; } return NULL; } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 22edc42..faa1cc6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1143,7 +1143,7 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip) if (!is_out) { int irq = gpio_to_irq(gpio); - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* This races with request_irq(), set_irq_type(), * and set_irq_wake() ... but those are "rare". diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 9097500..a8b33c2 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -6,7 +6,7 @@ # menuconfig DRM tristate "Direct Rendering Manager (XFree86 4.1.0 and higher DRI support)" - depends on (AGP || AGP=n) && PCI && !EMULATED_CMPXCHG && SHMEM + depends on (AGP || AGP=n) && PCI && !EMULATED_CMPXCHG && MMU help Kernel-level support for the Direct Rendering Infrastructure (DRI) introduced in XFree86 4.0. If you say Y here, you need to select diff --git a/drivers/gpu/drm/drm_proc.c b/drivers/gpu/drm/drm_proc.c index d490db4..ae73b7f 100644 --- a/drivers/gpu/drm/drm_proc.c +++ b/drivers/gpu/drm/drm_proc.c @@ -522,12 +522,12 @@ static int drm_gem_one_name_info(int id, void *ptr, void *data) struct drm_gem_object *obj = ptr; struct drm_gem_name_info_data *nid = data; - DRM_INFO("name %d size %d\n", obj->name, obj->size); + DRM_INFO("name %d size %zd\n", obj->name, obj->size); if (nid->eof) return 0; nid->len += sprintf(&nid->buf[nid->len], - "%6d%9d%8d%9d\n", + "%6d %8zd %7d %8d\n", obj->name, obj->size, atomic_read(&obj->handlecount.refcount), atomic_read(&obj->refcount.refcount)); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 9ac73dd..dc2e6fd 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -171,6 +171,37 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data, return 0; } +/* + * Try to write quickly with an atomic kmap. Return true on success. + * + * If this fails (which includes a partial write), we'll redo the whole + * thing with the slow version. + * + * This is a workaround for the low performance of iounmap (approximate + * 10% cpu cost on normal 3D workloads). kmap_atomic on HIGHMEM kernels + * happens to let us map card memory without taking IPIs. When the vmap + * rework lands we should be able to dump this hack. + */ +static inline int fast_user_write(unsigned long pfn, char __user *user_data, + int l, int o) +{ +#ifdef CONFIG_HIGHMEM + unsigned long unwritten; + char *vaddr_atomic; + + vaddr_atomic = kmap_atomic_pfn(pfn, KM_USER0); +#if WATCH_PWRITE + DRM_INFO("pwrite i %d o %d l %d pfn %ld vaddr %p\n", + i, o, l, pfn, vaddr_atomic); +#endif + unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + o, user_data, l); + kunmap_atomic(vaddr_atomic, KM_USER0); + return !unwritten; +#else + return 0; +#endif +} + static int i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj, struct drm_i915_gem_pwrite *args, @@ -180,12 +211,7 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj, ssize_t remain; loff_t offset; char __user *user_data; - char __iomem *vaddr; - char *vaddr_atomic; - int i, o, l; int ret = 0; - unsigned long pfn; - unsigned long unwritten; user_data = (char __user *) (uintptr_t) args->data_ptr; remain = args->size; @@ -209,6 +235,9 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj, obj_priv->dirty = 1; while (remain > 0) { + unsigned long pfn; + int i, o, l; + /* Operation in this page * * i = page number @@ -223,25 +252,10 @@ i915_gem_gtt_pwrite(struct drm_device *dev, struct drm_gem_object *obj, pfn = (dev->agp->base >> PAGE_SHIFT) + i; -#ifdef CONFIG_HIGHMEM - /* This is a workaround for the low performance of iounmap - * (approximate 10% cpu cost on normal 3D workloads). - * kmap_atomic on HIGHMEM kernels happens to let us map card - * memory without taking IPIs. When the vmap rework lands - * we should be able to dump this hack. - */ - vaddr_atomic = kmap_atomic_pfn(pfn, KM_USER0); -#if WATCH_PWRITE - DRM_INFO("pwrite i %d o %d l %d pfn %ld vaddr %p\n", - i, o, l, pfn, vaddr_atomic); -#endif - unwritten = __copy_from_user_inatomic_nocache(vaddr_atomic + o, - user_data, l); - kunmap_atomic(vaddr_atomic, KM_USER0); + if (!fast_user_write(pfn, user_data, l, o)) { + unsigned long unwritten; + char __iomem *vaddr; - if (unwritten) -#endif /* CONFIG_HIGHMEM */ - { vaddr = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE); #if WATCH_PWRITE DRM_INFO("pwrite slow i %d o %d l %d " diff --git a/drivers/i2c/busses/i2c-amd756.c b/drivers/i2c/busses/i2c-amd756.c index 1ea3925..424dad6 100644 --- a/drivers/i2c/busses/i2c-amd756.c +++ b/drivers/i2c/busses/i2c-amd756.c @@ -332,10 +332,6 @@ static int __devinit amd756_probe(struct pci_dev *pdev, int error; u8 temp; - /* driver_data might come from user-space, so check it */ - if (id->driver_data >= ARRAY_SIZE(chipname)) - return -EINVAL; - if (amd756_ioport) { dev_err(&pdev->dev, "Only one device supported " "(you have a strange motherboard, btw)\n"); @@ -412,7 +408,6 @@ static struct pci_driver amd756_driver = { .id_table = amd756_ids, .probe = amd756_probe, .remove = __devexit_p(amd756_remove), - .dynids.use_driver_data = 1, }; static int __init amd756_init(void) diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c index 73dc52e..9f194d9 100644 --- a/drivers/i2c/busses/i2c-viapro.c +++ b/drivers/i2c/busses/i2c-viapro.c @@ -332,10 +332,6 @@ static int __devinit vt596_probe(struct pci_dev *pdev, unsigned char temp; int error = -ENODEV; - /* driver_data might come from user-space, so check it */ - if (id->driver_data & 1 || id->driver_data > 0xff) - return -EINVAL; - /* Determine the address of the SMBus areas */ if (force_addr) { vt596_smba = force_addr & 0xfff0; @@ -483,7 +479,6 @@ static struct pci_driver vt596_driver = { .name = "vt596_smbus", .id_table = vt596_ids, .probe = vt596_probe, - .dynids.use_driver_data = 1, }; static int __init i2c_vt596_init(void) diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 74a369a..a820ca6 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -84,21 +84,40 @@ config BLK_DEV_IDE_SATA If unsure, say N. -config BLK_DEV_IDEDISK - tristate "Include IDE/ATA-2 DISK support" - ---help--- - This will include enhanced support for MFM/RLL/IDE hard disks. If - you have a MFM/RLL/IDE disk, and there is no special reason to use - the old hard disk driver instead, say Y. If you have an SCSI-only - system, you can say N here. +config IDE_GD + tristate "generic ATA/ATAPI disk support" + default y + help + Support for ATA/ATAPI disks (including ATAPI floppy drives). - To compile this driver as a module, choose M here: the - module will be called ide-disk. - Do not compile this driver as a module if your root file system - (the one containing the directory /) is located on the IDE disk. + To compile this driver as a module, choose M here. + The module will be called ide-gd_mod. + + If unsure, say Y. + +config IDE_GD_ATA + bool "ATA disk support" + depends on IDE_GD + default y + help + This will include support for ATA hard disks. If unsure, say Y. +config IDE_GD_ATAPI + bool "ATAPI floppy support" + depends on IDE_GD + select IDE_ATAPI + help + This will include support for ATAPI floppy drives + (i.e. Iomega ZIP or MKE LS-120). + + For information about jumper settings and the question + of when a ZIP drive uses a partition table, see + <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>. + + If unsure, say N. + config BLK_DEV_IDECS tristate "PCMCIA IDE support" depends on PCMCIA @@ -163,29 +182,6 @@ config BLK_DEV_IDETAPE To compile this driver as a module, choose M here: the module will be called ide-tape. -config BLK_DEV_IDEFLOPPY - tristate "Include IDE/ATAPI FLOPPY support" - select IDE_ATAPI - ---help--- - If you have an IDE floppy drive which uses the ATAPI protocol, - answer Y. ATAPI is a newer protocol used by IDE CD-ROM/tape/floppy - drives, similar to the SCSI protocol. - - The LS-120 and the IDE/ATAPI Iomega ZIP drive are also supported by - this driver. For information about jumper settings and the question - of when a ZIP drive uses a partition table, see - <http://www.win.tue.nl/~aeb/linux/zip/zip-1.html>. - (ATAPI PD-CD/CDR drives are not supported by this driver; support - for PD-CD/CDR drives is available if you answer Y to - "SCSI emulation support", below). - - If you say Y here, the FLOPPY drive will be identified along with - other IDE devices, as "hdb" or "hdc", or something similar (check - the boot messages with dmesg). - - To compile this driver as a module, choose M here: the - module will be called ide-floppy. - config BLK_DEV_IDESCSI tristate "SCSI emulation support (DEPRECATED)" depends on SCSI @@ -332,7 +328,7 @@ config IDEPCI_PCIBUS_ORDER # TODO: split it on per host driver config options (or module parameters) config BLK_DEV_OFFBOARD bool "Boot off-board chipsets first support (DEPRECATED)" - depends on BLK_DEV_IDEPCI && (BLK_DEV_AEC62XX || BLK_DEV_GENERIC || BLK_DEV_HPT34X || BLK_DEV_HPT366 || BLK_DEV_PDC202XX_NEW || BLK_DEV_PDC202XX_OLD || BLK_DEV_TC86C001) + depends on BLK_DEV_IDEPCI && (BLK_DEV_AEC62XX || BLK_DEV_GENERIC || BLK_DEV_HPT366 || BLK_DEV_PDC202XX_NEW || BLK_DEV_PDC202XX_OLD || BLK_DEV_TC86C001) help Normally, IDE controllers built into the motherboard (on-board controllers) are assigned to ide0 and ide1 while those on add-in PCI @@ -482,28 +478,6 @@ config BLK_DEV_CS5535 It is safe to say Y to this question. -config BLK_DEV_HPT34X - tristate "HPT34X chipset support" - depends on BROKEN - select BLK_DEV_IDEDMA_PCI - help - This driver adds up to 4 more EIDE devices sharing a single - interrupt. The HPT343 chipset in its current form is a non-bootable - controller; the HPT345/HPT363 chipset is a bootable (needs BIOS FIX) - PCI UDMA controllers. This driver requires dynamic tuning of the - chipset during the ide-probe at boot time. It is reported to support - DVD II drives, by the manufacturer. - -config HPT34X_AUTODMA - bool "HPT34X AUTODMA support (EXPERIMENTAL)" - depends on BLK_DEV_HPT34X && EXPERIMENTAL - help - This is a dangerous thing to attempt currently! Please read the - comments at the top of <file:drivers/ide/pci/hpt34x.c>. If you say Y - here, then say Y to "Use DMA by default when available" as well. - - If unsure, say N. - config BLK_DEV_HPT366 tristate "HPT36X/37X chipset support" select BLK_DEV_IDEDMA_PCI diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile index ceaf779..093d324 100644 --- a/drivers/ide/Makefile +++ b/drivers/ide/Makefile @@ -37,18 +37,25 @@ obj-$(CONFIG_IDE_H8300) += h8300/ obj-$(CONFIG_IDE_GENERIC) += ide-generic.o obj-$(CONFIG_BLK_DEV_IDEPNP) += ide-pnp.o -ide-disk_mod-y += ide-disk.o ide-disk_ioctl.o +ide-gd_mod-y += ide-gd.o ide-cd_mod-y += ide-cd.o ide-cd_ioctl.o ide-cd_verbose.o -ide-floppy_mod-y += ide-floppy.o ide-floppy_ioctl.o +ifeq ($(CONFIG_IDE_GD_ATA), y) + ide-gd_mod-y += ide-disk.o ide-disk_ioctl.o ifeq ($(CONFIG_IDE_PROC_FS), y) - ide-disk_mod-y += ide-disk_proc.o - ide-floppy_mod-y += ide-floppy_proc.o + ide-gd_mod-y += ide-disk_proc.o +endif +endif + +ifeq ($(CONFIG_IDE_GD_ATAPI), y) + ide-gd_mod-y += ide-floppy.o ide-floppy_ioctl.o +ifeq ($(CONFIG_IDE_PROC_FS), y) + ide-gd_mod-y += ide-floppy_proc.o +endif endif -obj-$(CONFIG_BLK_DEV_IDEDISK) += ide-disk_mod.o +obj-$(CONFIG_IDE_GD) += ide-gd_mod.o obj-$(CONFIG_BLK_DEV_IDECD) += ide-cd_mod.o -obj-$(CONFIG_BLK_DEV_IDEFLOPPY) += ide-floppy_mod.o obj-$(CONFIG_BLK_DEV_IDETAPE) += ide-tape.o ifeq ($(CONFIG_BLK_DEV_IDECS), y) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 2e30571..4e58b9e 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -191,7 +191,7 @@ int ide_set_media_lock(ide_drive_t *drive, struct gendisk *disk, int on) { struct ide_atapi_pc pc; - if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK) + if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) return 0; ide_init_pc(&pc); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 3308b1c..13265a8 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -99,7 +99,7 @@ static void ide_cd_put(struct cdrom_info *cd) /* Mark that we've seen a media change and invalidate our internal buffers. */ static void cdrom_saw_media_change(ide_drive_t *drive) { - drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED; + drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED; drive->atapi_flags &= ~IDE_AFLAG_TOC_VALID; } @@ -340,8 +340,8 @@ static int cdrom_decode_status(ide_drive_t *drive, int good_stat, int *stat_ret) } ide_debug_log(IDE_DBG_RQ, "%s: stat: 0x%x, good_stat: 0x%x, " - "rq->cmd_type: 0x%x, err: 0x%x\n", __func__, stat, - good_stat, rq->cmd_type, err); + "rq->cmd[0]: 0x%x, rq->cmd_type: 0x%x, err: 0x%x\n", + __func__, stat, good_stat, rq->cmd[0], rq->cmd_type, err); if (blk_sense_request(rq)) { /* @@ -843,13 +843,10 @@ static void ide_cd_restore_request(ide_drive_t *drive, struct request *rq) rq->q->prep_rq_fn(rq->q, rq); } -/* - * All other packet commands. - */ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct request *rq) { - - ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "Call %s, rq->cmd[0]: 0x%x\n", + __func__, rq->cmd[0]); /* * Some of the trailing request sense fields are optional, @@ -876,7 +873,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, if (!sense) sense = &local_sense; - ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd[0]: 0x%x, write: 0x%x, " + ide_debug_log(IDE_DBG_PC, "Call %s, cmd[0]: 0x%x, write: 0x%x, " "timeout: %d, cmd_flags: 0x%x\n", __func__, cmd[0], write, timeout, cmd_flags); @@ -1177,8 +1174,9 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) unsigned short sectors_per_frame = queue_hardsect_size(drive->queue) >> SECTOR_BITS; - ide_debug_log(IDE_DBG_RQ, "Call %s, write: 0x%x, secs_per_frame: %u\n", - __func__, write, sectors_per_frame); + ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd[0]: 0x%x, write: 0x%x, " + "secs_per_frame: %u\n", + __func__, rq->cmd[0], write, sectors_per_frame); if (write) { /* disk has become write protected */ @@ -1221,7 +1219,8 @@ static ide_startstop_t cdrom_do_newpc_cont(ide_drive_t *drive) static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq) { - ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd_type: 0x%x\n", __func__, + ide_debug_log(IDE_DBG_PC, "Call %s, rq->cmd[0]: 0x%x, " + "rq->cmd_type: 0x%x\n", __func__, rq->cmd[0], rq->cmd_type); if (blk_pc_request(rq)) @@ -1257,9 +1256,6 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq) } } -/* - * cdrom driver request routine. - */ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, sector_t block) { @@ -1267,8 +1263,10 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, ide_handler_t *fn; int xferlen; - ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd_type: 0x%x, block: %llu\n", - __func__, rq->cmd_type, (unsigned long long)block); + ide_debug_log(IDE_DBG_RQ, "Call %s, rq->cmd[0]: 0x%x, " + "rq->cmd_type: 0x%x, block: %llu\n", + __func__, rq->cmd[0], rq->cmd_type, + (unsigned long long)block); if (blk_fs_request(rq)) { if (drive->atapi_flags & IDE_AFLAG_SEEKING) { @@ -1412,6 +1410,10 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, *capacity = 1 + be32_to_cpu(capbuf.lba); *sectors_per_frame = blocklen >> SECTOR_BITS; + + ide_debug_log(IDE_DBG_PROBE, "%s: cap: %lu, sectors_per_frame: %lu\n", + __func__, *capacity, *sectors_per_frame); + return 0; } @@ -1643,6 +1645,9 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf) maxspeed = be16_to_cpup((__be16 *)&buf[8 + 8]); } + ide_debug_log(IDE_DBG_PROBE, "%s: curspeed: %u, maxspeed: %u\n", + __func__, curspeed, maxspeed); + cd->current_speed = (curspeed + (176/2)) / 176; cd->max_speed = (maxspeed + (176/2)) / 176; } @@ -1732,7 +1737,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive) return 0; if ((buf[8 + 6] & 0x01) == 0) - drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK; + drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; if (buf[8 + 6] & 0x08) drive->atapi_flags &= ~IDE_AFLAG_NO_EJECT; if (buf[8 + 3] & 0x01) @@ -1777,7 +1782,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive) if ((cdi->mask & CDC_DVD_R) == 0 || (cdi->mask & CDC_DVD_RAM) == 0) printk(KERN_CONT " DVD%s%s", (cdi->mask & CDC_DVD_R) ? "" : "-R", - (cdi->mask & CDC_DVD_RAM) ? "" : "-RAM"); + (cdi->mask & CDC_DVD_RAM) ? "" : "/RAM"); if ((cdi->mask & CDC_CD_R) == 0 || (cdi->mask & CDC_CD_RW) == 0) printk(KERN_CONT " CD%s%s", @@ -1908,6 +1913,16 @@ static const struct ide_proc_devset idecd_settings[] = { IDE_PROC_DEVSET(dsc_overlap, 0, 1), { 0 }, }; + +static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive) +{ + return idecd_proc; +} + +static const struct ide_proc_devset *ide_cd_proc_devsets(ide_drive_t *drive) +{ + return idecd_settings; +} #endif static const struct cd_list_entry ide_cd_quirks_list[] = { @@ -1986,8 +2001,8 @@ static int ide_cdrom_setup(ide_drive_t *drive) if (!drive->queue->unplug_delay) drive->queue->unplug_delay = 1; - drive->atapi_flags = IDE_AFLAG_MEDIA_CHANGED | IDE_AFLAG_NO_EJECT | - ide_cd_flags(id); + drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED; + drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id); if ((drive->atapi_flags & IDE_AFLAG_VERTOS_300_SSD) && fw_rev[4] == '1' && fw_rev[6] <= '2') @@ -2069,8 +2084,8 @@ static ide_driver_t ide_cdrom_driver = { .end_request = ide_end_request, .error = __ide_error, #ifdef CONFIG_IDE_PROC_FS - .proc = idecd_proc, - .settings = idecd_settings, + .proc_entries = ide_cd_proc_entries, + .proc_devsets = ide_cd_proc_devsets, #endif }; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 74231b4..df3df00 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -86,8 +86,8 @@ int ide_cdrom_check_media_change_real(struct cdrom_device_info *cdi, if (slot_nr == CDSL_CURRENT) { (void) cdrom_check_status(drive, NULL); - retval = (drive->atapi_flags & IDE_AFLAG_MEDIA_CHANGED) ? 1 : 0; - drive->atapi_flags &= ~IDE_AFLAG_MEDIA_CHANGED; + retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0; + drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED; return retval; } else { return -EINVAL; @@ -136,7 +136,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, sense = &my_sense; /* If the drive cannot lock the door, just pretend. */ - if (drive->atapi_flags & IDE_AFLAG_NO_DOORLOCK) { + if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) { stat = 0; } else { unsigned char cmd[BLK_MAX_CDB]; @@ -157,7 +157,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, (sense->asc == 0x24 || sense->asc == 0x20)) { printk(KERN_ERR "%s: door locking not supported\n", drive->name); - drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK; + drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; stat = 0; } diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 3853bde..223750c 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -14,9 +14,6 @@ * This is the IDE/ATA disk driver, as evolved from hd.c and ide.c. */ -#define IDEDISK_VERSION "1.18" - -#include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> @@ -39,46 +36,8 @@ #include <asm/io.h> #include <asm/div64.h> -#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) -#define IDE_DISK_MINORS (1 << PARTN_BITS) -#else -#define IDE_DISK_MINORS 0 -#endif - #include "ide-disk.h" -static DEFINE_MUTEX(idedisk_ref_mutex); - -#define to_ide_disk(obj) container_of(obj, struct ide_disk_obj, kref) - -static void ide_disk_release(struct kref *); - -static struct ide_disk_obj *ide_disk_get(struct gendisk *disk) -{ - struct ide_disk_obj *idkp = NULL; - - mutex_lock(&idedisk_ref_mutex); - idkp = ide_disk_g(disk); - if (idkp) { - if (ide_device_get(idkp->drive)) - idkp = NULL; - else - kref_get(&idkp->kref); - } - mutex_unlock(&idedisk_ref_mutex); - return idkp; -} - -static void ide_disk_put(struct ide_disk_obj *idkp) -{ - ide_drive_t *drive = idkp->drive; - - mutex_lock(&idedisk_ref_mutex); - kref_put(&idkp->kref, ide_disk_release); - ide_device_put(drive); - mutex_unlock(&idedisk_ref_mutex); -} - static const u8 ide_rw_cmds[] = { ATA_CMD_READ_MULTI, ATA_CMD_WRITE_MULTI, @@ -374,7 +333,7 @@ static void idedisk_check_hpa(ide_drive_t *drive) } } -static void init_idedisk_capacity(ide_drive_t *drive) +static int ide_disk_get_capacity(ide_drive_t *drive) { u16 *id = drive->id; int lba; @@ -403,11 +362,28 @@ static void init_idedisk_capacity(ide_drive_t *drive) if (ata_id_hpa_enabled(id)) idedisk_check_hpa(drive); } -} -sector_t ide_disk_capacity(ide_drive_t *drive) -{ - return drive->capacity64; + /* limit drive capacity to 137GB if LBA48 cannot be used */ + if ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 && + drive->capacity64 > 1ULL << 28) { + printk(KERN_WARNING "%s: cannot use LBA48 - full capacity " + "%llu sectors (%llu MB)\n", + drive->name, (unsigned long long)drive->capacity64, + sectors_to_MB(drive->capacity64)); + drive->capacity64 = 1ULL << 28; + } + + if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && + (drive->dev_flags & IDE_DFLAG_LBA48)) { + if (drive->capacity64 > 1ULL << 28) { + printk(KERN_INFO "%s: cannot use LBA48 DMA - PIO mode" + " will be used for accessing sectors " + "> %u\n", drive->name, 1 << 28); + } else + drive->dev_flags &= ~IDE_DFLAG_LBA48; + } + + return 0; } static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) @@ -508,7 +484,7 @@ static void update_ordered(ide_drive_t *drive) * time we have trimmed the drive capacity if LBA48 is * not available so we don't need to recheck that. */ - capacity = ide_disk_capacity(drive); + capacity = ide_gd_capacity(drive); barrier = ata_id_flush_enabled(id) && (drive->dev_flags & IDE_DFLAG_NOFLUSH) == 0 && ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 || @@ -616,7 +592,12 @@ ide_ext_devset_rw(wcache, wcache); ide_ext_devset_rw_sync(nowerr, nowerr); -static void idedisk_setup(ide_drive_t *drive) +static int ide_disk_check(ide_drive_t *drive, const char *s) +{ + return 1; +} + +static void ide_disk_setup(ide_drive_t *drive) { struct ide_disk_obj *idkp = drive->driver_data; ide_hwif_t *hwif = drive->hwif; @@ -652,33 +633,13 @@ static void idedisk_setup(ide_drive_t *drive) drive->queue->max_sectors / 2); /* calculate drive capacity, and select LBA if possible */ - init_idedisk_capacity(drive); - - /* limit drive capacity to 137GB if LBA48 cannot be used */ - if ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 && - drive->capacity64 > 1ULL << 28) { - printk(KERN_WARNING "%s: cannot use LBA48 - full capacity " - "%llu sectors (%llu MB)\n", - drive->name, (unsigned long long)drive->capacity64, - sectors_to_MB(drive->capacity64)); - drive->capacity64 = 1ULL << 28; - } - - if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && - (drive->dev_flags & IDE_DFLAG_LBA48)) { - if (drive->capacity64 > 1ULL << 28) { - printk(KERN_INFO "%s: cannot use LBA48 DMA - PIO mode" - " will be used for accessing sectors " - "> %u\n", drive->name, 1 << 28); - } else - drive->dev_flags &= ~IDE_DFLAG_LBA48; - } + ide_disk_get_capacity(drive); /* * if possible, give fdisk access to more of the drive, * by correcting bios_cyls: */ - capacity = ide_disk_capacity(drive); + capacity = ide_gd_capacity(drive); if ((drive->dev_flags & IDE_DFLAG_FORCED_GEOM) == 0) { if (ata_id_lba48_enabled(drive->id)) { @@ -718,9 +679,17 @@ static void idedisk_setup(ide_drive_t *drive) drive->dev_flags |= IDE_DFLAG_WCACHE; set_wcache(drive, 1); + + if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 && + (drive->head == 0 || drive->head > 16)) { + printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n", + drive->name, drive->head); + drive->dev_flags &= ~IDE_DFLAG_ATTACH; + } else + drive->dev_flags |= IDE_DFLAG_ATTACH; } -static void ide_cacheflush_p(ide_drive_t *drive) +static void ide_disk_flush(ide_drive_t *drive) { if (ata_id_flush_enabled(drive->id) == 0 || (drive->dev_flags & IDE_DFLAG_WCACHE) == 0) @@ -730,267 +699,40 @@ static void ide_cacheflush_p(ide_drive_t *drive) printk(KERN_INFO "%s: wcache flush failed!\n", drive->name); } -static void ide_disk_remove(ide_drive_t *drive) -{ - struct ide_disk_obj *idkp = drive->driver_data; - struct gendisk *g = idkp->disk; - - ide_proc_unregister_driver(drive, idkp->driver); - - del_gendisk(g); - - ide_cacheflush_p(drive); - - ide_disk_put(idkp); -} - -static void ide_disk_release(struct kref *kref) -{ - struct ide_disk_obj *idkp = to_ide_disk(kref); - ide_drive_t *drive = idkp->drive; - struct gendisk *g = idkp->disk; - - drive->driver_data = NULL; - g->private_data = NULL; - put_disk(g); - kfree(idkp); -} - -static int ide_disk_probe(ide_drive_t *drive); - -/* - * On HPA drives the capacity needs to be - * reinitilized on resume otherwise the disk - * can not be used and a hard reset is required - */ -static void ide_disk_resume(ide_drive_t *drive) +static int ide_disk_init_media(ide_drive_t *drive, struct gendisk *disk) { - if (ata_id_hpa_enabled(drive->id)) - init_idedisk_capacity(drive); -} - -static void ide_device_shutdown(ide_drive_t *drive) -{ -#ifdef CONFIG_ALPHA - /* On Alpha, halt(8) doesn't actually turn the machine off, - it puts you into the sort of firmware monitor. Typically, - it's used to boot another kernel image, so it's not much - different from reboot(8). Therefore, we don't need to - spin down the disk in this case, especially since Alpha - firmware doesn't handle disks in standby mode properly. - On the other hand, it's reasonably safe to turn the power - off when the shutdown process reaches the firmware prompt, - as the firmware initialization takes rather long time - - at least 10 seconds, which should be sufficient for - the disk to expire its write cache. */ - if (system_state != SYSTEM_POWER_OFF) { -#else - if (system_state == SYSTEM_RESTART) { -#endif - ide_cacheflush_p(drive); - return; - } - - printk(KERN_INFO "Shutdown: %s\n", drive->name); - - drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND); + return 0; } -static ide_driver_t idedisk_driver = { - .gen_driver = { - .owner = THIS_MODULE, - .name = "ide-disk", - .bus = &ide_bus_type, - }, - .probe = ide_disk_probe, - .remove = ide_disk_remove, - .resume = ide_disk_resume, - .shutdown = ide_device_shutdown, - .version = IDEDISK_VERSION, - .do_request = ide_do_rw_disk, - .end_request = ide_end_request, - .error = __ide_error, -#ifdef CONFIG_IDE_PROC_FS - .proc = ide_disk_proc, - .settings = ide_disk_settings, -#endif -}; - -static int idedisk_set_doorlock(ide_drive_t *drive, int on) +static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk, + int on) { ide_task_t task; + int ret; + + if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) + return 0; memset(&task, 0, sizeof(task)); task.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK; task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE; - return ide_no_data_taskfile(drive, &task); -} - -static int idedisk_open(struct inode *inode, struct file *filp) -{ - struct gendisk *disk = inode->i_bdev->bd_disk; - struct ide_disk_obj *idkp; - ide_drive_t *drive; - - idkp = ide_disk_get(disk); - if (idkp == NULL) - return -ENXIO; - - drive = idkp->drive; - - idkp->openers++; - - if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) { - check_disk_change(inode->i_bdev); - /* - * Ignore the return code from door_lock, - * since the open() has already succeeded, - * and the door_lock is irrelevant at this point. - */ - if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) && - idedisk_set_doorlock(drive, 1)) - drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; - } - return 0; -} - -static int idedisk_release(struct inode *inode, struct file *filp) -{ - struct gendisk *disk = inode->i_bdev->bd_disk; - struct ide_disk_obj *idkp = ide_disk_g(disk); - ide_drive_t *drive = idkp->drive; - - if (idkp->openers == 1) - ide_cacheflush_p(drive); - - if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) { - if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) && - idedisk_set_doorlock(drive, 0)) - drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; - } + ret = ide_no_data_taskfile(drive, &task); - idkp->openers--; + if (ret) + drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; - ide_disk_put(idkp); - - return 0; -} - -static int idedisk_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk); - ide_drive_t *drive = idkp->drive; - - geo->heads = drive->bios_head; - geo->sectors = drive->bios_sect; - geo->cylinders = (u16)drive->bios_cyl; /* truncate */ - return 0; + return ret; } -static int idedisk_media_changed(struct gendisk *disk) -{ - struct ide_disk_obj *idkp = ide_disk_g(disk); - ide_drive_t *drive = idkp->drive; - - /* do not scan partitions twice if this is a removable device */ - if (drive->dev_flags & IDE_DFLAG_ATTACH) { - drive->dev_flags &= ~IDE_DFLAG_ATTACH; - return 0; - } - - /* if removable, always assume it was changed */ - return !!(drive->dev_flags & IDE_DFLAG_REMOVABLE); -} - -static int idedisk_revalidate_disk(struct gendisk *disk) -{ - struct ide_disk_obj *idkp = ide_disk_g(disk); - set_capacity(disk, ide_disk_capacity(idkp->drive)); - return 0; -} - -static struct block_device_operations idedisk_ops = { - .owner = THIS_MODULE, - .open = idedisk_open, - .release = idedisk_release, - .ioctl = ide_disk_ioctl, - .getgeo = idedisk_getgeo, - .media_changed = idedisk_media_changed, - .revalidate_disk = idedisk_revalidate_disk +const struct ide_disk_ops ide_ata_disk_ops = { + .check = ide_disk_check, + .get_capacity = ide_disk_get_capacity, + .setup = ide_disk_setup, + .flush = ide_disk_flush, + .init_media = ide_disk_init_media, + .set_doorlock = ide_disk_set_doorlock, + .do_request = ide_do_rw_disk, + .end_request = ide_end_request, + .ioctl = ide_disk_ioctl, }; - -MODULE_DESCRIPTION("ATA DISK Driver"); - -static int ide_disk_probe(ide_drive_t *drive) -{ - struct ide_disk_obj *idkp; - struct gendisk *g; - - /* strstr("foo", "") is non-NULL */ - if (!strstr("ide-disk", drive->driver_req)) - goto failed; - - if (drive->media != ide_disk) - goto failed; - - idkp = kzalloc(sizeof(*idkp), GFP_KERNEL); - if (!idkp) - goto failed; - - g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif)); - if (!g) - goto out_free_idkp; - - ide_init_disk(g, drive); - - kref_init(&idkp->kref); - - idkp->drive = drive; - idkp->driver = &idedisk_driver; - idkp->disk = g; - - g->private_data = &idkp->driver; - - drive->driver_data = idkp; - - idedisk_setup(drive); - if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 && - (drive->head == 0 || drive->head > 16)) { - printk(KERN_ERR "%s: INVALID GEOMETRY: %d PHYSICAL HEADS?\n", - drive->name, drive->head); - drive->dev_flags &= ~IDE_DFLAG_ATTACH; - } else - drive->dev_flags |= IDE_DFLAG_ATTACH; - - g->minors = IDE_DISK_MINORS; - g->driverfs_dev = &drive->gendev; - g->flags |= GENHD_FL_EXT_DEVT; - if (drive->dev_flags & IDE_DFLAG_REMOVABLE) - g->flags = GENHD_FL_REMOVABLE; - set_capacity(g, ide_disk_capacity(drive)); - g->fops = &idedisk_ops; - add_disk(g); - return 0; - -out_free_idkp: - kfree(idkp); -failed: - return -ENODEV; -} - -static void __exit idedisk_exit(void) -{ - driver_unregister(&idedisk_driver.gen_driver); -} - -static int __init idedisk_init(void) -{ - return driver_register(&idedisk_driver.gen_driver); -} - -MODULE_ALIAS("ide:*m-disk*"); -MODULE_ALIAS("ide-disk"); -module_init(idedisk_init); -module_exit(idedisk_exit); -MODULE_LICENSE("GPL"); diff --git a/drivers/ide/ide-disk.h b/drivers/ide/ide-disk.h index a82fa43..b234b0f 100644 --- a/drivers/ide/ide-disk.h +++ b/drivers/ide/ide-disk.h @@ -1,19 +1,11 @@ #ifndef __IDE_DISK_H #define __IDE_DISK_H -struct ide_disk_obj { - ide_drive_t *drive; - ide_driver_t *driver; - struct gendisk *disk; - struct kref kref; - unsigned int openers; /* protected by BKL for now */ -}; - -#define ide_disk_g(disk) \ - container_of((disk)->private_data, struct ide_disk_obj, driver) +#include "ide-gd.h" +#ifdef CONFIG_IDE_GD_ATA /* ide-disk.c */ -sector_t ide_disk_capacity(ide_drive_t *); +extern const struct ide_disk_ops ide_ata_disk_ops; ide_decl_devset(address); ide_decl_devset(multcount); ide_decl_devset(nowerr); @@ -21,12 +13,17 @@ ide_decl_devset(wcache); ide_decl_devset(acoustic); /* ide-disk_ioctl.c */ -int ide_disk_ioctl(struct inode *, struct file *, unsigned int, unsigned long); +int ide_disk_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int, + unsigned long); #ifdef CONFIG_IDE_PROC_FS /* ide-disk_proc.c */ extern ide_proc_entry_t ide_disk_proc[]; extern const struct ide_proc_devset ide_disk_settings[]; #endif +#else +#define ide_disk_proc NULL +#define ide_disk_settings NULL +#endif #endif /* __IDE_DISK_H */ diff --git a/drivers/ide/ide-disk_ioctl.c b/drivers/ide/ide-disk_ioctl.c index a6cf1a0..a49698b 100644 --- a/drivers/ide/ide-disk_ioctl.c +++ b/drivers/ide/ide-disk_ioctl.c @@ -13,12 +13,10 @@ static const struct ide_ioctl_devset ide_disk_ioctl_settings[] = { { 0 } }; -int ide_disk_ioctl(struct inode *inode, struct file *file, +int ide_disk_ioctl(ide_drive_t *drive, struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct block_device *bdev = inode->i_bdev; - struct ide_disk_obj *idkp = ide_disk_g(bdev->bd_disk); - ide_drive_t *drive = idkp->drive; int err; err = ide_setting_ioctl(drive, bdev, cmd, arg, ide_disk_ioctl_settings); diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c index 4724976..1146f42 100644 --- a/drivers/ide/ide-disk_proc.c +++ b/drivers/ide/ide-disk_proc.c @@ -56,7 +56,7 @@ static int proc_idedisk_read_capacity ide_drive_t*drive = (ide_drive_t *)data; int len; - len = sprintf(page, "%llu\n", (long long)ide_disk_capacity(drive)); + len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive)); PROC_IDE_READ_RETURN(page, start, off, count, eof, len); } diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c index 0903782..cac431f 100644 --- a/drivers/ide/ide-dma-sff.c +++ b/drivers/ide/ide-dma-sff.c @@ -130,7 +130,7 @@ int ide_build_dmatable(ide_drive_t *drive, struct request *rq) xcount = bcount & 0xffff; if (is_trm290) xcount = ((xcount >> 2) - 1) << 16; - if (xcount == 0x0000) { + else if (xcount == 0x0000) { if (count++ >= PRD_ENTRIES) goto use_pio_instead; *table++ = cpu_to_le32(0x8000); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index cf0aa25..aeb1ad7 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -15,12 +15,6 @@ * Documentation/ide/ChangeLog.ide-floppy.1996-2002 */ -#define DRV_NAME "ide-floppy" -#define PFX DRV_NAME ": " - -#define IDEFLOPPY_VERSION "1.00" - -#include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> @@ -49,19 +43,6 @@ #include "ide-floppy.h" -/* module parameters */ -static unsigned long debug_mask; -module_param(debug_mask, ulong, 0644); - -/* define to see debug info */ -#define IDEFLOPPY_DEBUG_LOG 0 - -#if IDEFLOPPY_DEBUG_LOG -#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args) -#else -#define ide_debug_log(lvl, fmt, args...) do {} while (0) -#endif - /* * After each failed packet command we issue a request sense command and retry * the packet command IDEFLOPPY_MAX_PC_RETRIES times. @@ -83,43 +64,13 @@ module_param(debug_mask, ulong, 0644); /* Error code returned in rq->errors to the higher part of the driver. */ #define IDEFLOPPY_ERROR_GENERAL 101 -static DEFINE_MUTEX(idefloppy_ref_mutex); - -static void idefloppy_cleanup_obj(struct kref *); - -static struct ide_floppy_obj *ide_floppy_get(struct gendisk *disk) -{ - struct ide_floppy_obj *floppy = NULL; - - mutex_lock(&idefloppy_ref_mutex); - floppy = ide_drv_g(disk, ide_floppy_obj); - if (floppy) { - if (ide_device_get(floppy->drive)) - floppy = NULL; - else - kref_get(&floppy->kref); - } - mutex_unlock(&idefloppy_ref_mutex); - return floppy; -} - -static void ide_floppy_put(struct ide_floppy_obj *floppy) -{ - ide_drive_t *drive = floppy->drive; - - mutex_lock(&idefloppy_ref_mutex); - kref_put(&floppy->kref, idefloppy_cleanup_obj); - ide_device_put(drive); - mutex_unlock(&idefloppy_ref_mutex); -} - /* * Used to finish servicing a request. For read/write requests, we will call * ide_end_request to pass to the next buffer. */ -static int idefloppy_end_request(ide_drive_t *drive, int uptodate, int nsecs) +static int ide_floppy_end_request(ide_drive_t *drive, int uptodate, int nsecs) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct request *rq = HWGROUP(drive)->rq; int error; @@ -161,12 +112,12 @@ static void idefloppy_update_buffers(ide_drive_t *drive, struct bio *bio = rq->bio; while ((bio = rq->bio) != NULL) - idefloppy_end_request(drive, 1, 0); + ide_floppy_end_request(drive, 1, 0); } static void ide_floppy_callback(ide_drive_t *drive, int dsc) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct ide_atapi_pc *pc = drive->pc; int uptodate = pc->error ? 0 : 1; @@ -200,10 +151,10 @@ static void ide_floppy_callback(ide_drive_t *drive, int dsc) "Aborting request!\n"); } - idefloppy_end_request(drive, uptodate, 0); + ide_floppy_end_request(drive, uptodate, 0); } -static void ide_floppy_report_error(idefloppy_floppy_t *floppy, +static void ide_floppy_report_error(struct ide_disk_obj *floppy, struct ide_atapi_pc *pc) { /* supress error messages resulting from Medium not present */ @@ -222,7 +173,7 @@ static void ide_floppy_report_error(idefloppy_floppy_t *floppy, static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, struct ide_atapi_pc *pc) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; if (floppy->failed_pc == NULL && pc->c[0] != GPCMD_REQUEST_SENSE) @@ -286,7 +237,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc, struct request *rq, unsigned long sector) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; int block = sector / floppy->bs_factor; int blocks = rq->nr_sectors / floppy->bs_factor; int cmd = rq_data_dir(rq); @@ -310,7 +261,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive, pc->flags |= PC_FLAG_DMA_OK; } -static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy, +static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy, struct ide_atapi_pc *pc, struct request *rq) { ide_init_pc(pc); @@ -329,13 +280,12 @@ static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy, pc->req_xfer = pc->buf_size = rq->data_len; } -static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, - struct request *rq, sector_t block_s) +static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, + struct request *rq, sector_t block) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; ide_hwif_t *hwif = drive->hwif; struct ide_atapi_pc *pc; - unsigned long block = (unsigned long)block_s; ide_debug_log(IDE_DBG_FUNC, "%s: dev: %s, cmd: 0x%x, cmd_type: %x, " "errors: %d\n", @@ -353,7 +303,7 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, else printk(KERN_ERR PFX "%s: I/O error\n", drive->name); - idefloppy_end_request(drive, 0, 0); + ide_floppy_end_request(drive, 0, 0); return ide_stopped; } if (blk_fs_request(rq)) { @@ -361,11 +311,11 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, (rq->nr_sectors % floppy->bs_factor)) { printk(KERN_ERR PFX "%s: unsupported r/w rq size\n", drive->name); - idefloppy_end_request(drive, 0, 0); + ide_floppy_end_request(drive, 0, 0); return ide_stopped; } pc = &floppy->queued_pc; - idefloppy_create_rw_cmd(drive, pc, rq, block); + idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); } else if (blk_special_request(rq)) { pc = (struct ide_atapi_pc *) rq->buffer; } else if (blk_pc_request(rq)) { @@ -373,7 +323,7 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, idefloppy_blockpc_cmd(floppy, pc, rq); } else { blk_dump_rq_flags(rq, PFX "unsupported command in queue"); - idefloppy_end_request(drive, 0, 0); + ide_floppy_end_request(drive, 0, 0); return ide_stopped; } @@ -394,7 +344,7 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, */ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct gendisk *disk = floppy->disk; struct ide_atapi_pc pc; u8 *page; @@ -410,11 +360,11 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive) } if (pc.buf[3] & 0x80) - drive->atapi_flags |= IDE_AFLAG_WP; + drive->dev_flags |= IDE_DFLAG_WP; else - drive->atapi_flags &= ~IDE_AFLAG_WP; + drive->dev_flags &= ~IDE_DFLAG_WP; - set_disk_ro(disk, !!(drive->atapi_flags & IDE_AFLAG_WP)); + set_disk_ro(disk, !!(drive->dev_flags & IDE_DFLAG_WP)); page = &pc.buf[8]; @@ -445,7 +395,9 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive) drive->name, lba_capacity, capacity); floppy->blocks = floppy->block_size ? capacity / floppy->block_size : 0; + drive->capacity64 = floppy->blocks * floppy->bs_factor; } + return 0; } @@ -455,7 +407,7 @@ static int ide_floppy_get_flexible_disk_page(ide_drive_t *drive) */ static int ide_floppy_get_capacity(ide_drive_t *drive) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct gendisk *disk = floppy->disk; struct ide_atapi_pc pc; u8 *cap_desc; @@ -466,7 +418,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) drive->bios_head = drive->bios_sect = 0; floppy->blocks = 0; floppy->bs_factor = 1; - set_capacity(floppy->disk, 0); + drive->capacity64 = 0; ide_floppy_create_read_capacity_cmd(&pc); if (ide_queue_pc_tail(drive, disk, &pc)) { @@ -523,6 +475,8 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) "non 512 bytes block size not " "fully supported\n", drive->name); + drive->capacity64 = + floppy->blocks * floppy->bs_factor; rc = 0; } break; @@ -547,21 +501,12 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) if (!(drive->atapi_flags & IDE_AFLAG_CLIK_DRIVE)) (void) ide_floppy_get_flexible_disk_page(drive); - set_capacity(disk, floppy->blocks * floppy->bs_factor); - return rc; } -sector_t ide_floppy_capacity(ide_drive_t *drive) -{ - idefloppy_floppy_t *floppy = drive->driver_data; - unsigned long capacity = floppy->blocks * floppy->bs_factor; - - return capacity; -} - -static void idefloppy_setup(ide_drive_t *drive, idefloppy_floppy_t *floppy) +static void ide_floppy_setup(ide_drive_t *drive) { + struct ide_disk_obj *floppy = drive->driver_data; u16 *id = drive->id; drive->pc_callback = ide_floppy_callback; @@ -592,252 +537,42 @@ static void idefloppy_setup(ide_drive_t *drive, idefloppy_floppy_t *floppy) blk_queue_max_sectors(drive->queue, 64); drive->atapi_flags |= IDE_AFLAG_CLIK_DRIVE; /* IOMEGA Clik! drives do not support lock/unlock commands */ - drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK; + drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; } (void) ide_floppy_get_capacity(drive); ide_proc_register_driver(drive, floppy->driver); -} -static void ide_floppy_remove(ide_drive_t *drive) -{ - idefloppy_floppy_t *floppy = drive->driver_data; - struct gendisk *g = floppy->disk; - - ide_proc_unregister_driver(drive, floppy->driver); - - del_gendisk(g); - - ide_floppy_put(floppy); + drive->dev_flags |= IDE_DFLAG_ATTACH; } -static void idefloppy_cleanup_obj(struct kref *kref) +static void ide_floppy_flush(ide_drive_t *drive) { - struct ide_floppy_obj *floppy = to_ide_drv(kref, ide_floppy_obj); - ide_drive_t *drive = floppy->drive; - struct gendisk *g = floppy->disk; - - drive->driver_data = NULL; - g->private_data = NULL; - put_disk(g); - kfree(floppy); } -static int ide_floppy_probe(ide_drive_t *); - -static ide_driver_t idefloppy_driver = { - .gen_driver = { - .owner = THIS_MODULE, - .name = "ide-floppy", - .bus = &ide_bus_type, - }, - .probe = ide_floppy_probe, - .remove = ide_floppy_remove, - .version = IDEFLOPPY_VERSION, - .do_request = idefloppy_do_request, - .end_request = idefloppy_end_request, - .error = __ide_error, -#ifdef CONFIG_IDE_PROC_FS - .proc = ide_floppy_proc, - .settings = ide_floppy_settings, -#endif -}; - -static int idefloppy_open(struct inode *inode, struct file *filp) +static int ide_floppy_init_media(ide_drive_t *drive, struct gendisk *disk) { - struct gendisk *disk = inode->i_bdev->bd_disk; - struct ide_floppy_obj *floppy; - ide_drive_t *drive; int ret = 0; - floppy = ide_floppy_get(disk); - if (!floppy) - return -ENXIO; - - drive = floppy->drive; - - ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__); - - floppy->openers++; - - if (floppy->openers == 1) { - drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS; - /* Just in case */ - - if (ide_do_test_unit_ready(drive, disk)) - ide_do_start_stop(drive, disk, 1); - - if (ide_floppy_get_capacity(drive) - && (filp->f_flags & O_NDELAY) == 0 - /* - * Allow O_NDELAY to open a drive without a disk, or with an - * unreadable disk, so that we can get the format capacity - * of the drive or begin the format - Sam - */ - ) { - ret = -EIO; - goto out_put_floppy; - } - - if ((drive->atapi_flags & IDE_AFLAG_WP) && (filp->f_mode & 2)) { - ret = -EROFS; - goto out_put_floppy; - } - - drive->atapi_flags |= IDE_AFLAG_MEDIA_CHANGED; - ide_set_media_lock(drive, disk, 1); - check_disk_change(inode->i_bdev); - } else if (drive->atapi_flags & IDE_AFLAG_FORMAT_IN_PROGRESS) { - ret = -EBUSY; - goto out_put_floppy; - } - return 0; - -out_put_floppy: - floppy->openers--; - ide_floppy_put(floppy); - return ret; -} - -static int idefloppy_release(struct inode *inode, struct file *filp) -{ - struct gendisk *disk = inode->i_bdev->bd_disk; - struct ide_floppy_obj *floppy = ide_drv_g(disk, ide_floppy_obj); - ide_drive_t *drive = floppy->drive; - - ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__); - - if (floppy->openers == 1) { - ide_set_media_lock(drive, disk, 0); - drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS; - } - - floppy->openers--; - - ide_floppy_put(floppy); - - return 0; -} - -static int idefloppy_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct ide_floppy_obj *floppy = ide_drv_g(bdev->bd_disk, - ide_floppy_obj); - ide_drive_t *drive = floppy->drive; + if (ide_do_test_unit_ready(drive, disk)) + ide_do_start_stop(drive, disk, 1); - geo->heads = drive->bios_head; - geo->sectors = drive->bios_sect; - geo->cylinders = (u16)drive->bios_cyl; /* truncate */ - return 0; -} + ret = ide_floppy_get_capacity(drive); -static int idefloppy_media_changed(struct gendisk *disk) -{ - struct ide_floppy_obj *floppy = ide_drv_g(disk, ide_floppy_obj); - ide_drive_t *drive = floppy->drive; - int ret; + set_capacity(disk, ide_gd_capacity(drive)); - /* do not scan partitions twice if this is a removable device */ - if (drive->dev_flags & IDE_DFLAG_ATTACH) { - drive->dev_flags &= ~IDE_DFLAG_ATTACH; - return 0; - } - ret = !!(drive->atapi_flags & IDE_AFLAG_MEDIA_CHANGED); - drive->atapi_flags &= ~IDE_AFLAG_MEDIA_CHANGED; return ret; } -static int idefloppy_revalidate_disk(struct gendisk *disk) -{ - struct ide_floppy_obj *floppy = ide_drv_g(disk, ide_floppy_obj); - set_capacity(disk, ide_floppy_capacity(floppy->drive)); - return 0; -} - -static struct block_device_operations idefloppy_ops = { - .owner = THIS_MODULE, - .open = idefloppy_open, - .release = idefloppy_release, - .ioctl = ide_floppy_ioctl, - .getgeo = idefloppy_getgeo, - .media_changed = idefloppy_media_changed, - .revalidate_disk = idefloppy_revalidate_disk +const struct ide_disk_ops ide_atapi_disk_ops = { + .check = ide_check_atapi_device, + .get_capacity = ide_floppy_get_capacity, + .setup = ide_floppy_setup, + .flush = ide_floppy_flush, + .init_media = ide_floppy_init_media, + .set_doorlock = ide_set_media_lock, + .do_request = ide_floppy_do_request, + .end_request = ide_floppy_end_request, + .ioctl = ide_floppy_ioctl, }; - -static int ide_floppy_probe(ide_drive_t *drive) -{ - idefloppy_floppy_t *floppy; - struct gendisk *g; - - if (!strstr("ide-floppy", drive->driver_req)) - goto failed; - - if (drive->media != ide_floppy) - goto failed; - - if (!ide_check_atapi_device(drive, DRV_NAME)) { - printk(KERN_ERR PFX "%s: not supported by this version of " - DRV_NAME "\n", drive->name); - goto failed; - } - floppy = kzalloc(sizeof(idefloppy_floppy_t), GFP_KERNEL); - if (!floppy) { - printk(KERN_ERR PFX "%s: Can't allocate a floppy structure\n", - drive->name); - goto failed; - } - - g = alloc_disk(1 << PARTN_BITS); - if (!g) - goto out_free_floppy; - - ide_init_disk(g, drive); - - kref_init(&floppy->kref); - - floppy->drive = drive; - floppy->driver = &idefloppy_driver; - floppy->disk = g; - - g->private_data = &floppy->driver; - - drive->driver_data = floppy; - - drive->debug_mask = debug_mask; - - idefloppy_setup(drive, floppy); - drive->dev_flags |= IDE_DFLAG_ATTACH; - - g->minors = 1 << PARTN_BITS; - g->driverfs_dev = &drive->gendev; - if (drive->dev_flags & IDE_DFLAG_REMOVABLE) - g->flags = GENHD_FL_REMOVABLE; - g->fops = &idefloppy_ops; - add_disk(g); - return 0; - -out_free_floppy: - kfree(floppy); -failed: - return -ENODEV; -} - -static void __exit idefloppy_exit(void) -{ - driver_unregister(&idefloppy_driver.gen_driver); -} - -static int __init idefloppy_init(void) -{ - printk(KERN_INFO DRV_NAME " driver " IDEFLOPPY_VERSION "\n"); - return driver_register(&idefloppy_driver.gen_driver); -} - -MODULE_ALIAS("ide:*m-floppy*"); -MODULE_ALIAS("ide-floppy"); -module_init(idefloppy_init); -module_exit(idefloppy_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("ATAPI FLOPPY Driver"); - diff --git a/drivers/ide/ide-floppy.h b/drivers/ide/ide-floppy.h index 17cf865..c17124d 100644 --- a/drivers/ide/ide-floppy.h +++ b/drivers/ide/ide-floppy.h @@ -1,37 +1,9 @@ #ifndef __IDE_FLOPPY_H #define __IDE_FLOPPY_H -/* - * Most of our global data which we need to save even as we leave the driver - * due to an interrupt or a timer event is stored in a variable of type - * idefloppy_floppy_t, defined below. - */ -typedef struct ide_floppy_obj { - ide_drive_t *drive; - ide_driver_t *driver; - struct gendisk *disk; - struct kref kref; - unsigned int openers; /* protected by BKL for now */ - - /* Last failed packet command */ - struct ide_atapi_pc *failed_pc; - /* used for blk_{fs,pc}_request() requests */ - struct ide_atapi_pc queued_pc; - - /* Last error information */ - u8 sense_key, asc, ascq; - - int progress_indication; - - /* Device information */ - /* Current format */ - int blocks, block_size, bs_factor; - /* Last format capacity descriptor */ - u8 cap_desc[8]; - /* Copy of the flexible disk page */ - u8 flexible_disk_page[32]; -} idefloppy_floppy_t; +#include "ide-gd.h" +#ifdef CONFIG_IDE_GD_ATAPI /* * Pages of the SELECT SENSE / MODE SENSE packet commands. * See SFF-8070i spec. @@ -46,17 +18,22 @@ typedef struct ide_floppy_obj { #define IDEFLOPPY_IOCTL_FORMAT_GET_PROGRESS 0x4603 /* ide-floppy.c */ +extern const struct ide_disk_ops ide_atapi_disk_ops; void ide_floppy_create_mode_sense_cmd(struct ide_atapi_pc *, u8); void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *); -sector_t ide_floppy_capacity(ide_drive_t *); /* ide-floppy_ioctl.c */ -int ide_floppy_ioctl(struct inode *, struct file *, unsigned, unsigned long); +int ide_floppy_ioctl(ide_drive_t *, struct inode *, struct file *, unsigned int, + unsigned long); #ifdef CONFIG_IDE_PROC_FS /* ide-floppy_proc.c */ extern ide_proc_entry_t ide_floppy_proc[]; extern const struct ide_proc_devset ide_floppy_settings[]; #endif +#else +#define ide_floppy_proc NULL +#define ide_floppy_settings NULL +#endif #endif /*__IDE_FLOPPY_H */ diff --git a/drivers/ide/ide-floppy_ioctl.c b/drivers/ide/ide-floppy_ioctl.c index a3a7a08..409e4c1 100644 --- a/drivers/ide/ide-floppy_ioctl.c +++ b/drivers/ide/ide-floppy_ioctl.c @@ -33,7 +33,7 @@ static int ide_floppy_get_format_capacities(ide_drive_t *drive, int __user *arg) { - struct ide_floppy_obj *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct ide_atapi_pc pc; u8 header_len, desc_cnt; int i, blocks, length, u_array_size, u_index; @@ -113,7 +113,7 @@ static void ide_floppy_create_format_unit_cmd(struct ide_atapi_pc *pc, int b, static int ide_floppy_get_sfrp_bit(ide_drive_t *drive) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct ide_atapi_pc pc; drive->atapi_flags &= ~IDE_AFLAG_SRFP; @@ -132,17 +132,17 @@ static int ide_floppy_get_sfrp_bit(ide_drive_t *drive) static int ide_floppy_format_unit(ide_drive_t *drive, int __user *arg) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct ide_atapi_pc pc; int blocks, length, flags, err = 0; if (floppy->openers > 1) { /* Don't format if someone is using the disk */ - drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS; + drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS; return -EBUSY; } - drive->atapi_flags |= IDE_AFLAG_FORMAT_IN_PROGRESS; + drive->dev_flags |= IDE_DFLAG_FORMAT_IN_PROGRESS; /* * Send ATAPI_FORMAT_UNIT to the drive. @@ -174,7 +174,7 @@ static int ide_floppy_format_unit(ide_drive_t *drive, int __user *arg) out: if (err) - drive->atapi_flags &= ~IDE_AFLAG_FORMAT_IN_PROGRESS; + drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS; return err; } @@ -190,7 +190,7 @@ out: static int ide_floppy_get_format_progress(ide_drive_t *drive, int __user *arg) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct ide_atapi_pc pc; int progress_indication = 0x10000; @@ -226,7 +226,7 @@ static int ide_floppy_get_format_progress(ide_drive_t *drive, int __user *arg) static int ide_floppy_lockdoor(ide_drive_t *drive, struct ide_atapi_pc *pc, unsigned long arg, unsigned int cmd) { - idefloppy_floppy_t *floppy = drive->driver_data; + struct ide_disk_obj *floppy = drive->driver_data; struct gendisk *disk = floppy->disk; int prevent = (arg && cmd != CDROMEJECT) ? 1 : 0; @@ -260,13 +260,10 @@ static int ide_floppy_format_ioctl(ide_drive_t *drive, struct file *file, } } -int ide_floppy_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +int ide_floppy_ioctl(ide_drive_t *drive, struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) { struct block_device *bdev = inode->i_bdev; - struct ide_floppy_obj *floppy = ide_drv_g(bdev->bd_disk, - ide_floppy_obj); - ide_drive_t *drive = floppy->drive; struct ide_atapi_pc pc; void __user *argp = (void __user *)arg; int err; diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c index 76f0c6c..3ec762c 100644 --- a/drivers/ide/ide-floppy_proc.c +++ b/drivers/ide/ide-floppy_proc.c @@ -9,7 +9,7 @@ static int proc_idefloppy_read_capacity(char *page, char **start, off_t off, ide_drive_t*drive = (ide_drive_t *)data; int len; - len = sprintf(page, "%llu\n", (long long)ide_floppy_capacity(drive)); + len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive)); PROC_IDE_READ_RETURN(page, start, off, count, eof, len); } diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c new file mode 100644 index 0000000..d44898f --- /dev/null +++ b/drivers/ide/ide-gd.c @@ -0,0 +1,398 @@ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/genhd.h> +#include <linux/mutex.h> +#include <linux/ide.h> +#include <linux/hdreg.h> + +#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) +#define IDE_DISK_MINORS (1 << PARTN_BITS) +#else +#define IDE_DISK_MINORS 0 +#endif + +#include "ide-disk.h" +#include "ide-floppy.h" + +#define IDE_GD_VERSION "1.18" + +/* module parameters */ +static unsigned long debug_mask; +module_param(debug_mask, ulong, 0644); + +static DEFINE_MUTEX(ide_disk_ref_mutex); + +static void ide_disk_release(struct kref *); + +static struct ide_disk_obj *ide_disk_get(struct gendisk *disk) +{ + struct ide_disk_obj *idkp = NULL; + + mutex_lock(&ide_disk_ref_mutex); + idkp = ide_drv_g(disk, ide_disk_obj); + if (idkp) { + if (ide_device_get(idkp->drive)) + idkp = NULL; + else + kref_get(&idkp->kref); + } + mutex_unlock(&ide_disk_ref_mutex); + return idkp; +} + +static void ide_disk_put(struct ide_disk_obj *idkp) +{ + ide_drive_t *drive = idkp->drive; + + mutex_lock(&ide_disk_ref_mutex); + kref_put(&idkp->kref, ide_disk_release); + ide_device_put(drive); + mutex_unlock(&ide_disk_ref_mutex); +} + +sector_t ide_gd_capacity(ide_drive_t *drive) +{ + return drive->capacity64; +} + +static int ide_gd_probe(ide_drive_t *); + +static void ide_gd_remove(ide_drive_t *drive) +{ + struct ide_disk_obj *idkp = drive->driver_data; + struct gendisk *g = idkp->disk; + + ide_proc_unregister_driver(drive, idkp->driver); + + del_gendisk(g); + + drive->disk_ops->flush(drive); + + ide_disk_put(idkp); +} + +static void ide_disk_release(struct kref *kref) +{ + struct ide_disk_obj *idkp = to_ide_drv(kref, ide_disk_obj); + ide_drive_t *drive = idkp->drive; + struct gendisk *g = idkp->disk; + + drive->disk_ops = NULL; + drive->driver_data = NULL; + g->private_data = NULL; + put_disk(g); + kfree(idkp); +} + +/* + * On HPA drives the capacity needs to be + * reinitilized on resume otherwise the disk + * can not be used and a hard reset is required + */ +static void ide_gd_resume(ide_drive_t *drive) +{ + if (ata_id_hpa_enabled(drive->id)) + (void)drive->disk_ops->get_capacity(drive); +} + +static void ide_gd_shutdown(ide_drive_t *drive) +{ +#ifdef CONFIG_ALPHA + /* On Alpha, halt(8) doesn't actually turn the machine off, + it puts you into the sort of firmware monitor. Typically, + it's used to boot another kernel image, so it's not much + different from reboot(8). Therefore, we don't need to + spin down the disk in this case, especially since Alpha + firmware doesn't handle disks in standby mode properly. + On the other hand, it's reasonably safe to turn the power + off when the shutdown process reaches the firmware prompt, + as the firmware initialization takes rather long time - + at least 10 seconds, which should be sufficient for + the disk to expire its write cache. */ + if (system_state != SYSTEM_POWER_OFF) { +#else + if (system_state == SYSTEM_RESTART) { +#endif + drive->disk_ops->flush(drive); + return; + } + + printk(KERN_INFO "Shutdown: %s\n", drive->name); + + drive->gendev.bus->suspend(&drive->gendev, PMSG_SUSPEND); +} + +#ifdef CONFIG_IDE_PROC_FS +static ide_proc_entry_t *ide_disk_proc_entries(ide_drive_t *drive) +{ + return (drive->media == ide_disk) ? ide_disk_proc : ide_floppy_proc; +} + +static const struct ide_proc_devset *ide_disk_proc_devsets(ide_drive_t *drive) +{ + return (drive->media == ide_disk) ? ide_disk_settings + : ide_floppy_settings; +} +#endif + +static ide_startstop_t ide_gd_do_request(ide_drive_t *drive, + struct request *rq, sector_t sector) +{ + return drive->disk_ops->do_request(drive, rq, sector); +} + +static int ide_gd_end_request(ide_drive_t *drive, int uptodate, int nrsecs) +{ + return drive->disk_ops->end_request(drive, uptodate, nrsecs); +} + +static ide_driver_t ide_gd_driver = { + .gen_driver = { + .owner = THIS_MODULE, + .name = "ide-gd", + .bus = &ide_bus_type, + }, + .probe = ide_gd_probe, + .remove = ide_gd_remove, + .resume = ide_gd_resume, + .shutdown = ide_gd_shutdown, + .version = IDE_GD_VERSION, + .do_request = ide_gd_do_request, + .end_request = ide_gd_end_request, + .error = __ide_error, +#ifdef CONFIG_IDE_PROC_FS + .proc_entries = ide_disk_proc_entries, + .proc_devsets = ide_disk_proc_devsets, +#endif +}; + +static int ide_gd_open(struct inode *inode, struct file *filp) +{ + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ide_disk_obj *idkp; + ide_drive_t *drive; + int ret = 0; + + idkp = ide_disk_get(disk); + if (idkp == NULL) + return -ENXIO; + + drive = idkp->drive; + + ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__); + + idkp->openers++; + + if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) { + drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS; + /* Just in case */ + + ret = drive->disk_ops->init_media(drive, disk); + + /* + * Allow O_NDELAY to open a drive without a disk, or with an + * unreadable disk, so that we can get the format capacity + * of the drive or begin the format - Sam + */ + if (ret && (filp->f_flags & O_NDELAY) == 0) { + ret = -EIO; + goto out_put_idkp; + } + + if ((drive->dev_flags & IDE_DFLAG_WP) && (filp->f_mode & 2)) { + ret = -EROFS; + goto out_put_idkp; + } + + /* + * Ignore the return code from door_lock, + * since the open() has already succeeded, + * and the door_lock is irrelevant at this point. + */ + drive->disk_ops->set_doorlock(drive, disk, 1); + drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED; + check_disk_change(inode->i_bdev); + } else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) { + ret = -EBUSY; + goto out_put_idkp; + } + return 0; + +out_put_idkp: + idkp->openers--; + ide_disk_put(idkp); + return ret; +} + +static int ide_gd_release(struct inode *inode, struct file *filp) +{ + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); + ide_drive_t *drive = idkp->drive; + + ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__); + + if (idkp->openers == 1) + drive->disk_ops->flush(drive); + + if ((drive->dev_flags & IDE_DFLAG_REMOVABLE) && idkp->openers == 1) { + drive->disk_ops->set_doorlock(drive, disk, 0); + drive->dev_flags &= ~IDE_DFLAG_FORMAT_IN_PROGRESS; + } + + idkp->openers--; + + ide_disk_put(idkp); + + return 0; +} + +static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj); + ide_drive_t *drive = idkp->drive; + + geo->heads = drive->bios_head; + geo->sectors = drive->bios_sect; + geo->cylinders = (u16)drive->bios_cyl; /* truncate */ + return 0; +} + +static int ide_gd_media_changed(struct gendisk *disk) +{ + struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); + ide_drive_t *drive = idkp->drive; + int ret; + + /* do not scan partitions twice if this is a removable device */ + if (drive->dev_flags & IDE_DFLAG_ATTACH) { + drive->dev_flags &= ~IDE_DFLAG_ATTACH; + return 0; + } + + ret = !!(drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED); + drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED; + + return ret; +} + +static int ide_gd_revalidate_disk(struct gendisk *disk) +{ + struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj); + set_capacity(disk, ide_gd_capacity(idkp->drive)); + return 0; +} + +static int ide_gd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct block_device *bdev = inode->i_bdev; + struct ide_disk_obj *idkp = ide_drv_g(bdev->bd_disk, ide_disk_obj); + ide_drive_t *drive = idkp->drive; + + return drive->disk_ops->ioctl(drive, inode, file, cmd, arg); +} + +static struct block_device_operations ide_gd_ops = { + .owner = THIS_MODULE, + .open = ide_gd_open, + .release = ide_gd_release, + .ioctl = ide_gd_ioctl, + .getgeo = ide_gd_getgeo, + .media_changed = ide_gd_media_changed, + .revalidate_disk = ide_gd_revalidate_disk +}; + +static int ide_gd_probe(ide_drive_t *drive) +{ + const struct ide_disk_ops *disk_ops = NULL; + struct ide_disk_obj *idkp; + struct gendisk *g; + + /* strstr("foo", "") is non-NULL */ + if (!strstr("ide-gd", drive->driver_req)) + goto failed; + +#ifdef CONFIG_IDE_GD_ATA + if (drive->media == ide_disk) + disk_ops = &ide_ata_disk_ops; +#endif +#ifdef CONFIG_IDE_GD_ATAPI + if (drive->media == ide_floppy) + disk_ops = &ide_atapi_disk_ops; +#endif + if (disk_ops == NULL) + goto failed; + + if (disk_ops->check(drive, DRV_NAME) == 0) { + printk(KERN_ERR PFX "%s: not supported by this driver\n", + drive->name); + goto failed; + } + + idkp = kzalloc(sizeof(*idkp), GFP_KERNEL); + if (!idkp) { + printk(KERN_ERR PFX "%s: can't allocate a disk structure\n", + drive->name); + goto failed; + } + + g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif)); + if (!g) + goto out_free_idkp; + + ide_init_disk(g, drive); + + kref_init(&idkp->kref); + + idkp->drive = drive; + idkp->driver = &ide_gd_driver; + idkp->disk = g; + + g->private_data = &idkp->driver; + + drive->driver_data = idkp; + drive->debug_mask = debug_mask; + drive->disk_ops = disk_ops; + + disk_ops->setup(drive); + + set_capacity(g, ide_gd_capacity(drive)); + + g->minors = IDE_DISK_MINORS; + g->driverfs_dev = &drive->gendev; + g->flags |= GENHD_FL_EXT_DEVT; + if (drive->dev_flags & IDE_DFLAG_REMOVABLE) + g->flags = GENHD_FL_REMOVABLE; + g->fops = &ide_gd_ops; + add_disk(g); + return 0; + +out_free_idkp: + kfree(idkp); +failed: + return -ENODEV; +} + +static int __init ide_gd_init(void) +{ + printk(KERN_INFO DRV_NAME " driver " IDE_GD_VERSION "\n"); + return driver_register(&ide_gd_driver.gen_driver); +} + +static void __exit ide_gd_exit(void) +{ + driver_unregister(&ide_gd_driver.gen_driver); +} + +MODULE_ALIAS("ide:*m-disk*"); +MODULE_ALIAS("ide-disk"); +MODULE_ALIAS("ide:*m-floppy*"); +MODULE_ALIAS("ide-floppy"); +module_init(ide_gd_init); +module_exit(ide_gd_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("generic ATA/ATAPI disk driver"); diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h new file mode 100644 index 0000000..7d3d101 --- /dev/null +++ b/drivers/ide/ide-gd.h @@ -0,0 +1,44 @@ +#ifndef __IDE_GD_H +#define __IDE_GD_H + +#define DRV_NAME "ide-gd" +#define PFX DRV_NAME ": " + +/* define to see debug info */ +#define IDE_GD_DEBUG_LOG 0 + +#if IDE_GD_DEBUG_LOG +#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, args) +#else +#define ide_debug_log(lvl, fmt, args...) do {} while (0) +#endif + +struct ide_disk_obj { + ide_drive_t *drive; + ide_driver_t *driver; + struct gendisk *disk; + struct kref kref; + unsigned int openers; /* protected by BKL for now */ + + /* Last failed packet command */ + struct ide_atapi_pc *failed_pc; + /* used for blk_{fs,pc}_request() requests */ + struct ide_atapi_pc queued_pc; + + /* Last error information */ + u8 sense_key, asc, ascq; + + int progress_indication; + + /* Device information */ + /* Current format */ + int blocks, block_size, bs_factor; + /* Last format capacity descriptor */ + u8 cap_desc[8]; + /* Copy of the flexible disk page */ + u8 flexible_disk_page[32]; +}; + +sector_t ide_gd_capacity(ide_drive_t *); + +#endif /* __IDE_GD_H */ diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index b762deb..bb7a1ed 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -755,7 +755,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) udelay(1); SELECT_DRIVE(drive); - SELECT_MASK(drive, 0); + SELECT_MASK(drive, 1); udelay(1); tp_ops->set_irq(hwif, 0); diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 19f8c77..1649ea5 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -208,6 +208,7 @@ static inline void do_identify (ide_drive_t *drive, u8 cmd) drive->ready_stat = 0; if (ata_id_cdb_intr(id)) drive->atapi_flags |= IDE_AFLAG_DRQ_INTERRUPT; + drive->dev_flags |= IDE_DFLAG_DOORLOCKING; /* we don't do head unloading on ATAPI devices */ drive->dev_flags |= IDE_DFLAG_NO_UNLOAD; return; diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index b269264..c31d0dd 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -567,10 +567,10 @@ static void ide_remove_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t void ide_proc_register_driver(ide_drive_t *drive, ide_driver_t *driver) { mutex_lock(&ide_setting_mtx); - drive->settings = driver->settings; + drive->settings = driver->proc_devsets(drive); mutex_unlock(&ide_setting_mtx); - ide_add_proc_entries(drive->proc, driver->proc, drive); + ide_add_proc_entries(drive->proc, driver->proc_entries(drive), drive); } EXPORT_SYMBOL(ide_proc_register_driver); @@ -591,7 +591,7 @@ void ide_proc_unregister_driver(ide_drive_t *drive, ide_driver_t *driver) { unsigned long flags; - ide_remove_proc_entries(drive->proc, driver->proc); + ide_remove_proc_entries(drive->proc, driver->proc_entries(drive)); mutex_lock(&ide_setting_mtx); spin_lock_irqsave(&ide_lock, flags); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d879c77..b2b2e5e 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -2108,7 +2108,7 @@ static void idetape_get_mode_sense_results(ide_drive_t *drive) /* device lacks locking support according to capabilities page */ if ((caps[6] & 1) == 0) - drive->atapi_flags |= IDE_AFLAG_NO_DOORLOCK; + drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; if (caps[7] & 0x02) tape->blk_size = 512; @@ -2298,6 +2298,16 @@ static ide_proc_entry_t idetape_proc[] = { { "name", S_IFREG|S_IRUGO, proc_idetape_read_name, NULL }, { NULL, 0, NULL, NULL } }; + +static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive) +{ + return idetape_proc; +} + +static const struct ide_proc_devset *ide_tape_proc_devsets(ide_drive_t *drive) +{ + return idetape_settings; +} #endif static int ide_tape_probe(ide_drive_t *); @@ -2315,8 +2325,8 @@ static ide_driver_t idetape_driver = { .end_request = idetape_end_request, .error = __ide_error, #ifdef CONFIG_IDE_PROC_FS - .proc = idetape_proc, - .settings = idetape_settings, + .proc_entries = ide_tape_proc_entries, + .proc_devsets = ide_tape_proc_devsets, #endif }; diff --git a/drivers/ide/pci/Makefile b/drivers/ide/pci/Makefile index 02e6ee7..ab44a1f 100644 --- a/drivers/ide/pci/Makefile +++ b/drivers/ide/pci/Makefile @@ -11,7 +11,6 @@ obj-$(CONFIG_BLK_DEV_CS5535) += cs5535.o obj-$(CONFIG_BLK_DEV_SC1200) += sc1200.o obj-$(CONFIG_BLK_DEV_CY82C693) += cy82c693.o obj-$(CONFIG_BLK_DEV_DELKIN) += delkin_cb.o -obj-$(CONFIG_BLK_DEV_HPT34X) += hpt34x.o obj-$(CONFIG_BLK_DEV_HPT366) += hpt366.o obj-$(CONFIG_BLK_DEV_IT8213) += it8213.o obj-$(CONFIG_BLK_DEV_IT821X) += it821x.o diff --git a/drivers/ide/pci/delkin_cb.c b/drivers/ide/pci/delkin_cb.c index 8689a70..8f1b2d9 100644 --- a/drivers/ide/pci/delkin_cb.c +++ b/drivers/ide/pci/delkin_cb.c @@ -46,10 +46,27 @@ static const struct ide_port_ops delkin_cb_port_ops = { .quirkproc = ide_undecoded_slave, }; +static unsigned int delkin_cb_init_chipset(struct pci_dev *dev) +{ + unsigned long base = pci_resource_start(dev, 0); + int i; + + outb(0x02, base + 0x1e); /* set nIEN to block interrupts */ + inb(base + 0x17); /* read status to clear interrupts */ + + for (i = 0; i < sizeof(setup); ++i) { + if (setup[i]) + outb(setup[i], base + i); + } + + return 0; +} + static const struct ide_port_info delkin_cb_port_info = { .port_ops = &delkin_cb_port_ops, .host_flags = IDE_HFLAG_IO_32BIT | IDE_HFLAG_UNMASK_IRQS | IDE_HFLAG_NO_DMA, + .init_chipset = delkin_cb_init_chipset, }; static int __devinit @@ -57,7 +74,7 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id) { struct ide_host *host; unsigned long base; - int i, rc; + int rc; hw_regs_t hw, *hws[] = { &hw, NULL, NULL, NULL }; rc = pci_enable_device(dev); @@ -72,12 +89,8 @@ delkin_cb_probe (struct pci_dev *dev, const struct pci_device_id *id) return rc; } base = pci_resource_start(dev, 0); - outb(0x02, base + 0x1e); /* set nIEN to block interrupts */ - inb(base + 0x17); /* read status to clear interrupts */ - for (i = 0; i < sizeof(setup); ++i) { - if (setup[i]) - outb(setup[i], base + i); - } + + delkin_cb_init_chipset(dev); memset(&hw, 0, sizeof(hw)); ide_std_init_ports(&hw, base + 0x10, base + 0x1e); @@ -110,6 +123,40 @@ delkin_cb_remove (struct pci_dev *dev) pci_disable_device(dev); } +#ifdef CONFIG_PM +static int delkin_cb_suspend(struct pci_dev *dev, pm_message_t state) +{ + pci_save_state(dev); + pci_disable_device(dev); + pci_set_power_state(dev, pci_choose_state(dev, state)); + + return 0; +} + +static int delkin_cb_resume(struct pci_dev *dev) +{ + struct ide_host *host = pci_get_drvdata(dev); + int rc; + + pci_set_power_state(dev, PCI_D0); + + rc = pci_enable_device(dev); + if (rc) + return rc; + + pci_restore_state(dev); + pci_set_master(dev); + + if (host->init_chipset) + host->init_chipset(dev); + + return 0; +} +#else +#define delkin_cb_suspend NULL +#define delkin_cb_resume NULL +#endif + static struct pci_device_id delkin_cb_pci_tbl[] __devinitdata = { { 0x1145, 0xf021, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, { 0x1145, 0xf024, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, @@ -122,6 +169,8 @@ static struct pci_driver delkin_cb_pci_driver = { .id_table = delkin_cb_pci_tbl, .probe = delkin_cb_probe, .remove = delkin_cb_remove, + .suspend = delkin_cb_suspend, + .resume = delkin_cb_resume, }; static int __init delkin_cb_init(void) diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c deleted file mode 100644 index fb1a3aa..0000000 --- a/drivers/ide/pci/hpt34x.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (C) 1998-2000 Andre Hedrick <andre@linux-ide.org> - * - * May be copied or modified under the terms of the GNU General Public License - * - * - * 00:12.0 Unknown mass storage controller: - * Triones Technologies, Inc. - * Unknown device 0003 (rev 01) - * - * hde: UDMA 2 (0x0000 0x0002) (0x0000 0x0010) - * hdf: UDMA 2 (0x0002 0x0012) (0x0010 0x0030) - * hde: DMA 2 (0x0000 0x0002) (0x0000 0x0010) - * hdf: DMA 2 (0x0002 0x0012) (0x0010 0x0030) - * hdg: DMA 1 (0x0012 0x0052) (0x0030 0x0070) - * hdh: DMA 1 (0x0052 0x0252) (0x0070 0x00f0) - * - * ide-pci.c reference - * - * Since there are two cards that report almost identically, - * the only discernable difference is the values reported in pcicmd. - * Booting-BIOS card or HPT363 :: pcicmd == 0x07 - * Non-bootable card or HPT343 :: pcicmd == 0x05 - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/pci.h> -#include <linux/init.h> -#include <linux/ide.h> - -#define DRV_NAME "hpt34x" - -#define HPT343_DEBUG_DRIVE_INFO 0 - -static void hpt34x_set_mode(ide_drive_t *drive, const u8 speed) -{ - struct pci_dev *dev = to_pci_dev(drive->hwif->dev); - u32 reg1= 0, tmp1 = 0, reg2 = 0, tmp2 = 0; - u8 hi_speed, lo_speed; - - hi_speed = speed >> 4; - lo_speed = speed & 0x0f; - - if (hi_speed & 7) { - hi_speed = (hi_speed & 4) ? 0x01 : 0x10; - } else { - lo_speed <<= 5; - lo_speed >>= 5; - } - - pci_read_config_dword(dev, 0x44, ®1); - pci_read_config_dword(dev, 0x48, ®2); - tmp1 = ((lo_speed << (3*drive->dn)) | (reg1 & ~(7 << (3*drive->dn)))); - tmp2 = ((hi_speed << drive->dn) | (reg2 & ~(0x11 << drive->dn))); - pci_write_config_dword(dev, 0x44, tmp1); - pci_write_config_dword(dev, 0x48, tmp2); - -#if HPT343_DEBUG_DRIVE_INFO - printk("%s: %s drive%d (0x%04x 0x%04x) (0x%04x 0x%04x)" \ - " (0x%02x 0x%02x)\n", - drive->name, ide_xfer_verbose(speed), - drive->dn, reg1, tmp1, reg2, tmp2, - hi_speed, lo_speed); -#endif /* HPT343_DEBUG_DRIVE_INFO */ -} - -static void hpt34x_set_pio_mode(ide_drive_t *drive, const u8 pio) -{ - hpt34x_set_mode(drive, XFER_PIO_0 + pio); -} - -/* - * If the BIOS does not set the IO base addaress to XX00, 343 will fail. - */ -#define HPT34X_PCI_INIT_REG 0x80 - -static unsigned int init_chipset_hpt34x(struct pci_dev *dev) -{ - int i = 0; - unsigned long hpt34xIoBase = pci_resource_start(dev, 4); - unsigned long hpt_addr[4] = { 0x20, 0x34, 0x28, 0x3c }; - unsigned long hpt_addr_len[4] = { 7, 3, 7, 3 }; - u16 cmd; - unsigned long flags; - - local_irq_save(flags); - - pci_write_config_byte(dev, HPT34X_PCI_INIT_REG, 0x00); - pci_read_config_word(dev, PCI_COMMAND, &cmd); - - if (cmd & PCI_COMMAND_MEMORY) - pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0xF0); - else - pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x20); - - /* - * Since 20-23 can be assigned and are R/W, we correct them. - */ - pci_write_config_word(dev, PCI_COMMAND, cmd & ~PCI_COMMAND_IO); - for(i=0; i<4; i++) { - dev->resource[i].start = (hpt34xIoBase + hpt_addr[i]); - dev->resource[i].end = dev->resource[i].start + hpt_addr_len[i]; - dev->resource[i].flags = IORESOURCE_IO; - pci_write_config_dword(dev, - (PCI_BASE_ADDRESS_0 + (i * 4)), - dev->resource[i].start); - } - pci_write_config_word(dev, PCI_COMMAND, cmd); - - local_irq_restore(flags); - - return dev->irq; -} - -static const struct ide_port_ops hpt34x_port_ops = { - .set_pio_mode = hpt34x_set_pio_mode, - .set_dma_mode = hpt34x_set_mode, -}; - -#define IDE_HFLAGS_HPT34X \ - (IDE_HFLAG_NO_ATAPI_DMA | \ - IDE_HFLAG_NO_DSC | \ - IDE_HFLAG_NO_AUTODMA) - -static const struct ide_port_info hpt34x_chipsets[] __devinitdata = { - { /* 0: HPT343 */ - .name = DRV_NAME, - .init_chipset = init_chipset_hpt34x, - .port_ops = &hpt34x_port_ops, - .host_flags = IDE_HFLAGS_HPT34X | IDE_HFLAG_NON_BOOTABLE, - .pio_mask = ATA_PIO5, - }, - { /* 1: HPT345 */ - .name = DRV_NAME, - .init_chipset = init_chipset_hpt34x, - .port_ops = &hpt34x_port_ops, - .host_flags = IDE_HFLAGS_HPT34X | IDE_HFLAG_OFF_BOARD, - .pio_mask = ATA_PIO5, -#ifdef CONFIG_HPT34X_AUTODMA - .swdma_mask = ATA_SWDMA2, - .mwdma_mask = ATA_MWDMA2, - .udma_mask = ATA_UDMA2, -#endif - } -}; - -static int __devinit hpt34x_init_one(struct pci_dev *dev, const struct pci_device_id *id) -{ - const struct ide_port_info *d; - u16 pcicmd = 0; - - pci_read_config_word(dev, PCI_COMMAND, &pcicmd); - - d = &hpt34x_chipsets[(pcicmd & PCI_COMMAND_MEMORY) ? 1 : 0]; - - return ide_pci_init_one(dev, d, NULL); -} - -static const struct pci_device_id hpt34x_pci_tbl[] = { - { PCI_VDEVICE(TTI, PCI_DEVICE_ID_TTI_HPT343), 0 }, - { 0, }, -}; -MODULE_DEVICE_TABLE(pci, hpt34x_pci_tbl); - -static struct pci_driver hpt34x_pci_driver = { - .name = "HPT34x_IDE", - .id_table = hpt34x_pci_tbl, - .probe = hpt34x_init_one, - .remove = ide_pci_remove, - .suspend = ide_pci_suspend, - .resume = ide_pci_resume, -}; - -static int __init hpt34x_ide_init(void) -{ - return ide_pci_register_driver(&hpt34x_pci_driver); -} - -static void __exit hpt34x_ide_exit(void) -{ - pci_unregister_driver(&hpt34x_pci_driver); -} - -module_init(hpt34x_ide_init); -module_exit(hpt34x_ide_exit); - -MODULE_AUTHOR("Andre Hedrick"); -MODULE_DESCRIPTION("PCI driver module for Highpoint 34x IDE"); -MODULE_LICENSE("GPL"); diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c index 9cf171c..a7909e9 100644 --- a/drivers/ide/pci/hpt366.c +++ b/drivers/ide/pci/hpt366.c @@ -3,7 +3,7 @@ * Portions Copyright (C) 2001 Sun Microsystems, Inc. * Portions Copyright (C) 2003 Red Hat Inc * Portions Copyright (C) 2007 Bartlomiej Zolnierkiewicz - * Portions Copyright (C) 2005-2007 MontaVista Software, Inc. + * Portions Copyright (C) 2005-2008 MontaVista Software, Inc. * * Thanks to HighPoint Technologies for their assistance, and hardware. * Special Thanks to Jon Burchmore in SanDiego for the deep pockets, his @@ -748,26 +748,24 @@ static void hpt3xx_maskproc(ide_drive_t *drive, int mask) struct pci_dev *dev = to_pci_dev(hwif->dev); struct hpt_info *info = hpt3xx_get_info(hwif->dev); - if (drive->quirk_list) { - if (info->chip_type >= HPT370) { - u8 scr1 = 0; - - pci_read_config_byte(dev, 0x5a, &scr1); - if (((scr1 & 0x10) >> 4) != mask) { - if (mask) - scr1 |= 0x10; - else - scr1 &= ~0x10; - pci_write_config_byte(dev, 0x5a, scr1); - } - } else { + if (drive->quirk_list == 0) + return; + + if (info->chip_type >= HPT370) { + u8 scr1 = 0; + + pci_read_config_byte(dev, 0x5a, &scr1); + if (((scr1 & 0x10) >> 4) != mask) { if (mask) - disable_irq(hwif->irq); + scr1 |= 0x10; else - enable_irq (hwif->irq); + scr1 &= ~0x10; + pci_write_config_byte(dev, 0x5a, scr1); } - } else - outb(ATA_DEVCTL_OBS | (mask ? 2 : 0), hwif->io_ports.ctl_addr); + } else if (mask) + disable_irq(hwif->irq); + else + enable_irq(hwif->irq); } /* @@ -1289,7 +1287,6 @@ static u8 hpt3xx_cable_detect(ide_hwif_t *hwif) static void __devinit init_hwif_hpt366(ide_hwif_t *hwif) { - struct pci_dev *dev = to_pci_dev(hwif->dev); struct hpt_info *info = hpt3xx_get_info(hwif->dev); int serialize = HPT_SERIALIZE_IO; u8 chip_type = info->chip_type; diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c index 9ce1d80..49f163a 100644 --- a/drivers/ide/pci/scc_pata.c +++ b/drivers/ide/pci/scc_pata.c @@ -617,7 +617,6 @@ static int __devinit init_setup_scc(struct pci_dev *dev, unsigned long intmask_port; unsigned long mode_port; unsigned long ecmode_port; - unsigned long dma_status_port; u32 reg = 0; struct scc_ports *ports; int rc; @@ -637,7 +636,6 @@ static int __devinit init_setup_scc(struct pci_dev *dev, intmask_port = dma_base + 0x010; mode_port = ctl_base + 0x024; ecmode_port = ctl_base + 0xf00; - dma_status_port = dma_base + 0x004; /* controller initialization */ reg = 0; @@ -843,8 +841,6 @@ static u8 scc_cable_detect(ide_hwif_t *hwif) static void __devinit init_hwif_scc(ide_hwif_t *hwif) { - struct scc_ports *ports = ide_get_hwifdata(hwif); - /* PTERADD */ out_be32((void __iomem *)(hwif->dma_base + 0x018), hwif->dmatable_dma); diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c index dd63454..8af9b23 100644 --- a/drivers/ide/pci/sgiioc4.c +++ b/drivers/ide/pci/sgiioc4.c @@ -101,18 +101,8 @@ sgiioc4_init_hwif_ports(hw_regs_t * hw, unsigned long data_port, for (i = 0; i <= 7; i++) hw->io_ports_array[i] = reg + i * 4; - if (ctrl_port) - hw->io_ports.ctl_addr = ctrl_port; - - if (irq_port) - hw->io_ports.irq_addr = irq_port; -} - -static void -sgiioc4_maskproc(ide_drive_t * drive, int mask) -{ - writeb(ATA_DEVCTL_OBS | (mask ? 2 : 0), - (void __iomem *)drive->hwif->io_ports.ctl_addr); + hw->io_ports.ctl_addr = ctrl_port; + hw->io_ports.irq_addr = irq_port; } static int @@ -310,16 +300,14 @@ static u8 sgiioc4_read_status(ide_hwif_t *hwif) unsigned long port = hwif->io_ports.status_addr; u8 reg = (u8) readb((void __iomem *) port); - if ((port & 0xFFF) == 0x11C) { /* Status register of IOC4 */ - if (!(reg & ATA_BUSY)) { /* Not busy... check for interrupt */ - unsigned long other_ir = port - 0x110; - unsigned int intr_reg = (u32) readl((void __iomem *) other_ir); + if (!(reg & ATA_BUSY)) { /* Not busy... check for interrupt */ + unsigned long other_ir = port - 0x110; + unsigned int intr_reg = (u32) readl((void __iomem *) other_ir); - /* Clear the Interrupt, Error bits on the IOC4 */ - if (intr_reg & 0x03) { - writel(0x03, (void __iomem *) other_ir); - intr_reg = (u32) readl((void __iomem *) other_ir); - } + /* Clear the Interrupt, Error bits on the IOC4 */ + if (intr_reg & 0x03) { + writel(0x03, (void __iomem *) other_ir); + intr_reg = (u32) readl((void __iomem *) other_ir); } } @@ -332,13 +320,9 @@ ide_dma_sgiioc4(ide_hwif_t *hwif, const struct ide_port_info *d) { struct pci_dev *dev = to_pci_dev(hwif->dev); unsigned long dma_base = pci_resource_start(dev, 0) + IOC4_DMA_OFFSET; - void __iomem *virt_dma_base; int num_ports = sizeof (ioc4_dma_regs_t); void *pad; - if (dma_base == 0) - return -1; - printk(KERN_INFO " %s: MMIO-DMA\n", hwif->name); if (request_mem_region(dma_base, num_ports, hwif->name) == NULL) { @@ -348,14 +332,8 @@ ide_dma_sgiioc4(ide_hwif_t *hwif, const struct ide_port_info *d) return -1; } - virt_dma_base = ioremap(dma_base, num_ports); - if (virt_dma_base == NULL) { - printk(KERN_ERR "%s(%s) -- ERROR: unable to map addresses " - "0x%lx to 0x%lx\n", __func__, hwif->name, - dma_base, dma_base + num_ports - 1); - goto dma_remap_failure; - } - hwif->dma_base = (unsigned long) virt_dma_base; + hwif->dma_base = (unsigned long)hwif->io_ports.irq_addr + + IOC4_DMA_OFFSET; hwif->sg_max_nents = IOC4_PRD_ENTRIES; @@ -379,9 +357,6 @@ ide_dma_sgiioc4(ide_hwif_t *hwif, const struct ide_port_info *d) printk(KERN_INFO "%s: changing from DMA to PIO mode", hwif->name); dma_pci_alloc_failure: - iounmap(virt_dma_base); - -dma_remap_failure: release_mem_region(dma_base, num_ports); return -1; @@ -563,8 +538,6 @@ static const struct ide_port_ops sgiioc4_port_ops = { .set_dma_mode = sgiioc4_set_dma_mode, /* reset DMA engine, clear IRQs */ .resetproc = sgiioc4_resetproc, - /* mask on/off NIEN register */ - .maskproc = sgiioc4_maskproc, }; static const struct ide_dma_ops sgiioc4_dma_ops = { diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index e3e4042..c7ff1e1 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -179,7 +179,7 @@ config LEDS_TRIGGER_TIMER config LEDS_TRIGGER_IDE_DISK bool "LED IDE Disk Trigger" - depends on LEDS_TRIGGERS && BLK_DEV_IDEDISK + depends on LEDS_TRIGGERS && IDE_GD_ATA help This allows LEDs to be controlled by IDE disk activity. If unsure, say Y. diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index ba5aa20..e4c0db4 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -123,7 +123,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) irqnr = asic->irq_base + (ASIC3_GPIOS_PER_BANK * bank) + i; - desc = irq_desc + irqnr; + desc = irq_to_desc(irqnr); desc->handle_irq(irqnr, desc); if (asic->irq_bothedge[bank] & bit) asic3_irq_flip_edge(asic, base, @@ -136,7 +136,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) for (i = ASIC3_NUM_GPIOS; i < ASIC3_NR_IRQS; i++) { /* They start at bit 4 and go up */ if (status & (1 << (i - ASIC3_NUM_GPIOS + 4))) { - desc = irq_desc + asic->irq_base + i; + desc = irq_to_desc(asic->irq_base + i); desc->handle_irq(asic->irq_base + i, desc); } diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index 50dff6e..1a4d046 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -112,7 +112,7 @@ static void egpio_handler(unsigned int irq, struct irq_desc *desc) /* Run irq handler */ pr_debug("got IRQ %d\n", irqpin); irq = ei->irq_start + irqpin; - desc = &irq_desc[irq]; + desc = irq_to_desc(irq); desc->handle_irq(irq, desc); } } diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index 491ee16..9ba295d 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -90,7 +90,7 @@ static int vortex_debug = 1; #include <linux/eisa.h> #include <linux/bitops.h> #include <linux/jiffies.h> -#include <asm/irq.h> /* For NR_IRQS only. */ +#include <asm/irq.h> /* For nr_irqs only. */ #include <asm/io.h> #include <asm/uaccess.h> @@ -1221,7 +1221,7 @@ static int __devinit vortex_probe1(struct device *gendev, if (print_info) printk(", IRQ %d\n", dev->irq); /* Tell them about an invalid IRQ. */ - if (dev->irq <= 0 || dev->irq >= NR_IRQS) + if (dev->irq <= 0 || dev->irq >= nr_irqs) printk(KERN_WARNING " *** Warning: IRQ %d is unlikely to work! ***\n", dev->irq); diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c index 17ac697..b6a816e 100644 --- a/drivers/net/hamradio/baycom_ser_fdx.c +++ b/drivers/net/hamradio/baycom_ser_fdx.c @@ -416,10 +416,10 @@ static int ser12_open(struct net_device *dev) if (!dev || !bc) return -ENXIO; if (!dev->base_addr || dev->base_addr > 0xffff-SER12_EXTENT || - dev->irq < 2 || dev->irq > NR_IRQS) { + dev->irq < 2 || dev->irq > nr_irqs) { printk(KERN_INFO "baycom_ser_fdx: invalid portnumber (max %u) " "or irq (2 <= irq <= %d)\n", - 0xffff-SER12_EXTENT, NR_IRQS); + 0xffff-SER12_EXTENT, nr_irqs); return -ENXIO; } if (bc->baud < 300 || bc->baud > 4800) { diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c index 45ae9d1..c17e39b 100644 --- a/drivers/net/hamradio/scc.c +++ b/drivers/net/hamradio/scc.c @@ -1465,7 +1465,7 @@ static void z8530_init(void) printk(KERN_INFO "Init Z8530 driver: %u channels, IRQ", Nchips*2); flag=" "; - for (k = 0; k < NR_IRQS; k++) + for (k = 0; k < nr_irqs; k++) if (Ivec[k].used) { printk("%s%d", flag, k); @@ -1728,7 +1728,7 @@ static int scc_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (hwcfg.irq == 2) hwcfg.irq = 9; - if (hwcfg.irq < 0 || hwcfg.irq >= NR_IRQS) + if (hwcfg.irq < 0 || hwcfg.irq >= nr_irqs) return -EINVAL; if (!Ivec[hwcfg.irq].used && hwcfg.irq) @@ -2148,7 +2148,7 @@ static void __exit scc_cleanup_driver(void) } /* To unload the port must be closed so no real IRQ pending */ - for (k=0; k < NR_IRQS ; k++) + for (k = 0; k < nr_irqs ; k++) if (Ivec[k].used) free_irq(k, NULL); local_irq_enable(); diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c index 38b90e7..7914867 100644 --- a/drivers/net/usb/pegasus.c +++ b/drivers/net/usb/pegasus.c @@ -168,7 +168,7 @@ static int get_registers(pegasus_t * pegasus, __u16 indx, __u16 size, netif_device_detach(pegasus->net); if (netif_msg_drv(pegasus) && printk_ratelimit()) dev_err(&pegasus->intf->dev, "%s, status %d\n", - __FUNCTION__, ret); + __func__, ret); goto out; } @@ -192,7 +192,7 @@ static int set_registers(pegasus_t * pegasus, __u16 indx, __u16 size, if (!buffer) { if (netif_msg_drv(pegasus)) dev_warn(&pegasus->intf->dev, "out of memory in %s\n", - __FUNCTION__); + __func__); return -ENOMEM; } memcpy(buffer, data, size); diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index f972fef..ee51b6a 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -318,7 +318,7 @@ sbni_pci_probe( struct net_device *dev ) continue; } - if( pci_irq_line <= 0 || pci_irq_line >= NR_IRQS ) + if (pci_irq_line <= 0 || pci_irq_line >= nr_irqs) printk( KERN_WARNING " WARNING: The PCI BIOS assigned " "this PCI card to IRQ %d, which is unlikely " "to work!.\n" diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index b30e38f..dcc1e99 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -66,15 +66,8 @@ #undef DEBUG_CCIO_RUN_SG #ifdef CONFIG_PROC_FS -/* - * CCIO_SEARCH_TIME can help measure how fast the bitmap search is. - * impacts performance though - ditch it if you don't use it. - */ -#define CCIO_SEARCH_TIME -#undef CCIO_MAP_STATS -#else -#undef CCIO_SEARCH_TIME -#undef CCIO_MAP_STATS +/* depends on proc fs support. But costs CPU performance. */ +#undef CCIO_COLLECT_STATS #endif #include <linux/proc_fs.h> @@ -239,12 +232,10 @@ struct ioc { u32 res_size; /* size of resource map in bytes */ spinlock_t res_lock; -#ifdef CCIO_SEARCH_TIME +#ifdef CCIO_COLLECT_STATS #define CCIO_SEARCH_SAMPLE 0x100 unsigned long avg_search[CCIO_SEARCH_SAMPLE]; unsigned long avg_idx; /* current index into avg_search */ -#endif -#ifdef CCIO_MAP_STATS unsigned long used_pages; unsigned long msingle_calls; unsigned long msingle_pages; @@ -351,7 +342,7 @@ ccio_alloc_range(struct ioc *ioc, struct device *dev, size_t size) unsigned int pages_needed = size >> IOVP_SHIFT; unsigned int res_idx; unsigned long boundary_size; -#ifdef CCIO_SEARCH_TIME +#ifdef CCIO_COLLECT_STATS unsigned long cr_start = mfctl(16); #endif @@ -406,7 +397,7 @@ resource_found: DBG_RES("%s() res_idx %d res_hint: %d\n", __func__, res_idx, ioc->res_hint); -#ifdef CCIO_SEARCH_TIME +#ifdef CCIO_COLLECT_STATS { unsigned long cr_end = mfctl(16); unsigned long tmp = cr_end - cr_start; @@ -416,7 +407,7 @@ resource_found: ioc->avg_search[ioc->avg_idx++] = cr_start; ioc->avg_idx &= CCIO_SEARCH_SAMPLE - 1; #endif -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->used_pages += pages_needed; #endif /* @@ -452,7 +443,7 @@ ccio_free_range(struct ioc *ioc, dma_addr_t iova, unsigned long pages_mapped) DBG_RES("%s(): res_idx: %d pages_mapped %d\n", __func__, res_idx, pages_mapped); -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->used_pages -= pages_mapped; #endif @@ -764,7 +755,7 @@ ccio_map_single(struct device *dev, void *addr, size_t size, size = ALIGN(size + offset, IOVP_SIZE); spin_lock_irqsave(&ioc->res_lock, flags); -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->msingle_calls++; ioc->msingle_pages += size >> IOVP_SHIFT; #endif @@ -828,7 +819,7 @@ ccio_unmap_single(struct device *dev, dma_addr_t iova, size_t size, spin_lock_irqsave(&ioc->res_lock, flags); -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->usingle_calls++; ioc->usingle_pages += size >> IOVP_SHIFT; #endif @@ -894,7 +885,7 @@ ccio_free_consistent(struct device *dev, size_t size, void *cpu_addr, */ #define PIDE_FLAG 0x80000000UL -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS #define IOMMU_MAP_STATS #endif #include "iommu-helpers.h" @@ -938,7 +929,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, int nents, spin_lock_irqsave(&ioc->res_lock, flags); -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->msg_calls++; #endif @@ -997,13 +988,13 @@ ccio_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, DBG_RUN_SG("%s() START %d entries, %08lx,%x\n", __func__, nents, sg_virt_addr(sglist), sglist->length); -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->usg_calls++; #endif while(sg_dma_len(sglist) && nents--) { -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS ioc->usg_pages += sg_dma_len(sglist) >> PAGE_SHIFT; #endif ccio_unmap_single(dev, sg_dma_address(sglist), @@ -1048,7 +1039,7 @@ static int ccio_proc_info(struct seq_file *m, void *p) len += seq_printf(m, "IO PDIR size : %d bytes (%d entries)\n", total_pages * 8, total_pages); -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS len += seq_printf(m, "IO PDIR entries : %ld free %ld used (%d%%)\n", total_pages - ioc->used_pages, ioc->used_pages, (int)(ioc->used_pages * 100 / total_pages)); @@ -1057,7 +1048,7 @@ static int ccio_proc_info(struct seq_file *m, void *p) len += seq_printf(m, "Resource bitmap : %d bytes (%d pages)\n", ioc->res_size, total_pages); -#ifdef CCIO_SEARCH_TIME +#ifdef CCIO_COLLECT_STATS min = max = ioc->avg_search[0]; for(j = 0; j < CCIO_SEARCH_SAMPLE; ++j) { avg += ioc->avg_search[j]; @@ -1070,7 +1061,7 @@ static int ccio_proc_info(struct seq_file *m, void *p) len += seq_printf(m, " Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n", min, avg, max); #endif -#ifdef CCIO_MAP_STATS +#ifdef CCIO_COLLECT_STATS len += seq_printf(m, "pci_map_single(): %8ld calls %8ld pages (avg %d/1000)\n", ioc->msingle_calls, ioc->msingle_pages, (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls)); @@ -1088,7 +1079,7 @@ static int ccio_proc_info(struct seq_file *m, void *p) len += seq_printf(m, "pci_unmap_sg() : %8ld calls %8ld pages (avg %d/1000)\n\n\n", ioc->usg_calls, ioc->usg_pages, (int)((ioc->usg_pages * 1000)/ioc->usg_calls)); -#endif /* CCIO_MAP_STATS */ +#endif /* CCIO_COLLECT_STATS */ ioc = ioc->next; } diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index fd56128..3bc54b3 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -298,7 +298,8 @@ struct pci_port_ops dino_port_ops = { static void dino_disable_irq(unsigned int irq) { - struct dino_device *dino_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct dino_device *dino_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS); DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, irq); @@ -310,7 +311,8 @@ static void dino_disable_irq(unsigned int irq) static void dino_enable_irq(unsigned int irq) { - struct dino_device *dino_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct dino_device *dino_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS); u32 tmp; diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c index 771cef5..7891db5 100644 --- a/drivers/parisc/eisa.c +++ b/drivers/parisc/eisa.c @@ -346,10 +346,10 @@ static int __init eisa_probe(struct parisc_device *dev) } /* Reserve IRQ2 */ - irq_desc[2].action = &irq2_action; + irq_to_desc(2)->action = &irq2_action; for (i = 0; i < 16; i++) { - irq_desc[i].chip = &eisa_interrupt_type; + irq_to_desc(i)->chip = &eisa_interrupt_type; } EISA_bus = 1; diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c index f7d088b..e76db9e 100644 --- a/drivers/parisc/gsc.c +++ b/drivers/parisc/gsc.c @@ -108,7 +108,8 @@ int gsc_find_local_irq(unsigned int irq, int *global_irqs, int limit) static void gsc_asic_disable_irq(unsigned int irq) { - struct gsc_asic *irq_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct gsc_asic *irq_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32); u32 imr; @@ -123,7 +124,8 @@ static void gsc_asic_disable_irq(unsigned int irq) static void gsc_asic_enable_irq(unsigned int irq) { - struct gsc_asic *irq_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct gsc_asic *irq_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32); u32 imr; @@ -159,12 +161,14 @@ static struct hw_interrupt_type gsc_asic_interrupt_type = { int gsc_assign_irq(struct hw_interrupt_type *type, void *data) { static int irq = GSC_IRQ_BASE; + struct irq_desc *desc; if (irq > GSC_IRQ_MAX) return NO_IRQ; - irq_desc[irq].chip = type; - irq_desc[irq].chip_data = data; + desc = irq_to_desc(irq); + desc->chip = type; + desc->chip_data = data; return irq++; } diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 6fb3f79..7beffca 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -619,7 +619,9 @@ iosapic_set_irt_data( struct vector_info *vi, u32 *dp0, u32 *dp1) static struct vector_info *iosapic_get_vector(unsigned int irq) { - return irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + + return desc->chip_data; } static void iosapic_disable_irq(unsigned int irq) diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c index 1e8d2d1..1e93c83 100644 --- a/drivers/parisc/superio.c +++ b/drivers/parisc/superio.c @@ -363,7 +363,9 @@ int superio_fixup_irq(struct pci_dev *pcidev) #endif for (i = 0; i < 16; i++) { - irq_desc[i].chip = &superio_interrupt_type; + struct irq_desc *desc = irq_to_desc(i); + + desc->chip = &superio_interrupt_type; } /* diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 529d9d7..999cc40 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -151,6 +151,13 @@ void pci_bus_add_devices(struct pci_bus *bus) if (retval) dev_err(&dev->dev, "Error creating cpuaffinity" " file, continuing...\n"); + + retval = device_create_file(&child_bus->dev, + &dev_attr_cpulistaffinity); + if (retval) + dev_err(&dev->dev, + "Error creating cpulistaffinity" + " file, continuing...\n"); } } } diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 7b37511..691b3ad 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -211,7 +211,7 @@ static int __init dmar_parse_dev(struct dmar_drhd_unit *dmaru) include_all = 1; } - if (ret || (dmaru->devices_cnt == 0 && !dmaru->include_all)) { + if (ret) { list_del(&dmaru->list); kfree(dmaru); } @@ -289,6 +289,24 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header) } } +/** + * dmar_table_detect - checks to see if the platform supports DMAR devices + */ +static int __init dmar_table_detect(void) +{ + acpi_status status = AE_OK; + + /* if we could find DMAR table, then there are DMAR devices */ + status = acpi_get_table(ACPI_SIG_DMAR, 0, + (struct acpi_table_header **)&dmar_tbl); + + if (ACPI_SUCCESS(status) && !dmar_tbl) { + printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); + status = AE_NOT_FOUND; + } + + return (ACPI_SUCCESS(status) ? 1 : 0); +} /** * parse_dmar_table - parses the DMA reporting table @@ -300,6 +318,12 @@ parse_dmar_table(void) struct acpi_dmar_header *entry_header; int ret = 0; + /* + * Do it again, earlier dmar_tbl mapping could be mapped with + * fixed map. + */ + dmar_table_detect(); + dmar = (struct acpi_table_dmar *)dmar_tbl; if (!dmar) return -ENODEV; @@ -373,10 +397,10 @@ dmar_find_matched_drhd_unit(struct pci_dev *dev) int __init dmar_dev_scope_init(void) { - struct dmar_drhd_unit *drhd; + struct dmar_drhd_unit *drhd, *drhd_n; int ret = -ENODEV; - for_each_drhd_unit(drhd) { + list_for_each_entry_safe(drhd, drhd_n, &dmar_drhd_units, list) { ret = dmar_parse_dev(drhd); if (ret) return ret; @@ -384,8 +408,8 @@ int __init dmar_dev_scope_init(void) #ifdef CONFIG_DMAR { - struct dmar_rmrr_unit *rmrr; - for_each_rmrr_units(rmrr) { + struct dmar_rmrr_unit *rmrr, *rmrr_n; + list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) { ret = rmrr_parse_dev(rmrr); if (ret) return ret; @@ -430,30 +454,11 @@ int __init dmar_table_init(void) return 0; } -/** - * early_dmar_detect - checks to see if the platform supports DMAR devices - */ -int __init early_dmar_detect(void) -{ - acpi_status status = AE_OK; - - /* if we could find DMAR table, then there are DMAR devices */ - status = acpi_get_table(ACPI_SIG_DMAR, 0, - (struct acpi_table_header **)&dmar_tbl); - - if (ACPI_SUCCESS(status) && !dmar_tbl) { - printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); - status = AE_NOT_FOUND; - } - - return (ACPI_SUCCESS(status) ? 1 : 0); -} - void __init detect_intel_iommu(void) { int ret; - ret = early_dmar_detect(); + ret = dmar_table_detect(); { #ifdef CONFIG_INTR_REMAP @@ -470,13 +475,13 @@ void __init detect_intel_iommu(void) "Queued invalidation will be enabled to support " "x2apic and Intr-remapping.\n"); #endif - #ifdef CONFIG_DMAR if (ret && !no_iommu && !iommu_detected && !swiotlb && !dmar_disabled) iommu_detected = 1; #endif } + dmar_tbl = NULL; } diff --git a/drivers/pci/hotplug/ibmphp_ebda.c b/drivers/pci/hotplug/ibmphp_ebda.c index 7d27631..8cfd1c4 100644 --- a/drivers/pci/hotplug/ibmphp_ebda.c +++ b/drivers/pci/hotplug/ibmphp_ebda.c @@ -123,10 +123,8 @@ static struct ebda_pci_rsrc *alloc_ebda_pci_rsrc (void) static void __init print_bus_info (void) { struct bus_info *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &bus_info_head) { - ptr = list_entry (ptr1, struct bus_info, bus_info_list); + list_for_each_entry(ptr, &bus_info_head, bus_info_list) { debug ("%s - slot_min = %x\n", __func__, ptr->slot_min); debug ("%s - slot_max = %x\n", __func__, ptr->slot_max); debug ("%s - slot_count = %x\n", __func__, ptr->slot_count); @@ -146,10 +144,8 @@ static void __init print_bus_info (void) static void print_lo_info (void) { struct rio_detail *ptr; - struct list_head *ptr1; debug ("print_lo_info ----\n"); - list_for_each (ptr1, &rio_lo_head) { - ptr = list_entry (ptr1, struct rio_detail, rio_detail_list); + list_for_each_entry(ptr, &rio_lo_head, rio_detail_list) { debug ("%s - rio_node_id = %x\n", __func__, ptr->rio_node_id); debug ("%s - rio_type = %x\n", __func__, ptr->rio_type); debug ("%s - owner_id = %x\n", __func__, ptr->owner_id); @@ -163,10 +159,8 @@ static void print_lo_info (void) static void print_vg_info (void) { struct rio_detail *ptr; - struct list_head *ptr1; debug ("%s ---\n", __func__); - list_for_each (ptr1, &rio_vg_head) { - ptr = list_entry (ptr1, struct rio_detail, rio_detail_list); + list_for_each_entry(ptr, &rio_vg_head, rio_detail_list) { debug ("%s - rio_node_id = %x\n", __func__, ptr->rio_node_id); debug ("%s - rio_type = %x\n", __func__, ptr->rio_type); debug ("%s - owner_id = %x\n", __func__, ptr->owner_id); @@ -180,10 +174,8 @@ static void print_vg_info (void) static void __init print_ebda_pci_rsrc (void) { struct ebda_pci_rsrc *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &ibmphp_ebda_pci_rsrc_head) { - ptr = list_entry (ptr1, struct ebda_pci_rsrc, ebda_pci_rsrc_list); + list_for_each_entry(ptr, &ibmphp_ebda_pci_rsrc_head, ebda_pci_rsrc_list) { debug ("%s - rsrc type: %x bus#: %x dev_func: %x start addr: %x end addr: %x\n", __func__, ptr->rsrc_type ,ptr->bus_num, ptr->dev_fun,ptr->start_addr, ptr->end_addr); } @@ -192,10 +184,8 @@ static void __init print_ebda_pci_rsrc (void) static void __init print_ibm_slot (void) { struct slot *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &ibmphp_slot_head) { - ptr = list_entry (ptr1, struct slot, ibm_slot_list); + list_for_each_entry(ptr, &ibmphp_slot_head, ibm_slot_list) { debug ("%s - slot_number: %x\n", __func__, ptr->number); } } @@ -203,10 +193,8 @@ static void __init print_ibm_slot (void) static void __init print_opt_vg (void) { struct opt_rio *ptr; - struct list_head *ptr1; debug ("%s ---\n", __func__); - list_for_each (ptr1, &opt_vg_head) { - ptr = list_entry (ptr1, struct opt_rio, opt_rio_list); + list_for_each_entry(ptr, &opt_vg_head, opt_rio_list) { debug ("%s - rio_type %x\n", __func__, ptr->rio_type); debug ("%s - chassis_num: %x\n", __func__, ptr->chassis_num); debug ("%s - first_slot_num: %x\n", __func__, ptr->first_slot_num); @@ -217,13 +205,9 @@ static void __init print_opt_vg (void) static void __init print_ebda_hpc (void) { struct controller *hpc_ptr; - struct list_head *ptr1; u16 index; - list_for_each (ptr1, &ebda_hpc_head) { - - hpc_ptr = list_entry (ptr1, struct controller, ebda_hpc_list); - + list_for_each_entry(hpc_ptr, &ebda_hpc_head, ebda_hpc_list) { for (index = 0; index < hpc_ptr->slot_count; index++) { debug ("%s - physical slot#: %x\n", __func__, hpc_ptr->slots[index].slot_num); debug ("%s - pci bus# of the slot: %x\n", __func__, hpc_ptr->slots[index].slot_bus_num); @@ -460,9 +444,7 @@ static int __init ebda_rio_table (void) static struct opt_rio *search_opt_vg (u8 chassis_num) { struct opt_rio *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &opt_vg_head) { - ptr = list_entry (ptr1, struct opt_rio, opt_rio_list); + list_for_each_entry(ptr, &opt_vg_head, opt_rio_list) { if (ptr->chassis_num == chassis_num) return ptr; } @@ -473,10 +455,8 @@ static int __init combine_wpg_for_chassis (void) { struct opt_rio *opt_rio_ptr = NULL; struct rio_detail *rio_detail_ptr = NULL; - struct list_head *list_head_ptr = NULL; - list_for_each (list_head_ptr, &rio_vg_head) { - rio_detail_ptr = list_entry (list_head_ptr, struct rio_detail, rio_detail_list); + list_for_each_entry(rio_detail_ptr, &rio_vg_head, rio_detail_list) { opt_rio_ptr = search_opt_vg (rio_detail_ptr->chassis_num); if (!opt_rio_ptr) { opt_rio_ptr = kzalloc(sizeof(struct opt_rio), GFP_KERNEL); @@ -497,14 +477,12 @@ static int __init combine_wpg_for_chassis (void) } /* - * reorgnizing linked list of expansion box + * reorganizing linked list of expansion box */ static struct opt_rio_lo *search_opt_lo (u8 chassis_num) { struct opt_rio_lo *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &opt_lo_head) { - ptr = list_entry (ptr1, struct opt_rio_lo, opt_rio_lo_list); + list_for_each_entry(ptr, &opt_lo_head, opt_rio_lo_list) { if (ptr->chassis_num == chassis_num) return ptr; } @@ -515,10 +493,8 @@ static int combine_wpg_for_expansion (void) { struct opt_rio_lo *opt_rio_lo_ptr = NULL; struct rio_detail *rio_detail_ptr = NULL; - struct list_head *list_head_ptr = NULL; - list_for_each (list_head_ptr, &rio_lo_head) { - rio_detail_ptr = list_entry (list_head_ptr, struct rio_detail, rio_detail_list); + list_for_each_entry(rio_detail_ptr, &rio_lo_head, rio_detail_list) { opt_rio_lo_ptr = search_opt_lo (rio_detail_ptr->chassis_num); if (!opt_rio_lo_ptr) { opt_rio_lo_ptr = kzalloc(sizeof(struct opt_rio_lo), GFP_KERNEL); @@ -550,20 +526,17 @@ static int first_slot_num (u8 slot_num, u8 first_slot, u8 var) { struct opt_rio *opt_vg_ptr = NULL; struct opt_rio_lo *opt_lo_ptr = NULL; - struct list_head *ptr = NULL; int rc = 0; if (!var) { - list_for_each (ptr, &opt_vg_head) { - opt_vg_ptr = list_entry (ptr, struct opt_rio, opt_rio_list); + list_for_each_entry(opt_vg_ptr, &opt_vg_head, opt_rio_list) { if ((first_slot < opt_vg_ptr->first_slot_num) && (slot_num >= opt_vg_ptr->first_slot_num)) { rc = -ENODEV; break; } } } else { - list_for_each (ptr, &opt_lo_head) { - opt_lo_ptr = list_entry (ptr, struct opt_rio_lo, opt_rio_lo_list); + list_for_each_entry(opt_lo_ptr, &opt_lo_head, opt_rio_lo_list) { if ((first_slot < opt_lo_ptr->first_slot_num) && (slot_num >= opt_lo_ptr->first_slot_num)) { rc = -ENODEV; break; @@ -576,10 +549,8 @@ static int first_slot_num (u8 slot_num, u8 first_slot, u8 var) static struct opt_rio_lo * find_rxe_num (u8 slot_num) { struct opt_rio_lo *opt_lo_ptr; - struct list_head *ptr; - list_for_each (ptr, &opt_lo_head) { - opt_lo_ptr = list_entry (ptr, struct opt_rio_lo, opt_rio_lo_list); + list_for_each_entry(opt_lo_ptr, &opt_lo_head, opt_rio_lo_list) { //check to see if this slot_num belongs to expansion box if ((slot_num >= opt_lo_ptr->first_slot_num) && (!first_slot_num (slot_num, opt_lo_ptr->first_slot_num, 1))) return opt_lo_ptr; @@ -590,10 +561,8 @@ static struct opt_rio_lo * find_rxe_num (u8 slot_num) static struct opt_rio * find_chassis_num (u8 slot_num) { struct opt_rio *opt_vg_ptr; - struct list_head *ptr; - list_for_each (ptr, &opt_vg_head) { - opt_vg_ptr = list_entry (ptr, struct opt_rio, opt_rio_list); + list_for_each_entry(opt_vg_ptr, &opt_vg_head, opt_rio_list) { //check to see if this slot_num belongs to chassis if ((slot_num >= opt_vg_ptr->first_slot_num) && (!first_slot_num (slot_num, opt_vg_ptr->first_slot_num, 0))) return opt_vg_ptr; @@ -607,11 +576,9 @@ static struct opt_rio * find_chassis_num (u8 slot_num) static u8 calculate_first_slot (u8 slot_num) { u8 first_slot = 1; - struct list_head * list; struct slot * slot_cur; - list_for_each (list, &ibmphp_slot_head) { - slot_cur = list_entry (list, struct slot, ibm_slot_list); + list_for_each_entry(slot_cur, &ibmphp_slot_head, ibm_slot_list) { if (slot_cur->ctrl) { if ((slot_cur->ctrl->ctlr_type != 4) && (slot_cur->ctrl->ending_slot_num > first_slot) && (slot_num > slot_cur->ctrl->ending_slot_num)) first_slot = slot_cur->ctrl->ending_slot_num; @@ -767,7 +734,6 @@ static int __init ebda_rsrc_controller (void) struct bus_info *bus_info_ptr1, *bus_info_ptr2; int rc; struct slot *tmp_slot; - struct list_head *list; addr = hpc_list_ptr->phys_addr; for (ctlr = 0; ctlr < hpc_list_ptr->num_ctlrs; ctlr++) { @@ -997,9 +963,7 @@ static int __init ebda_rsrc_controller (void) } /* each hpc */ - list_for_each (list, &ibmphp_slot_head) { - tmp_slot = list_entry (list, struct slot, ibm_slot_list); - + list_for_each_entry(tmp_slot, &ibmphp_slot_head, ibm_slot_list) { snprintf (tmp_slot->hotplug_slot->name, 30, "%s", create_file_name (tmp_slot)); pci_hp_register(tmp_slot->hotplug_slot, pci_find_bus(0, tmp_slot->bus), tmp_slot->device); @@ -1101,10 +1065,8 @@ u16 ibmphp_get_total_controllers (void) struct slot *ibmphp_get_slot_from_physical_num (u8 physical_num) { struct slot *slot; - struct list_head *list; - list_for_each (list, &ibmphp_slot_head) { - slot = list_entry (list, struct slot, ibm_slot_list); + list_for_each_entry(slot, &ibmphp_slot_head, ibm_slot_list) { if (slot->number == physical_num) return slot; } @@ -1120,10 +1082,8 @@ struct slot *ibmphp_get_slot_from_physical_num (u8 physical_num) struct bus_info *ibmphp_find_same_bus_num (u32 num) { struct bus_info *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &bus_info_head) { - ptr = list_entry (ptr1, struct bus_info, bus_info_list); + list_for_each_entry(ptr, &bus_info_head, bus_info_list) { if (ptr->busno == num) return ptr; } @@ -1136,10 +1096,8 @@ struct bus_info *ibmphp_find_same_bus_num (u32 num) int ibmphp_get_bus_index (u8 num) { struct bus_info *ptr; - struct list_head *ptr1; - list_for_each (ptr1, &bus_info_head) { - ptr = list_entry (ptr1, struct bus_info, bus_info_list); + list_for_each_entry(ptr, &bus_info_head, bus_info_list) { if (ptr->busno == num) return ptr->index; } @@ -1212,11 +1170,9 @@ static struct pci_driver ibmphp_driver = { int ibmphp_register_pci (void) { struct controller *ctrl; - struct list_head *tmp; int rc = 0; - list_for_each (tmp, &ebda_hpc_head) { - ctrl = list_entry (tmp, struct controller, ebda_hpc_list); + list_for_each_entry(ctrl, &ebda_hpc_head, ebda_hpc_list) { if (ctrl->ctlr_type == 1) { rc = pci_register_driver(&ibmphp_driver); break; @@ -1227,12 +1183,10 @@ int ibmphp_register_pci (void) static int ibmphp_probe (struct pci_dev * dev, const struct pci_device_id *ids) { struct controller *ctrl; - struct list_head *tmp; debug ("inside ibmphp_probe\n"); - list_for_each (tmp, &ebda_hpc_head) { - ctrl = list_entry (tmp, struct controller, ebda_hpc_list); + list_for_each_entry(ctrl, &ebda_hpc_head, ebda_hpc_list) { if (ctrl->ctlr_type == 1) { if ((dev->devfn == ctrl->u.pci_ctlr.dev_fun) && (dev->bus->number == ctrl->u.pci_ctlr.bus)) { ctrl->ctrl_dev = dev; diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c index 5f85b1b..2e6c447 100644 --- a/drivers/pci/hotplug/pci_hotplug_core.c +++ b/drivers/pci/hotplug/pci_hotplug_core.c @@ -102,13 +102,13 @@ static int get_##name (struct hotplug_slot *slot, type *value) \ { \ struct hotplug_slot_ops *ops = slot->ops; \ int retval = 0; \ - if (try_module_get(ops->owner)) { \ - if (ops->get_##name) \ - retval = ops->get_##name(slot, value); \ - else \ - *value = slot->info->name; \ - module_put(ops->owner); \ - } \ + if (!try_module_get(ops->owner)) \ + return -ENODEV; \ + if (ops->get_##name) \ + retval = ops->get_##name(slot, value); \ + else \ + *value = slot->info->name; \ + module_put(ops->owner); \ return retval; \ } diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 9e6cec6..c367978 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -57,6 +57,19 @@ extern struct workqueue_struct *pciehp_wq; #define warn(format, arg...) \ printk(KERN_WARNING "%s: " format, MY_NAME , ## arg) +#define ctrl_dbg(ctrl, format, arg...) \ + do { \ + if (pciehp_debug) \ + dev_printk(, &ctrl->pcie->device, \ + format, ## arg); \ + } while (0) +#define ctrl_err(ctrl, format, arg...) \ + dev_err(&ctrl->pcie->device, format, ## arg) +#define ctrl_info(ctrl, format, arg...) \ + dev_info(&ctrl->pcie->device, format, ## arg) +#define ctrl_warn(ctrl, format, arg...) \ + dev_warn(&ctrl->pcie->device, format, ## arg) + #define SLOT_NAME_SIZE 10 struct slot { u8 bus; @@ -87,6 +100,7 @@ struct controller { int num_slots; /* Number of slots on ctlr */ int slot_num_inc; /* 1 or -1 */ struct pci_dev *pci_dev; + struct pcie_device *pcie; /* PCI Express port service */ struct list_head slot_list; struct hpc_ops *hpc_ops; wait_queue_head_t queue; /* sleep & wake process */ @@ -170,7 +184,7 @@ static inline struct slot *pciehp_find_slot(struct controller *ctrl, u8 device) return slot; } - err("%s: slot (device=0x%x) not found\n", __func__, device); + ctrl_err(ctrl, "%s: slot (device=0x%x) not found\n", __func__, device); return NULL; } diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 4fd5355..c748a19 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -144,9 +144,10 @@ set_lock_exit: * sysfs interface which allows the user to toggle the Electro Mechanical * Interlock. Valid values are either 0 or 1. 0 == unlock, 1 == lock */ -static ssize_t lock_write_file(struct hotplug_slot *slot, const char *buf, - size_t count) +static ssize_t lock_write_file(struct hotplug_slot *hotplug_slot, + const char *buf, size_t count) { + struct slot *slot = hotplug_slot->private; unsigned long llock; u8 lock; int retval = 0; @@ -157,10 +158,11 @@ static ssize_t lock_write_file(struct hotplug_slot *slot, const char *buf, switch (lock) { case 0: case 1: - retval = set_lock_status(slot, lock); + retval = set_lock_status(hotplug_slot, lock); break; default: - err ("%d is an invalid lock value\n", lock); + ctrl_err(slot->ctrl, "%d is an invalid lock value\n", + lock); retval = -EINVAL; } if (retval) @@ -180,7 +182,10 @@ static struct hotplug_slot_attribute hotplug_slot_attr_lock = { */ static void release_slot(struct hotplug_slot *hotplug_slot) { - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + struct slot *slot = hotplug_slot->private; + + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); kfree(hotplug_slot->info); kfree(hotplug_slot); @@ -215,9 +220,9 @@ static int init_slots(struct controller *ctrl) get_adapter_status(hotplug_slot, &info->adapter_status); slot->hotplug_slot = hotplug_slot; - dbg("Registering bus=%x dev=%x hp_slot=%x sun=%x " - "slot_device_offset=%x\n", slot->bus, slot->device, - slot->hp_slot, slot->number, ctrl->slot_device_offset); + ctrl_dbg(ctrl, "Registering bus=%x dev=%x hp_slot=%x sun=%x " + "slot_device_offset=%x\n", slot->bus, slot->device, + slot->hp_slot, slot->number, ctrl->slot_device_offset); duplicate_name: retval = pci_hp_register(hotplug_slot, ctrl->pci_dev->subordinate, @@ -233,9 +238,11 @@ duplicate_name: if (len < SLOT_NAME_SIZE) goto duplicate_name; else - err("duplicate slot name overflow\n"); + ctrl_err(ctrl, "duplicate slot name " + "overflow\n"); } - err("pci_hp_register failed with error %d\n", retval); + ctrl_err(ctrl, "pci_hp_register failed with error %d\n", + retval); goto error_info; } /* create additional sysfs entries */ @@ -244,7 +251,8 @@ duplicate_name: &hotplug_slot_attr_lock.attr); if (retval) { pci_hp_deregister(hotplug_slot); - err("cannot create additional sysfs entries\n"); + ctrl_err(ctrl, "cannot create additional sysfs " + "entries\n"); goto error_info; } } @@ -278,7 +286,8 @@ static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status) { struct slot *slot = hotplug_slot->private; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); hotplug_slot->info->attention_status = status; @@ -293,7 +302,8 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) { struct slot *slot = hotplug_slot->private; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); return pciehp_sysfs_enable_slot(slot); } @@ -303,7 +313,8 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) { struct slot *slot = hotplug_slot->private; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); return pciehp_sysfs_disable_slot(slot); } @@ -313,7 +324,8 @@ static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value) struct slot *slot = hotplug_slot->private; int retval; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); retval = slot->hpc_ops->get_power_status(slot, value); if (retval < 0) @@ -327,7 +339,8 @@ static int get_attention_status(struct hotplug_slot *hotplug_slot, u8 *value) struct slot *slot = hotplug_slot->private; int retval; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); retval = slot->hpc_ops->get_attention_status(slot, value); if (retval < 0) @@ -341,7 +354,8 @@ static int get_latch_status(struct hotplug_slot *hotplug_slot, u8 *value) struct slot *slot = hotplug_slot->private; int retval; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); retval = slot->hpc_ops->get_latch_status(slot, value); if (retval < 0) @@ -355,7 +369,8 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value) struct slot *slot = hotplug_slot->private; int retval; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); retval = slot->hpc_ops->get_adapter_status(slot, value); if (retval < 0) @@ -370,7 +385,8 @@ static int get_max_bus_speed(struct hotplug_slot *hotplug_slot, struct slot *slot = hotplug_slot->private; int retval; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); retval = slot->hpc_ops->get_max_bus_speed(slot, value); if (retval < 0) @@ -384,7 +400,8 @@ static int get_cur_bus_speed(struct hotplug_slot *hotplug_slot, enum pci_bus_spe struct slot *slot = hotplug_slot->private; int retval; - dbg("%s - physical_slot = %s\n", __func__, hotplug_slot->name); + ctrl_dbg(slot->ctrl, "%s - physical_slot = %s\n", + __func__, hotplug_slot->name); retval = slot->hpc_ops->get_cur_bus_speed(slot, value); if (retval < 0) @@ -402,14 +419,15 @@ static int pciehp_probe(struct pcie_device *dev, const struct pcie_port_service_ struct pci_dev *pdev = dev->port; if (pciehp_force) - dbg("Bypassing BIOS check for pciehp use on %s\n", - pci_name(pdev)); + dev_info(&dev->device, + "Bypassing BIOS check for pciehp use on %s\n", + pci_name(pdev)); else if (pciehp_get_hp_hw_control_from_firmware(pdev)) goto err_out_none; ctrl = pcie_init(dev); if (!ctrl) { - dbg("%s: controller initialization failed\n", PCIE_MODULE_NAME); + dev_err(&dev->device, "controller initialization failed\n"); goto err_out_none; } set_service_data(dev, ctrl); @@ -418,11 +436,10 @@ static int pciehp_probe(struct pcie_device *dev, const struct pcie_port_service_ rc = init_slots(ctrl); if (rc) { if (rc == -EBUSY) - warn("%s: slot already registered by another " - "hotplug driver\n", PCIE_MODULE_NAME); + ctrl_warn(ctrl, "slot already registered by another " + "hotplug driver\n"); else - err("%s: slot initialization failed\n", - PCIE_MODULE_NAME); + ctrl_err(ctrl, "slot initialization failed\n"); goto err_out_release_ctlr; } @@ -461,13 +478,13 @@ static void pciehp_remove (struct pcie_device *dev) #ifdef CONFIG_PM static int pciehp_suspend (struct pcie_device *dev, pm_message_t state) { - printk("%s ENTRY\n", __func__); + dev_info(&dev->device, "%s ENTRY\n", __func__); return 0; } static int pciehp_resume (struct pcie_device *dev) { - printk("%s ENTRY\n", __func__); + dev_info(&dev->device, "%s ENTRY\n", __func__); if (pciehp_force) { struct controller *ctrl = get_service_data(dev); struct slot *t_slot; @@ -497,10 +514,9 @@ static struct pcie_port_service_id port_pci_ids[] = { { .driver_data = 0, }, { /* end: all zeroes */ } }; -static const char device_name[] = "hpdriver"; static struct pcie_port_service_driver hpdriver_portdrv = { - .name = (char *)device_name, + .name = PCIE_MODULE_NAME, .id_table = &port_pci_ids[0], .probe = pciehp_probe, diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 96a5d55..acb7f9e 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -58,14 +58,15 @@ static int queue_interrupt_event(struct slot *p_slot, u32 event_type) u8 pciehp_handle_attention_button(struct slot *p_slot) { u32 event_type; + struct controller *ctrl = p_slot->ctrl; /* Attention Button Change */ - dbg("pciehp: Attention button interrupt received.\n"); + ctrl_dbg(ctrl, "Attention button interrupt received.\n"); /* * Button pressed - See if need to TAKE ACTION!!! */ - info("Button pressed on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Button pressed on Slot(%s)\n", p_slot->name); event_type = INT_BUTTON_PRESS; queue_interrupt_event(p_slot, event_type); @@ -77,22 +78,23 @@ u8 pciehp_handle_switch_change(struct slot *p_slot) { u8 getstatus; u32 event_type; + struct controller *ctrl = p_slot->ctrl; /* Switch Change */ - dbg("pciehp: Switch interrupt received.\n"); + ctrl_dbg(ctrl, "Switch interrupt received.\n"); p_slot->hpc_ops->get_latch_status(p_slot, &getstatus); if (getstatus) { /* * Switch opened */ - info("Latch open on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Latch open on Slot(%s)\n", p_slot->name); event_type = INT_SWITCH_OPEN; } else { /* * Switch closed */ - info("Latch close on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Latch close on Slot(%s)\n", p_slot->name); event_type = INT_SWITCH_CLOSE; } @@ -105,9 +107,10 @@ u8 pciehp_handle_presence_change(struct slot *p_slot) { u32 event_type; u8 presence_save; + struct controller *ctrl = p_slot->ctrl; /* Presence Change */ - dbg("pciehp: Presence/Notify input change.\n"); + ctrl_dbg(ctrl, "Presence/Notify input change.\n"); /* Switch is open, assume a presence change * Save the presence state @@ -117,13 +120,13 @@ u8 pciehp_handle_presence_change(struct slot *p_slot) /* * Card Present */ - info("Card present on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Card present on Slot(%s)\n", p_slot->name); event_type = INT_PRESENCE_ON; } else { /* * Not Present */ - info("Card not present on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Card not present on Slot(%s)\n", p_slot->name); event_type = INT_PRESENCE_OFF; } @@ -135,23 +138,25 @@ u8 pciehp_handle_presence_change(struct slot *p_slot) u8 pciehp_handle_power_fault(struct slot *p_slot) { u32 event_type; + struct controller *ctrl = p_slot->ctrl; /* power fault */ - dbg("pciehp: Power fault interrupt received.\n"); + ctrl_dbg(ctrl, "Power fault interrupt received.\n"); if ( !(p_slot->hpc_ops->query_power_fault(p_slot))) { /* * power fault Cleared */ - info("Power fault cleared on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Power fault cleared on Slot(%s)\n", + p_slot->name); event_type = INT_POWER_FAULT_CLEAR; } else { /* * power fault */ - info("Power fault on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Power fault on Slot(%s)\n", p_slot->name); event_type = INT_POWER_FAULT; - info("power fault bit %x set\n", 0); + ctrl_info(ctrl, "power fault bit %x set\n", 0); } queue_interrupt_event(p_slot, event_type); @@ -168,8 +173,9 @@ static void set_slot_off(struct controller *ctrl, struct slot * pslot) /* turn off slot, turn on Amber LED, turn off Green LED if supported*/ if (POWER_CTRL(ctrl)) { if (pslot->hpc_ops->power_off_slot(pslot)) { - err("%s: Issue of Slot Power Off command failed\n", - __func__); + ctrl_err(ctrl, + "%s: Issue of Slot Power Off command failed\n", + __func__); return; } } @@ -186,8 +192,8 @@ static void set_slot_off(struct controller *ctrl, struct slot * pslot) if (ATTN_LED(ctrl)) { if (pslot->hpc_ops->set_attention_status(pslot, 1)) { - err("%s: Issue of Set Attention Led command failed\n", - __func__); + ctrl_err(ctrl, "%s: Issue of Set Attention " + "Led command failed\n", __func__); return; } } @@ -205,9 +211,9 @@ static int board_added(struct slot *p_slot) int retval = 0; struct controller *ctrl = p_slot->ctrl; - dbg("%s: slot device, slot offset, hp slot = %d, %d ,%d\n", - __func__, p_slot->device, - ctrl->slot_device_offset, p_slot->hp_slot); + ctrl_dbg(ctrl, "%s: slot device, slot offset, hp slot = %d, %d ,%d\n", + __func__, p_slot->device, ctrl->slot_device_offset, + p_slot->hp_slot); if (POWER_CTRL(ctrl)) { /* Power on slot */ @@ -225,22 +231,22 @@ static int board_added(struct slot *p_slot) /* Check link training status */ retval = p_slot->hpc_ops->check_lnk_status(ctrl); if (retval) { - err("%s: Failed to check link status\n", __func__); + ctrl_err(ctrl, "%s: Failed to check link status\n", __func__); set_slot_off(ctrl, p_slot); return retval; } /* Check for a power fault */ if (p_slot->hpc_ops->query_power_fault(p_slot)) { - dbg("%s: power fault detected\n", __func__); + ctrl_dbg(ctrl, "%s: power fault detected\n", __func__); retval = POWER_FAILURE; goto err_exit; } retval = pciehp_configure_device(p_slot); if (retval) { - err("Cannot add device 0x%x:%x\n", p_slot->bus, - p_slot->device); + ctrl_err(ctrl, "Cannot add device 0x%x:%x\n", + p_slot->bus, p_slot->device); goto err_exit; } @@ -272,14 +278,14 @@ static int remove_board(struct slot *p_slot) if (retval) return retval; - dbg("In %s, hp_slot = %d\n", __func__, p_slot->hp_slot); + ctrl_dbg(ctrl, "In %s, hp_slot = %d\n", __func__, p_slot->hp_slot); if (POWER_CTRL(ctrl)) { /* power off slot */ retval = p_slot->hpc_ops->power_off_slot(p_slot); if (retval) { - err("%s: Issue of Slot Disable command failed\n", - __func__); + ctrl_err(ctrl, "%s: Issue of Slot Disable command " + "failed\n", __func__); return retval; } } @@ -320,8 +326,8 @@ static void pciehp_power_thread(struct work_struct *work) switch (p_slot->state) { case POWEROFF_STATE: mutex_unlock(&p_slot->lock); - dbg("%s: disabling bus:device(%x:%x)\n", - __func__, p_slot->bus, p_slot->device); + ctrl_dbg(p_slot->ctrl, "%s: disabling bus:device(%x:%x)\n", + __func__, p_slot->bus, p_slot->device); pciehp_disable_slot(p_slot); mutex_lock(&p_slot->lock); p_slot->state = STATIC_STATE; @@ -349,7 +355,8 @@ void pciehp_queue_pushbutton_work(struct work_struct *work) info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) { - err("%s: Cannot allocate memory\n", __func__); + ctrl_err(p_slot->ctrl, "%s: Cannot allocate memory\n", + __func__); return; } info->p_slot = p_slot; @@ -403,12 +410,14 @@ static void handle_button_press_event(struct slot *p_slot) p_slot->hpc_ops->get_power_status(p_slot, &getstatus); if (getstatus) { p_slot->state = BLINKINGOFF_STATE; - info("PCI slot #%s - powering off due to button " - "press.\n", p_slot->name); + ctrl_info(ctrl, + "PCI slot #%s - powering off due to button " + "press.\n", p_slot->name); } else { p_slot->state = BLINKINGON_STATE; - info("PCI slot #%s - powering on due to button " - "press.\n", p_slot->name); + ctrl_info(ctrl, + "PCI slot #%s - powering on due to button " + "press.\n", p_slot->name); } /* blink green LED and turn off amber */ if (PWR_LED(ctrl)) @@ -425,8 +434,8 @@ static void handle_button_press_event(struct slot *p_slot) * press the attention again before the 5 sec. limit * expires to cancel hot-add or hot-remove */ - info("Button cancel on Slot(%s)\n", p_slot->name); - dbg("%s: button cancel\n", __func__); + ctrl_info(ctrl, "Button cancel on Slot(%s)\n", p_slot->name); + ctrl_dbg(ctrl, "%s: button cancel\n", __func__); cancel_delayed_work(&p_slot->work); if (p_slot->state == BLINKINGOFF_STATE) { if (PWR_LED(ctrl)) @@ -437,8 +446,8 @@ static void handle_button_press_event(struct slot *p_slot) } if (ATTN_LED(ctrl)) p_slot->hpc_ops->set_attention_status(p_slot, 0); - info("PCI slot #%s - action canceled due to button press\n", - p_slot->name); + ctrl_info(ctrl, "PCI slot #%s - action canceled " + "due to button press\n", p_slot->name); p_slot->state = STATIC_STATE; break; case POWEROFF_STATE: @@ -448,11 +457,11 @@ static void handle_button_press_event(struct slot *p_slot) * this means that the previous attention button action * to hot-add or hot-remove is undergoing */ - info("Button ignore on Slot(%s)\n", p_slot->name); + ctrl_info(ctrl, "Button ignore on Slot(%s)\n", p_slot->name); update_slot_info(p_slot); break; default: - warn("Not a valid state\n"); + ctrl_warn(ctrl, "Not a valid state\n"); break; } } @@ -467,7 +476,8 @@ static void handle_surprise_event(struct slot *p_slot) info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) { - err("%s: Cannot allocate memory\n", __func__); + ctrl_err(p_slot->ctrl, "%s: Cannot allocate memory\n", + __func__); return; } info->p_slot = p_slot; @@ -505,7 +515,7 @@ static void interrupt_event_handler(struct work_struct *work) case INT_PRESENCE_OFF: if (!HP_SUPR_RM(ctrl)) break; - dbg("Surprise Removal\n"); + ctrl_dbg(ctrl, "Surprise Removal\n"); update_slot_info(p_slot); handle_surprise_event(p_slot); break; @@ -522,22 +532,23 @@ int pciehp_enable_slot(struct slot *p_slot) { u8 getstatus = 0; int rc; + struct controller *ctrl = p_slot->ctrl; /* Check to see if (latch closed, card present, power off) */ mutex_lock(&p_slot->ctrl->crit_sect); rc = p_slot->hpc_ops->get_adapter_status(p_slot, &getstatus); if (rc || !getstatus) { - info("%s: no adapter on slot(%s)\n", __func__, - p_slot->name); + ctrl_info(ctrl, "%s: no adapter on slot(%s)\n", + __func__, p_slot->name); mutex_unlock(&p_slot->ctrl->crit_sect); return -ENODEV; } if (MRL_SENS(p_slot->ctrl)) { rc = p_slot->hpc_ops->get_latch_status(p_slot, &getstatus); if (rc || getstatus) { - info("%s: latch open on slot(%s)\n", __func__, - p_slot->name); + ctrl_info(ctrl, "%s: latch open on slot(%s)\n", + __func__, p_slot->name); mutex_unlock(&p_slot->ctrl->crit_sect); return -ENODEV; } @@ -546,8 +557,8 @@ int pciehp_enable_slot(struct slot *p_slot) if (POWER_CTRL(p_slot->ctrl)) { rc = p_slot->hpc_ops->get_power_status(p_slot, &getstatus); if (rc || getstatus) { - info("%s: already enabled on slot(%s)\n", __func__, - p_slot->name); + ctrl_info(ctrl, "%s: already enabled on slot(%s)\n", + __func__, p_slot->name); mutex_unlock(&p_slot->ctrl->crit_sect); return -EINVAL; } @@ -571,6 +582,7 @@ int pciehp_disable_slot(struct slot *p_slot) { u8 getstatus = 0; int ret = 0; + struct controller *ctrl = p_slot->ctrl; if (!p_slot->ctrl) return 1; @@ -581,8 +593,8 @@ int pciehp_disable_slot(struct slot *p_slot) if (!HP_SUPR_RM(p_slot->ctrl)) { ret = p_slot->hpc_ops->get_adapter_status(p_slot, &getstatus); if (ret || !getstatus) { - info("%s: no adapter on slot(%s)\n", __func__, - p_slot->name); + ctrl_info(ctrl, "%s: no adapter on slot(%s)\n", + __func__, p_slot->name); mutex_unlock(&p_slot->ctrl->crit_sect); return -ENODEV; } @@ -591,8 +603,8 @@ int pciehp_disable_slot(struct slot *p_slot) if (MRL_SENS(p_slot->ctrl)) { ret = p_slot->hpc_ops->get_latch_status(p_slot, &getstatus); if (ret || getstatus) { - info("%s: latch open on slot(%s)\n", __func__, - p_slot->name); + ctrl_info(ctrl, "%s: latch open on slot(%s)\n", + __func__, p_slot->name); mutex_unlock(&p_slot->ctrl->crit_sect); return -ENODEV; } @@ -601,8 +613,8 @@ int pciehp_disable_slot(struct slot *p_slot) if (POWER_CTRL(p_slot->ctrl)) { ret = p_slot->hpc_ops->get_power_status(p_slot, &getstatus); if (ret || !getstatus) { - info("%s: already disabled slot(%s)\n", __func__, - p_slot->name); + ctrl_info(ctrl, "%s: already disabled slot(%s)\n", + __func__, p_slot->name); mutex_unlock(&p_slot->ctrl->crit_sect); return -EINVAL; } @@ -618,6 +630,7 @@ int pciehp_disable_slot(struct slot *p_slot) int pciehp_sysfs_enable_slot(struct slot *p_slot) { int retval = -ENODEV; + struct controller *ctrl = p_slot->ctrl; mutex_lock(&p_slot->lock); switch (p_slot->state) { @@ -631,15 +644,15 @@ int pciehp_sysfs_enable_slot(struct slot *p_slot) p_slot->state = STATIC_STATE; break; case POWERON_STATE: - info("Slot %s is already in powering on state\n", - p_slot->name); + ctrl_info(ctrl, "Slot %s is already in powering on state\n", + p_slot->name); break; case BLINKINGOFF_STATE: case POWEROFF_STATE: - info("Already enabled on slot %s\n", p_slot->name); + ctrl_info(ctrl, "Already enabled on slot %s\n", p_slot->name); break; default: - err("Not a valid state on slot %s\n", p_slot->name); + ctrl_err(ctrl, "Not a valid state on slot %s\n", p_slot->name); break; } mutex_unlock(&p_slot->lock); @@ -650,6 +663,7 @@ int pciehp_sysfs_enable_slot(struct slot *p_slot) int pciehp_sysfs_disable_slot(struct slot *p_slot) { int retval = -ENODEV; + struct controller *ctrl = p_slot->ctrl; mutex_lock(&p_slot->lock); switch (p_slot->state) { @@ -663,15 +677,15 @@ int pciehp_sysfs_disable_slot(struct slot *p_slot) p_slot->state = STATIC_STATE; break; case POWEROFF_STATE: - info("Slot %s is already in powering off state\n", - p_slot->name); + ctrl_info(ctrl, "Slot %s is already in powering off state\n", + p_slot->name); break; case BLINKINGON_STATE: case POWERON_STATE: - info("Already disabled on slot %s\n", p_slot->name); + ctrl_info(ctrl, "Already disabled on slot %s\n", p_slot->name); break; default: - err("Not a valid state on slot %s\n", p_slot->name); + ctrl_err(ctrl, "Not a valid state on slot %s\n", p_slot->name); break; } mutex_unlock(&p_slot->lock); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 9d934dd..8e9530c 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -223,7 +223,7 @@ static void start_int_poll_timer(struct controller *ctrl, int sec) static inline int pciehp_request_irq(struct controller *ctrl) { - int retval, irq = ctrl->pci_dev->irq; + int retval, irq = ctrl->pcie->irq; /* Install interrupt polling timer. Start with 10 sec delay */ if (pciehp_poll_mode) { @@ -235,7 +235,8 @@ static inline int pciehp_request_irq(struct controller *ctrl) /* Installs the interrupt handler */ retval = request_irq(irq, pcie_isr, IRQF_SHARED, MY_NAME, ctrl); if (retval) - err("Cannot get irq %d for the hotplug controller\n", irq); + ctrl_err(ctrl, "Cannot get irq %d for the hotplug controller\n", + irq); return retval; } @@ -244,7 +245,7 @@ static inline void pciehp_free_irq(struct controller *ctrl) if (pciehp_poll_mode) del_timer_sync(&ctrl->poll_timer); else - free_irq(ctrl->pci_dev->irq, ctrl); + free_irq(ctrl->pcie->irq, ctrl); } static int pcie_poll_cmd(struct controller *ctrl) @@ -282,7 +283,7 @@ static void pcie_wait_cmd(struct controller *ctrl, int poll) else rc = wait_event_timeout(ctrl->queue, !ctrl->cmd_busy, timeout); if (!rc) - dbg("Command not completed in 1000 msec\n"); + ctrl_dbg(ctrl, "Command not completed in 1000 msec\n"); } /** @@ -301,7 +302,8 @@ static int pcie_write_cmd(struct controller *ctrl, u16 cmd, u16 mask) retval = pciehp_readw(ctrl, SLOTSTATUS, &slot_status); if (retval) { - err("%s: Cannot read SLOTSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTSTATUS register\n", + __func__); goto out; } @@ -312,26 +314,28 @@ static int pcie_write_cmd(struct controller *ctrl, u16 cmd, u16 mask) * proceed forward to issue the next command according * to spec. Just print out the error message. */ - dbg("%s: CMD_COMPLETED not clear after 1 sec.\n", - __func__); + ctrl_dbg(ctrl, + "%s: CMD_COMPLETED not clear after 1 sec.\n", + __func__); } else if (!NO_CMD_CMPL(ctrl)) { /* * This controller semms to notify of command completed * event even though it supports none of power * controller, attention led, power led and EMI. */ - dbg("%s: Unexpected CMD_COMPLETED. Need to wait for " - "command completed event.\n", __func__); + ctrl_dbg(ctrl, "%s: Unexpected CMD_COMPLETED. Need to " + "wait for command completed event.\n", + __func__); ctrl->no_cmd_complete = 0; } else { - dbg("%s: Unexpected CMD_COMPLETED. Maybe the " - "controller is broken.\n", __func__); + ctrl_dbg(ctrl, "%s: Unexpected CMD_COMPLETED. Maybe " + "the controller is broken.\n", __func__); } } retval = pciehp_readw(ctrl, SLOTCTRL, &slot_ctrl); if (retval) { - err("%s: Cannot read SLOTCTRL register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTCTRL register\n", __func__); goto out; } @@ -341,7 +345,8 @@ static int pcie_write_cmd(struct controller *ctrl, u16 cmd, u16 mask) smp_mb(); retval = pciehp_writew(ctrl, SLOTCTRL, slot_ctrl); if (retval) - err("%s: Cannot write to SLOTCTRL register\n", __func__); + ctrl_err(ctrl, "%s: Cannot write to SLOTCTRL register\n", + __func__); /* * Wait for command completion. @@ -370,14 +375,15 @@ static int hpc_check_lnk_status(struct controller *ctrl) retval = pciehp_readw(ctrl, LNKSTATUS, &lnk_status); if (retval) { - err("%s: Cannot read LNKSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read LNKSTATUS register\n", + __func__); return retval; } - dbg("%s: lnk_status = %x\n", __func__, lnk_status); + ctrl_dbg(ctrl, "%s: lnk_status = %x\n", __func__, lnk_status); if ( (lnk_status & LNK_TRN) || (lnk_status & LNK_TRN_ERR) || !(lnk_status & NEG_LINK_WD)) { - err("%s : Link Training Error occurs \n", __func__); + ctrl_err(ctrl, "%s : Link Training Error occurs \n", __func__); retval = -1; return retval; } @@ -394,12 +400,12 @@ static int hpc_get_attention_status(struct slot *slot, u8 *status) retval = pciehp_readw(ctrl, SLOTCTRL, &slot_ctrl); if (retval) { - err("%s: Cannot read SLOTCTRL register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTCTRL register\n", __func__); return retval; } - dbg("%s: SLOTCTRL %x, value read %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_ctrl); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x, value read %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_ctrl); atten_led_state = (slot_ctrl & ATTN_LED_CTRL) >> 6; @@ -433,11 +439,11 @@ static int hpc_get_power_status(struct slot *slot, u8 *status) retval = pciehp_readw(ctrl, SLOTCTRL, &slot_ctrl); if (retval) { - err("%s: Cannot read SLOTCTRL register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTCTRL register\n", __func__); return retval; } - dbg("%s: SLOTCTRL %x value read %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_ctrl); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x value read %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_ctrl); pwr_state = (slot_ctrl & PWR_CTRL) >> 10; @@ -464,7 +470,8 @@ static int hpc_get_latch_status(struct slot *slot, u8 *status) retval = pciehp_readw(ctrl, SLOTSTATUS, &slot_status); if (retval) { - err("%s: Cannot read SLOTSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTSTATUS register\n", + __func__); return retval; } @@ -482,7 +489,8 @@ static int hpc_get_adapter_status(struct slot *slot, u8 *status) retval = pciehp_readw(ctrl, SLOTSTATUS, &slot_status); if (retval) { - err("%s: Cannot read SLOTSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTSTATUS register\n", + __func__); return retval; } card_state = (u8)((slot_status & PRSN_STATE) >> 6); @@ -500,7 +508,7 @@ static int hpc_query_power_fault(struct slot *slot) retval = pciehp_readw(ctrl, SLOTSTATUS, &slot_status); if (retval) { - err("%s: Cannot check for power fault\n", __func__); + ctrl_err(ctrl, "%s: Cannot check for power fault\n", __func__); return retval; } pwr_fault = (u8)((slot_status & PWR_FAULT_DETECTED) >> 1); @@ -516,7 +524,7 @@ static int hpc_get_emi_status(struct slot *slot, u8 *status) retval = pciehp_readw(ctrl, SLOTSTATUS, &slot_status); if (retval) { - err("%s : Cannot check EMI status\n", __func__); + ctrl_err(ctrl, "%s : Cannot check EMI status\n", __func__); return retval; } *status = (slot_status & EMI_STATE) >> EMI_STATUS_BIT; @@ -560,8 +568,8 @@ static int hpc_set_attention_status(struct slot *slot, u8 value) return -1; } rc = pcie_write_cmd(ctrl, slot_cmd, cmd_mask); - dbg("%s: SLOTCTRL %x write cmd %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); return rc; } @@ -575,8 +583,8 @@ static void hpc_set_green_led_on(struct slot *slot) slot_cmd = 0x0100; cmd_mask = PWR_LED_CTRL; pcie_write_cmd(ctrl, slot_cmd, cmd_mask); - dbg("%s: SLOTCTRL %x write cmd %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); } static void hpc_set_green_led_off(struct slot *slot) @@ -588,8 +596,8 @@ static void hpc_set_green_led_off(struct slot *slot) slot_cmd = 0x0300; cmd_mask = PWR_LED_CTRL; pcie_write_cmd(ctrl, slot_cmd, cmd_mask); - dbg("%s: SLOTCTRL %x write cmd %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); } static void hpc_set_green_led_blink(struct slot *slot) @@ -601,8 +609,8 @@ static void hpc_set_green_led_blink(struct slot *slot) slot_cmd = 0x0200; cmd_mask = PWR_LED_CTRL; pcie_write_cmd(ctrl, slot_cmd, cmd_mask); - dbg("%s: SLOTCTRL %x write cmd %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); } static int hpc_power_on_slot(struct slot * slot) @@ -613,20 +621,22 @@ static int hpc_power_on_slot(struct slot * slot) u16 slot_status; int retval = 0; - dbg("%s: slot->hp_slot %x\n", __func__, slot->hp_slot); + ctrl_dbg(ctrl, "%s: slot->hp_slot %x\n", __func__, slot->hp_slot); /* Clear sticky power-fault bit from previous power failures */ retval = pciehp_readw(ctrl, SLOTSTATUS, &slot_status); if (retval) { - err("%s: Cannot read SLOTSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTSTATUS register\n", + __func__); return retval; } slot_status &= PWR_FAULT_DETECTED; if (slot_status) { retval = pciehp_writew(ctrl, SLOTSTATUS, slot_status); if (retval) { - err("%s: Cannot write to SLOTSTATUS register\n", - __func__); + ctrl_err(ctrl, + "%s: Cannot write to SLOTSTATUS register\n", + __func__); return retval; } } @@ -644,11 +654,12 @@ static int hpc_power_on_slot(struct slot * slot) retval = pcie_write_cmd(ctrl, slot_cmd, cmd_mask); if (retval) { - err("%s: Write %x command failed!\n", __func__, slot_cmd); + ctrl_err(ctrl, "%s: Write %x command failed!\n", + __func__, slot_cmd); return -1; } - dbg("%s: SLOTCTRL %x write cmd %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); return retval; } @@ -694,7 +705,7 @@ static int hpc_power_off_slot(struct slot * slot) int retval = 0; int changed; - dbg("%s: slot->hp_slot %x\n", __func__, slot->hp_slot); + ctrl_dbg(ctrl, "%s: slot->hp_slot %x\n", __func__, slot->hp_slot); /* * Set Bad DLLP Mask bit in Correctable Error Mask @@ -722,12 +733,12 @@ static int hpc_power_off_slot(struct slot * slot) retval = pcie_write_cmd(ctrl, slot_cmd, cmd_mask); if (retval) { - err("%s: Write command failed!\n", __func__); + ctrl_err(ctrl, "%s: Write command failed!\n", __func__); retval = -1; goto out; } - dbg("%s: SLOTCTRL %x write cmd %x\n", - __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", + __func__, ctrl->cap_base + SLOTCTRL, slot_cmd); out: if (changed) pcie_unmask_bad_dllp(ctrl); @@ -749,7 +760,8 @@ static irqreturn_t pcie_isr(int irq, void *dev_id) intr_loc = 0; do { if (pciehp_readw(ctrl, SLOTSTATUS, &detected)) { - err("%s: Cannot read SLOTSTATUS\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTSTATUS\n", + __func__); return IRQ_NONE; } @@ -760,12 +772,13 @@ static irqreturn_t pcie_isr(int irq, void *dev_id) if (!intr_loc) return IRQ_NONE; if (detected && pciehp_writew(ctrl, SLOTSTATUS, detected)) { - err("%s: Cannot write to SLOTSTATUS\n", __func__); + ctrl_err(ctrl, "%s: Cannot write to SLOTSTATUS\n", + __func__); return IRQ_NONE; } } while (detected); - dbg("%s: intr_loc %x\n", __FUNCTION__, intr_loc); + ctrl_dbg(ctrl, "%s: intr_loc %x\n", __func__, intr_loc); /* Check Command Complete Interrupt Pending */ if (intr_loc & CMD_COMPLETED) { @@ -807,7 +820,7 @@ static int hpc_get_max_lnk_speed(struct slot *slot, enum pci_bus_speed *value) retval = pciehp_readl(ctrl, LNKCAP, &lnk_cap); if (retval) { - err("%s: Cannot read LNKCAP register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read LNKCAP register\n", __func__); return retval; } @@ -821,7 +834,7 @@ static int hpc_get_max_lnk_speed(struct slot *slot, enum pci_bus_speed *value) } *value = lnk_speed; - dbg("Max link speed = %d\n", lnk_speed); + ctrl_dbg(ctrl, "Max link speed = %d\n", lnk_speed); return retval; } @@ -836,7 +849,7 @@ static int hpc_get_max_lnk_width(struct slot *slot, retval = pciehp_readl(ctrl, LNKCAP, &lnk_cap); if (retval) { - err("%s: Cannot read LNKCAP register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read LNKCAP register\n", __func__); return retval; } @@ -871,7 +884,7 @@ static int hpc_get_max_lnk_width(struct slot *slot, } *value = lnk_wdth; - dbg("Max link width = %d\n", lnk_wdth); + ctrl_dbg(ctrl, "Max link width = %d\n", lnk_wdth); return retval; } @@ -885,7 +898,8 @@ static int hpc_get_cur_lnk_speed(struct slot *slot, enum pci_bus_speed *value) retval = pciehp_readw(ctrl, LNKSTATUS, &lnk_status); if (retval) { - err("%s: Cannot read LNKSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read LNKSTATUS register\n", + __func__); return retval; } @@ -899,7 +913,7 @@ static int hpc_get_cur_lnk_speed(struct slot *slot, enum pci_bus_speed *value) } *value = lnk_speed; - dbg("Current link speed = %d\n", lnk_speed); + ctrl_dbg(ctrl, "Current link speed = %d\n", lnk_speed); return retval; } @@ -914,7 +928,8 @@ static int hpc_get_cur_lnk_width(struct slot *slot, retval = pciehp_readw(ctrl, LNKSTATUS, &lnk_status); if (retval) { - err("%s: Cannot read LNKSTATUS register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read LNKSTATUS register\n", + __func__); return retval; } @@ -949,7 +964,7 @@ static int hpc_get_cur_lnk_width(struct slot *slot, } *value = lnk_wdth; - dbg("Current link width = %d\n", lnk_wdth); + ctrl_dbg(ctrl, "Current link width = %d\n", lnk_wdth); return retval; } @@ -998,7 +1013,8 @@ int pcie_enable_notification(struct controller *ctrl) PWR_FAULT_DETECT_ENABLE | HP_INTR_ENABLE | CMD_CMPL_INTR_ENABLE; if (pcie_write_cmd(ctrl, cmd, mask)) { - err("%s: Cannot enable software notification\n", __func__); + ctrl_err(ctrl, "%s: Cannot enable software notification\n", + __func__); return -1; } return 0; @@ -1010,7 +1026,8 @@ static void pcie_disable_notification(struct controller *ctrl) mask = PRSN_DETECT_ENABLE | ATTN_BUTTN_ENABLE | MRL_DETECT_ENABLE | PWR_FAULT_DETECT_ENABLE | HP_INTR_ENABLE | CMD_CMPL_INTR_ENABLE; if (pcie_write_cmd(ctrl, 0, mask)) - warn("%s: Cannot disable software notification\n", __func__); + ctrl_warn(ctrl, "%s: Cannot disable software notification\n", + __func__); } static int pcie_init_notification(struct controller *ctrl) @@ -1071,34 +1088,45 @@ static inline void dbg_ctrl(struct controller *ctrl) if (!pciehp_debug) return; - dbg("Hotplug Controller:\n"); - dbg(" Seg/Bus/Dev/Func/IRQ : %s IRQ %d\n", pci_name(pdev), pdev->irq); - dbg(" Vendor ID : 0x%04x\n", pdev->vendor); - dbg(" Device ID : 0x%04x\n", pdev->device); - dbg(" Subsystem ID : 0x%04x\n", pdev->subsystem_device); - dbg(" Subsystem Vendor ID : 0x%04x\n", pdev->subsystem_vendor); - dbg(" PCIe Cap offset : 0x%02x\n", ctrl->cap_base); + ctrl_info(ctrl, "Hotplug Controller:\n"); + ctrl_info(ctrl, " Seg/Bus/Dev/Func/IRQ : %s IRQ %d\n", + pci_name(pdev), pdev->irq); + ctrl_info(ctrl, " Vendor ID : 0x%04x\n", pdev->vendor); + ctrl_info(ctrl, " Device ID : 0x%04x\n", pdev->device); + ctrl_info(ctrl, " Subsystem ID : 0x%04x\n", + pdev->subsystem_device); + ctrl_info(ctrl, " Subsystem Vendor ID : 0x%04x\n", + pdev->subsystem_vendor); + ctrl_info(ctrl, " PCIe Cap offset : 0x%02x\n", ctrl->cap_base); for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { if (!pci_resource_len(pdev, i)) continue; - dbg(" PCI resource [%d] : 0x%llx@0x%llx\n", i, - (unsigned long long)pci_resource_len(pdev, i), - (unsigned long long)pci_resource_start(pdev, i)); + ctrl_info(ctrl, " PCI resource [%d] : 0x%llx@0x%llx\n", + i, (unsigned long long)pci_resource_len(pdev, i), + (unsigned long long)pci_resource_start(pdev, i)); } - dbg("Slot Capabilities : 0x%08x\n", ctrl->slot_cap); - dbg(" Physical Slot Number : %d\n", ctrl->first_slot); - dbg(" Attention Button : %3s\n", ATTN_BUTTN(ctrl) ? "yes" : "no"); - dbg(" Power Controller : %3s\n", POWER_CTRL(ctrl) ? "yes" : "no"); - dbg(" MRL Sensor : %3s\n", MRL_SENS(ctrl) ? "yes" : "no"); - dbg(" Attention Indicator : %3s\n", ATTN_LED(ctrl) ? "yes" : "no"); - dbg(" Power Indicator : %3s\n", PWR_LED(ctrl) ? "yes" : "no"); - dbg(" Hot-Plug Surprise : %3s\n", HP_SUPR_RM(ctrl) ? "yes" : "no"); - dbg(" EMI Present : %3s\n", EMI(ctrl) ? "yes" : "no"); - dbg(" Command Completed : %3s\n", NO_CMD_CMPL(ctrl)? "no" : "yes"); + ctrl_info(ctrl, "Slot Capabilities : 0x%08x\n", ctrl->slot_cap); + ctrl_info(ctrl, " Physical Slot Number : %d\n", ctrl->first_slot); + ctrl_info(ctrl, " Attention Button : %3s\n", + ATTN_BUTTN(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " Power Controller : %3s\n", + POWER_CTRL(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " MRL Sensor : %3s\n", + MRL_SENS(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " Attention Indicator : %3s\n", + ATTN_LED(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " Power Indicator : %3s\n", + PWR_LED(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " Hot-Plug Surprise : %3s\n", + HP_SUPR_RM(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " EMI Present : %3s\n", + EMI(ctrl) ? "yes" : "no"); + ctrl_info(ctrl, " Command Completed : %3s\n", + NO_CMD_CMPL(ctrl) ? "no" : "yes"); pciehp_readw(ctrl, SLOTSTATUS, ®16); - dbg("Slot Status : 0x%04x\n", reg16); + ctrl_info(ctrl, "Slot Status : 0x%04x\n", reg16); pciehp_readw(ctrl, SLOTCTRL, ®16); - dbg("Slot Control : 0x%04x\n", reg16); + ctrl_info(ctrl, "Slot Control : 0x%04x\n", reg16); } struct controller *pcie_init(struct pcie_device *dev) @@ -1109,19 +1137,21 @@ struct controller *pcie_init(struct pcie_device *dev) ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) { - err("%s : out of memory\n", __func__); + dev_err(&dev->device, "%s : out of memory\n", __func__); goto abort; } INIT_LIST_HEAD(&ctrl->slot_list); + ctrl->pcie = dev; ctrl->pci_dev = pdev; ctrl->cap_base = pci_find_capability(pdev, PCI_CAP_ID_EXP); if (!ctrl->cap_base) { - err("%s: Cannot find PCI Express capability\n", __func__); + ctrl_err(ctrl, "%s: Cannot find PCI Express capability\n", + __func__); goto abort; } if (pciehp_readl(ctrl, SLOTCAP, &slot_cap)) { - err("%s: Cannot read SLOTCAP register\n", __func__); + ctrl_err(ctrl, "%s: Cannot read SLOTCAP register\n", __func__); goto abort; } @@ -1161,9 +1191,9 @@ struct controller *pcie_init(struct pcie_device *dev) goto abort_ctrl; } - info("HPC vendor_id %x device_id %x ss_vid %x ss_did %x\n", - pdev->vendor, pdev->device, - pdev->subsystem_vendor, pdev->subsystem_device); + ctrl_info(ctrl, "HPC vendor_id %x device_id %x ss_vid %x ss_did %x\n", + pdev->vendor, pdev->device, pdev->subsystem_vendor, + pdev->subsystem_device); if (pcie_init_slot(ctrl)) goto abort_ctrl; diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c index 6040dcc..ffd1114 100644 --- a/drivers/pci/hotplug/pciehp_pci.c +++ b/drivers/pci/hotplug/pciehp_pci.c @@ -198,18 +198,20 @@ int pciehp_configure_device(struct slot *p_slot) struct pci_dev *dev; struct pci_bus *parent = p_slot->ctrl->pci_dev->subordinate; int num, fn; + struct controller *ctrl = p_slot->ctrl; dev = pci_get_slot(parent, PCI_DEVFN(p_slot->device, 0)); if (dev) { - err("Device %s already exists at %x:%x, cannot hot-add\n", - pci_name(dev), p_slot->bus, p_slot->device); + ctrl_err(ctrl, + "Device %s already exists at %x:%x, cannot hot-add\n", + pci_name(dev), p_slot->bus, p_slot->device); pci_dev_put(dev); return -EINVAL; } num = pci_scan_slot(parent, PCI_DEVFN(p_slot->device, 0)); if (num == 0) { - err("No new device found\n"); + ctrl_err(ctrl, "No new device found\n"); return -ENODEV; } @@ -218,8 +220,8 @@ int pciehp_configure_device(struct slot *p_slot) if (!dev) continue; if ((dev->class >> 16) == PCI_BASE_CLASS_DISPLAY) { - err("Cannot hot-add display device %s\n", - pci_name(dev)); + ctrl_err(ctrl, "Cannot hot-add display device %s\n", + pci_name(dev)); pci_dev_put(dev); continue; } @@ -244,9 +246,10 @@ int pciehp_unconfigure_device(struct slot *p_slot) u8 presence = 0; struct pci_bus *parent = p_slot->ctrl->pci_dev->subordinate; u16 command; + struct controller *ctrl = p_slot->ctrl; - dbg("%s: bus/dev = %x/%x\n", __func__, p_slot->bus, - p_slot->device); + ctrl_dbg(ctrl, "%s: bus/dev = %x/%x\n", __func__, + p_slot->bus, p_slot->device); ret = p_slot->hpc_ops->get_adapter_status(p_slot, &presence); if (ret) presence = 0; @@ -257,16 +260,17 @@ int pciehp_unconfigure_device(struct slot *p_slot) if (!temp) continue; if ((temp->class >> 16) == PCI_BASE_CLASS_DISPLAY) { - err("Cannot remove display device %s\n", - pci_name(temp)); + ctrl_err(ctrl, "Cannot remove display device %s\n", + pci_name(temp)); pci_dev_put(temp); continue; } if (temp->hdr_type == PCI_HEADER_TYPE_BRIDGE && presence) { pci_read_config_byte(temp, PCI_BRIDGE_CONTROL, &bctl); if (bctl & PCI_BRIDGE_CTL_VGA) { - err("Cannot remove display device %s\n", - pci_name(temp)); + ctrl_err(ctrl, + "Cannot remove display device %s\n", + pci_name(temp)); pci_dev_put(temp); continue; } diff --git a/drivers/pci/hotplug/rpaphp.h b/drivers/pci/hotplug/rpaphp.h index 7d5921b..419919a 100644 --- a/drivers/pci/hotplug/rpaphp.h +++ b/drivers/pci/hotplug/rpaphp.h @@ -46,10 +46,10 @@ #define PRESENT 1 /* Card in slot */ #define MY_NAME "rpaphp" -extern int debug; +extern int rpaphp_debug; #define dbg(format, arg...) \ do { \ - if (debug) \ + if (rpaphp_debug) \ printk(KERN_DEBUG "%s: " format, \ MY_NAME , ## arg); \ } while (0) diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index 1f84f40..95d02a0 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -37,7 +37,7 @@ /* and pci_do_scan_bus */ #include "rpaphp.h" -int debug; +int rpaphp_debug; LIST_HEAD(rpaphp_slot_head); #define DRIVER_VERSION "0.1" @@ -50,7 +50,7 @@ MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); -module_param(debug, bool, 0644); +module_param_named(debug, rpaphp_debug, bool, 0644); /** * set_attention_status - set attention LED diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c index 5acfd4f..513e1e2 100644 --- a/drivers/pci/hotplug/rpaphp_pci.c +++ b/drivers/pci/hotplug/rpaphp_pci.c @@ -123,7 +123,7 @@ int rpaphp_enable_slot(struct slot *slot) slot->state = CONFIGURED; } - if (debug) { + if (rpaphp_debug) { struct pci_dev *dev; dbg("%s: pci_devs of slot[%s]\n", __func__, slot->dn->full_name); list_for_each_entry (dev, &bus->devices, bus_list) diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 279c940..bf7d6ce 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -126,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_hi = 0xffffffff; irq = create_irq(); - if (irq < 0) { + + if (irq <= 0) { kfree(cfg); return -EBUSY; } diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 738d4c8..2de5a32 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -1,3 +1,4 @@ +#include <linux/interrupt.h> #include <linux/dmar.h> #include <linux/spinlock.h> #include <linux/jiffies.h> @@ -11,41 +12,64 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; static int ir_ioapic_num; int intr_remapping_enabled; -static struct { +struct irq_2_iommu { struct intel_iommu *iommu; u16 irte_index; u16 sub_handle; u8 irte_mask; -} irq_2_iommu[NR_IRQS]; +}; + +static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; + +static struct irq_2_iommu *irq_2_iommu(unsigned int irq) +{ + return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL; +} + +static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) +{ + return irq_2_iommu(irq); +} static DEFINE_SPINLOCK(irq_2_ir_lock); -int irq_remapped(int irq) +static struct irq_2_iommu *valid_irq_2_iommu(unsigned int irq) { - if (irq > NR_IRQS) - return 0; + struct irq_2_iommu *irq_iommu; + + irq_iommu = irq_2_iommu(irq); + + if (!irq_iommu) + return NULL; + + if (!irq_iommu->iommu) + return NULL; - if (!irq_2_iommu[irq].iommu) - return 0; + return irq_iommu; +} - return 1; +int irq_remapped(int irq) +{ + return valid_irq_2_iommu(irq) != NULL; } int get_irte(int irq, struct irte *entry) { int index; + struct irq_2_iommu *irq_iommu; - if (!entry || irq > NR_IRQS) + if (!entry) return -1; spin_lock(&irq_2_ir_lock); - if (!irq_2_iommu[irq].iommu) { + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { spin_unlock(&irq_2_ir_lock); return -1; } - index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle; - *entry = *(irq_2_iommu[irq].iommu->ir_table->base + index); + index = irq_iommu->irte_index + irq_iommu->sub_handle; + *entry = *(irq_iommu->iommu->ir_table->base + index); spin_unlock(&irq_2_ir_lock); return 0; @@ -54,6 +78,7 @@ int get_irte(int irq, struct irte *entry) int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) { struct ir_table *table = iommu->ir_table; + struct irq_2_iommu *irq_iommu; u16 index, start_index; unsigned int mask = 0; int i; @@ -61,6 +86,10 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) if (!count) return -1; + /* protect irq_2_iommu_alloc later */ + if (irq >= nr_irqs) + return -1; + /* * start the IRTE search from index 0. */ @@ -100,10 +129,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) for (i = index; i < index + count; i++) table->base[i].present = 1; - irq_2_iommu[irq].iommu = iommu; - irq_2_iommu[irq].irte_index = index; - irq_2_iommu[irq].sub_handle = 0; - irq_2_iommu[irq].irte_mask = mask; + irq_iommu = irq_2_iommu_alloc(irq); + irq_iommu->iommu = iommu; + irq_iommu->irte_index = index; + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = mask; spin_unlock(&irq_2_ir_lock); @@ -124,31 +154,33 @@ static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask) int map_irq_to_irte_handle(int irq, u16 *sub_handle) { int index; + struct irq_2_iommu *irq_iommu; spin_lock(&irq_2_ir_lock); - if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) { + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { spin_unlock(&irq_2_ir_lock); return -1; } - *sub_handle = irq_2_iommu[irq].sub_handle; - index = irq_2_iommu[irq].irte_index; + *sub_handle = irq_iommu->sub_handle; + index = irq_iommu->irte_index; spin_unlock(&irq_2_ir_lock); return index; } int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) { + struct irq_2_iommu *irq_iommu; + spin_lock(&irq_2_ir_lock); - if (irq >= NR_IRQS || irq_2_iommu[irq].iommu) { - spin_unlock(&irq_2_ir_lock); - return -1; - } - irq_2_iommu[irq].iommu = iommu; - irq_2_iommu[irq].irte_index = index; - irq_2_iommu[irq].sub_handle = subhandle; - irq_2_iommu[irq].irte_mask = 0; + irq_iommu = irq_2_iommu_alloc(irq); + + irq_iommu->iommu = iommu; + irq_iommu->irte_index = index; + irq_iommu->sub_handle = subhandle; + irq_iommu->irte_mask = 0; spin_unlock(&irq_2_ir_lock); @@ -157,16 +189,19 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index) { + struct irq_2_iommu *irq_iommu; + spin_lock(&irq_2_ir_lock); - if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) { + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { spin_unlock(&irq_2_ir_lock); return -1; } - irq_2_iommu[irq].iommu = NULL; - irq_2_iommu[irq].irte_index = 0; - irq_2_iommu[irq].sub_handle = 0; - irq_2_iommu[irq].irte_mask = 0; + irq_iommu->iommu = NULL; + irq_iommu->irte_index = 0; + irq_iommu->sub_handle = 0; + irq_2_iommu(irq)->irte_mask = 0; spin_unlock(&irq_2_ir_lock); @@ -178,16 +213,18 @@ int modify_irte(int irq, struct irte *irte_modified) int index; struct irte *irte; struct intel_iommu *iommu; + struct irq_2_iommu *irq_iommu; spin_lock(&irq_2_ir_lock); - if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) { + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { spin_unlock(&irq_2_ir_lock); return -1; } - iommu = irq_2_iommu[irq].iommu; + iommu = irq_iommu->iommu; - index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle; + index = irq_iommu->irte_index + irq_iommu->sub_handle; irte = &iommu->ir_table->base[index]; set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1)); @@ -203,18 +240,20 @@ int flush_irte(int irq) { int index; struct intel_iommu *iommu; + struct irq_2_iommu *irq_iommu; spin_lock(&irq_2_ir_lock); - if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) { + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { spin_unlock(&irq_2_ir_lock); return -1; } - iommu = irq_2_iommu[irq].iommu; + iommu = irq_iommu->iommu; - index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle; + index = irq_iommu->irte_index + irq_iommu->sub_handle; - qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask); + qi_flush_iec(iommu, index, irq_iommu->irte_mask); spin_unlock(&irq_2_ir_lock); return 0; @@ -246,28 +285,30 @@ int free_irte(int irq) int index, i; struct irte *irte; struct intel_iommu *iommu; + struct irq_2_iommu *irq_iommu; spin_lock(&irq_2_ir_lock); - if (irq >= NR_IRQS || !irq_2_iommu[irq].iommu) { + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { spin_unlock(&irq_2_ir_lock); return -1; } - iommu = irq_2_iommu[irq].iommu; + iommu = irq_iommu->iommu; - index = irq_2_iommu[irq].irte_index + irq_2_iommu[irq].sub_handle; + index = irq_iommu->irte_index + irq_iommu->sub_handle; irte = &iommu->ir_table->base[index]; - if (!irq_2_iommu[irq].sub_handle) { - for (i = 0; i < (1 << irq_2_iommu[irq].irte_mask); i++) + if (!irq_iommu->sub_handle) { + for (i = 0; i < (1 << irq_iommu->irte_mask); i++) set_64bit((unsigned long *)irte, 0); - qi_flush_iec(iommu, index, irq_2_iommu[irq].irte_mask); + qi_flush_iec(iommu, index, irq_iommu->irte_mask); } - irq_2_iommu[irq].iommu = NULL; - irq_2_iommu[irq].irte_index = 0; - irq_2_iommu[irq].sub_handle = 0; - irq_2_iommu[irq].irte_mask = 0; + irq_iommu->iommu = NULL; + irq_iommu->irte_index = 0; + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = 0; spin_unlock(&irq_2_ir_lock); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 4a10b56..d281201 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -378,23 +378,21 @@ static int msi_capability_init(struct pci_dev *dev) entry->msi_attrib.masked = 1; entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ entry->msi_attrib.pos = pos; - if (is_mask_bit_support(control)) { + if (entry->msi_attrib.maskbit) { entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos, - is_64bit_address(control)); + entry->msi_attrib.is_64); } entry->dev = dev; if (entry->msi_attrib.maskbit) { unsigned int maskbits, temp; /* All MSIs are unmasked by default, Mask them all */ pci_read_config_dword(dev, - msi_mask_bits_reg(pos, is_64bit_address(control)), + msi_mask_bits_reg(pos, entry->msi_attrib.is_64), &maskbits); temp = (1 << multi_msi_capable(control)); temp = ((temp - 1) & ~temp); maskbits |= temp; - pci_write_config_dword(dev, - msi_mask_bits_reg(pos, is_64bit_address(control)), - maskbits); + pci_write_config_dword(dev, entry->msi_attrib.is_64, maskbits); entry->msi_attrib.maskbits_mask = temp; } list_add_tail(&entry->list, &dev->msi_list); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index a13f534..b4cdd69 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -43,18 +43,32 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count) { struct pci_dynid *dynid; struct pci_driver *pdrv = to_pci_driver(driver); + const struct pci_device_id *ids = pdrv->id_table; __u32 vendor, device, subvendor=PCI_ANY_ID, subdevice=PCI_ANY_ID, class=0, class_mask=0; unsigned long driver_data=0; int fields=0; - int retval = 0; + int retval; - fields = sscanf(buf, "%x %x %x %x %x %x %lux", + fields = sscanf(buf, "%x %x %x %x %x %x %lx", &vendor, &device, &subvendor, &subdevice, &class, &class_mask, &driver_data); if (fields < 2) return -EINVAL; + /* Only accept driver_data values that match an existing id_table + entry */ + retval = -EINVAL; + while (ids->vendor || ids->subvendor || ids->class_mask) { + if (driver_data == ids->driver_data) { + retval = 0; + break; + } + ids++; + } + if (retval) /* No match */ + return retval; + dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; @@ -65,8 +79,7 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count) dynid->id.subdevice = subdevice; dynid->id.class = class; dynid->id.class_mask = class_mask; - dynid->id.driver_data = pdrv->dynids.use_driver_data ? - driver_data : 0UL; + dynid->id.driver_data = driver_data; spin_lock(&pdrv->dynids.lock); list_add_tail(&dynid->node, &pdrv->dynids.list); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 77baff0..110022d 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -423,7 +423,7 @@ pci_write_vpd(struct kobject *kobj, struct bin_attribute *bin_attr, * Reads 1, 2, or 4 bytes from legacy I/O port space using an arch specific * callback routine (pci_legacy_read). */ -ssize_t +static ssize_t pci_read_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { @@ -448,7 +448,7 @@ pci_read_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr, * Writes 1, 2, or 4 bytes from legacy I/O port space using an arch specific * callback routine (pci_legacy_write). */ -ssize_t +static ssize_t pci_write_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { @@ -468,11 +468,11 @@ pci_write_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr, * @attr: struct bin_attribute for this file * @vma: struct vm_area_struct passed to mmap * - * Uses an arch specific callback, pci_mmap_legacy_page_range, to mmap + * Uses an arch specific callback, pci_mmap_legacy_mem_page_range, to mmap * legacy memory space (first meg of bus space) into application virtual * memory space. */ -int +static int pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma) { @@ -480,7 +480,90 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr, struct device, kobj)); - return pci_mmap_legacy_page_range(bus, vma); + return pci_mmap_legacy_page_range(bus, vma, pci_mmap_mem); +} + +/** + * pci_mmap_legacy_io - map legacy PCI IO into user memory space + * @kobj: kobject corresponding to device to be mapped + * @attr: struct bin_attribute for this file + * @vma: struct vm_area_struct passed to mmap + * + * Uses an arch specific callback, pci_mmap_legacy_io_page_range, to mmap + * legacy IO space (first meg of bus space) into application virtual + * memory space. Returns -ENOSYS if the operation isn't supported + */ +static int +pci_mmap_legacy_io(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + struct pci_bus *bus = to_pci_bus(container_of(kobj, + struct device, + kobj)); + + return pci_mmap_legacy_page_range(bus, vma, pci_mmap_io); +} + +/** + * pci_create_legacy_files - create legacy I/O port and memory files + * @b: bus to create files under + * + * Some platforms allow access to legacy I/O port and ISA memory space on + * a per-bus basis. This routine creates the files and ties them into + * their associated read, write and mmap files from pci-sysfs.c + * + * On error unwind, but don't propogate the error to the caller + * as it is ok to set up the PCI bus without these files. + */ +void pci_create_legacy_files(struct pci_bus *b) +{ + int error; + + b->legacy_io = kzalloc(sizeof(struct bin_attribute) * 2, + GFP_ATOMIC); + if (!b->legacy_io) + goto kzalloc_err; + + b->legacy_io->attr.name = "legacy_io"; + b->legacy_io->size = 0xffff; + b->legacy_io->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_io->read = pci_read_legacy_io; + b->legacy_io->write = pci_write_legacy_io; + b->legacy_io->mmap = pci_mmap_legacy_io; + error = device_create_bin_file(&b->dev, b->legacy_io); + if (error) + goto legacy_io_err; + + /* Allocated above after the legacy_io struct */ + b->legacy_mem = b->legacy_io + 1; + b->legacy_mem->attr.name = "legacy_mem"; + b->legacy_mem->size = 1024*1024; + b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_mem->mmap = pci_mmap_legacy_mem; + error = device_create_bin_file(&b->dev, b->legacy_mem); + if (error) + goto legacy_mem_err; + + return; + +legacy_mem_err: + device_remove_bin_file(&b->dev, b->legacy_io); +legacy_io_err: + kfree(b->legacy_io); + b->legacy_io = NULL; +kzalloc_err: + printk(KERN_WARNING "pci: warning: could not create legacy I/O port " + "and ISA memory resources to sysfs\n"); + return; +} + +void pci_remove_legacy_files(struct pci_bus *b) +{ + if (b->legacy_io) { + device_remove_bin_file(&b->dev, b->legacy_io); + device_remove_bin_file(&b->dev, b->legacy_mem); + kfree(b->legacy_io); /* both are allocated here */ + } } #endif /* HAVE_PCI_LEGACY */ @@ -715,7 +798,7 @@ static struct bin_attribute pci_config_attr = { .name = "config", .mode = S_IRUGO | S_IWUSR, }, - .size = 256, + .size = PCI_CFG_SPACE_SIZE, .read = pci_read_config, .write = pci_write_config, }; @@ -725,7 +808,7 @@ static struct bin_attribute pcie_config_attr = { .name = "config", .mode = S_IRUGO | S_IWUSR, }, - .size = 4096, + .size = PCI_CFG_SPACE_EXP_SIZE, .read = pci_read_config, .write = pci_write_config, }; @@ -735,86 +818,103 @@ int __attribute__ ((weak)) pcibios_add_platform_entries(struct pci_dev *dev) return 0; } +static int pci_create_capabilities_sysfs(struct pci_dev *dev) +{ + int retval; + struct bin_attribute *attr; + + /* If the device has VPD, try to expose it in sysfs. */ + if (dev->vpd) { + attr = kzalloc(sizeof(*attr), GFP_ATOMIC); + if (!attr) + return -ENOMEM; + + attr->size = dev->vpd->len; + attr->attr.name = "vpd"; + attr->attr.mode = S_IRUSR | S_IWUSR; + attr->read = pci_read_vpd; + attr->write = pci_write_vpd; + retval = sysfs_create_bin_file(&dev->dev.kobj, attr); + if (retval) { + kfree(dev->vpd->attr); + return retval; + } + dev->vpd->attr = attr; + } + + /* Active State Power Management */ + pcie_aspm_create_sysfs_dev_files(dev); + + return 0; +} + int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev) { - struct bin_attribute *attr = NULL; int retval; + int rom_size = 0; + struct bin_attribute *attr; if (!sysfs_initialized) return -EACCES; - if (pdev->cfg_size < 4096) + if (pdev->cfg_size < PCI_CFG_SPACE_EXP_SIZE) retval = sysfs_create_bin_file(&pdev->dev.kobj, &pci_config_attr); else retval = sysfs_create_bin_file(&pdev->dev.kobj, &pcie_config_attr); if (retval) goto err; - /* If the device has VPD, try to expose it in sysfs. */ - if (pdev->vpd) { - attr = kzalloc(sizeof(*attr), GFP_ATOMIC); - if (attr) { - pdev->vpd->attr = attr; - attr->size = pdev->vpd->len; - attr->attr.name = "vpd"; - attr->attr.mode = S_IRUSR | S_IWUSR; - attr->read = pci_read_vpd; - attr->write = pci_write_vpd; - retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); - if (retval) - goto err_vpd; - } else { - retval = -ENOMEM; - goto err_config_file; - } - } - retval = pci_create_resource_files(pdev); if (retval) - goto err_vpd_file; + goto err_config_file; + + if (pci_resource_len(pdev, PCI_ROM_RESOURCE)) + rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + else if (pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW) + rom_size = 0x20000; /* If the device has a ROM, try to expose it in sysfs. */ - if (pci_resource_len(pdev, PCI_ROM_RESOURCE) || - (pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW)) { + if (rom_size) { attr = kzalloc(sizeof(*attr), GFP_ATOMIC); - if (attr) { - pdev->rom_attr = attr; - attr->size = pci_resource_len(pdev, PCI_ROM_RESOURCE); - attr->attr.name = "rom"; - attr->attr.mode = S_IRUSR; - attr->read = pci_read_rom; - attr->write = pci_write_rom; - retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); - if (retval) - goto err_rom; - } else { + if (!attr) { retval = -ENOMEM; goto err_resource_files; } + attr->size = rom_size; + attr->attr.name = "rom"; + attr->attr.mode = S_IRUSR; + attr->read = pci_read_rom; + attr->write = pci_write_rom; + retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); + if (retval) { + kfree(attr); + goto err_resource_files; + } + pdev->rom_attr = attr; } + /* add platform-specific attributes */ - if (pcibios_add_platform_entries(pdev)) + retval = pcibios_add_platform_entries(pdev); + if (retval) goto err_rom_file; - pcie_aspm_create_sysfs_dev_files(pdev); + /* add sysfs entries for various capabilities */ + retval = pci_create_capabilities_sysfs(pdev); + if (retval) + goto err_rom_file; return 0; err_rom_file: - if (pci_resource_len(pdev, PCI_ROM_RESOURCE)) + if (rom_size) { sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr); -err_rom: - kfree(pdev->rom_attr); + kfree(pdev->rom_attr); + pdev->rom_attr = NULL; + } err_resource_files: pci_remove_resource_files(pdev); -err_vpd_file: - if (pdev->vpd) { - sysfs_remove_bin_file(&pdev->dev.kobj, pdev->vpd->attr); -err_vpd: - kfree(pdev->vpd->attr); - } err_config_file: - if (pdev->cfg_size < 4096) + if (pdev->cfg_size < PCI_CFG_SPACE_EXP_SIZE) sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); else sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr); @@ -822,6 +922,16 @@ err: return retval; } +static void pci_remove_capabilities_sysfs(struct pci_dev *dev) +{ + if (dev->vpd && dev->vpd->attr) { + sysfs_remove_bin_file(&dev->dev.kobj, dev->vpd->attr); + kfree(dev->vpd->attr); + } + + pcie_aspm_remove_sysfs_dev_files(dev); +} + /** * pci_remove_sysfs_dev_files - cleanup PCI specific sysfs files * @pdev: device whose entries we should free @@ -830,27 +940,28 @@ err: */ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) { + int rom_size = 0; + if (!sysfs_initialized) return; - pcie_aspm_remove_sysfs_dev_files(pdev); + pci_remove_capabilities_sysfs(pdev); - if (pdev->vpd) { - sysfs_remove_bin_file(&pdev->dev.kobj, pdev->vpd->attr); - kfree(pdev->vpd->attr); - } - if (pdev->cfg_size < 4096) + if (pdev->cfg_size < PCI_CFG_SPACE_EXP_SIZE) sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); else sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr); pci_remove_resource_files(pdev); - if (pci_resource_len(pdev, PCI_ROM_RESOURCE)) { - if (pdev->rom_attr) { - sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr); - kfree(pdev->rom_attr); - } + if (pci_resource_len(pdev, PCI_ROM_RESOURCE)) + rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + else if (pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW) + rom_size = 0x20000; + + if (rom_size && pdev->rom_attr) { + sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr); + kfree(pdev->rom_attr); } } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index dbe9f39..4db261e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -213,10 +213,13 @@ int pci_bus_find_capability(struct pci_bus *bus, unsigned int devfn, int cap) int pci_find_ext_capability(struct pci_dev *dev, int cap) { u32 header; - int ttl = 480; /* 3840 bytes, minimum 8 bytes per capability */ - int pos = 0x100; + int ttl; + int pos = PCI_CFG_SPACE_SIZE; - if (dev->cfg_size <= 256) + /* minimum 8 bytes per capability */ + ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8; + + if (dev->cfg_size <= PCI_CFG_SPACE_SIZE) return 0; if (pci_read_config_dword(dev, pos, &header) != PCIBIOS_SUCCESSFUL) @@ -234,7 +237,7 @@ int pci_find_ext_capability(struct pci_dev *dev, int cap) return pos; pos = PCI_EXT_CAP_NEXT(header); - if (pos < 0x100) + if (pos < PCI_CFG_SPACE_SIZE) break; if (pci_read_config_dword(dev, pos, &header) != PCIBIOS_SUCCESSFUL) @@ -1127,6 +1130,27 @@ int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) } /** + * pci_wake_from_d3 - enable/disable device to wake up from D3_hot or D3_cold + * @dev: PCI device to prepare + * @enable: True to enable wake-up event generation; false to disable + * + * Many drivers want the device to wake up the system from D3_hot or D3_cold + * and this function allows them to set that up cleanly - pci_enable_wake() + * should not be called twice in a row to enable wake-up due to PCI PM vs ACPI + * ordering constraints. + * + * This function only returns error code if the device is not capable of + * generating PME# from both D3_hot and D3_cold, and the platform is unable to + * enable wake-up power for it. + */ +int pci_wake_from_d3(struct pci_dev *dev, bool enable) +{ + return pci_pme_capable(dev, PCI_D3cold) ? + pci_enable_wake(dev, PCI_D3cold, enable) : + pci_enable_wake(dev, PCI_D3hot, enable); +} + +/** * pci_target_state - find an appropriate low power state for a given PCI dev * @dev: PCI device * @@ -1242,25 +1266,25 @@ void pci_pm_init(struct pci_dev *dev) dev->d1_support = false; dev->d2_support = false; if (!pci_no_d1d2(dev)) { - if (pmc & PCI_PM_CAP_D1) { - dev_printk(KERN_DEBUG, &dev->dev, "supports D1\n"); + if (pmc & PCI_PM_CAP_D1) dev->d1_support = true; - } - if (pmc & PCI_PM_CAP_D2) { - dev_printk(KERN_DEBUG, &dev->dev, "supports D2\n"); + if (pmc & PCI_PM_CAP_D2) dev->d2_support = true; - } + + if (dev->d1_support || dev->d2_support) + dev_printk(KERN_DEBUG, &dev->dev, "supports%s%s\n", + dev->d1_support ? " D1" : "", + dev->d2_support ? " D2" : ""); } pmc &= PCI_PM_CAP_PME_MASK; if (pmc) { - dev_printk(KERN_INFO, &dev->dev, - "PME# supported from%s%s%s%s%s\n", - (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", - (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", - (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", - (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", - (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); + dev_info(&dev->dev, "PME# supported from%s%s%s%s%s\n", + (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", + (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", + (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", + (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", + (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); dev->pme_support = pmc >> PCI_PM_CAP_PME_SHIFT; /* * Make device's PM flags reflect the wake-up capability, but @@ -1275,6 +1299,38 @@ void pci_pm_init(struct pci_dev *dev) } } +/** + * pci_enable_ari - enable ARI forwarding if hardware support it + * @dev: the PCI device + */ +void pci_enable_ari(struct pci_dev *dev) +{ + int pos; + u32 cap; + u16 ctrl; + + if (!dev->is_pcie) + return; + + if (dev->pcie_type != PCI_EXP_TYPE_ROOT_PORT && + dev->pcie_type != PCI_EXP_TYPE_DOWNSTREAM) + return; + + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (!pos) + return; + + pci_read_config_dword(dev, pos + PCI_EXP_DEVCAP2, &cap); + if (!(cap & PCI_EXP_DEVCAP2_ARI)) + return; + + pci_read_config_word(dev, pos + PCI_EXP_DEVCTL2, &ctrl); + ctrl |= PCI_EXP_DEVCTL2_ARI; + pci_write_config_word(dev, pos + PCI_EXP_DEVCTL2, ctrl); + + dev->ari_enabled = 1; +} + int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge) { @@ -1942,6 +1998,7 @@ EXPORT_SYMBOL(pci_restore_state); EXPORT_SYMBOL(pci_pme_capable); EXPORT_SYMBOL(pci_pme_active); EXPORT_SYMBOL(pci_enable_wake); +EXPORT_SYMBOL(pci_wake_from_d3); EXPORT_SYMBOL(pci_target_state); EXPORT_SYMBOL(pci_prepare_to_sleep); EXPORT_SYMBOL(pci_back_from_sleep); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d807cd7..b205ab8 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -1,3 +1,9 @@ +#ifndef DRIVERS_PCI_H +#define DRIVERS_PCI_H + +#define PCI_CFG_SPACE_SIZE 256 +#define PCI_CFG_SPACE_EXP_SIZE 4096 + /* Functions internal to the PCI core code */ extern int pci_uevent(struct device *dev, struct kobj_uevent_env *env); @@ -76,7 +82,13 @@ static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; } /* Functions for PCI Hotplug drivers to use */ extern unsigned int pci_do_scan_bus(struct pci_bus *bus); +#ifdef HAVE_PCI_LEGACY +extern void pci_create_legacy_files(struct pci_bus *bus); extern void pci_remove_legacy_files(struct pci_bus *bus); +#else +static inline void pci_create_legacy_files(struct pci_bus *bus) { return; } +static inline void pci_remove_legacy_files(struct pci_bus *bus) { return; } +#endif /* Lock for read/write access to pci device and bus lists */ extern struct rw_semaphore pci_bus_sem; @@ -109,6 +121,7 @@ static inline int pci_no_d1d2(struct pci_dev *dev) extern int pcie_mch_quirk; extern struct device_attribute pci_dev_attrs[]; extern struct device_attribute dev_attr_cpuaffinity; +extern struct device_attribute dev_attr_cpulistaffinity; /** * pci_match_one_device - Tell if a PCI device structure has a matching @@ -144,3 +157,16 @@ struct pci_slot_attribute { }; #define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr) +extern void pci_enable_ari(struct pci_dev *dev); +/** + * pci_ari_enabled - query ARI forwarding status + * @dev: the PCI device + * + * Returns 1 if ARI forwarding is enabled, or 0 if not enabled; + */ +static inline int pci_ari_enabled(struct pci_dev *dev) +{ + return dev->ari_enabled; +} + +#endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index 77036f4..e390707 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -105,7 +105,7 @@ static irqreturn_t aer_irq(int irq, void *context) unsigned long flags; int pos; - pos = pci_find_aer_capability(pdev->port); + pos = pci_find_ext_capability(pdev->port, PCI_EXT_CAP_ID_ERR); /* * Must lock access to Root Error Status Reg, Root Error ID Reg, * and Root error producer/consumer index @@ -252,7 +252,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) u32 status; int pos; - pos = pci_find_aer_capability(dev); + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); /* Disable Root's interrupt in response to error messages */ pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, 0); @@ -316,7 +316,7 @@ static void aer_error_resume(struct pci_dev *dev) pci_write_config_word(dev, pos + PCI_EXP_DEVSTA, reg16); /* Clean AER Root Error Status */ - pos = pci_find_aer_capability(dev); + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &mask); if (dev->error_state == pci_channel_io_normal) diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index ee5e7b5..dfc63d0 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -28,41 +28,15 @@ static int forceload; module_param(forceload, bool, 0); -#define PCI_CFG_SPACE_SIZE (0x100) -int pci_find_aer_capability(struct pci_dev *dev) -{ - int pos; - u32 reg32 = 0; - - /* Check if it's a pci-express device */ - pos = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (!pos) - return 0; - - /* Check if it supports pci-express AER */ - pos = PCI_CFG_SPACE_SIZE; - while (pos) { - if (pci_read_config_dword(dev, pos, ®32)) - return 0; - - /* some broken boards return ~0 */ - if (reg32 == 0xffffffff) - return 0; - - if (PCI_EXT_CAP_ID(reg32) == PCI_EXT_CAP_ID_ERR) - break; - - pos = reg32 >> 20; - } - - return pos; -} - int pci_enable_pcie_error_reporting(struct pci_dev *dev) { u16 reg16 = 0; int pos; + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (!pos) + return -EIO; + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); if (!pos) return -EIO; @@ -102,7 +76,7 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) int pos; u32 status, mask; - pos = pci_find_aer_capability(dev); + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); if (!pos) return -EIO; @@ -123,7 +97,7 @@ int pci_cleanup_aer_correct_error_status(struct pci_dev *dev) int pos; u32 status; - pos = pci_find_aer_capability(dev); + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); if (!pos) return -EIO; @@ -502,7 +476,7 @@ static void handle_error_source(struct pcie_device * aerdev, * Correctable error does not need software intevention. * No need to go through error recovery process. */ - pos = pci_find_aer_capability(dev); + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); if (pos) pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, info.status); @@ -542,7 +516,7 @@ void aer_enable_rootport(struct aer_rpc *rpc) reg16 &= ~(SYSTEM_ERROR_INTR_ON_MESG_MASK); pci_write_config_word(pdev, pos + PCI_EXP_RTCTL, reg16); - aer_pos = pci_find_aer_capability(pdev); + aer_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); /* Clear error status */ pci_read_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, ®32); pci_write_config_dword(pdev, aer_pos + PCI_ERR_ROOT_STATUS, reg32); @@ -579,7 +553,7 @@ static void disable_root_aer(struct aer_rpc *rpc) u32 reg32; int pos; - pos = pci_find_aer_capability(pdev); + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR); /* Disable Root's interrupt in response to error messages */ pci_write_config_dword(pdev, pos + PCI_ERR_ROOT_COMMAND, 0); @@ -618,7 +592,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) { int pos; - pos = pci_find_aer_capability(dev); + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); /* The device might not support AER */ if (!pos) @@ -755,7 +729,6 @@ int aer_init(struct pcie_device *dev) return AER_SUCCESS; } -EXPORT_SYMBOL_GPL(pci_find_aer_capability); EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting); EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting); EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status); diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 851f5b8..8f63f4c 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -528,9 +528,9 @@ static int pcie_aspm_sanity_check(struct pci_dev *pdev) pci_read_config_dword(child_dev, child_pos + PCI_EXP_DEVCAP, ®32); if (!(reg32 & PCI_EXP_DEVCAP_RBER) && !aspm_force) { - printk("Pre-1.1 PCIe device detected, " - "disable ASPM for %s. It can be enabled forcedly" - " with 'pcie_aspm=force'\n", pci_name(pdev)); + dev_printk(KERN_INFO, &child_dev->dev, "disabling ASPM" + " on pre-1.1 PCIe device. You can enable it" + " with 'pcie_aspm=force'\n"); return -EINVAL; } } diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index 3656e03..2529f3f 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -25,7 +25,6 @@ #define PCIE_CAPABILITIES_REG 0x2 #define PCIE_SLOT_CAPABILITIES_REG 0x14 #define PCIE_PORT_DEVICE_MAXSERVICES 4 -#define PCI_CFG_SPACE_SIZE 256 #define get_descriptor_id(type, service) (((type - 4) << 4) | service) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 890f0d2..2e091e0 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -195,24 +195,11 @@ static int get_port_device_capability(struct pci_dev *dev) /* PME Capable - root port capability */ if (((reg16 >> 4) & PORT_TYPE_MASK) == PCIE_RC_PORT) services |= PCIE_PORT_SERVICE_PME; - - pos = PCI_CFG_SPACE_SIZE; - while (pos) { - pci_read_config_dword(dev, pos, ®32); - switch (reg32 & 0xffff) { - case PCI_EXT_CAP_ID_ERR: - services |= PCIE_PORT_SERVICE_AER; - pos = reg32 >> 20; - break; - case PCI_EXT_CAP_ID_VC: - services |= PCIE_PORT_SERVICE_VC; - pos = reg32 >> 20; - break; - default: - pos = 0; - break; - } - } + + if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR)) + services |= PCIE_PORT_SERVICE_AER; + if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VC)) + services |= PCIE_PORT_SERVICE_VC; return services; } diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 367c9c2..584422d 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -91,7 +91,7 @@ static int __devinit pcie_portdrv_probe (struct pci_dev *dev, pci_set_master(dev); if (!dev->irq && dev->pin) { - dev_warn(&dev->dev, "device [%04x/%04x] has invalid IRQ; " + dev_warn(&dev->dev, "device [%04x:%04x] has invalid IRQ; " "check vendor BIOS\n", dev->vendor, dev->device); } if (pcie_port_device_register(dev)) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d3db8b2..aaaf0a1 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -14,8 +14,6 @@ #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ #define CARDBUS_RESERVE_BUSNR 3 -#define PCI_CFG_SPACE_SIZE 256 -#define PCI_CFG_SPACE_EXP_SIZE 4096 /* Ugh. Need to stop exporting this to modules. */ LIST_HEAD(pci_root_buses); @@ -44,72 +42,6 @@ int no_pci_devices(void) } EXPORT_SYMBOL(no_pci_devices); -#ifdef HAVE_PCI_LEGACY -/** - * pci_create_legacy_files - create legacy I/O port and memory files - * @b: bus to create files under - * - * Some platforms allow access to legacy I/O port and ISA memory space on - * a per-bus basis. This routine creates the files and ties them into - * their associated read, write and mmap files from pci-sysfs.c - * - * On error unwind, but don't propogate the error to the caller - * as it is ok to set up the PCI bus without these files. - */ -static void pci_create_legacy_files(struct pci_bus *b) -{ - int error; - - b->legacy_io = kzalloc(sizeof(struct bin_attribute) * 2, - GFP_ATOMIC); - if (!b->legacy_io) - goto kzalloc_err; - - b->legacy_io->attr.name = "legacy_io"; - b->legacy_io->size = 0xffff; - b->legacy_io->attr.mode = S_IRUSR | S_IWUSR; - b->legacy_io->read = pci_read_legacy_io; - b->legacy_io->write = pci_write_legacy_io; - error = device_create_bin_file(&b->dev, b->legacy_io); - if (error) - goto legacy_io_err; - - /* Allocated above after the legacy_io struct */ - b->legacy_mem = b->legacy_io + 1; - b->legacy_mem->attr.name = "legacy_mem"; - b->legacy_mem->size = 1024*1024; - b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; - b->legacy_mem->mmap = pci_mmap_legacy_mem; - error = device_create_bin_file(&b->dev, b->legacy_mem); - if (error) - goto legacy_mem_err; - - return; - -legacy_mem_err: - device_remove_bin_file(&b->dev, b->legacy_io); -legacy_io_err: - kfree(b->legacy_io); - b->legacy_io = NULL; -kzalloc_err: - printk(KERN_WARNING "pci: warning: could not create legacy I/O port " - "and ISA memory resources to sysfs\n"); - return; -} - -void pci_remove_legacy_files(struct pci_bus *b) -{ - if (b->legacy_io) { - device_remove_bin_file(&b->dev, b->legacy_io); - device_remove_bin_file(&b->dev, b->legacy_mem); - kfree(b->legacy_io); /* both are allocated here */ - } -} -#else /* !HAVE_PCI_LEGACY */ -static inline void pci_create_legacy_files(struct pci_bus *bus) { return; } -void pci_remove_legacy_files(struct pci_bus *bus) { return; } -#endif /* HAVE_PCI_LEGACY */ - /* * PCI Bus Class Devices */ @@ -219,7 +151,7 @@ static inline enum pci_bar_type decode_bar(struct resource *res, u32 bar) res->flags = bar & ~PCI_BASE_ADDRESS_MEM_MASK; - if (res->flags == PCI_BASE_ADDRESS_MEM_TYPE_64) + if (res->flags & PCI_BASE_ADDRESS_MEM_TYPE_64) return pci_bar_mem64; return pci_bar_mem32; } @@ -304,8 +236,8 @@ static int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, } else { res->start = l64; res->end = l64 + sz64; - printk(KERN_DEBUG "PCI: %s reg %x 64bit mmio: %pR\n", - pci_name(dev), pos, res); + dev_printk(KERN_DEBUG, &dev->dev, + "reg %x 64bit mmio: %pR\n", pos, res); } } else { sz = pci_size(l, sz, mask); @@ -315,10 +247,10 @@ static int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, res->start = l; res->end = l + sz; - printk(KERN_DEBUG "PCI: %s reg %x %s: %pR\n", - pci_name(dev), pos, - (res->flags & IORESOURCE_IO) ? "io port":"32bit mmio", - res); + + dev_printk(KERN_DEBUG, &dev->dev, "reg %x %s: %pR\n", pos, + (res->flags & IORESOURCE_IO) ? "io port" : "32bit mmio", + res); } out: @@ -389,8 +321,7 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) res->start = base; if (!res->end) res->end = limit + 0xfff; - printk(KERN_DEBUG "PCI: bridge %s io port: %pR\n", - pci_name(dev), res); + dev_printk(KERN_DEBUG, &dev->dev, "bridge io port: %pR\n", res); } res = child->resource[1]; @@ -402,8 +333,8 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM; res->start = base; res->end = limit + 0xfffff; - printk(KERN_DEBUG "PCI: bridge %s 32bit mmio: %pR\n", - pci_name(dev), res); + dev_printk(KERN_DEBUG, &dev->dev, "bridge 32bit mmio: %pR\n", + res); } res = child->resource[2]; @@ -439,9 +370,9 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH; res->start = base; res->end = limit + 0xfffff; - printk(KERN_DEBUG "PCI: bridge %s %sbit mmio pref: %pR\n", - pci_name(dev), - (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64":"32", res); + dev_printk(KERN_DEBUG, &dev->dev, "bridge %sbit mmio pref: %pR\n", + (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64" : "32", + res); } } @@ -762,7 +693,7 @@ static int pci_setup_device(struct pci_dev * dev) dev->class = class; class >>= 8; - dev_dbg(&dev->dev, "found [%04x/%04x] class %06x header type %02x\n", + dev_dbg(&dev->dev, "found [%04x:%04x] class %06x header type %02x\n", dev->vendor, dev->device, class, dev->hdr_type); /* "Unknown power state" */ @@ -844,6 +775,11 @@ static int pci_setup_device(struct pci_dev * dev) return 0; } +static void pci_release_capabilities(struct pci_dev *dev) +{ + pci_vpd_release(dev); +} + /** * pci_release_dev - free a pci device structure when all users of it are finished. * @dev: device that's been disconnected @@ -856,7 +792,7 @@ static void pci_release_dev(struct device *dev) struct pci_dev *pci_dev; pci_dev = to_pci_dev(dev); - pci_vpd_release(pci_dev); + pci_release_capabilities(pci_dev); kfree(pci_dev); } @@ -887,8 +823,9 @@ static void set_pcie_port_type(struct pci_dev *pdev) int pci_cfg_space_size_ext(struct pci_dev *dev) { u32 status; + int pos = PCI_CFG_SPACE_SIZE; - if (pci_read_config_dword(dev, 256, &status) != PCIBIOS_SUCCESSFUL) + if (pci_read_config_dword(dev, pos, &status) != PCIBIOS_SUCCESSFUL) goto fail; if (status == 0xffffffff) goto fail; @@ -936,8 +873,6 @@ struct pci_dev *alloc_pci_dev(void) INIT_LIST_HEAD(&dev->bus_list); - pci_msi_init_pci_dev(dev); - return dev; } EXPORT_SYMBOL(alloc_pci_dev); @@ -949,6 +884,7 @@ EXPORT_SYMBOL(alloc_pci_dev); static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) { struct pci_dev *dev; + struct pci_slot *slot; u32 l; u8 hdr_type; int delay = 1; @@ -997,6 +933,10 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) dev->error_state = pci_channel_io_normal; set_pcie_port_type(dev); + list_for_each_entry(slot, &bus->slots, list) + if (PCI_SLOT(devfn) == slot->number) + dev->slot = slot; + /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) set this higher, assuming the system even supports it. */ dev->dma_mask = 0xffffffff; @@ -1005,9 +945,22 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) return NULL; } + return dev; +} + +static void pci_init_capabilities(struct pci_dev *dev) +{ + /* MSI/MSI-X list */ + pci_msi_init_pci_dev(dev); + + /* Power Management */ + pci_pm_init(dev); + + /* Vital Product Data */ pci_vpd_pci22_init(dev); - return dev; + /* Alternative Routing-ID Forwarding */ + pci_enable_ari(dev); } void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) @@ -1026,8 +979,8 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) /* Fix up broken headers */ pci_fixup_device(pci_fixup_header, dev); - /* Initialize power management of the device */ - pci_pm_init(dev); + /* Initialize various capabilities */ + pci_init_capabilities(dev); /* * Add the device to our list of discovered devices diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 832175d..96cf8ec 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -24,6 +24,14 @@ #include <linux/kallsyms.h> #include "pci.h" +int isa_dma_bridge_buggy; +EXPORT_SYMBOL(isa_dma_bridge_buggy); +int pci_pci_problems; +EXPORT_SYMBOL(pci_pci_problems); +int pcie_mch_quirk; +EXPORT_SYMBOL(pcie_mch_quirk); + +#ifdef CONFIG_PCI_QUIRKS /* The Mellanox Tavor device gives false positive parity errors * Mark this device with a broken_parity_status, to allow * PCI scanning code to "skip" this now blacklisted device. @@ -76,8 +84,6 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82441, quirk_p This appears to be BIOS not version dependent. So presumably there is a chipset level fix */ -int isa_dma_bridge_buggy; -EXPORT_SYMBOL(isa_dma_bridge_buggy); static void __devinit quirk_isa_dma_hangs(struct pci_dev *dev) { @@ -98,9 +104,6 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_CBUS_1, quirk_isa_d DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_CBUS_2, quirk_isa_dma_hangs); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_CBUS_3, quirk_isa_dma_hangs); -int pci_pci_problems; -EXPORT_SYMBOL(pci_pci_problems); - /* * Chipsets where PCI->PCI transfers vanish or hang */ @@ -1376,9 +1379,6 @@ static void __init quirk_alder_ioapic(struct pci_dev *pdev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_EESSC, quirk_alder_ioapic); #endif -int pcie_mch_quirk; -EXPORT_SYMBOL(pcie_mch_quirk); - static void __devinit quirk_pcie_mch(struct pci_dev *pdev) { pcie_mch_quirk = 1; @@ -1569,84 +1569,6 @@ static void __devinit fixup_rev1_53c810(struct pci_dev* dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, fixup_rev1_53c810); -static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, struct pci_fixup *end) -{ - while (f < end) { - if ((f->vendor == dev->vendor || f->vendor == (u16) PCI_ANY_ID) && - (f->device == dev->device || f->device == (u16) PCI_ANY_ID)) { -#ifdef DEBUG - dev_dbg(&dev->dev, "calling %pF\n", f->hook); -#endif - f->hook(dev); - } - f++; - } -} - -extern struct pci_fixup __start_pci_fixups_early[]; -extern struct pci_fixup __end_pci_fixups_early[]; -extern struct pci_fixup __start_pci_fixups_header[]; -extern struct pci_fixup __end_pci_fixups_header[]; -extern struct pci_fixup __start_pci_fixups_final[]; -extern struct pci_fixup __end_pci_fixups_final[]; -extern struct pci_fixup __start_pci_fixups_enable[]; -extern struct pci_fixup __end_pci_fixups_enable[]; -extern struct pci_fixup __start_pci_fixups_resume[]; -extern struct pci_fixup __end_pci_fixups_resume[]; -extern struct pci_fixup __start_pci_fixups_resume_early[]; -extern struct pci_fixup __end_pci_fixups_resume_early[]; -extern struct pci_fixup __start_pci_fixups_suspend[]; -extern struct pci_fixup __end_pci_fixups_suspend[]; - - -void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) -{ - struct pci_fixup *start, *end; - - switch(pass) { - case pci_fixup_early: - start = __start_pci_fixups_early; - end = __end_pci_fixups_early; - break; - - case pci_fixup_header: - start = __start_pci_fixups_header; - end = __end_pci_fixups_header; - break; - - case pci_fixup_final: - start = __start_pci_fixups_final; - end = __end_pci_fixups_final; - break; - - case pci_fixup_enable: - start = __start_pci_fixups_enable; - end = __end_pci_fixups_enable; - break; - - case pci_fixup_resume: - start = __start_pci_fixups_resume; - end = __end_pci_fixups_resume; - break; - - case pci_fixup_resume_early: - start = __start_pci_fixups_resume_early; - end = __end_pci_fixups_resume_early; - break; - - case pci_fixup_suspend: - start = __start_pci_fixups_suspend; - end = __end_pci_fixups_suspend; - break; - - default: - /* stupid compiler warning, you would think with an enum... */ - return; - } - pci_do_fixups(dev, start, end); -} -EXPORT_SYMBOL(pci_fixup_device); - /* Enable 1k I/O space granularity on the Intel P64H2 */ static void __devinit quirk_p64h2_1k_io(struct pci_dev *dev) { @@ -2020,3 +1942,82 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x4375, quirk_msi_intx_disable_bug); #endif /* CONFIG_PCI_MSI */ + +static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, struct pci_fixup *end) +{ + while (f < end) { + if ((f->vendor == dev->vendor || f->vendor == (u16) PCI_ANY_ID) && + (f->device == dev->device || f->device == (u16) PCI_ANY_ID)) { + dev_dbg(&dev->dev, "calling %pF\n", f->hook); + f->hook(dev); + } + f++; + } +} + +extern struct pci_fixup __start_pci_fixups_early[]; +extern struct pci_fixup __end_pci_fixups_early[]; +extern struct pci_fixup __start_pci_fixups_header[]; +extern struct pci_fixup __end_pci_fixups_header[]; +extern struct pci_fixup __start_pci_fixups_final[]; +extern struct pci_fixup __end_pci_fixups_final[]; +extern struct pci_fixup __start_pci_fixups_enable[]; +extern struct pci_fixup __end_pci_fixups_enable[]; +extern struct pci_fixup __start_pci_fixups_resume[]; +extern struct pci_fixup __end_pci_fixups_resume[]; +extern struct pci_fixup __start_pci_fixups_resume_early[]; +extern struct pci_fixup __end_pci_fixups_resume_early[]; +extern struct pci_fixup __start_pci_fixups_suspend[]; +extern struct pci_fixup __end_pci_fixups_suspend[]; + + +void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) +{ + struct pci_fixup *start, *end; + + switch(pass) { + case pci_fixup_early: + start = __start_pci_fixups_early; + end = __end_pci_fixups_early; + break; + + case pci_fixup_header: + start = __start_pci_fixups_header; + end = __end_pci_fixups_header; + break; + + case pci_fixup_final: + start = __start_pci_fixups_final; + end = __end_pci_fixups_final; + break; + + case pci_fixup_enable: + start = __start_pci_fixups_enable; + end = __end_pci_fixups_enable; + break; + + case pci_fixup_resume: + start = __start_pci_fixups_resume; + end = __end_pci_fixups_resume; + break; + + case pci_fixup_resume_early: + start = __start_pci_fixups_resume_early; + end = __end_pci_fixups_resume_early; + break; + + case pci_fixup_suspend: + start = __start_pci_fixups_suspend; + end = __end_pci_fixups_suspend; + break; + + default: + /* stupid compiler warning, you would think with an enum... */ + return; + } + pci_do_fixups(dev, start, end); +} +#else +void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) {} +#endif +EXPORT_SYMBOL(pci_fixup_device); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index bdc2a44..042e089 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -73,6 +73,7 @@ void pci_remove_bus(struct pci_bus *pci_bus) up_write(&pci_bus_sem); pci_remove_legacy_files(pci_bus); device_remove_file(&pci_bus->dev, &dev_attr_cpuaffinity); + device_remove_file(&pci_bus->dev, &dev_attr_cpulistaffinity); device_unregister(&pci_bus->dev); } EXPORT_SYMBOL(pci_remove_bus); @@ -114,13 +115,9 @@ void pci_remove_behind_bridge(struct pci_dev *dev) { struct list_head *l, *n; - if (dev->subordinate) { - list_for_each_safe(l, n, &dev->subordinate->devices) { - struct pci_dev *dev = pci_dev_b(l); - - pci_remove_bus_device(dev); - } - } + if (dev->subordinate) + list_for_each_safe(l, n, &dev->subordinate->devices) + pci_remove_bus_device(pci_dev_b(l)); } static void pci_stop_bus_devices(struct pci_bus *bus) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 471a429..ea979f2 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -299,7 +299,7 @@ static void pbus_size_io(struct pci_bus *bus) if (r->parent || !(r->flags & IORESOURCE_IO)) continue; - r_size = r->end - r->start + 1; + r_size = resource_size(r); if (r_size < 0x400) /* Might be re-aligned for ISA */ @@ -350,7 +350,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long if (r->parent || (r->flags & mask) != type) continue; - r_size = r->end - r->start + 1; + r_size = resource_size(r); /* For bridges size != alignment */ align = resource_alignment(r); order = __ffs(align) - 20; diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index d4b5c69..2dbd96c 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -129,7 +129,7 @@ int pci_assign_resource(struct pci_dev *dev, int resno) resource_size_t size, min, align; int ret; - size = res->end - res->start + 1; + size = resource_size(res); min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM; align = resource_alignment(res); diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 7e5b85c..0c6db03 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -49,11 +49,16 @@ static ssize_t address_read_file(struct pci_slot *slot, char *buf) static void pci_slot_release(struct kobject *kobj) { + struct pci_dev *dev; struct pci_slot *slot = to_pci_slot(kobj); pr_debug("%s: releasing pci_slot on %x:%d\n", __func__, slot->bus->number, slot->number); + list_for_each_entry(dev, &slot->bus->devices, bus_list) + if (PCI_SLOT(dev->devfn) == slot->number) + dev->slot = NULL; + list_del(&slot->list); kfree(slot); @@ -108,6 +113,7 @@ static struct kobj_type pci_slot_ktype = { struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name) { + struct pci_dev *dev; struct pci_slot *slot; int err; @@ -150,6 +156,10 @@ placeholder: INIT_LIST_HEAD(&slot->list); list_add(&slot->list, &parent->slots); + list_for_each_entry(dev, &parent->devices, bus_list) + if (PCI_SLOT(dev->devfn) == slot_nr) + dev->slot = slot; + /* Don't care if debug printk has a -1 for slot_nr */ pr_debug("%s: created pci_slot on %04x:%02x:%02x\n", __func__, pci_domain_nr(parent), parent->number, slot_nr); diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c index a0ffb8e..9e1140f 100644 --- a/drivers/pcmcia/at91_cf.c +++ b/drivers/pcmcia/at91_cf.c @@ -273,7 +273,7 @@ static int __init at91_cf_probe(struct platform_device *pdev) goto fail0d; cf->socket.pci_irq = board->irq_pin; } else - cf->socket.pci_irq = NR_IRQS + 1; + cf->socket.pci_irq = nr_irqs + 1; /* pcmcia layer only remaps "real" memory not iospace */ cf->socket.io_offset = (unsigned long) diff --git a/drivers/pcmcia/hd64465_ss.c b/drivers/pcmcia/hd64465_ss.c index 117dc12..9ef69cd 100644 --- a/drivers/pcmcia/hd64465_ss.c +++ b/drivers/pcmcia/hd64465_ss.c @@ -233,15 +233,18 @@ static struct hw_interrupt_type hd64465_ss_irq_type = { */ static void hs_map_irq(hs_socket_t *sp, unsigned int irq) { + struct irq_desc *desc; + DPRINTK("hs_map_irq(sock=%d irq=%d)\n", sp->number, irq); if (irq >= HS_NUM_MAPPED_IRQS) return; + desc = irq_to_desc(irq); hs_mapped_irq[irq].sock = sp; /* insert ourselves as the irq controller */ - hs_mapped_irq[irq].old_handler = irq_desc[irq].chip; - irq_desc[irq].chip = &hd64465_ss_irq_type; + hs_mapped_irq[irq].old_handler = desc->chip; + desc->chip = &hd64465_ss_irq_type; } @@ -250,13 +253,16 @@ static void hs_map_irq(hs_socket_t *sp, unsigned int irq) */ static void hs_unmap_irq(hs_socket_t *sp, unsigned int irq) { + struct irq_desc *desc; + DPRINTK("hs_unmap_irq(sock=%d irq=%d)\n", sp->number, irq); if (irq >= HS_NUM_MAPPED_IRQS) return; + desc = irq_to_desc(irq); /* restore the original irq controller */ - irq_desc[irq].chip = hs_mapped_irq[irq].old_handler; + desc->chip = hs_mapped_irq[irq].old_handler; } /*============================================================*/ diff --git a/drivers/pcmcia/vrc4171_card.c b/drivers/pcmcia/vrc4171_card.c index eee2f1c..b2c4124 100644 --- a/drivers/pcmcia/vrc4171_card.c +++ b/drivers/pcmcia/vrc4171_card.c @@ -639,7 +639,7 @@ static int __devinit vrc4171_card_setup(char *options) int irq; options += 4; irq = simple_strtoul(options, &options, 0); - if (irq >= 0 && irq < NR_IRQS) + if (irq >= 0 && irq < nr_irqs) vrc4171_irq = irq; if (*options != ',') diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index f660ef3..814f49f 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -610,6 +610,14 @@ config RTC_DRV_RS5C313 help If you say yes here you get support for the Ricoh RS5C313 RTC chips. +config RTC_DRV_PARISC + tristate "PA-RISC firmware RTC support" + depends on PARISC + help + Say Y or M here to enable RTC support on PA-RISC systems using + firmware calls. If you do not know what you are doing, you should + just say Y. + config RTC_DRV_PPC tristate "PowerPC machine dependent RTC support" depends on PPC diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index d05928b..d6a9ac7 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_RTC_DRV_PCF8563) += rtc-pcf8563.o obj-$(CONFIG_RTC_DRV_PCF8583) += rtc-pcf8583.o obj-$(CONFIG_RTC_DRV_PL030) += rtc-pl030.o obj-$(CONFIG_RTC_DRV_PL031) += rtc-pl031.o +obj-$(CONFIG_RTC_DRV_PARISC) += rtc-parisc.o obj-$(CONFIG_RTC_DRV_PPC) += rtc-ppc.o obj-$(CONFIG_RTC_DRV_R9701) += rtc-r9701.o obj-$(CONFIG_RTC_DRV_RS5C313) += rtc-rs5c313.o diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c new file mode 100644 index 0000000..346d633 --- /dev/null +++ b/drivers/rtc/rtc-parisc.c @@ -0,0 +1,111 @@ +/* rtc-parisc: RTC for HP PA-RISC firmware + * + * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/time.h> +#include <linux/platform_device.h> + +#include <asm/rtc.h> + +/* as simple as can be, and no simpler. */ +struct parisc_rtc { + struct rtc_device *rtc; + spinlock_t lock; +}; + +static int parisc_get_time(struct device *dev, struct rtc_time *tm) +{ + struct parisc_rtc *p = dev_get_drvdata(dev); + unsigned long flags, ret; + + spin_lock_irqsave(&p->lock, flags); + ret = get_rtc_time(tm); + spin_unlock_irqrestore(&p->lock, flags); + + if (ret & RTC_BATT_BAD) + return -EOPNOTSUPP; + + return 0; +} + +static int parisc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct parisc_rtc *p = dev_get_drvdata(dev); + unsigned long flags, ret; + + spin_lock_irqsave(&p->lock, flags); + ret = set_rtc_time(tm); + spin_unlock_irqrestore(&p->lock, flags); + + if (ret < 0) + return -EOPNOTSUPP; + + return 0; +} + +static const struct rtc_class_ops parisc_rtc_ops = { + .read_time = parisc_get_time, + .set_time = parisc_set_time, +}; + +static int __devinit parisc_rtc_probe(struct platform_device *dev) +{ + struct parisc_rtc *p; + + p = kzalloc(sizeof (*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + spin_lock_init(&p->lock); + + p->rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops, + THIS_MODULE); + if (IS_ERR(p->rtc)) { + int err = PTR_ERR(p->rtc); + kfree(p); + return err; + } + + platform_set_drvdata(dev, p); + + return 0; +} + +static int __devexit parisc_rtc_remove(struct platform_device *dev) +{ + struct parisc_rtc *p = platform_get_drvdata(dev); + + rtc_device_unregister(p->rtc); + kfree(p); + + return 0; +} + +static struct platform_driver parisc_rtc_driver = { + .driver = { + .name = "rtc-parisc", + .owner = THIS_MODULE, + }, + .probe = parisc_rtc_probe, + .remove = __devexit_p(parisc_rtc_remove), +}; + +static int __init parisc_rtc_init(void) +{ + return platform_driver_register(&parisc_rtc_driver); +} + +static void __exit parisc_rtc_fini(void) +{ + platform_driver_unregister(&parisc_rtc_driver); +} + +module_init(parisc_rtc_init); +module_exit(parisc_rtc_fini); + +MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("HP PA-RISC RTC driver"); diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index 884b635..834dcc6 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -360,7 +360,7 @@ static int __devinit rtc_probe(struct platform_device *pdev) spin_unlock_irq(&rtc_lock); aie_irq = platform_get_irq(pdev, 0); - if (aie_irq < 0 || aie_irq >= NR_IRQS) { + if (aie_irq < 0 || aie_irq >= nr_irqs) { retval = -EBUSY; goto err_device_unregister; } @@ -371,7 +371,7 @@ static int __devinit rtc_probe(struct platform_device *pdev) goto err_device_unregister; pie_irq = platform_get_irq(pdev, 1); - if (pie_irq < 0 || pie_irq >= NR_IRQS) + if (pie_irq < 0 || pie_irq >= nr_irqs) goto err_free_irq; retval = request_irq(pie_irq, rtclong1_interrupt, IRQF_DISABLED, diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index b5a868d..1e5478a 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -337,7 +337,7 @@ CMD_INC_RESID(struct scsi_cmnd *cmd, int inc) #else #define IRQ_MIN 9 #if defined(__PPC) -#define IRQ_MAX (NR_IRQS-1) +#define IRQ_MAX (nr_irqs-1) #else #define IRQ_MAX 12 #endif diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index 740bad4..afc96e8 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -343,6 +343,11 @@ static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *r } #ifdef CONFIG_IDE_PROC_FS +static ide_proc_entry_t idescsi_proc[] = { + { "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL }, + { NULL, 0, NULL, NULL } +}; + #define ide_scsi_devset_get(name, field) \ static int get_##name(ide_drive_t *drive) \ { \ @@ -378,6 +383,16 @@ static const struct ide_proc_devset idescsi_settings[] = { IDE_PROC_DEVSET(transform, 0, 3), { 0 }, }; + +static ide_proc_entry_t *ide_scsi_proc_entries(ide_drive_t *drive) +{ + return idescsi_proc; +} + +static const struct ide_proc_devset *ide_scsi_proc_devsets(ide_drive_t *drive) +{ + return idescsi_settings; +} #endif /* @@ -419,13 +434,6 @@ static void ide_scsi_remove(ide_drive_t *drive) static int ide_scsi_probe(ide_drive_t *); -#ifdef CONFIG_IDE_PROC_FS -static ide_proc_entry_t idescsi_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL }, - { NULL, 0, NULL, NULL } -}; -#endif - static ide_driver_t idescsi_driver = { .gen_driver = { .owner = THIS_MODULE, @@ -439,8 +447,8 @@ static ide_driver_t idescsi_driver = { .end_request = idescsi_end_request, .error = idescsi_atapi_error, #ifdef CONFIG_IDE_PROC_FS - .proc = idescsi_proc, - .settings = idescsi_settings, + .proc_entries = ide_scsi_proc_entries, + .proc_devsets = ide_scsi_proc_devsets, #endif }; diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index d30eb7b..098739d 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -7859,7 +7859,6 @@ static struct pci_driver ipr_driver = { .remove = ipr_remove, .shutdown = ipr_shutdown, .err_handler = &ipr_err_handler, - .dynids.use_driver_data = 1 }; /** diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 83c8192..f25f41a 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -2108,7 +2108,7 @@ struct scsi_qla_host; struct qla_msix_entry { int have_irq; - uint16_t msix_vector; + uint32_t msix_vector; uint16_t msix_entry; }; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 2aed472..21dd182 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1566,9 +1566,8 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto probe_out; } - if (pci_find_aer_capability(pdev)) - if (pci_enable_pcie_error_reporting(pdev)) - goto probe_out; + /* This may fail but that's ok */ + pci_enable_pcie_error_reporting(pdev); host = scsi_host_alloc(sht, sizeof(scsi_qla_host_t)); if (host == NULL) { diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c index 381b12a..d935b2d 100644 --- a/drivers/serial/68328serial.c +++ b/drivers/serial/68328serial.c @@ -66,7 +66,6 @@ #endif static struct m68k_serial m68k_soft[NR_PORTS]; -struct m68k_serial *IRQ_ports[NR_IRQS]; static unsigned int uart_irqs[NR_PORTS] = UART_IRQ_DEFNS; @@ -375,15 +374,11 @@ clear_and_return: */ irqreturn_t rs_interrupt(int irq, void *dev_id) { - struct m68k_serial * info; + struct m68k_serial *info = dev_id; m68328_uart *uart; unsigned short rx; unsigned short tx; - info = IRQ_ports[irq]; - if(!info) - return IRQ_NONE; - uart = &uart_addr[info->line]; rx = uart->urx.w; @@ -1383,8 +1378,6 @@ rs68328_init(void) info->port, info->irq); printk(" is a builtin MC68328 UART\n"); - IRQ_ports[info->irq] = info; /* waste of space */ - #ifdef CONFIG_M68VZ328 if (i > 0 ) PJSEL &= 0xCF; /* PSW enable second port output */ @@ -1393,7 +1386,7 @@ rs68328_init(void) if (request_irq(uart_irqs[i], rs_interrupt, IRQF_DISABLED, - "M68328_UART", NULL)) + "M68328_UART", info)) panic("Unable to attach 68328 serial interrupt\n"); } local_irq_restore(flags); diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c index 1528de2..303272a 100644 --- a/drivers/serial/8250.c +++ b/drivers/serial/8250.c @@ -156,11 +156,15 @@ struct uart_8250_port { }; struct irq_info { - spinlock_t lock; + struct hlist_node node; + int irq; + spinlock_t lock; /* Protects list not the hash */ struct list_head *head; }; -static struct irq_info irq_lists[NR_IRQS]; +#define NR_IRQ_HASH 32 /* Can be adjusted later */ +static struct hlist_head irq_lists[NR_IRQ_HASH]; +static DEFINE_MUTEX(hash_mutex); /* Used to walk the hash */ /* * Here we define the default xmit fifo size used for each type of UART. @@ -1545,15 +1549,43 @@ static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up) BUG_ON(i->head != &up->list); i->head = NULL; } - spin_unlock_irq(&i->lock); + /* List empty so throw away the hash node */ + if (i->head == NULL) { + hlist_del(&i->node); + kfree(i); + } } static int serial_link_irq_chain(struct uart_8250_port *up) { - struct irq_info *i = irq_lists + up->port.irq; + struct hlist_head *h; + struct hlist_node *n; + struct irq_info *i; int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? IRQF_SHARED : 0; + mutex_lock(&hash_mutex); + + h = &irq_lists[up->port.irq % NR_IRQ_HASH]; + + hlist_for_each(n, h) { + i = hlist_entry(n, struct irq_info, node); + if (i->irq == up->port.irq) + break; + } + + if (n == NULL) { + i = kzalloc(sizeof(struct irq_info), GFP_KERNEL); + if (i == NULL) { + mutex_unlock(&hash_mutex); + return -ENOMEM; + } + spin_lock_init(&i->lock); + i->irq = up->port.irq; + hlist_add_head(&i->node, h); + } + mutex_unlock(&hash_mutex); + spin_lock_irq(&i->lock); if (i->head) { @@ -1577,14 +1609,28 @@ static int serial_link_irq_chain(struct uart_8250_port *up) static void serial_unlink_irq_chain(struct uart_8250_port *up) { - struct irq_info *i = irq_lists + up->port.irq; + struct irq_info *i; + struct hlist_node *n; + struct hlist_head *h; + mutex_lock(&hash_mutex); + + h = &irq_lists[up->port.irq % NR_IRQ_HASH]; + + hlist_for_each(n, h) { + i = hlist_entry(n, struct irq_info, node); + if (i->irq == up->port.irq) + break; + } + + BUG_ON(n == NULL); BUG_ON(i->head == NULL); if (list_empty(i->head)) free_irq(up->port.irq, i); serial_do_unlink(i, up); + mutex_unlock(&hash_mutex); } /* Base timer interval for polling */ @@ -2447,7 +2493,7 @@ static void serial8250_config_port(struct uart_port *port, int flags) static int serial8250_verify_port(struct uart_port *port, struct serial_struct *ser) { - if (ser->irq >= NR_IRQS || ser->irq < 0 || + if (ser->irq >= nr_irqs || ser->irq < 0 || ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS || ser->type == PORT_STARTECH) @@ -2967,7 +3013,7 @@ EXPORT_SYMBOL(serial8250_unregister_port); static int __init serial8250_init(void) { - int ret, i; + int ret; if (nr_uarts > UART_NR) nr_uarts = UART_NR; @@ -2976,9 +3022,6 @@ static int __init serial8250_init(void) "%d ports, IRQ sharing %sabled\n", nr_uarts, share_irqs ? "en" : "dis"); - for (i = 0; i < NR_IRQS; i++) - spin_lock_init(&irq_lists[i].lock); - #ifdef CONFIG_SPARC ret = sunserial_register_minors(&serial8250_reg, UART_NR); #else @@ -3006,15 +3049,15 @@ static int __init serial8250_init(void) goto out; platform_device_del(serial8250_isa_devs); - put_dev: +put_dev: platform_device_put(serial8250_isa_devs); - unreg_uart_drv: +unreg_uart_drv: #ifdef CONFIG_SPARC sunserial_unregister_minors(&serial8250_reg, UART_NR); #else uart_unregister_driver(&serial8250_reg); #endif - out: +out: return ret; } diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c index 90b56c2..7156268 100644 --- a/drivers/serial/amba-pl010.c +++ b/drivers/serial/amba-pl010.c @@ -512,7 +512,7 @@ static int pl010_verify_port(struct uart_port *port, struct serial_struct *ser) int ret = 0; if (ser->type != PORT_UNKNOWN && ser->type != PORT_AMBA) ret = -EINVAL; - if (ser->irq < 0 || ser->irq >= NR_IRQS) + if (ser->irq < 0 || ser->irq >= nr_irqs) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c index 9d08f27..b718004 100644 --- a/drivers/serial/amba-pl011.c +++ b/drivers/serial/amba-pl011.c @@ -572,7 +572,7 @@ static int pl010_verify_port(struct uart_port *port, struct serial_struct *ser) int ret = 0; if (ser->type != PORT_UNKNOWN && ser->type != PORT_AMBA) ret = -EINVAL; - if (ser->irq < 0 || ser->irq >= NR_IRQS) + if (ser->irq < 0 || ser->irq >= nr_irqs) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; diff --git a/drivers/serial/cpm_uart/cpm_uart_core.c b/drivers/serial/cpm_uart/cpm_uart_core.c index a6c4d74..bde4b4b 100644 --- a/drivers/serial/cpm_uart/cpm_uart_core.c +++ b/drivers/serial/cpm_uart/cpm_uart_core.c @@ -623,7 +623,7 @@ static int cpm_uart_verify_port(struct uart_port *port, if (ser->type != PORT_UNKNOWN && ser->type != PORT_CPM) ret = -EINVAL; - if (ser->irq < 0 || ser->irq >= NR_IRQS) + if (ser->irq < 0 || ser->irq >= nr_irqs) ret = -EINVAL; if (ser->baud_base < 9600) ret = -EINVAL; diff --git a/drivers/serial/m32r_sio.c b/drivers/serial/m32r_sio.c index 23d0305..611c97a 100644 --- a/drivers/serial/m32r_sio.c +++ b/drivers/serial/m32r_sio.c @@ -922,7 +922,7 @@ static void m32r_sio_config_port(struct uart_port *port, int flags) static int m32r_sio_verify_port(struct uart_port *port, struct serial_struct *ser) { - if (ser->irq >= NR_IRQS || ser->irq < 0 || + if (ser->irq >= nr_irqs || ser->irq < 0 || ser->baud_base < 9600 || ser->type < PORT_UNKNOWN || ser->type >= ARRAY_SIZE(uart_config)) return -EINVAL; @@ -1162,7 +1162,7 @@ static int __init m32r_sio_init(void) printk(KERN_INFO "Serial: M32R SIO driver\n"); - for (i = 0; i < NR_IRQS; i++) + for (i = 0; i < nr_irqs; i++) spin_lock_init(&irq_lists[i].lock); ret = uart_register_driver(&m32r_sio_reg); diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c index 6bdf336..874786a 100644 --- a/drivers/serial/serial_core.c +++ b/drivers/serial/serial_core.c @@ -741,7 +741,7 @@ static int uart_set_info(struct uart_state *state, if (port->ops->verify_port) retval = port->ops->verify_port(port, &new_serial); - if ((new_serial.irq >= NR_IRQS) || (new_serial.irq < 0) || + if ((new_serial.irq >= nr_irqs) || (new_serial.irq < 0) || (new_serial.baud_base < 9600)) retval = -EINVAL; diff --git a/drivers/serial/serial_lh7a40x.c b/drivers/serial/serial_lh7a40x.c index cb49a5a..61dc8b3 100644 --- a/drivers/serial/serial_lh7a40x.c +++ b/drivers/serial/serial_lh7a40x.c @@ -460,7 +460,7 @@ static int lh7a40xuart_verify_port (struct uart_port* port, if (ser->type != PORT_UNKNOWN && ser->type != PORT_LH7A40X) ret = -EINVAL; - if (ser->irq < 0 || ser->irq >= NR_IRQS) + if (ser->irq < 0 || ser->irq >= nr_irqs) ret = -EINVAL; if (ser->baud_base < 9600) /* *** FIXME: is this true? */ ret = -EINVAL; diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 3b9d2d8..f0658d2 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -1149,7 +1149,7 @@ static int sci_verify_port(struct uart_port *port, struct serial_struct *ser) { struct sci_port *s = &sci_ports[port->line]; - if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > NR_IRQS) + if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > nr_irqs) return -EINVAL; if (ser->baud_base < 2400) /* No paper tape reader for Mitch.. */ diff --git a/drivers/serial/ucc_uart.c b/drivers/serial/ucc_uart.c index 539c933..315a933 100644 --- a/drivers/serial/ucc_uart.c +++ b/drivers/serial/ucc_uart.c @@ -1066,7 +1066,7 @@ static int qe_uart_verify_port(struct uart_port *port, if (ser->type != PORT_UNKNOWN && ser->type != PORT_CPM) return -EINVAL; - if (ser->irq < 0 || ser->irq >= NR_IRQS) + if (ser->irq < 0 || ser->irq >= nr_irqs) return -EINVAL; if (ser->baud_base < 9600) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 5dccf05..f9b4647 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -47,6 +47,9 @@ static struct uio_class { struct class *class; } *uio_class; +/* Protect idr accesses */ +static DEFINE_MUTEX(minor_lock); + /* * attributes */ @@ -239,7 +242,6 @@ static void uio_dev_del_attributes(struct uio_device *idev) static int uio_get_minor(struct uio_device *idev) { - static DEFINE_MUTEX(minor_lock); int retval = -ENOMEM; int id; @@ -261,7 +263,9 @@ exit: static void uio_free_minor(struct uio_device *idev) { + mutex_lock(&minor_lock); idr_remove(&uio_idr, idev->minor); + mutex_unlock(&minor_lock); } /** @@ -305,8 +309,9 @@ static int uio_open(struct inode *inode, struct file *filep) struct uio_listener *listener; int ret = 0; - lock_kernel(); + mutex_lock(&minor_lock); idev = idr_find(&uio_idr, iminor(inode)); + mutex_unlock(&minor_lock); if (!idev) { ret = -ENODEV; goto out; @@ -332,18 +337,15 @@ static int uio_open(struct inode *inode, struct file *filep) if (ret) goto err_infoopen; } - unlock_kernel(); return 0; err_infoopen: - kfree(listener); -err_alloc_listener: +err_alloc_listener: module_put(idev->owner); out: - unlock_kernel(); return ret; } diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index d343afa..15a803b 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -1111,8 +1111,8 @@ clean0: #ifdef DEBUG debugfs_remove(ehci_debug_root); ehci_debug_root = NULL; -#endif err_debug: +#endif clear_bit(USB_EHCI_LOADED, &usb_hcds_loaded); return retval; } diff --git a/drivers/watchdog/ib700wdt.c b/drivers/watchdog/ib700wdt.c index 05a2810..8782ec1 100644 --- a/drivers/watchdog/ib700wdt.c +++ b/drivers/watchdog/ib700wdt.c @@ -154,7 +154,7 @@ static int ibwdt_set_heartbeat(int t) return -EINVAL; for (i = 0x0F; i > -1; i--) - if (wd_times[i] > t) + if (wd_times[i] >= t) break; wd_margin = i; return 0; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index c3290bc..9ce1ab6 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); + irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); #endif __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); @@ -137,10 +137,12 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) static void init_evtchn_cpu_bindings(void) { #ifdef CONFIG_SMP + struct irq_desc *desc; int i; + /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); + for_each_irq_desc(i, desc) + desc->affinity = cpumask_of_cpu(0); #endif memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); @@ -229,12 +231,12 @@ static int find_unbound_irq(void) int irq; /* Only allocate from dynirq range */ - for (irq = 0; irq < NR_IRQS; irq++) + for_each_irq_nr(irq) if (irq_bindcount[irq] == 0) break; - if (irq == NR_IRQS) - panic("No available IRQ to bind to: increase NR_IRQS!\n"); + if (irq == nr_irqs) + panic("No available IRQ to bind to: increase nr_irqs!\n"); return irq; } @@ -790,7 +792,7 @@ void xen_irq_resume(void) mask_evtchn(evtchn); /* No IRQ <-> event-channel mappings. */ - for (irq = 0; irq < NR_IRQS; irq++) + for_each_irq_nr(irq) irq_info[irq].evtchn = 0; /* zap event-channel binding */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) @@ -822,7 +824,7 @@ void __init xen_init_IRQ(void) mask_evtchn(i); /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for (i = 0; i < NR_IRQS; i++) + for_each_irq_nr(i) irq_bindcount[i] = 0; irq_ctx_init(smp_processor_id()); @@ -403,7 +403,7 @@ config AUTOFS4_FS N here. config FUSE_FS - tristate "Filesystem in Userspace support" + tristate "FUSE (Filesystem in Userspace) support" help With FUSE it is possible to implement a fully functional filesystem in a userspace program. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e215906..8fcfa39 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1341,20 +1341,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { + struct task_cputime cputime; + /* - * This is the record for the group leader. Add in the - * cumulative times of previous dead threads. This total - * won't include the time of each live thread whose state - * is included in the core dump. The final total reported - * to our parent process when it calls wait4 will include - * those sums as well as the little bit more time it takes - * this and each other thread to finish dying after the - * core dump synchronization phase. + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. */ - cputime_to_timeval(cputime_add(p->utime, p->signal->utime), - &prstatus->pr_utime); - cputime_to_timeval(cputime_add(p->stime, p->signal->stime), - &prstatus->pr_stime); + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { cputime_to_timeval(p->utime, &prstatus->pr_utime); cputime_to_timeval(p->stime, &prstatus->pr_stime); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 0e8367c..5b5424c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1390,20 +1390,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { + struct task_cputime cputime; + /* - * This is the record for the group leader. Add in the - * cumulative times of previous dead threads. This total - * won't include the time of each live thread whose state - * is included in the core dump. The final total reported - * to our parent process when it calls wait4 will include - * those sums as well as the little bit more time it takes - * this and each other thread to finish dying after the - * core dump synchronization phase. + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. */ - cputime_to_timeval(cputime_add(p->utime, p->signal->utime), - &prstatus->pr_utime); - cputime_to_timeval(cputime_add(p->stime, p->signal->stime), - &prstatus->pr_stime); + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { cputime_to_timeval(p->utime, &prstatus->pr_utime); cputime_to_timeval(p->stime, &prstatus->pr_stime); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2bada6b..34930a9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -101,6 +101,8 @@ void fuse_finish_open(struct inode *inode, struct file *file, file->f_op = &fuse_direct_io_file_operations; if (!(outarg->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); + if (outarg->open_flags & FOPEN_NONSEEKABLE) + nonseekable_open(inode, file); ff->fh = outarg->fh; file->private_data = fuse_file_get(ff); } @@ -1448,6 +1450,9 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) mutex_lock(&inode->i_mutex); switch (origin) { case SEEK_END: + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (retval) + return retval; offset += i_size_read(inode); break; case SEEK_CUR: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3a87607..35accfd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -6,6 +6,9 @@ See the file COPYING. */ +#ifndef _FS_FUSE_I_H +#define _FS_FUSE_I_H + #include <linux/fuse.h> #include <linux/fs.h> #include <linux/mount.h> @@ -655,3 +658,5 @@ void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); u64 fuse_get_attr_version(struct fuse_conn *fc); + +#endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6a84388..54b1f0e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -865,7 +865,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (is_bdev) { fc->destroy_req = fuse_request_alloc(); if (!fc->destroy_req) - goto err_put_root; + goto err_free_init_req; } mutex_lock(&fuse_mutex); @@ -895,6 +895,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_unlock: mutex_unlock(&fuse_mutex); + err_free_init_req: fuse_request_free(init_req); err_put_root: dput(root_dentry); diff --git a/fs/proc/array.c b/fs/proc/array.c index f4bc0e7..bb9f4b0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -388,20 +388,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* add up live thread stats at the group level */ if (whole) { + struct task_cputime cputime; struct task_struct *t = task; do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime = cputime_add(utime, task_utime(t)); - stime = cputime_add(stime, task_stime(t)); gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime = cputime_add(utime, sig->utime); - stime = cputime_add(stime, sig->stime); + thread_group_cputime(task, &cputime); + utime = cputime.utime; + stime = cputime.stime; gtime = cputime_add(gtime, sig->gtime); } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 61b25f4..7ea52c7 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -30,6 +30,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/pagemap.h> +#include <linux/irq.h> #include <linux/interrupt.h> #include <linux/swap.h> #include <linux/slab.h> @@ -521,17 +522,13 @@ static const struct file_operations proc_vmalloc_operations = { static int show_stat(struct seq_file *p, void *v) { - int i; + int i, j; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; cputime64_t guest; u64 sum = 0; struct timespec boottime; - unsigned int *per_irq_sum; - - per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); - if (!per_irq_sum) - return -ENOMEM; + unsigned int per_irq_sum; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; @@ -540,8 +537,6 @@ static int show_stat(struct seq_file *p, void *v) jif = boottime.tv_sec; for_each_possible_cpu(i) { - int j; - user = cputime64_add(user, kstat_cpu(i).cpustat.user); nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); @@ -551,11 +546,10 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - for (j = 0; j < NR_IRQS; j++) { - unsigned int temp = kstat_cpu(i).irqs[j]; - sum += temp; - per_irq_sum[j] += temp; - } + + for_each_irq_nr(j) + sum += kstat_irqs_cpu(j, i); + sum += arch_irq_stat_cpu(i); } sum += arch_irq_stat(); @@ -597,8 +591,15 @@ static int show_stat(struct seq_file *p, void *v) } seq_printf(p, "intr %llu", (unsigned long long)sum); - for (i = 0; i < NR_IRQS; i++) - seq_printf(p, " %u", per_irq_sum[i]); + /* sum again ? it could be updated? */ + for_each_irq_nr(j) { + per_irq_sum = 0; + + for_each_possible_cpu(i) + per_irq_sum += kstat_irqs_cpu(j, i); + + seq_printf(p, " %u", per_irq_sum); + } seq_printf(p, "\nctxt %llu\n" @@ -612,7 +613,6 @@ static int show_stat(struct seq_file *p, void *v) nr_running(), nr_iowait()); - kfree(per_irq_sum); return 0; } @@ -651,15 +651,14 @@ static const struct file_operations proc_stat_operations = { */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { - return (*pos <= NR_IRQS) ? pos : NULL; + return (*pos <= nr_irqs) ? pos : NULL; } + static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos > NR_IRQS) - return NULL; - return pos; + return (*pos <= nr_irqs) ? pos : NULL; } static void int_seq_stop(struct seq_file *f, void *v) @@ -667,7 +666,6 @@ static void int_seq_stop(struct seq_file *f, void *v) /* Nothing to do */ } - static const struct seq_operations int_seq_ops = { .start = int_seq_start, .next = int_seq_next, diff --git a/include/asm-frv/ide.h b/include/asm-frv/ide.h index 7ebcc56..3610766 100644 --- a/include/asm-frv/ide.h +++ b/include/asm-frv/ide.h @@ -18,15 +18,7 @@ #include <asm/io.h> #include <asm/irq.h> -/****************************************************************************/ -/* - * some bits needed for parts of the IDE subsystem to compile - */ -#define __ide_mm_insw(port, addr, n) insw((unsigned long) (port), addr, n) -#define __ide_mm_insl(port, addr, n) insl((unsigned long) (port), addr, n) -#define __ide_mm_outsw(port, addr, n) outsw((unsigned long) (port), addr, n) -#define __ide_mm_outsl(port, addr, n) outsl((unsigned long) (port), addr, n) - +#include <asm-generic/ide_iops.h> #endif /* __KERNEL__ */ #endif /* _ASM_IDE_H */ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 0f6dabd..12c07c1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -41,7 +41,7 @@ extern void warn_slowpath(const char *file, const int line, #define __WARN() warn_on_slowpath(__FILE__, __LINE__) #define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg) #else -#define __WARN_printf(arg...) __WARN() +#define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0) #endif #ifndef WARN_ON diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 74c5faf..8074460 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -37,6 +37,13 @@ #define MEM_DISCARD(sec) *(.mem##sec) #endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +#define MCOUNT_REC() VMLINUX_SYMBOL(__start_mcount_loc) = .; \ + *(__mcount_loc) \ + VMLINUX_SYMBOL(__stop_mcount_loc) = .; +#else +#define MCOUNT_REC() +#endif /* .data section */ #define DATA_DATA \ @@ -52,7 +59,10 @@ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___markers) = .; \ *(__markers) \ - VMLINUX_SYMBOL(__stop___markers) = .; + VMLINUX_SYMBOL(__stop___markers) = .; \ + VMLINUX_SYMBOL(__start___tracepoints) = .; \ + *(__tracepoints) \ + VMLINUX_SYMBOL(__stop___tracepoints) = .; #define RO_DATA(align) \ . = ALIGN((align)); \ @@ -61,6 +71,7 @@ *(.rodata) *(.rodata.*) \ *(__vermagic) /* Kernel version magic */ \ *(__markers_strings) /* Markers: strings */ \ + *(__tracepoints_strings)/* Tracepoints: strings */ \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ @@ -188,6 +199,7 @@ /* __*init sections */ \ __init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) { \ *(.ref.rodata) \ + MCOUNT_REC() \ DEV_KEEP(init.rodata) \ DEV_KEEP(exit.rodata) \ CPU_KEEP(init.rodata) \ diff --git a/include/asm-m68k/ide.h b/include/asm-m68k/ide.h index 1daf6cb..b996a3c 100644 --- a/include/asm-m68k/ide.h +++ b/include/asm-m68k/ide.h @@ -92,15 +92,6 @@ #define outsw_swapw(port, addr, n) raw_outsw_swapw((u16 *)port, addr, n) #endif - -/* Q40 and Atari have byteswapped IDE busses and since many interesting - * values in the identification string are text, chars and words they - * happened to be almost correct without swapping.. However *_capacity - * is needed for drives over 8 GB. RZ */ -#if defined(CONFIG_Q40) || defined(CONFIG_ATARI) -#define M68K_IDE_SWAPW (MACH_IS_Q40 || MACH_IS_ATARI) -#endif - #ifdef CONFIG_BLK_DEV_FALCON_IDE #define IDE_ARCH_LOCK diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index d76a083..ef1d72d 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -40,8 +40,6 @@ extern void generic_apic_probe(void); extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; -extern int ioapic_force; - extern int disable_apic; /* * Basic functions accessing APICs. @@ -100,6 +98,20 @@ extern void check_x2apic(void); extern void enable_x2apic(void); extern void enable_IR_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); +static inline int x2apic_enabled(void) +{ + int msr, msr2; + + if (!cpu_has_x2apic) + return 0; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (msr & X2APIC_ENABLE) + return 1; + return 0; +} +#else +#define x2apic_enabled() 0 #endif struct apic_ops { diff --git a/include/asm-x86/bigsmp/apic.h b/include/asm-x86/bigsmp/apic.h index 0a9cd7c..1d9543b 100644 --- a/include/asm-x86/bigsmp/apic.h +++ b/include/asm-x86/bigsmp/apic.h @@ -9,22 +9,17 @@ static inline int apic_id_registered(void) return (1); } -/* Round robin the irqs amoung the online cpus */ static inline cpumask_t target_cpus(void) { - static unsigned long cpu = NR_CPUS; - do { - if (cpu >= NR_CPUS) - cpu = first_cpu(cpu_online_map); - else - cpu = next_cpu(cpu, cpu_online_map); - } while (cpu >= NR_CPUS); - return cpumask_of_cpu(cpu); +#ifdef CONFIG_SMP + return cpu_online_map; +#else + return cpumask_of_cpu(0); +#endif } #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0 -#define TARGET_CPUS (target_cpus()) #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target proc */ diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h index ed2de22..313438e 100644 --- a/include/asm-x86/efi.h +++ b/include/asm-x86/efi.h @@ -94,4 +94,17 @@ extern void efi_reserve_early(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +#ifndef CONFIG_EFI +/* + * IF EFI is not configured, have the EFI calls return -ENOSYS. + */ +#define efi_call0(_f) (-ENOSYS) +#define efi_call1(_f, _a1) (-ENOSYS) +#define efi_call2(_f, _a1, _a2) (-ENOSYS) +#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS) +#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) +#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) +#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +#endif /* CONFIG_EFI */ + #endif /* ASM_X86__EFI_H */ diff --git a/include/asm-x86/es7000/apic.h b/include/asm-x86/es7000/apic.h index aae50c2..380f0b4 100644 --- a/include/asm-x86/es7000/apic.h +++ b/include/asm-x86/es7000/apic.h @@ -17,7 +17,6 @@ static inline cpumask_t target_cpus(void) return cpumask_of_cpu(smp_processor_id()); #endif } -#define TARGET_CPUS (target_cpus()) #if defined CONFIG_ES7000_CLUSTERED_APIC #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) @@ -81,7 +80,7 @@ static inline void setup_apic_routing(void) int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(TARGET_CPUS)[0]); + "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); } static inline int multi_timer_check(int apic, int irq) diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h index be0e004..1bb6f9bb 100644 --- a/include/asm-x86/ftrace.h +++ b/include/asm-x86/ftrace.h @@ -7,6 +7,16 @@ #ifndef __ASSEMBLY__ extern void mcount(void); + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* + * call mcount is "e8 <4 byte offset>" + * The addr points to the 4 byte offset and the caller of this + * function wants the pointer to e8. Simply subtract one. + */ + return addr - 1; +} #endif #endif /* CONFIG_FTRACE */ diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h index 34280f0..6fe4f81 100644 --- a/include/asm-x86/genapic_32.h +++ b/include/asm-x86/genapic_32.h @@ -57,6 +57,7 @@ struct genapic { unsigned (*get_apic_id)(unsigned long x); unsigned long apic_id_mask; unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + cpumask_t (*vector_allocation_domain)(int cpu); #ifdef CONFIG_SMP /* ipi */ @@ -104,6 +105,7 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ + APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ IPIFUNC(send_IPI_allbutself) \ diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h index cbbbb6d..58b273f 100644 --- a/include/asm-x86/hpet.h +++ b/include/asm-x86/hpet.h @@ -1,6 +1,8 @@ #ifndef ASM_X86__HPET_H #define ASM_X86__HPET_H +#include <linux/msi.h> + #ifdef CONFIG_HPET_TIMER #define HPET_MMAP_SIZE 1024 @@ -10,6 +12,11 @@ #define HPET_CFG 0x010 #define HPET_STATUS 0x020 #define HPET_COUNTER 0x0f0 + +#define HPET_Tn_CFG(n) (0x100 + 0x20 * n) +#define HPET_Tn_CMP(n) (0x108 + 0x20 * n) +#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n) + #define HPET_T0_CFG 0x100 #define HPET_T0_CMP 0x108 #define HPET_T0_ROUTE 0x110 @@ -65,6 +72,20 @@ extern void hpet_disable(void); extern unsigned long hpet_readl(unsigned long a); extern void force_hpet_resume(void); +extern void hpet_msi_unmask(unsigned int irq); +extern void hpet_msi_mask(unsigned int irq); +extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); +extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); + +#ifdef CONFIG_PCI_MSI +extern int arch_setup_hpet_msi(unsigned int irq); +#else +static inline int arch_setup_hpet_msi(unsigned int irq) +{ + return -EINVAL; +} +#endif + #ifdef CONFIG_HPET_EMULATE_RTC #include <linux/interrupt.h> diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 50f6e03..749d042 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -96,13 +96,8 @@ extern asmlinkage void qic_call_function_interrupt(void); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_32 extern void smp_spurious_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); -#else -extern asmlinkage void smp_spurious_interrupt(void); -extern asmlinkage void smp_error_interrupt(void); -#endif #ifdef CONFIG_X86_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); @@ -115,13 +110,13 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif #ifdef CONFIG_X86_32 -extern void (*const interrupt[NR_IRQS])(void); -#else +extern void (*const interrupt[NR_VECTORS])(void); +#endif + typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -#endif -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_IO_APIC extern void lock_vector_lock(void); extern void unlock_vector_lock(void); extern void __setup_vector_irq(int cpu); diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 8ec68a5..d35cbd7 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -4,6 +4,7 @@ #include <linux/types.h> #include <asm/mpspec.h> #include <asm/apicdef.h> +#include <asm/irq_vectors.h> /* * Intel IO-APIC support for SMP and UP systems. @@ -87,24 +88,8 @@ struct IO_APIC_route_entry { mask : 1, /* 0: enabled, 1: disabled */ __reserved_2 : 15; -#ifdef CONFIG_X86_32 - union { - struct { - __u32 __reserved_1 : 24, - physical_dest : 4, - __reserved_2 : 4; - } physical; - - struct { - __u32 __reserved_1 : 24, - logical_dest : 8; - } logical; - } dest; -#else __u32 __reserved_3 : 24, dest : 8; -#endif - } __attribute__ ((packed)); struct IR_IO_APIC_route_entry { @@ -203,10 +188,17 @@ extern void restore_IO_APIC_setup(void); extern void reinit_intr_remapped_IO_APIC(int); #endif +extern int probe_nr_irqs(void); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } + +static inline int probe_nr_irqs(void) +{ + return NR_IRQS; +} #endif #endif /* ASM_X86__IO_APIC_H */ diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h index c5d2d76..a8d065d 100644 --- a/include/asm-x86/irq_vectors.h +++ b/include/asm-x86/irq_vectors.h @@ -19,19 +19,14 @@ /* * Reserve the lowest usable priority level 0x20 - 0x2f for triggering - * cleanup after irq migration on 64 bit. + * cleanup after irq migration. */ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR /* - * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. - * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit. + * Vectors 0x30-0x3f are used for ISA interrupts. */ -#ifdef CONFIG_X86_32 -#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR) -#else #define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) -#endif #define IRQ1_VECTOR (IRQ0_VECTOR + 1) #define IRQ2_VECTOR (IRQ0_VECTOR + 2) #define IRQ3_VECTOR (IRQ0_VECTOR + 3) @@ -96,11 +91,7 @@ * start at 0x31(0x41) to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) */ -#ifdef CONFIG_X86_32 -# define FIRST_DEVICE_VECTOR 0x31 -#else -# define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) -#endif +#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) #define NR_VECTORS 256 @@ -116,7 +107,6 @@ # else # define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) # endif -# define NR_IRQ_VECTORS NR_IRQS #elif !defined(CONFIG_X86_VOYAGER) @@ -124,23 +114,15 @@ # define NR_IRQS 224 -# if (224 >= 32 * NR_CPUS) -# define NR_IRQ_VECTORS NR_IRQS -# else -# define NR_IRQ_VECTORS (32 * NR_CPUS) -# endif - # else /* IO_APIC || PARAVIRT */ # define NR_IRQS 16 -# define NR_IRQ_VECTORS NR_IRQS # endif #else /* !VISWS && !VOYAGER */ # define NR_IRQS 224 -# define NR_IRQ_VECTORS NR_IRQS #endif /* VISWS */ diff --git a/include/asm-x86/mach-default/entry_arch.h b/include/asm-x86/mach-default/entry_arch.h index 9283b60..6b1add8 100644 --- a/include/asm-x86/mach-default/entry_arch.h +++ b/include/asm-x86/mach-default/entry_arch.h @@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) +BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) #endif /* diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h index 2a330a4..3c66f2c 100644 --- a/include/asm-x86/mach-default/mach_apic.h +++ b/include/asm-x86/mach-default/mach_apic.h @@ -85,6 +85,20 @@ static inline int apicid_to_node(int logical_apicid) return 0; #endif } + +static inline cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} #endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) @@ -138,6 +152,5 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) static inline void enable_apic_mode(void) { } - #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* ASM_X86__MACH_DEFAULT__MACH_APIC_H */ diff --git a/include/asm-x86/mach-generic/irq_vectors_limits.h b/include/asm-x86/mach-generic/irq_vectors_limits.h deleted file mode 100644 index f7870e1..0000000 --- a/include/asm-x86/mach-generic/irq_vectors_limits.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef ASM_X86__MACH_GENERIC__IRQ_VECTORS_LIMITS_H -#define ASM_X86__MACH_GENERIC__IRQ_VECTORS_LIMITS_H - -/* - * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs, - * even with uni-proc kernels, so use a big array. - * - * This value should be the same in both the generic and summit subarches. - * Change one, change 'em both. - */ -#define NR_IRQS 224 -#define NR_IRQ_VECTORS 1024 - -#endif /* ASM_X86__MACH_GENERIC__IRQ_VECTORS_LIMITS_H */ diff --git a/include/asm-x86/mach-generic/mach_apic.h b/include/asm-x86/mach-generic/mach_apic.h index 5d010c6..5085b52 100644 --- a/include/asm-x86/mach-generic/mach_apic.h +++ b/include/asm-x86/mach-generic/mach_apic.h @@ -24,6 +24,7 @@ #define check_phys_apicid_present (genapic->check_phys_apicid_present) #define check_apicid_used (genapic->check_apicid_used) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) diff --git a/include/asm-x86/numaq/apic.h b/include/asm-x86/numaq/apic.h index a8344ba..0bf2a06 100644 --- a/include/asm-x86/numaq/apic.h +++ b/include/asm-x86/numaq/apic.h @@ -12,8 +12,6 @@ static inline cpumask_t target_cpus(void) return CPU_MASK_ALL; } -#define TARGET_CPUS (target_cpus()) - #define NO_BALANCE_IRQ (1) #define esr_disable (1) diff --git a/include/asm-x86/summit/apic.h b/include/asm-x86/summit/apic.h index 394b00b..9b3070f 100644 --- a/include/asm-x86/summit/apic.h +++ b/include/asm-x86/summit/apic.h @@ -22,7 +22,6 @@ static inline cpumask_t target_cpus(void) */ return cpumask_of_cpu(0); } -#define TARGET_CPUS (target_cpus()) #define INT_DELIVERY_MODE (dest_LowestPrio) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ diff --git a/include/asm-x86/summit/irq_vectors_limits.h b/include/asm-x86/summit/irq_vectors_limits.h deleted file mode 100644 index 890ce3f..0000000 --- a/include/asm-x86/summit/irq_vectors_limits.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _ASM_IRQ_VECTORS_LIMITS_H -#define _ASM_IRQ_VECTORS_LIMITS_H - -/* - * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs, - * even with uni-proc kernels, so use a big array. - * - * This value should be the same in both the generic and summit subarches. - * Change one, change 'em both. - */ -#define NR_IRQS 224 -#define NR_IRQ_VECTORS 1024 - -#endif /* _ASM_IRQ_VECTORS_LIMITS_H */ diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h index 7cd6d7e..215f196 100644 --- a/include/asm-x86/uv/bios.h +++ b/include/asm-x86/uv/bios.h @@ -2,9 +2,7 @@ #define ASM_X86__UV__BIOS_H /* - * BIOS layer definitions. - * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * UV BIOS layer definitions. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,50 +17,78 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include <linux/rtc.h> -#define BIOS_FREQ_BASE 0x01000001 +/* + * Values for the BIOS calls. It is passed as the first * argument in the + * BIOS call. Passing any other value in the first argument will result + * in a BIOS_STATUS_UNIMPLEMENTED return status. + */ +enum uv_bios_cmd { + UV_BIOS_COMMON, + UV_BIOS_GET_SN_INFO, + UV_BIOS_FREQ_BASE +}; +/* + * Status values returned from a BIOS call. + */ enum { - BIOS_FREQ_BASE_PLATFORM = 0, - BIOS_FREQ_BASE_INTERVAL_TIMER = 1, - BIOS_FREQ_BASE_REALTIME_CLOCK = 2 + BIOS_STATUS_SUCCESS = 0, + BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, + BIOS_STATUS_EINVAL = -EINVAL, + BIOS_STATUS_UNAVAIL = -EBUSY }; -# define BIOS_CALL(result, a0, a1, a2, a3, a4, a5, a6, a7) \ - do { \ - /* XXX - the real call goes here */ \ - result.status = BIOS_STATUS_UNIMPLEMENTED; \ - isrv.v0 = 0; \ - isrv.v1 = 0; \ - } while (0) +/* + * The UV system table describes specific firmware + * capabilities available to the Linux kernel at runtime. + */ +struct uv_systab { + char signature[4]; /* must be "UVST" */ + u32 revision; /* distinguish different firmware revs */ + u64 function; /* BIOS runtime callback function ptr */ +}; enum { - BIOS_STATUS_SUCCESS = 0, - BIOS_STATUS_UNIMPLEMENTED = -1, - BIOS_STATUS_EINVAL = -2, - BIOS_STATUS_ERROR = -3 + BIOS_FREQ_BASE_PLATFORM = 0, + BIOS_FREQ_BASE_INTERVAL_TIMER = 1, + BIOS_FREQ_BASE_REALTIME_CLOCK = 2 }; -struct uv_bios_retval { - /* - * A zero status value indicates call completed without error. - * A negative status value indicates reason of call failure. - * A positive status value indicates success but an - * informational value should be printed (e.g., "reboot for - * change to take effect"). - */ - s64 status; - u64 v0; - u64 v1; - u64 v2; +union partition_info_u { + u64 val; + struct { + u64 hub_version : 8, + partition_id : 16, + coherence_id : 16, + region_size : 24; + }; }; -extern long -x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, - unsigned long *drift_info); -extern const char *x86_bios_strerror(long status); +/* + * bios calls have 6 parameters + */ +extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); + +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_freq_base(u64, u64 *); + +extern void uv_bios_init(void); + +extern int uv_type; +extern long sn_partition_id; +extern long uv_coherency_id; +extern long uv_region_size; +#define partition_coherence_id() (uv_coherency_id) + +extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ #endif /* ASM_X86__UV__BIOS_H */ diff --git a/include/asm-x86/uv/uv_irq.h b/include/asm-x86/uv/uv_irq.h new file mode 100644 index 0000000..8bf5f32 --- /dev/null +++ b/include/asm-x86/uv/uv_irq.h @@ -0,0 +1,36 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ definitions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef ASM_X86__UV__UV_IRQ_H +#define ASM_X86__UV__UV_IRQ_H + +/* If a generic version of this structure gets defined, eliminate this one. */ +struct uv_IO_APIC_route_entry { + __u64 vector : 8, + delivery_mode : 3, + dest_mode : 1, + delivery_status : 1, + polarity : 1, + __reserved_1 : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15, + dest : 32; +}; + +extern struct irq_chip uv_irq_chip; + +extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); +extern void arch_disable_uv_irq(int, unsigned long); + +extern int uv_setup_irq(char *, int, int, unsigned long); +extern void uv_teardown_irq(unsigned int, int, unsigned long); + +#endif /* ASM_X86__UV__UV_IRQ_H */ diff --git a/include/linux/aer.h b/include/linux/aer.h index f251814..f7df1ee 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -10,7 +10,6 @@ #if defined(CONFIG_PCIEAER) /* pci-e port driver needs this function to enable aer */ extern int pci_enable_pcie_error_reporting(struct pci_dev *dev); -extern int pci_find_aer_capability(struct pci_dev *dev); extern int pci_disable_pcie_error_reporting(struct pci_dev *dev); extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev); #else @@ -18,10 +17,6 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev *dev) { return -EINVAL; } -static inline int pci_find_aer_capability(struct pci_dev *dev) -{ - return 0; -} static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev) { return -EINVAL; diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 55e434f..f88d32f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -45,7 +45,8 @@ struct clocksource; * @read: returns a cycle value * @mask: bitmask for two's complement * subtraction of non 64 bit counters - * @mult: cycle to nanosecond multiplier + * @mult: cycle to nanosecond multiplier (adjusted by NTP) + * @mult_orig: cycle to nanosecond multiplier (unadjusted by NTP) * @shift: cycle to nanosecond divisor (power of two) * @flags: flags describing special properties * @vread: vsyscall based read @@ -63,6 +64,7 @@ struct clocksource { cycle_t (*read)(void); cycle_t mask; u32 mult; + u32 mult_orig; u32 shift; unsigned long flags; cycle_t (*vread)(void); @@ -77,6 +79,7 @@ struct clocksource { /* timekeeping specific data, ignore */ cycle_t cycle_interval; u64 xtime_interval; + u32 raw_interval; /* * Second part is written at each timer interrupt * Keep it in a different cache line to dirty no @@ -85,6 +88,7 @@ struct clocksource { cycle_t cycle_last ____cacheline_aligned_in_smp; u64 xtime_nsec; s64 error; + struct timespec raw_time; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ @@ -201,17 +205,19 @@ static inline void clocksource_calculate_interval(struct clocksource *c, { u64 tmp; - /* XXX - All of this could use a whole lot of optimization */ + /* Do the ns -> cycle conversion first, using original mult */ tmp = length_nsec; tmp <<= c->shift; - tmp += c->mult/2; - do_div(tmp, c->mult); + tmp += c->mult_orig/2; + do_div(tmp, c->mult_orig); c->cycle_interval = (cycle_t)tmp; if (c->cycle_interval == 0) c->cycle_interval = 1; + /* Go back from cycles -> shifted ns, this time use ntp adjused mult */ c->xtime_interval = (u64)c->cycle_interval * c->mult; + c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift; } diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8322141..98115d9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -44,6 +44,8 @@ extern void __chk_io_ptr(const volatile void __iomem *); # error Sorry, your compiler is too old/not recognized. #endif +#define notrace __attribute__((no_instrument_function)) + /* Intel compiler defines __GNUC__. So we will overwrite implementations * coming from above header files here */ diff --git a/include/linux/dmar.h b/include/linux/dmar.h index c360c55..f1984fc 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -45,7 +45,6 @@ extern struct list_head dmar_drhd_units; list_for_each_entry(drhd, &dmar_drhd_units, list) extern int dmar_table_init(void); -extern int early_dmar_detect(void); extern int dmar_dev_scope_init(void); /* Intel IOMMU detection */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 807373d..bb66feb 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -208,6 +208,9 @@ typedef efi_status_t efi_set_virtual_address_map_t (unsigned long memory_map_siz #define EFI_GLOBAL_VARIABLE_GUID \ EFI_GUID( 0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c ) +#define UV_SYSTEM_TABLE_GUID \ + EFI_GUID( 0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 ) + typedef struct { efi_guid_t guid; unsigned long table; @@ -255,6 +258,7 @@ extern struct efi { unsigned long boot_info; /* boot info table */ unsigned long hcdp; /* HCDP table */ unsigned long uga; /* UGA table */ + unsigned long uv_systab; /* UV system table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index bb38406..a3d4615 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1,10 +1,14 @@ #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H -#ifdef CONFIG_FTRACE - #include <linux/linkage.h> #include <linux/fs.h> +#include <linux/ktime.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kallsyms.h> + +#ifdef CONFIG_FTRACE extern int ftrace_enabled; extern int @@ -36,6 +40,7 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1); # define register_ftrace_function(ops) do { } while (0) # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) +static inline void ftrace_kill_atomic(void) { } #endif /* CONFIG_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE @@ -76,8 +81,10 @@ extern void mcount_call(void); extern int skip_trace(unsigned long ip); -void ftrace_disable_daemon(void); -void ftrace_enable_daemon(void); +extern void ftrace_release(void *start, unsigned long size); + +extern void ftrace_disable_daemon(void); +extern void ftrace_enable_daemon(void); #else # define skip_trace(ip) ({ 0; }) @@ -85,6 +92,7 @@ void ftrace_enable_daemon(void); # define ftrace_set_filter(buf, len, reset) do { } while (0) # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) +static inline void ftrace_release(void *start, unsigned long size) { } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ @@ -98,9 +106,11 @@ static inline void tracer_disable(void) #endif } -/* Ftrace disable/restore without lock. Some synchronization mechanism +/* + * Ftrace disable/restore without lock. Some synchronization mechanism * must be used to prevent ftrace_enabled to be changed between - * disable/restore. */ + * disable/restore. + */ static inline int __ftrace_enabled_save(void) { #ifdef CONFIG_FTRACE @@ -157,9 +167,71 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_TRACING extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); + +/** + * ftrace_printk - printf formatting in the ftrace buffer + * @fmt: the printf format for printing + * + * Note: __ftrace_printk is an internal function for ftrace_printk and + * the @ip is passed in via the ftrace_printk macro. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving ftrace_printks scattered around in + * your code. + */ +# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt) +extern int +__ftrace_printk(unsigned long ip, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void ftrace_dump(void); #else static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } +static inline int +ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); + +static inline int +ftrace_printk(const char *fmt, ...) +{ + return 0; +} +static inline void ftrace_dump(void) { } #endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +extern void ftrace_init(void); +extern void ftrace_init_module(unsigned long *start, unsigned long *end); +#else +static inline void ftrace_init(void) { } +static inline void +ftrace_init_module(unsigned long *start, unsigned long *end) { } +#endif + + +struct boot_trace { + pid_t caller; + char func[KSYM_NAME_LEN]; + int result; + unsigned long long duration; /* usecs */ + ktime_t calltime; + ktime_t rettime; +}; + +#ifdef CONFIG_BOOT_TRACER +extern void trace_boot(struct boot_trace *it, initcall_t fn); +extern void start_boot_trace(void); +extern void stop_boot_trace(void); +#else +static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } +static inline void start_boot_trace(void) { } +static inline void stop_boot_trace(void) { } +#endif + + + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 265635d..350fe97 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -17,8 +17,14 @@ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in * - add blksize field to fuse_attr * - add file flags field to fuse_read_in and fuse_write_in + * + * 7.10 + * - add nonseekable open flag */ +#ifndef _LINUX_FUSE_H +#define _LINUX_FUSE_H + #include <asm/types.h> #include <linux/major.h> @@ -26,7 +32,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 9 +#define FUSE_KERNEL_MINOR_VERSION 10 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -98,9 +104,11 @@ struct fuse_file_lock { * * FOPEN_DIRECT_IO: bypass page cache for this open file * FOPEN_KEEP_CACHE: don't invalidate the data cache on open + * FOPEN_NONSEEKABLE: the file is not seekable */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) +#define FOPEN_NONSEEKABLE (1 << 2) /** * INIT request/reply flags @@ -409,3 +417,5 @@ struct fuse_dirent { #define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1)) #define FUSE_DIRENT_SIZE(d) \ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + +#endif /* _LINUX_FUSE_H */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2f245fe..9a4e35c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -125,12 +125,12 @@ struct hrtimer { enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; - enum hrtimer_cb_mode cb_mode; struct list_head cb_entry; + enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS + int start_pid; void *start_site; char start_comm[16]; - int start_pid; #endif }; @@ -155,10 +155,8 @@ struct hrtimer_sleeper { * @first: pointer to the timer node which expires first * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock - * @get_softirq_time: function to retrieve the current time from the softirq * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base - * @reprogram: function to reprogram the timer event */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; @@ -167,13 +165,9 @@ struct hrtimer_clock_base { struct rb_node *first; ktime_t resolution; ktime_t (*get_time)(void); - ktime_t (*get_softirq_time)(void); ktime_t softirq_time; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t offset; - int (*reprogram)(struct hrtimer *t, - struct hrtimer_clock_base *b, - ktime_t n); #endif }; diff --git a/include/linux/ide.h b/include/linux/ide.h index c47e371..89e53cf 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -461,12 +461,26 @@ struct ide_acpi_drive_link; struct ide_acpi_hwif_link; #endif +struct ide_drive_s; + +struct ide_disk_ops { + int (*check)(struct ide_drive_s *, const char *); + int (*get_capacity)(struct ide_drive_s *); + void (*setup)(struct ide_drive_s *); + void (*flush)(struct ide_drive_s *); + int (*init_media)(struct ide_drive_s *, struct gendisk *); + int (*set_doorlock)(struct ide_drive_s *, struct gendisk *, + int); + ide_startstop_t (*do_request)(struct ide_drive_s *, struct request *, + sector_t); + int (*end_request)(struct ide_drive_s *, int, int); + int (*ioctl)(struct ide_drive_s *, struct inode *, + struct file *, unsigned int, unsigned long); +}; + /* ATAPI device flags */ enum { IDE_AFLAG_DRQ_INTERRUPT = (1 << 0), - IDE_AFLAG_MEDIA_CHANGED = (1 << 1), - /* Drive cannot lock the door. */ - IDE_AFLAG_NO_DOORLOCK = (1 << 2), /* ide-cd */ /* Drive cannot eject the disc. */ @@ -498,14 +512,10 @@ enum { IDE_AFLAG_LE_SPEED_FIELDS = (1 << 17), /* ide-floppy */ - /* Format in progress */ - IDE_AFLAG_FORMAT_IN_PROGRESS = (1 << 18), /* Avoid commands not supported in Clik drive */ IDE_AFLAG_CLIK_DRIVE = (1 << 19), /* Requires BH algorithm for packets */ IDE_AFLAG_ZIP_DRIVE = (1 << 20), - /* Write protect */ - IDE_AFLAG_WP = (1 << 21), /* Supports format progress report */ IDE_AFLAG_SRFP = (1 << 22), @@ -578,7 +588,11 @@ enum { /* don't unload heads */ IDE_DFLAG_NO_UNLOAD = (1 << 27), /* heads unloaded, please don't reset port */ - IDE_DFLAG_PARKED = (1 << 28) + IDE_DFLAG_PARKED = (1 << 28), + IDE_DFLAG_MEDIA_CHANGED = (1 << 29), + /* write protect */ + IDE_DFLAG_WP = (1 << 30), + IDE_DFLAG_FORMAT_IN_PROGRESS = (1 << 31), }; struct ide_drive_s { @@ -597,6 +611,8 @@ struct ide_drive_s { #endif struct hwif_s *hwif; /* actually (ide_hwif_t *) */ + const struct ide_disk_ops *disk_ops; + unsigned long dev_flags; unsigned long sleep; /* sleep until this time */ @@ -1123,8 +1139,8 @@ struct ide_driver_s { void (*resume)(ide_drive_t *); void (*shutdown)(ide_drive_t *); #ifdef CONFIG_IDE_PROC_FS - ide_proc_entry_t *proc; - const struct ide_proc_devset *settings; + ide_proc_entry_t * (*proc_entries)(ide_drive_t *); + const struct ide_proc_devset * (*proc_devsets)(ide_drive_t *); #endif }; diff --git a/include/linux/init.h b/include/linux/init.h index ad63824..0c12646 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -40,7 +40,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold +#define __init __section(.init.text) __cold notrace #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 35a61dc..f58a0cf 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -8,6 +8,7 @@ #include <linux/preempt.h> #include <linux/cpumask.h> #include <linux/irqreturn.h> +#include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/sched.h> #include <linux/irqflags.h> diff --git a/include/linux/irq.h b/include/linux/irq.h index 8d9411b..d058c57 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -18,6 +18,7 @@ #include <linux/spinlock.h> #include <linux/cpumask.h> #include <linux/irqreturn.h> +#include <linux/irqnr.h> #include <linux/errno.h> #include <asm/irq.h> @@ -152,6 +153,7 @@ struct irq_chip { * @name: flow handler name for /proc/interrupts output */ struct irq_desc { + unsigned int irq; irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -170,7 +172,7 @@ struct irq_desc { cpumask_t affinity; unsigned int cpu; #endif -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_t pending_mask; #endif #ifdef CONFIG_PROC_FS @@ -179,8 +181,14 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; + extern struct irq_desc irq_desc[NR_IRQS]; +static inline struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < nr_irqs) ? irq_desc + irq : NULL; +} + /* * Migration helpers for obsolete names, they will go away: */ @@ -198,19 +206,15 @@ extern int setup_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_GENERIC_HARDIRQS -#ifndef handle_dynamic_tick -# define handle_dynamic_tick(a) do { } while (0) -#endif - #ifdef CONFIG_SMP -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ void set_pending_irq(unsigned int irq, cpumask_t mask); void move_native_irq(int irq); void move_masked_irq(int irq); -#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */ +#else /* CONFIG_GENERIC_PENDING_IRQ */ static inline void move_irq(int irq) { @@ -237,19 +241,14 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask) #endif /* CONFIG_SMP */ -#ifdef CONFIG_IRQBALANCE -extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask); -#else -static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ -} -#endif - extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) { - return irq_desc[irq].status & IRQ_NO_BALANCING_MASK; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + return desc->status & IRQ_NO_BALANCING_MASK; } /* Handle irq action chains: */ @@ -279,10 +278,8 @@ extern unsigned int __do_IRQ(unsigned int irq); * irqchip-style controller then we call the ->handle_irq() handler, * and it calls __do_IRQ() if it's attached to an irqtype-style controller. */ -static inline void generic_handle_irq(unsigned int irq) +static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) { - struct irq_desc *desc = irq_desc + irq; - #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); #else @@ -293,6 +290,11 @@ static inline void generic_handle_irq(unsigned int irq) #endif } +static inline void generic_handle_irq(unsigned int irq) +{ + generic_handle_irq_desc(irq, irq_to_desc(irq)); +} + /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(unsigned int irq, struct irq_desc *desc, int action_ret); @@ -325,7 +327,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, static inline void __set_irq_handler_unlocked(int irq, irq_flow_handler_t handler) { - irq_desc[irq].handle_irq = handler; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->handle_irq = handler; } /* @@ -353,13 +358,14 @@ extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); /* Handle dynamic irq creation and destruction */ +extern unsigned int create_irq_nr(unsigned int irq_want); extern int create_irq(void); extern void destroy_irq(unsigned int irq); /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); return desc->action != NULL; } @@ -374,10 +380,10 @@ extern int set_irq_chip_data(unsigned int irq, void *data); extern int set_irq_type(unsigned int irq, unsigned int type); extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); -#define get_irq_chip(irq) (irq_desc[irq].chip) -#define get_irq_chip_data(irq) (irq_desc[irq].chip_data) -#define get_irq_data(irq) (irq_desc[irq].handler_data) -#define get_irq_msi(irq) (irq_desc[irq].msi_desc) +#define get_irq_chip(irq) (irq_to_desc(irq)->chip) +#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data) +#define get_irq_data(irq) (irq_to_desc(irq)->handler_data) +#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h new file mode 100644 index 0000000..3171ddc --- /dev/null +++ b/include/linux/irqnr.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_IRQNR_H +#define _LINUX_IRQNR_H + +#ifndef CONFIG_GENERIC_HARDIRQS +#include <asm/irq.h> +# define nr_irqs NR_IRQS + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0; irq < nr_irqs; irq++) +#else +extern int nr_irqs; + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) + +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 ); \ + irq > 0; irq--, desc--) +#endif + +#define for_each_irq_nr(irq) \ + for (irq = 0; irq < nr_irqs; irq++) + +#endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5a566b7..94d17ff 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -496,4 +496,9 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD +#endif + #endif diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index cf9f40a..4a145ca 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -39,19 +39,34 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +struct irq_desc; + +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, + struct irq_desc *desc) +{ + kstat_this_cpu.irqs[irq]++; +} + +static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + return kstat_cpu(cpu).irqs[irq]; +} + /* * Number of interrupts per specific IRQ source, since bootup */ -static inline int kstat_irqs(int irq) +static inline unsigned int kstat_irqs(unsigned int irq) { - int cpu, sum = 0; + unsigned int sum = 0; + int cpu; for_each_possible_cpu(cpu) - sum += kstat_cpu(cpu).irqs[irq]; + sum += kstat_irqs_cpu(irq, cpu); return sum; } +extern unsigned long long task_delta_exec(struct task_struct *); extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0be7795..497b1d1 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -29,6 +29,7 @@ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi * <prasanna@in.ibm.com> added function-return probes. */ +#include <linux/linkage.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/smp.h> @@ -47,7 +48,7 @@ #define KPROBE_HIT_SSDONE 0x00000008 /* Attach to insert probes on any functions which should be ignored*/ -#define __kprobes __attribute__((__section__(".kprobes.text"))) +#define __kprobes __attribute__((__section__(".kprobes.text"))) notrace struct kprobe; struct pt_regs; @@ -256,7 +257,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); #else /* CONFIG_KPROBES */ -#define __kprobes /**/ +#define __kprobes notrace struct jprobe; struct kretprobe; diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 56ba373..9fd1f85 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -4,8 +4,6 @@ #include <linux/compiler.h> #include <asm/linkage.h> -#define notrace __attribute__((no_instrument_function)) - #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else diff --git a/include/linux/marker.h b/include/linux/marker.h index 1290653..889196c 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -160,4 +160,11 @@ extern int marker_probe_unregister_private_data(marker_probe_func *probe, extern void *marker_get_private_data(const char *name, marker_probe_func *probe, int num); +/* + * marker_synchronize_unregister must be called between the last marker probe + * unregistration and the end of module exit to make sure there is no caller + * executing a probe when it is freed. + */ +#define marker_synchronize_unregister() synchronize_sched() + #endif diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 61d19e1..139d7c8 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -34,11 +34,15 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p); /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); -/* Called from ioremap.c */ #ifdef CONFIG_MMIOTRACE +/* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); extern void mmiotrace_iounmap(volatile void __iomem *addr); + +/* For anyone to insert markers. Remember trailing newline. */ +extern int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); #else static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) @@ -48,15 +52,22 @@ static inline void mmiotrace_ioremap(resource_size_t offset, static inline void mmiotrace_iounmap(volatile void __iomem *addr) { } -#endif /* CONFIG_MMIOTRACE_HOOKS */ + +static inline int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 0))); + +static inline int mmiotrace_printk(const char *fmt, ...) +{ + return 0; +} +#endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { MMIO_READ = 0x1, /* struct mmiotrace_rw */ MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ MMIO_PROBE = 0x3, /* struct mmiotrace_map */ MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_MARKER = 0x5, /* raw char data */ - MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { @@ -81,5 +92,6 @@ extern void enable_mmiotrace(void); extern void disable_mmiotrace(void); extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); +extern int mmio_trace_printk(const char *fmt, va_list args); #endif /* MMIOTRACE_H */ diff --git a/include/linux/module.h b/include/linux/module.h index a41555c..5d2970c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <linux/moduleparam.h> #include <linux/marker.h> +#include <linux/tracepoint.h> #include <asm/local.h> #include <asm/module.h> @@ -331,6 +332,10 @@ struct module struct marker *markers; unsigned int num_markers; #endif +#ifdef CONFIG_TRACEPOINTS + struct tracepoint *tracepoints; + unsigned int num_tracepoints; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ @@ -453,6 +458,9 @@ extern void print_modules(void); extern void module_update_markers(void); +extern void module_update_tracepoints(void); +extern int module_get_iter_tracepoints(struct tracepoint_iter *iter); + #else /* !CONFIG_MODULES... */ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) @@ -557,6 +565,15 @@ static inline void module_update_markers(void) { } +static inline void module_update_tracepoints(void) +{ +} + +static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + return 0; +} + #endif /* CONFIG_MODULES */ struct device_driver; diff --git a/include/linux/pci.h b/include/linux/pci.h index acf8f24..085187b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -214,6 +214,7 @@ struct pci_dev { unsigned int broken_parity_status:1; /* Device generates false positive parity */ unsigned int msi_enabled:1; unsigned int msix_enabled:1; + unsigned int ari_enabled:1; /* ARI forwarding */ unsigned int is_managed:1; unsigned int is_pcie:1; pci_dev_flags_t dev_flags; @@ -347,7 +348,6 @@ struct pci_bus_region { struct pci_dynids { spinlock_t lock; /* protects list, index */ struct list_head list; /* for IDs added at runtime */ - unsigned int use_driver_data:1; /* pci_device_id->driver_data is used */ }; /* ---------------------------------------------------------------- */ @@ -456,8 +456,8 @@ struct pci_driver { /** * PCI_VDEVICE - macro used to describe a specific pci device in short form - * @vend: the vendor name - * @dev: the 16 bit PCI Device ID + * @vendor: the vendor name + * @device: the 16 bit PCI Device ID * * This macro is used to create a struct pci_device_id that matches a * specific PCI device. The subvendor, and subdevice fields will be set @@ -645,6 +645,7 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); bool pci_pme_capable(struct pci_dev *dev, pci_power_t state); void pci_pme_active(struct pci_dev *dev, bool enable); int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable); +int pci_wake_from_d3(struct pci_dev *dev, bool enable); pci_power_t pci_target_state(struct pci_dev *dev); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); @@ -725,7 +726,7 @@ enum pci_dma_burst_strategy { }; struct msix_entry { - u16 vector; /* kernel uses to write allocated vector */ + u32 vector; /* kernel uses to write allocated vector */ u16 entry; /* driver uses to specify entry, OS writes */ }; @@ -1118,5 +1119,20 @@ static inline void pci_mmcfg_early_init(void) { } static inline void pci_mmcfg_late_init(void) { } #endif +#ifdef CONFIG_HAS_IOMEM +static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar) +{ + /* + * Make sure the BAR is actually a memory resource, not an IO resource + */ + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) { + WARN_ON(1); + return NULL; + } + return ioremap_nocache(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar)); +} +#endif + #endif /* __KERNEL__ */ #endif /* LINUX_PCI_H */ diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 8edddc2..e5d344b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2454,9 +2454,9 @@ #define PCI_DEVICE_ID_INTEL_ICH10_3 0x3a1a #define PCI_DEVICE_ID_INTEL_ICH10_4 0x3a30 #define PCI_DEVICE_ID_INTEL_ICH10_5 0x3a60 -#define PCI_DEVICE_ID_INTEL_PCH_0 0x3b10 -#define PCI_DEVICE_ID_INTEL_PCH_1 0x3b11 -#define PCI_DEVICE_ID_INTEL_PCH_2 0x3b30 +#define PCI_DEVICE_ID_INTEL_PCH_LPC_MIN 0x3b00 +#define PCI_DEVICE_ID_INTEL_PCH_LPC_MAX 0x3b1f +#define PCI_DEVICE_ID_INTEL_PCH_SMBUS 0x3b30 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h index 450684f..eb6686b 100644 --- a/include/linux/pci_regs.h +++ b/include/linux/pci_regs.h @@ -419,6 +419,10 @@ #define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ #define PCI_EXP_RTCAP 30 /* Root Capabilities */ #define PCI_EXP_RTSTA 32 /* Root Status */ +#define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ +#define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ +#define PCI_EXP_DEVCTL2_ARI 0x20 /* Alternative Routing-ID */ /* Extended Capabilities (PCI-X 2.0 and Express) */ #define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) @@ -429,6 +433,7 @@ #define PCI_EXT_CAP_ID_VC 2 #define PCI_EXT_CAP_ID_DSN 3 #define PCI_EXT_CAP_ID_PWR 4 +#define PCI_EXT_CAP_ID_ARI 14 /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ @@ -536,5 +541,14 @@ #define HT_CAPTYPE_GEN3 0xD0 /* Generation 3 hypertransport configuration */ #define HT_CAPTYPE_PM 0xE0 /* Hypertransport powermanagement configuration */ +/* Alternative Routing-ID Interpretation */ +#define PCI_ARI_CAP 0x04 /* ARI Capability Register */ +#define PCI_ARI_CAP_MFVC 0x0001 /* MFVC Function Groups Capability */ +#define PCI_ARI_CAP_ACS 0x0002 /* ACS Function Groups Capability */ +#define PCI_ARI_CAP_NFN(x) (((x) >> 8) & 0xff) /* Next Function Number */ +#define PCI_ARI_CTRL 0x06 /* ARI Control Register */ +#define PCI_ARI_CTRL_MFVC 0x0001 /* MFVC Function Groups Enable */ +#define PCI_ARI_CTRL_ACS 0x0002 /* ACS Function Groups Enable */ +#define PCI_ARI_CTRL_FG(x) (((x) >> 4) & 7) /* Function Group */ #endif /* LINUX_PCI_REGS_H */ diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7dd38f..a7c7213 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -45,8 +45,6 @@ struct k_itimer { int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 int it_sigev_notify; /* notify word of sigevent struct */ - int it_sigev_signo; /* signo word of sigevent struct */ - sigval_t it_sigev_value; /* value word of sigevent struct */ struct task_struct *it_process; /* process to send signal to */ struct sigqueue *sigq; /* signal queue entry. */ union { @@ -115,4 +113,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, long clock_nanosleep_restart(struct restart_block *restart_block); +void update_rlimit_cpu(unsigned long rlim_new); + #endif diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h new file mode 100644 index 0000000..536b0ca --- /dev/null +++ b/include/linux/ring_buffer.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_RING_BUFFER_H +#define _LINUX_RING_BUFFER_H + +#include <linux/mm.h> +#include <linux/seq_file.h> + +struct ring_buffer; +struct ring_buffer_iter; + +/* + * Don't reference this struct directly, use functions below. + */ +struct ring_buffer_event { + u32 type:2, len:3, time_delta:27; + u32 array[]; +}; + +/** + * enum ring_buffer_type - internal ring buffer types + * + * @RINGBUF_TYPE_PADDING: Left over page padding + * array is ignored + * size is variable depending on how much + * padding is needed + * + * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta + * array[0] = time delta (28 .. 59) + * size = 8 bytes + * + * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock + * array[0] = tv_nsec + * array[1] = tv_sec + * size = 16 bytes + * + * @RINGBUF_TYPE_DATA: Data record + * If len is zero: + * array[0] holds the actual length + * array[1..(length+3)/4-1] holds data + * else + * length = len << 2 + * array[0..(length+3)/4] holds data + */ +enum ring_buffer_type { + RINGBUF_TYPE_PADDING, + RINGBUF_TYPE_TIME_EXTEND, + /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ + RINGBUF_TYPE_TIME_STAMP, + RINGBUF_TYPE_DATA, +}; + +unsigned ring_buffer_event_length(struct ring_buffer_event *event); +void *ring_buffer_event_data(struct ring_buffer_event *event); + +/** + * ring_buffer_event_time_delta - return the delta timestamp of the event + * @event: the event to get the delta timestamp of + * + * The delta timestamp is the 27 bit timestamp since the last event. + */ +static inline unsigned +ring_buffer_event_time_delta(struct ring_buffer_event *event) +{ + return event->time_delta; +} + +/* + * size is in bytes for each per CPU buffer. + */ +struct ring_buffer * +ring_buffer_alloc(unsigned long size, unsigned flags); +void ring_buffer_free(struct ring_buffer *buffer); + +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); + +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, + unsigned long length, + unsigned long *flags); +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags); +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, void *data); + +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts); +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts); + +struct ring_buffer_iter * +ring_buffer_read_start(struct ring_buffer *buffer, int cpu); +void ring_buffer_read_finish(struct ring_buffer_iter *iter); + +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts); +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts); +void ring_buffer_iter_reset(struct ring_buffer_iter *iter); +int ring_buffer_iter_empty(struct ring_buffer_iter *iter); + +unsigned long ring_buffer_size(struct ring_buffer *buffer); + +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu); +void ring_buffer_reset(struct ring_buffer *buffer); + +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu); + +int ring_buffer_empty(struct ring_buffer *buffer); +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu); + +void ring_buffer_record_disable(struct ring_buffer *buffer); +void ring_buffer_record_enable(struct ring_buffer *buffer); +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); + +unsigned long ring_buffer_entries(struct ring_buffer *buffer); +unsigned long ring_buffer_overruns(struct ring_buffer *buffer); + +u64 ring_buffer_time_stamp(int cpu); +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); + +enum ring_buffer_flags { + RB_FL_OVERWRITE = 1 << 0, +}; + +#endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f52dbd3..5c38db5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,6 +434,39 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +/** + * struct task_cputime - collected CPU time counts + * @utime: time spent in user mode, in &cputime_t units + * @stime: time spent in kernel mode, in &cputime_t units + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * + * This structure groups together three kinds of CPU time that are + * tracked for threads and thread groups. Most things considering + * CPU time want to group these counts together and treat all three + * of them in parallel. + */ +struct task_cputime { + cputime_t utime; + cputime_t stime; + unsigned long long sum_exec_runtime; +}; +/* Alternate field names when used to cache expirations. */ +#define prof_exp stime +#define virt_exp utime +#define sched_exp sum_exec_runtime + +/** + * struct thread_group_cputime - thread group interval timer counts + * @totals: thread group interval timers; substructure for + * uniprocessor kernel, per-cpu for SMP kernel. + * + * This structure contains the version of task_cputime, above, that is + * used for thread group CPU clock calculations. + */ +struct thread_group_cputime { + struct task_cputime *totals; +}; + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -479,6 +512,17 @@ struct signal_struct { cputime_t it_prof_expires, it_virt_expires; cputime_t it_prof_incr, it_virt_incr; + /* + * Thread group totals for process CPU clocks. + * See thread_group_cputime(), et al, for details. + */ + struct thread_group_cputime cputime; + + /* Earliest-expiration cache. */ + struct task_cputime cputime_expires; + + struct list_head cpu_timers[3]; + /* job control IDs */ /* @@ -509,7 +553,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t utime, stime, cutime, cstime; + cputime_t cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@ -518,14 +562,6 @@ struct signal_struct { struct task_io_accounting ioac; /* - * Cumulative ns of scheduled CPU time for dead threads in the - * group, not including a zombie group leader. (This only differs - * from jiffies_to_ns(utime + stime) if sched_clock uses something - * other than jiffies.) - */ - unsigned long long sum_sched_runtime; - - /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one @@ -536,8 +572,6 @@ struct signal_struct { */ struct rlimit rlim[RLIM_NLIMITS]; - struct list_head cpu_timers[3]; - /* keep the process-shared keyrings here so that they do the right * thing in threads created with CLONE_THREAD */ #ifdef CONFIG_KEYS @@ -1146,8 +1180,7 @@ struct task_struct { /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; - cputime_t it_prof_expires, it_virt_expires; - unsigned long long it_sched_expires; + struct task_cputime cputime_expires; struct list_head cpu_timers[3]; /* process credentials */ @@ -1597,6 +1630,7 @@ extern unsigned long long cpu_clock(int cpu); extern unsigned long long task_sched_runtime(struct task_struct *task); +extern unsigned long long thread_group_sched_runtime(struct task_struct *task); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP @@ -2094,6 +2128,30 @@ static inline int spin_needbreak(spinlock_t *lock) } /* + * Thread group CPU time accounting. + */ + +extern int thread_group_cputime_alloc(struct task_struct *); +extern void thread_group_cputime(struct task_struct *, struct task_cputime *); + +static inline void thread_group_cputime_init(struct signal_struct *sig) +{ + sig->cputime.totals = NULL; +} + +static inline int thread_group_cputime_clone_thread(struct task_struct *curr) +{ + if (curr->signal->cputime.totals) + return 0; + return thread_group_cputime_alloc(curr); +} + +static inline void thread_group_cputime_free(struct signal_struct *sig) +{ + free_percpu(sig->cputime.totals); +} + +/* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. diff --git a/include/linux/tick.h b/include/linux/tick.h index 98921a3..b6ec818 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -96,9 +96,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void); extern void tick_clock_notify(void); extern int tick_check_oneshot_change(int allow_nohz); extern struct tick_sched *tick_get_tick_sched(int cpu); +extern void tick_check_idle(int cpu); # else static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline void tick_check_idle(int cpu) { } # endif #else /* CONFIG_GENERIC_CLOCKEVENTS */ @@ -106,26 +108,23 @@ static inline void tick_init(void) { } static inline void tick_cancel_sched_timer(int cpu) { } static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline void tick_check_idle(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ extern void tick_nohz_stop_sched_tick(int inidle); extern void tick_nohz_restart_sched_tick(void); -extern void tick_nohz_update_jiffies(void); extern ktime_t tick_nohz_get_sleep_length(void); -extern void tick_nohz_stop_idle(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_stop_sched_tick(int inidle) { } static inline void tick_nohz_restart_sched_tick(void) { } -static inline void tick_nohz_update_jiffies(void) { } static inline ktime_t tick_nohz_get_sleep_length(void) { ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; return len; } -static inline void tick_nohz_stop_idle(int cpu) { } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ diff --git a/include/linux/time.h b/include/linux/time.h index 51e883d..4f1c9db 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -119,6 +119,7 @@ extern int do_setitimer(int which, struct itimerval *value, extern unsigned int alarm_setitimer(unsigned int seconds); extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); +extern void getrawmonotonic(struct timespec *ts); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); @@ -127,6 +128,9 @@ extern int timekeeping_valid_for_hres(void); extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); +struct tms; +extern void do_sys_times(struct tms *); + /** * timespec_to_ns - Convert timespec to nanoseconds * @ts: pointer to the timespec variable to be converted @@ -216,6 +220,7 @@ struct itimerval { #define CLOCK_MONOTONIC 1 #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 +#define CLOCK_MONOTONIC_RAW 4 /* * The IDs of various hardware clocks: diff --git a/include/linux/timex.h b/include/linux/timex.h index fc6035d..9007313 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -82,7 +82,7 @@ */ #define SHIFT_USEC 16 /* frequency offset scale (shift) */ #define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) -#define PPM_SCALE_INV_SHIFT 20 +#define PPM_SCALE_INV_SHIFT 19 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ PPM_SCALE + 1) @@ -141,8 +141,15 @@ struct timex { #define ADJ_MICRO 0x1000 /* select microsecond resolution */ #define ADJ_NANO 0x2000 /* select nanosecond resolution */ #define ADJ_TICK 0x4000 /* tick value */ + +#ifdef __KERNEL__ +#define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ +#define ADJ_OFFSET_SINGLESHOT 0x0001 /* old-fashioned adjtime */ +#define ADJ_OFFSET_READONLY 0x2000 /* read-only adjtime */ +#else #define ADJ_OFFSET_SINGLESHOT 0x8001 /* old-fashioned adjtime */ -#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ +#define ADJ_OFFSET_SS_READ 0xa001 /* read-only adjtime */ +#endif /* xntp 3.4 compatibility names */ #define MOD_OFFSET ADJ_OFFSET diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h new file mode 100644 index 0000000..c5bb39c --- /dev/null +++ b/include/linux/tracepoint.h @@ -0,0 +1,137 @@ +#ifndef _LINUX_TRACEPOINT_H +#define _LINUX_TRACEPOINT_H + +/* + * Kernel Tracepoint API. + * + * See Documentation/tracepoint.txt. + * + * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * Heavily inspired from the Linux Kernel Markers. + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/types.h> +#include <linux/rcupdate.h> + +struct module; +struct tracepoint; + +struct tracepoint { + const char *name; /* Tracepoint name */ + int state; /* State. */ + void **funcs; +} __attribute__((aligned(8))); + + +#define TPPROTO(args...) args +#define TPARGS(args...) args + +#ifdef CONFIG_TRACEPOINTS + +/* + * it_func[0] is never NULL because there is at least one element in the array + * when the array itself is non NULL. + */ +#define __DO_TRACE(tp, proto, args) \ + do { \ + void **it_func; \ + \ + rcu_read_lock_sched(); \ + it_func = rcu_dereference((tp)->funcs); \ + if (it_func) { \ + do { \ + ((void(*)(proto))(*it_func))(args); \ + } while (*(++it_func)); \ + } \ + rcu_read_unlock_sched(); \ + } while (0) + +/* + * Make sure the alignment of the structure in the __tracepoints section will + * not add unwanted padding between the beginning of the section and the + * structure. Force alignment to the same alignment as the section start. + */ +#define DEFINE_TRACE(name, proto, args) \ + static inline void trace_##name(proto) \ + { \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) \ + = #name ":" #proto; \ + static struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"), aligned(8))) = \ + { __tpstrtab_##name, 0, NULL }; \ + if (unlikely(__tracepoint_##name.state)) \ + __DO_TRACE(&__tracepoint_##name, \ + TPPROTO(proto), TPARGS(args)); \ + } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ + return tracepoint_probe_register(#name ":" #proto, \ + (void *)probe); \ + } \ + static inline void unregister_trace_##name(void (*probe)(proto))\ + { \ + tracepoint_probe_unregister(#name ":" #proto, \ + (void *)probe); \ + } + +extern void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end); + +#else /* !CONFIG_TRACEPOINTS */ +#define DEFINE_TRACE(name, proto, args) \ + static inline void _do_trace_##name(struct tracepoint *tp, proto) \ + { } \ + static inline void trace_##name(proto) \ + { } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ + return -ENOSYS; \ + } \ + static inline void unregister_trace_##name(void (*probe)(proto))\ + { } + +static inline void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ } +#endif /* CONFIG_TRACEPOINTS */ + +/* + * Connect a probe to a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_register(const char *name, void *probe); + +/* + * Disconnect a probe from a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_unregister(const char *name, void *probe); + +struct tracepoint_iter { + struct module *module; + struct tracepoint *tracepoint; +}; + +extern void tracepoint_iter_start(struct tracepoint_iter *iter); +extern void tracepoint_iter_next(struct tracepoint_iter *iter); +extern void tracepoint_iter_stop(struct tracepoint_iter *iter); +extern void tracepoint_iter_reset(struct tracepoint_iter *iter); +extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end); + +/* + * tracepoint_synchronize_unregister must be called between the last tracepoint + * probe unregistration and the end of module exit to make sure there is no + * caller executing a probe when it is freed. + */ +static inline void tracepoint_synchronize_unregister(void) +{ + synchronize_sched(); +} + +#endif diff --git a/include/trace/sched.h b/include/trace/sched.h new file mode 100644 index 0000000..ad47369 --- /dev/null +++ b/include/trace/sched.h @@ -0,0 +1,56 @@ +#ifndef _TRACE_SCHED_H +#define _TRACE_SCHED_H + +#include <linux/sched.h> +#include <linux/tracepoint.h> + +DEFINE_TRACE(sched_kthread_stop, + TPPROTO(struct task_struct *t), + TPARGS(t)); + +DEFINE_TRACE(sched_kthread_stop_ret, + TPPROTO(int ret), + TPARGS(ret)); + +DEFINE_TRACE(sched_wait_task, + TPPROTO(struct rq *rq, struct task_struct *p), + TPARGS(rq, p)); + +DEFINE_TRACE(sched_wakeup, + TPPROTO(struct rq *rq, struct task_struct *p), + TPARGS(rq, p)); + +DEFINE_TRACE(sched_wakeup_new, + TPPROTO(struct rq *rq, struct task_struct *p), + TPARGS(rq, p)); + +DEFINE_TRACE(sched_switch, + TPPROTO(struct rq *rq, struct task_struct *prev, + struct task_struct *next), + TPARGS(rq, prev, next)); + +DEFINE_TRACE(sched_migrate_task, + TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), + TPARGS(rq, p, dest_cpu)); + +DEFINE_TRACE(sched_process_free, + TPPROTO(struct task_struct *p), + TPARGS(p)); + +DEFINE_TRACE(sched_process_exit, + TPPROTO(struct task_struct *p), + TPARGS(p)); + +DEFINE_TRACE(sched_process_wait, + TPPROTO(struct pid *pid), + TPARGS(pid)); + +DEFINE_TRACE(sched_process_fork, + TPPROTO(struct task_struct *parent, struct task_struct *child), + TPARGS(parent, child)); + +DEFINE_TRACE(sched_signal_send, + TPPROTO(int sig, struct task_struct *p), + TPARGS(sig, p)); + +#endif diff --git a/init/Kconfig b/init/Kconfig index 8828ed0..113c74c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -737,6 +737,14 @@ config VM_EVENT_COUNTERS on EMBEDDED systems. /proc/vmstat will only show page counts if VM event counters are disabled. +config PCI_QUIRKS + default y + bool "Enable PCI quirk workarounds" if EMBEDDED && PCI + help + This enables workarounds for various PCI chipset + bugs/quirks. Disable this only if your target machine is + unaffected by PCI quirks. + config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EMBEDDED @@ -786,6 +794,13 @@ config PROFILING Say Y here to enable the extended profiling support mechanisms used by profilers such as OProfile. +# +# Place an empty function call at each tracepoint site. Can be +# dynamically changed for a probe function. +# +config TRACEPOINTS + bool + config MARKERS bool "Activate markers" help diff --git a/init/main.c b/init/main.c index 4371d11..3e17a3b 100644 --- a/init/main.c +++ b/init/main.c @@ -61,6 +61,7 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/idr.h> +#include <linux/ftrace.h> #include <asm/io.h> #include <asm/bugs.h> @@ -689,6 +690,8 @@ asmlinkage void __init start_kernel(void) acpi_early_init(); /* before LAPIC and SMP init */ + ftrace_init(); + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -705,30 +708,31 @@ __setup("initcall_debug", initcall_debug_setup); int do_one_initcall(initcall_t fn) { int count = preempt_count(); - ktime_t t0, t1, delta; + ktime_t delta; char msgbuf[64]; - int result; + struct boot_trace it; if (initcall_debug) { - printk("calling %pF @ %i\n", fn, task_pid_nr(current)); - t0 = ktime_get(); + it.caller = task_pid_nr(current); + printk("calling %pF @ %i\n", fn, it.caller); + it.calltime = ktime_get(); } - result = fn(); + it.result = fn(); if (initcall_debug) { - t1 = ktime_get(); - delta = ktime_sub(t1, t0); - - printk("initcall %pF returned %d after %Ld msecs\n", - fn, result, - (unsigned long long) delta.tv64 >> 20); + it.rettime = ktime_get(); + delta = ktime_sub(it.rettime, it.calltime); + it.duration = (unsigned long long) delta.tv64 >> 10; + printk("initcall %pF returned %d after %Ld usecs\n", fn, + it.result, it.duration); + trace_boot(&it, fn); } msgbuf[0] = 0; - if (result && result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", result); + if (it.result && it.result != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", it.result); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -742,7 +746,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return result; + return it.result; } @@ -857,6 +861,7 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); + start_boot_trace(); smp_init(); sched_init_smp(); @@ -883,6 +888,7 @@ static int __init kernel_init(void * unused) * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ + stop_boot_trace(); init_post(); return 0; } diff --git a/kernel/Makefile b/kernel/Makefile index 066550a..305f11d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FTRACE) += trace/ diff --git a/kernel/compat.c b/kernel/compat.c index 143990e..8eafe3e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,6 +23,7 @@ #include <linux/timex.h> #include <linux/migrate.h> #include <linux/posix-timers.h> +#include <linux/times.h> #include <asm/uaccess.h> @@ -208,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which, return 0; } +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { + struct tms tms; struct compat_tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --git a/kernel/exit.c b/kernel/exit.c index 0ef4673..80137a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,6 +47,7 @@ #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> +#include <trace/sched.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -112,8 +113,6 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -122,7 +121,6 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -149,7 +147,10 @@ static void __exit_signal(struct task_struct *tsk) static void delayed_put_task_struct(struct rcu_head *rhp) { - put_task_struct(container_of(rhp, struct task_struct, rcu)); + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); } @@ -1073,6 +1074,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); + trace_sched_process_exit(tsk); + exit_sem(tsk); exit_files(tsk); exit_fs(tsk); @@ -1301,6 +1304,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1316,20 +1320,23 @@ static int wait_task_zombie(struct task_struct *p, int options, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; + thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1674,6 +1681,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options, struct task_struct *tsk; int retval; + trace_sched_process_wait(pid); + add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* diff --git a/kernel/fork.c b/kernel/fork.c index 30de644..4d09355 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -58,6 +58,7 @@ #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/blkdev.h> +#include <trace/sched.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -759,15 +760,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -795,40 +825,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -838,6 +853,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); @@ -888,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif /* CONFIG_MM_OWNER */ /* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + +/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * @@ -997,12 +1026,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1203,21 +1227,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { @@ -1364,6 +1373,8 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + trace_sched_process_fork(current, p); + nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cdec83e..95978f4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1403,9 +1403,7 @@ void hrtimer_run_queues(void) if (!base->first) continue; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { + if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } @@ -1688,9 +1686,11 @@ static void migrate_hrtimers(int cpu) new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1703,8 +1703,7 @@ static void migrate_hrtimers(int cpu) raise = 1; spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); if (raise) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 533068c..cc0f732 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -30,17 +30,16 @@ static DEFINE_MUTEX(probing_active); unsigned long probe_irq_on(void) { struct irq_desc *desc; - unsigned long mask; - unsigned int i; + unsigned long mask = 0; + unsigned int status; + int i; mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. */ - for (i = NR_IRQS-1; i > 0; i--) { - desc = irq_desc + i; - + for_each_irq_desc_reverse(i, desc) { spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -68,9 +67,7 @@ unsigned long probe_irq_on(void) * (we must startup again here because if a longstanding irq * happened in the previous stage, it may have masked itself) */ - for (i = NR_IRQS-1; i > 0; i--) { - desc = irq_desc + i; - + for_each_irq_desc_reverse(i, desc) { spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -88,11 +85,7 @@ unsigned long probe_irq_on(void) /* * Now filter out any obviously spurious interrupts */ - mask = 0; - for (i = 0; i < NR_IRQS; i++) { - unsigned int status; - - desc = irq_desc + i; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; @@ -126,14 +119,11 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int mask; + unsigned int status, mask = 0; + struct irq_desc *desc; int i; - mask = 0; - for (i = 0; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - unsigned int status; - + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; @@ -171,20 +161,19 @@ EXPORT_SYMBOL(probe_irq_mask); */ int probe_irq_off(unsigned long val) { - int i, irq_found = 0, nr_irqs = 0; - - for (i = 0; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - unsigned int status; + int i, irq_found = 0, nr_of_irqs = 0; + struct irq_desc *desc; + unsigned int status; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { if (!(status & IRQ_WAITING)) { - if (!nr_irqs) + if (!nr_of_irqs) irq_found = i; - nr_irqs++; + nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); @@ -193,7 +182,7 @@ int probe_irq_off(unsigned long val) } mutex_unlock(&probing_active); - if (nr_irqs > 1) + if (nr_of_irqs > 1) irq_found = -irq_found; return irq_found; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3cd441e..4895fde 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -24,16 +24,15 @@ */ void dynamic_irq_init(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -57,15 +56,14 @@ void dynamic_irq_init(unsigned int irq) */ void dynamic_irq_cleanup(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -89,10 +87,10 @@ void dynamic_irq_cleanup(unsigned int irq) */ int set_irq_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } @@ -100,7 +98,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; @@ -111,27 +108,27 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) EXPORT_SYMBOL(set_irq_chip); /** - * set_irq_type - set the irq type for an irq + * set_irq_type - set the irq trigger type for an irq * @irq: irq number - * @type: interrupt type - see include/linux/interrupt.h + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = -ENXIO; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; } - desc = irq_desc + irq; - if (desc->chip->set_type) { - spin_lock_irqsave(&desc->lock, flags); - ret = desc->chip->set_type(irq, type); - spin_unlock_irqrestore(&desc->lock, flags); - } + if (type == IRQ_TYPE_NONE) + return 0; + + spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_trigger(desc, irq, flags); + spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); @@ -145,16 +142,15 @@ EXPORT_SYMBOL(set_irq_type); */ int set_irq_data(unsigned int irq, void *data) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -171,15 +167,15 @@ EXPORT_SYMBOL(set_irq_data); */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -197,10 +193,16 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) */ int set_irq_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS || !desc->chip) { + if (!desc) { + printk(KERN_ERR + "Trying to install chip data for IRQ%d\n", irq); + return -EINVAL; + } + + if (!desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } @@ -218,7 +220,7 @@ EXPORT_SYMBOL(set_irq_chip_data); */ static void default_enable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); desc->chip->unmask(irq); desc->status &= ~IRQ_MASKED; @@ -236,8 +238,9 @@ static void default_disable(unsigned int irq) */ static unsigned int default_startup(unsigned int irq) { - irq_desc[irq].chip->enable(irq); + struct irq_desc *desc = irq_to_desc(irq); + desc->chip->enable(irq); return 0; } @@ -246,7 +249,7 @@ static unsigned int default_startup(unsigned int irq) */ static void default_shutdown(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); desc->chip->mask(irq); desc->status |= IRQ_MASKED; @@ -305,14 +308,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; - const unsigned int cpu = smp_processor_id(); spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -344,7 +346,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -354,7 +355,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -392,7 +393,6 @@ out_unlock: void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -402,7 +402,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -451,8 +451,6 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - const unsigned int cpu = smp_processor_id(); - spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -468,8 +466,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc, irq); goto out_unlock; } - - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); @@ -524,7 +521,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); if (desc->chip->ack) desc->chip->ack(irq); @@ -541,17 +538,15 @@ void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); return; } - desc = irq_desc + irq; - if (!handle) handle = handle_bad_irq; else if (desc->chip == &no_irq_chip) { @@ -583,7 +578,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->unmask(irq); + desc->chip->startup(irq); } spin_unlock_irqrestore(&desc->lock, flags); } @@ -606,17 +601,14 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void __init set_irq_noprobe(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); - return; } - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); @@ -624,17 +616,14 @@ void __init set_irq_noprobe(unsigned int irq) void __init set_irq_probe(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; } - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 5fa6198..c815b42 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -25,11 +25,10 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); ack_bad_irq(irq); } @@ -47,6 +46,9 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * * Controller mappings for all interrupt sources: */ +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -66,7 +68,9 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { */ static void ack_bad(unsigned int irq) { - print_irq_desc(irq, irq_desc + irq); + struct irq_desc *desc = irq_to_desc(irq); + + print_irq_desc(irq, desc); ack_bad_irq(irq); } @@ -131,8 +135,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - handle_dynamic_tick(action); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); @@ -165,11 +167,12 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) */ unsigned int __do_IRQ(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned int status; - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -256,8 +259,8 @@ out: } #endif -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_TRACE_IRQFLAGS /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ @@ -265,10 +268,10 @@ static struct lock_class_key irq_desc_lock_class; void early_init_irq_lock_class(void) { + struct irq_desc *desc; int i; - for (i = 0; i < NR_IRQS; i++) - lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); + for_each_irq_desc(i, desc) + lockdep_set_class(&desc->lock, &irq_desc_lock_class); } - #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 08a849a..c9767e6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -10,12 +10,15 @@ extern void irq_chip_set_defaults(struct irq_chip *chip); /* Set default handler: */ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, + unsigned long flags); + #ifdef CONFIG_PROC_FS -extern void register_irq_proc(unsigned int irq); +extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else -static inline void register_irq_proc(unsigned int irq) { } +static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 60c49e3..c498a1b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; - if (irq >= NR_IRQS) + if (!desc) return; do { @@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq); */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || !desc->chip->set_affinity) @@ -81,18 +81,17 @@ int irq_can_set_affinity(unsigned int irq) */ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (!desc->chip->set_affinity) return -EINVAL; - set_balance_irq_affinity(irq, cpumask); - #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { + if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { unsigned long flags; spin_lock_irqsave(&desc->lock, flags); + desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); spin_unlock_irqrestore(&desc->lock, flags); } else @@ -111,16 +110,17 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) int irq_select_affinity(unsigned int irq) { cpumask_t mask; + struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - irq_desc[irq].affinity = mask; - irq_desc[irq].chip->set_affinity(irq, mask); + desc = irq_to_desc(irq); + desc->affinity = mask; + desc->chip->set_affinity(irq, mask); - set_balance_irq_affinity(irq, mask); return 0; } #endif @@ -140,10 +140,10 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -169,9 +169,9 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); - if (irq >= NR_IRQS) + if (!desc) return; disable_irq_nosync(irq); @@ -211,10 +211,10 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -223,9 +223,9 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); -int set_irq_wake_real(unsigned int irq, unsigned int on) +static int set_irq_wake_real(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (desc->chip->set_wake) @@ -248,7 +248,7 @@ int set_irq_wake_real(unsigned int irq, unsigned int on) */ int set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = 0; @@ -288,12 +288,16 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) + if (!desc) + return 0; + + if (desc->status & IRQ_NOREQUEST) return 0; - action = irq_desc[irq].action; + action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; @@ -312,10 +316,11 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) desc->handle_irq = NULL; } -static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, +int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { int ret; + struct irq_chip *chip = desc->chip; if (!chip || !chip->set_type) { /* @@ -333,6 +338,11 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, pr_err("setting trigger mode %d for irq %u failed (%pF)\n", (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); + else { + /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= flags & IRQ_TYPE_SENSE_MASK; + } return ret; } @@ -341,16 +351,16 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. */ -int setup_irq(unsigned int irq, struct irqaction *new) +static int +__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) { - struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; int shared = 0; int ret; - if (irq >= NR_IRQS) + if (!desc) return -EINVAL; if (desc->chip == &no_irq_chip) @@ -411,7 +421,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc->chip, irq, new->flags); + ret = __irq_set_trigger(desc, irq, new->flags); if (ret) { spin_unlock_irqrestore(&desc->lock, flags); @@ -430,16 +440,21 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - if (desc->chip->startup) - desc->chip->startup(irq); - else - desc->chip->enable(irq); + desc->chip->startup(irq); } else /* Undo nested disables: */ desc->depth = 1; /* Set default affinity mask once everything is setup */ irq_select_affinity(irq); + + } else if ((new->flags & IRQF_TRIGGER_MASK) + && (new->flags & IRQF_TRIGGER_MASK) + != (desc->status & IRQ_TYPE_SENSE_MASK)) { + /* hope the handler works with the actual trigger mode... */ + pr_warning("IRQ %d uses trigger mode %d; requested %d\n", + irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), + (int)(new->flags & IRQF_TRIGGER_MASK)); } *p = new; @@ -464,7 +479,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; - register_irq_proc(irq); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); @@ -484,6 +499,20 @@ mismatch: } /** + * setup_irq - setup an interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt + * + * Used to statically setup interrupts in the early boot process. + */ +int setup_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __setup_irq(irq, desc, act); +} + +/** * free_irq - free an interrupt * @irq: Interrupt line to free * @dev_id: Device identity to free @@ -499,15 +528,15 @@ mismatch: */ void free_irq(unsigned int irq, void *dev_id) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction **p; unsigned long flags; WARN_ON(in_interrupt()); - if (irq >= NR_IRQS) + + if (!desc) return; - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -596,12 +625,14 @@ EXPORT_SYMBOL(free_irq); * IRQF_SHARED Interrupt is shared * IRQF_DISABLED Disable local interrupts while processing * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy + * IRQF_TRIGGER_* Specify active edge(s) or level * */ int request_irq(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; + struct irq_desc *desc; int retval; #ifdef CONFIG_LOCKDEP @@ -618,9 +649,12 @@ int request_irq(unsigned int irq, irq_handler_t handler, */ if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - if (irq >= NR_IRQS) + + desc = irq_to_desc(irq); + if (!desc) return -EINVAL; - if (irq_desc[irq].status & IRQ_NOREQUEST) + + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) return -EINVAL; @@ -636,26 +670,29 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; + retval = __setup_irq(irq, desc, action); + if (retval) + kfree(action); + #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... - * We do this before actually registering it, to make sure that - * a 'real' IRQ doesn't run in parallel with our fake + * We disable the irq to make sure that a 'real' IRQ doesn't + * run in parallel with our fake. */ unsigned long flags; + disable_irq(irq); local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); + enable_irq(irq); } #endif - - retval = setup_irq(irq, action); - if (retval) - kfree(action); - return retval; } EXPORT_SYMBOL(request_irq); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 77b7acc..90b920d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,18 +3,18 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + desc->pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } void move_masked_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) @@ -30,7 +30,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) + if (unlikely(cpus_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); + cpus_and(tmp, desc->pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -55,12 +55,12 @@ void move_masked_irq(int irq) if (likely(!cpus_empty(tmp))) { desc->chip->set_affinity(irq,tmp); } - cpus_clear(irq_desc[irq].pending_mask); + cpus_clear(desc->pending_mask); } void move_native_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a09dd29..fac014a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)m->private; + struct irq_desc *desc = irq_to_desc((long)m->private); cpumask_t *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_t new_value; int err; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = { static int irq_spurious_read(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct irq_desc *d = &irq_desc[(long) data]; + struct irq_desc *desc = irq_to_desc((long) data); return sprintf(page, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", - d->irq_count, - d->irqs_unhandled, - jiffies_to_msecs(d->last_unhandled)); + desc->irq_count, + desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); } #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; int ret = 1; @@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; + struct irq_desc *desc = irq_to_desc(irq); - if (!irq_desc[irq].dir || action->dir || !action->name || + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -174,36 +175,34 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); + action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN #define MAX_NAMELEN 10 -void register_irq_proc(unsigned int irq) +void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; - if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_chip) || - irq_desc[irq].dir) + if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); + desc->dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); + entry = create_proc_entry("spurious", 0444, desc->dir); if (entry) { entry->data = (void *)(long)irq; entry->read_proc = irq_spurious_read; @@ -214,8 +213,11 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) - remove_proc_entry(action->dir->name, irq_desc[irq].dir); + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + + remove_proc_entry(action->dir->name, desc->dir); + } } void register_default_affinity_proc(void) @@ -228,7 +230,8 @@ void register_default_affinity_proc(void) void init_irq_proc(void) { - int i; + unsigned int irq; + struct irq_desc *desc; /* create /proc/irq */ root_irq_dir = proc_mkdir("irq", NULL); @@ -240,7 +243,7 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for (i = 0; i < NR_IRQS; i++) - register_irq_proc(i); + for_each_irq_desc(irq, desc) + register_irq_proc(irq, desc); } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index a804679..89c7117 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -33,10 +33,10 @@ static void resend_irqs(unsigned long arg) struct irq_desc *desc; int irq; - while (!bitmap_empty(irqs_resend, NR_IRQS)) { - irq = find_first_bit(irqs_resend, NR_IRQS); + while (!bitmap_empty(irqs_resend, nr_irqs)) { + irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); - desc = irq_desc + irq; + desc = irq_to_desc(irq); local_irq_disable(); desc->handle_irq(irq, desc); local_irq_enable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c66d3f1..dd364c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -12,83 +12,122 @@ #include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/moduleparam.h> +#include <linux/timer.h> static int irqfixup __read_mostly; +#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) +static void poll_spurious_irqs(unsigned long dummy); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); + /* * Recovery handler for misrouted interrupts. */ -static int misrouted_irq(int irq) +static int try_one_irq(int irq, struct irq_desc *desc) { - int i; - int ok = 0; - int work = 0; /* Did we do work for a real IRQ */ - - for (i = 1; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - struct irqaction *action; - - if (i == irq) /* Already tried */ - continue; + struct irqaction *action; + int ok = 0, work = 0; - spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); - continue; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; + spin_lock(&desc->lock); + /* Already running on another processor */ + if (desc->status & IRQ_INPROGRESS) { + /* + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too + */ + if (desc->action && (desc->action->flags & IRQF_SHARED)) + desc->status |= IRQ_PENDING; spin_unlock(&desc->lock); + return ok; + } + /* Honour the normal IRQ locking */ + desc->status |= IRQ_INPROGRESS; + action = desc->action; + spin_unlock(&desc->lock); - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(i, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; + while (action) { + /* Only shared IRQ handlers are safe to call */ + if (action->flags & IRQF_SHARED) { + if (action->handler(irq, action->dev_id) == + IRQ_HANDLED) + ok = 1; } - local_irq_disable(); - /* Now clean up the flags */ - spin_lock(&desc->lock); - action = desc->action; + action = action->next; + } + local_irq_disable(); + /* Now clean up the flags */ + spin_lock(&desc->lock); + action = desc->action; + /* + * While we were looking for a fixup someone queued a real + * IRQ clashing with our walk: + */ + while ((desc->status & IRQ_PENDING) && action) { /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: - */ - while ((desc->status & IRQ_PENDING) && action) { - /* - * Perform real IRQ processing for the IRQ we deferred - */ - work = 1; - spin_unlock(&desc->lock); - handle_IRQ_event(i, action); - spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too + * Perform real IRQ processing for the IRQ we deferred */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(i); + work = 1; spin_unlock(&desc->lock); + handle_IRQ_event(irq, action); + spin_lock(&desc->lock); + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + /* + * If we did actual work for the real IRQ line we must let the + * IRQ controller clean up too + */ + if (work && desc->chip && desc->chip->end) + desc->chip->end(irq); + spin_unlock(&desc->lock); + + return ok; +} + +static int misrouted_irq(int irq) +{ + struct irq_desc *desc; + int i, ok = 0; + + for_each_irq_desc(i, desc) { + if (!i) + continue; + + if (i == irq) /* Already tried */ + continue; + + if (try_one_irq(i, desc)) + ok = 1; } /* So the caller can adjust the irq error counts */ return ok; } +static void poll_spurious_irqs(unsigned long dummy) +{ + struct irq_desc *desc; + int i; + + for_each_irq_desc(i, desc) { + unsigned int status; + + if (!i) + continue; + + /* Racy but it doesn't matter */ + status = desc->status; + barrier(); + if (!(status & IRQ_SPURIOUS_DISABLED)) + continue; + + try_one_irq(i, desc); + } + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); +} + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -137,7 +176,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } } -static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static inline int +try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; @@ -212,6 +253,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; desc->chip->disable(irq); + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } @@ -241,7 +285,7 @@ static int __init irqfixup_setup(char *str) __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); +MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { diff --git a/kernel/itimer.c b/kernel/itimer.c index ab98274..db7c358 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t utime = tsk->signal->utime; - do { - utime = cputime_add(utime, t->utime); - t = next_thread(t); - } while (t != tsk); + struct task_cputime cputime; + cputime_t utime; + + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; case ITIMER_PROF: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t ptime = cputime_add(tsk->signal->utime, - tsk->signal->stime); - do { - ptime = cputime_add(ptime, - cputime_add(t->utime, - t->stime)); - t = next_thread(t); - } while (t != tsk); + struct task_cputime times; + cputime_t ptime; + + thread_group_cputime(tsk, ×); + ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; @@ -185,7 +176,6 @@ again: case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; @@ -200,7 +190,6 @@ again: tsk->signal->it_virt_expires = nval; tsk->signal->it_virt_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); @@ -209,7 +198,6 @@ again: case ITIMER_PROF: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; @@ -224,7 +212,6 @@ again: tsk->signal->it_prof_expires = nval; tsk->signal->it_prof_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); diff --git a/kernel/kexec.c b/kernel/kexec.c index 777ac45..ac0fde7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -30,6 +30,7 @@ #include <linux/pm.h> #include <linux/cpu.h> #include <linux/console.h> +#include <linux/vmalloc.h> #include <asm/page.h> #include <asm/uaccess.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index 14ec64f..8e7a7ce 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> +#include <trace/sched.h> #define KTHREAD_NICE_LEVEL (-5) @@ -205,6 +206,8 @@ int kthread_stop(struct task_struct *k) /* It could exit after stop_info.k set, but before wake_up_process. */ get_task_struct(k); + trace_sched_kthread_stop(k); + /* Must init completion *before* thread sees kthread_stop_info.k */ init_completion(&kthread_stop_info.done); smp_wmb(); @@ -220,6 +223,8 @@ int kthread_stop(struct task_struct *k) ret = kthread_stop_info.err; mutex_unlock(&kthread_stop_lock); + trace_sched_kthread_stop_ret(ret); + return ret; } EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/marker.c b/kernel/marker.c index 7d1faec..e9c6b2b 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -62,7 +62,7 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - unsigned char rcu_pending:1; + int rcu_pending; unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -103,11 +103,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) char ptype; /* - * preempt_disable does two things : disabling preemption to make sure - * the teardown of the callbacks can be done correctly when they are in - * modules and they insure RCU read coherency. + * rcu_read_lock_sched does two things : disabling preemption to make + * sure the teardown of the callbacks can be done correctly when they + * are in modules and they insure RCU read coherency. */ - preempt_disable(); + rcu_read_lock_sched(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -145,7 +145,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) va_end(args); } } - preempt_enable(); + rcu_read_unlock_sched(); } EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -162,7 +162,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) va_list args; /* not initialized */ char ptype; - preempt_disable(); + rcu_read_lock_sched(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -195,7 +195,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); } - preempt_enable(); + rcu_read_unlock_sched(); } EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); @@ -560,7 +560,7 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * Disable a marker and its probe callback. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured - * by preempt_disable around the call site. + * by rcu_read_lock_sched around the call site. */ static void disable_marker(struct marker *elem) { @@ -653,11 +653,17 @@ int marker_probe_register(const char *name, const char *format, entry = get_marker(name); if (!entry) { entry = add_marker(name, format); - if (IS_ERR(entry)) { + if (IS_ERR(entry)) ret = PTR_ERR(entry); - goto end; - } + } else if (format) { + if (!entry->format) + ret = marker_set_format(&entry, format); + else if (strcmp(entry->format, format)) + ret = -EPERM; } + if (ret) + goto end; + /* * If we detect that a call_rcu is pending for this marker, * make sure it's executed now. @@ -674,6 +680,8 @@ int marker_probe_register(const char *name, const char *format, mutex_lock(&markers_mutex); entry = get_marker(name); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ @@ -717,6 +725,8 @@ int marker_probe_unregister(const char *name, entry = get_marker(name); if (!entry) goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ @@ -795,6 +805,8 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ diff --git a/kernel/module.c b/kernel/module.c index 25bc9ac..0d8d21e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,6 +46,8 @@ #include <asm/cacheflush.h> #include <linux/license.h> #include <asm/sections.h> +#include <linux/tracepoint.h> +#include <linux/ftrace.h> #if 0 #define DEBUGP printk @@ -1430,6 +1432,9 @@ static void free_module(struct module *mod) /* Module unload stuff */ module_unload_free(mod); + /* release any pointers to mcount in this module */ + ftrace_release(mod->module_core, mod->core_size); + /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1861,9 +1866,13 @@ static noinline struct module *load_module(void __user *umod, unsigned int markersindex; unsigned int markersstringsindex; unsigned int verboseindex; + unsigned int tracepointsindex; + unsigned int tracepointsstringsindex; + unsigned int mcountindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *mseg; struct exception_table_entry *extable; mm_segment_t old_fs; @@ -2156,6 +2165,12 @@ static noinline struct module *load_module(void __user *umod, markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); + tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); + tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, + "__tracepoints_strings"); + + mcountindex = find_sec(hdr, sechdrs, secstrings, + "__mcount_loc"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2183,6 +2198,12 @@ static noinline struct module *load_module(void __user *umod, mod->num_markers = sechdrs[markersindex].sh_size / sizeof(*mod->markers); #endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; + mod->num_tracepoints = + sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); +#endif + /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2201,12 +2222,22 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + if (!mod->taints) { #ifdef CONFIG_MARKERS - if (!mod->taints) marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif dynamic_printk_setup(sechdrs, verboseindex); +#ifdef CONFIG_TRACEPOINTS + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); +#endif + } + + /* sechdrs[0].sh_size is always zero */ + mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2276,6 +2307,7 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); + ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); module_free(mod, mod->module_init); @@ -2759,3 +2791,50 @@ void module_update_markers(void) mutex_unlock(&module_mutex); } #endif + +#ifdef CONFIG_TRACEPOINTS +void module_update_tracepoints(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!mod->taints) + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!iter_mod->taints) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints, + iter_mod->tracepoints + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif diff --git a/kernel/notifier.c b/kernel/notifier.c index 823be11..4282c0a 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c42a03a..153dcb2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -7,6 +7,93 @@ #include <linux/errno.h> #include <linux/math64.h> #include <asm/uaccess.h> +#include <linux/kernel_stat.h> + +/* + * Allocate the thread_group_cputime structure appropriately and fill in the + * current values of the fields. Called from copy_signal() via + * thread_group_cputime_clone_thread() when adding a second or subsequent + * thread to a thread group. Assumes interrupts are enabled when called. + */ +int thread_group_cputime_alloc(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct task_cputime *cputime; + + /* + * If we have multiple threads and we don't already have a + * per-CPU task_cputime struct (checked in the caller), allocate + * one and fill it in with the times accumulated so far. We may + * race with another thread so recheck after we pick up the sighand + * lock. + */ + cputime = alloc_percpu(struct task_cputime); + if (cputime == NULL) + return -ENOMEM; + spin_lock_irq(&tsk->sighand->siglock); + if (sig->cputime.totals) { + spin_unlock_irq(&tsk->sighand->siglock); + free_percpu(cputime); + return 0; + } + sig->cputime.totals = cputime; + cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); + cputime->utime = tsk->utime; + cputime->stime = tsk->stime; + cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; + spin_unlock_irq(&tsk->sighand->siglock); + return 0; +} + +/** + * thread_group_cputime - Sum the thread group time fields across all CPUs. + * + * @tsk: The task we use to identify the thread group. + * @times: task_cputime structure in which we return the summed fields. + * + * Walk the list of CPUs to sum the per-CPU time fields in the thread group + * time structure. + */ +void thread_group_cputime( + struct task_struct *tsk, + struct task_cputime *times) +{ + struct signal_struct *sig; + int i; + struct task_cputime *tot; + + sig = tsk->signal; + if (unlikely(!sig) || !sig->cputime.totals) { + times->utime = tsk->utime; + times->stime = tsk->stime; + times->sum_exec_runtime = tsk->se.sum_exec_runtime; + return; + } + times->stime = times->utime = cputime_zero; + times->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + times->utime = cputime_add(times->utime, tot->utime); + times->stime = cputime_add(times->stime, tot->stime); + times->sum_exec_runtime += tot->sum_exec_runtime; + } +} + +/* + * Called after updating RLIMIT_CPU to set timer expiration if necessary. + */ +void update_rlimit_cpu(unsigned long rlim_new) +{ + cputime_t cputime; + + cputime = secs_to_cputime(rlim_new); + if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + cputime_lt(current->signal->it_prof_expires, cputime)) { + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); + } +} static int check_clock(const clockid_t which_clock) { @@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p) { return p->utime; } -static inline unsigned long long sched_ns(struct task_struct *p) -{ - return task_sched_runtime(p); -} int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = sched_ns(p); + cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock. */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, - struct task_struct *p, - union cpu_time_count *cpu) +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) { - struct task_struct *t = p; - switch (clock_idx) { + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + switch (which_clock) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); - do { - cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = p->signal->utime; - do { - cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; - /* Add in each other live thread. */ - while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; - } - cpu->sched += sched_ns(p); + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - int ret; - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, - cpu); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { @@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, tsk->signal->utime), - cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} + struct task_cputime cputime; - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, - unsigned int clock_idx, - union cpu_time_count expires, - union cpu_time_count val) -{ - cputime_t ticks, left; - unsigned long long ns, nsleft; - struct task_struct *t = p; - unsigned int nthreads = atomic_read(&p->signal->live); - - if (!nthreads) - return; - - switch (clock_idx) { - default: - BUG(); - break; - case CPUCLOCK_PROF: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(prof_ticks(t), left); - if (cputime_eq(t->it_prof_expires, - cputime_zero) || - cputime_gt(t->it_prof_expires, ticks)) { - t->it_prof_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(virt_ticks(t), left); - if (cputime_eq(t->it_virt_expires, - cputime_zero) || - cputime_gt(t->it_virt_expires, ticks)) { - t->it_virt_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - nsleft = expires.sched - val.sched; - do_div(nsleft, nthreads); - nsleft = max_t(unsigned long long, nsleft, 1); - do { - if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; - if (t->it_sched_expires == 0 || - t->it_sched_expires > ns) { - t->it_sched_expires = ns; - } - } - t = next_thread(t); - } while (t != p); - break; - } + thread_group_cputime(tsk, &cputime); + cleanup_timers(tsk->signal->cpu_timers, + cputime.utime, cputime.stime, cputime.sum_exec_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->it_prof_expires, + if (cputime_eq(p->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(p->it_prof_expires, + cputime_gt(p->cputime_expires.prof_exp, nt->expires.cpu)) - p->it_prof_expires = nt->expires.cpu; + p->cputime_expires.prof_exp = + nt->expires.cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->it_virt_expires, + if (cputime_eq(p->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(p->it_virt_expires, + cputime_gt(p->cputime_expires.virt_exp, nt->expires.cpu)) - p->it_virt_expires = nt->expires.cpu; + p->cputime_expires.virt_exp = + nt->expires.cpu; break; case CPUCLOCK_SCHED: - if (p->it_sched_expires == 0 || - p->it_sched_expires > nt->expires.sched) - p->it_sched_expires = nt->expires.sched; + if (p->cputime_expires.sched_exp == 0 || + p->cputime_expires.sched_exp > + nt->expires.sched) + p->cputime_expires.sched_exp = + nt->expires.sched; break; } } else { /* - * For a process timer, we must balance - * all the live threads' expirations. + * For a process timer, set the cached expiration time. */ switch (CPUCLOCK_WHICH(timer->it_clock)) { default: @@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) cputime_lt(p->signal->it_virt_expires, timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.virt_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_PROF: if (!cputime_eq(p->signal->it_prof_expires, cputime_zero) && @@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) if (i != RLIM_INFINITY && i <= cputime_to_secs(timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.prof_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_SCHED: - rebalance: - process_timer_rebalance( - timer->it.cpu.task, - CPUCLOCK_WHICH(timer->it_clock), - timer->it.cpu.expires, now); + p->signal->cputime_expires.sched_exp = + timer->it.cpu.expires.sched; break; } } @@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk, struct signal_struct *const sig = tsk->signal; maxfire = 20; - tsk->it_prof_expires = cputime_zero; + tsk->cputime_expires.prof_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { - tsk->it_prof_expires = t->expires.cpu; + tsk->cputime_expires.prof_exp = t->expires.cpu; break; } t->firing = 1; @@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_virt_expires = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { - tsk->it_virt_expires = t->expires.cpu; + tsk->cputime_expires.virt_exp = t->expires.cpu; break; } t->firing = 1; @@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_sched_expires = 0; + tsk->cputime_expires.sched_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->it_sched_expires = t->expires.sched; + tsk->cputime_expires.sched_exp = t->expires.sched; break; } t->firing = 1; @@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime, ptime, virt_expires, prof_expires; + cputime_t utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; - struct task_struct *t; struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk, /* * Collect the current process totals. */ - utime = sig->utime; - stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; - t = next_thread(t); - } while (t != tsk); - ptime = cputime_add(utime, stime); - + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; + ptime = cputime_add(utime, cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { @@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) || - !cputime_eq(virt_expires, cputime_zero) || - sched_expires != 0) { - /* - * Rebalance the threads' expiry times for the remaining - * process CPU timers. - */ - - cputime_t prof_left, virt_left, ticks; - unsigned long long sched_left, sched; - const unsigned int nthreads = atomic_read(&sig->live); - - if (!nthreads) - return; - - prof_left = cputime_sub(prof_expires, utime); - prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div_non_zero(prof_left, nthreads); - virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div_non_zero(virt_left, nthreads); - if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; - do_div(sched_left, nthreads); - sched_left = max_t(unsigned long long, sched_left, 1); - } else { - sched_left = 0; - } - t = tsk; - do { - if (unlikely(t->flags & PF_EXITING)) - continue; - - ticks = cputime_add(cputime_add(t->utime, t->stime), - prof_left); - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(t->it_prof_expires, cputime_zero) || - cputime_gt(t->it_prof_expires, ticks))) { - t->it_prof_expires = ticks; - } - - ticks = cputime_add(t->utime, virt_left); - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(t->it_virt_expires, cputime_zero) || - cputime_gt(t->it_virt_expires, ticks))) { - t->it_virt_expires = ticks; - } - - sched = t->se.sum_exec_runtime + sched_left; - if (sched_expires && (t->it_sched_expires == 0 || - t->it_sched_expires > sched)) { - t->it_sched_expires = sched; - } - } while ((t = next_thread(t)) != tsk); - } + if (!cputime_eq(prof_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) + sig->cputime_expires.prof_exp = prof_expires; + if (!cputime_eq(virt_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) + sig->cputime_expires.virt_exp = virt_expires; + if (sched_expires != 0 && + (sig->cputime_expires.sched_exp == 0 || + sig->cputime_expires.sched_exp > sched_expires)) + sig->cputime_expires.sched_exp = sched_expires; } /* @@ -1314,6 +1253,86 @@ out: ++timer->it_requeue_pending; } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) + return 1; + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + if (unlikely(!sig)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + if (!task_cputime_zero(&sig->cputime_expires)) { + struct task_cputime group_sample; + + thread_group_cputime(tsk, &group_sample); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + return 0; +} + /* * This is called from the timer interrupt handler. The irq handler has * already updated our counts. We need to check if any timers fire now. @@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk) BUG_ON(!irqs_disabled()); -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) return; -#undef UNEXPIRED - + spin_lock(&tsk->sighand->siglock); /* - * Double-check with locks held. + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. */ - read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); - - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); /* * Now that all the timers on our list have the firing flag, @@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. + * The tsk->sighand->siglock must be held by the caller. + * The *newval argument is relative and we update it to be absolute, *oldval + * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) @@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group_locked(clock_idx, tsk, &now); + cpu_clock_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { @@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { - /* - * Rejigger each thread's expiry time so that one will - * notice before we hit the process-cumulative expiry time. - */ - union cpu_time_count expires = { .sched = 0 }; - expires.cpu = *newval; - process_timer_rebalance(tsk, clock_idx, expires, now); + switch (clock_idx) { + case CPUCLOCK_PROF: + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } } } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5131e54..b931d7c 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + +/* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) @@ -235,9 +244,15 @@ static __init int init_posix_timers(void) .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { + int shared, ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_code = SI_TIMER; - timr->sigq->info.si_tid = timr->it_id; - timr->sigq->info.si_value = timr->it_sigev_value; - - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - - if (likely(ret >= 0)) - return ret; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } - - return send_sigqueue(timr->sigq, timr->it_process, 1); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, timr->it_process, shared); + /* If we failed to send the signal the timer stops. */ + return ret > 0; } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock, struct sigevent __user *timer_event_spec, timer_t __user * created_timer_id) { - int error = 0; - struct k_itimer *new_timer = NULL; - int new_timer_id; - struct task_struct *process = NULL; - unsigned long flags; + struct k_itimer *new_timer; + int error, new_timer_id; + struct task_struct *process; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock, goto out; } spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, (void *) new_timer, - &new_timer_id); + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); - if (error == -EAGAIN) - goto retry; - else if (error) { + if (error) { + if (error == -EAGAIN) + goto retry; /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) @@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock, error = -EFAULT; goto out; } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->it_sigev_signo = event.sigev_signo; - new_timer->it_sigev_value = event.sigev_value; - - read_lock(&tasklist_lock); - if ((process = good_sigevent(&event))) { - /* - * We may be setting up this process for another - * thread. It may be exiting. To catch this - * case the we check the PF_EXITING flag. If - * the flag is not set, the siglock will catch - * him before it is too late (in exit_itimers). - * - * The exec case is a bit more invloved but easy - * to code. If the process is in our thread - * group (and it must be or we would not allow - * it here) and is doing an exec, it will cause - * us to be killed. In this case it will wait - * for us to die which means we can finish this - * linkage with our last gasp. I.e. no code :) - */ - spin_lock_irqsave(&process->sighand->siglock, flags); - if (!(process->flags & PF_EXITING)) { - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - get_task_struct(process); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } else { - spin_unlock_irqrestore(&process->sighand->siglock, flags); - process = NULL; - } - } - read_unlock(&tasklist_lock); + rcu_read_lock(); + process = good_sigevent(&event); + if (process) + get_task_struct(process); + rcu_read_unlock(); if (!process) { error = -EINVAL; goto out; } } else { - new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->it_sigev_signo = SIGALRM; - new_timer->it_sigev_value.sival_int = new_timer->it_id; + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; process = current->group_leader; - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); + get_task_struct(process); } + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_process = process; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify * new_timer after the unlock call. */ - out: - if (error) - release_posix_timer(new_timer, it_id_set); - + release_posix_timer(new_timer, it_id_set); return error; } @@ -597,7 +571,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* @@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) * flags part over to the timer lock. Must not let interrupts in * while we are moving the lock. */ - spin_lock_irqsave(&idr_lock, *flags); - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - - if ((timr->it_id != timer_id) || !(timr->it_process) || - !same_thread_group(timr->it_process, current)) { - spin_unlock(&timr->it_lock); - spin_unlock_irqrestore(&idr_lock, *flags); - timr = NULL; - } else + if (timr->it_process && + same_thread_group(timr->it_process, current)) { spin_unlock(&idr_lock); - } else - spin_unlock_irqrestore(&idr_lock, *flags); + return timr; + } + spin_unlock(&timr->it_lock); + } + spin_unlock_irqrestore(&idr_lock, *flags); - return timr; + return NULL; } /* @@ -862,8 +833,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); @@ -890,8 +860,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 90b5b12..85cb905 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -42,10 +42,10 @@ #include <linux/freezer.h> #include <linux/cpu.h> #include <linux/delay.h> -#include <linux/byteorder/swabb.h> #include <linux/stat.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <asm/byteorder.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " diff --git a/kernel/sched.c b/kernel/sched.c index 6f23059..d906f72 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> +#include <trace/sched.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -1936,6 +1937,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; @@ -2297,9 +2299,7 @@ out_activate: success = 1; out_running: - trace_mark(kernel_sched_wakeup, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup(rq, p); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@ -2432,9 +2432,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_mark(kernel_sched_wakeup_new, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup_new(rq, p); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2607,11 +2605,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_mark(kernel_sched_schedule, - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - prev->pid, next->pid, prev->state, - rq, prev, next); + trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2851,6 +2845,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; + trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -4052,23 +4047,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; struct rq *rq; + u64 ns = 0; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; + if (task_current(rq, p)) { + u64 delta_exec; + update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - ns += delta_exec; + ns = delta_exec; } + task_rq_unlock(rq, &flags); return ns; @@ -4085,6 +4083,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4109,6 +4108,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4144,6 +4144,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4185,6 +4186,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd171..f604dae 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -449,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); } } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index cdf5740..b446dc8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -526,6 +526,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); @@ -1458,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->rt.timeout++; next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) - p->it_sched_expires = p->se.sum_exec_runtime; + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43..b8c1569 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -270,3 +270,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} diff --git a/kernel/signal.c b/kernel/signal.c index e661b01..105217d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,6 +27,7 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> +#include <trace/sched.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -803,6 +804,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; + trace_sched_signal_send(sig, t); + assert_spin_locked(&t->sighand->siglock); if (!prepare_signal(sig, t)) return 0; @@ -1338,6 +1341,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1368,10 +1372,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + thread_group_cputime(tsk, &cputime); + info.si_utime = cputime_to_jiffies(cputime.utime); + info.si_stime = cputime_to_jiffies(cputime.stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) diff --git a/kernel/softirq.c b/kernel/softirq.c index 83ba21a..7110dae 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -267,16 +267,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED diff --git a/kernel/sys.c b/kernel/sys.c index 0bc8fa3..53879cd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_cputime(current, &cputime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + asmlinkage long sys_times(struct tms __user * tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); + + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1449,7 +1439,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) @@ -1503,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - } + update_rlimit_cpu(new_rlim.rlim_cur); out: return 0; } @@ -1552,11 +1530,8 @@ out: * */ -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, - cputime_t *utimep, cputime_t *stimep) +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { - *utimep = cputime_add(*utimep, t->utime); - *stimep = cputime_add(*stimep, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@ -1570,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + struct task_cputime cputime; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - accumulate_thread_rusage(p, r, &utime, &stime); + accumulate_thread_rusage(p, r); goto out; } @@ -1598,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1608,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - accumulate_thread_rusage(t, r, &utime, &stime); + accumulate_thread_rusage(t, r); t = next_thread(t); } while (t != p); break; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4ac..9ed2eec 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; + /* save mult_orig on registration */ + c->mult_orig = c->mult; + spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fd..1ca9955 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 1ad46f3..1a20715 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@ #include <linux/mm.h> #include <linux/time.h> -#include <linux/timer.h> #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/hrtimer.h> #include <linux/capability.h> #include <linux/math64.h> #include <linux/clocksource.h> +#include <linux/workqueue.h> #include <asm/timex.h> /* @@ -218,11 +218,11 @@ void second_overflow(void) /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + schedule_delayed_work(&sync_cmos_work, 0); } #else @@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { } int do_adjtimex(struct timex *txc) { struct timespec ts; - long save_adjust, sec; int result; - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* if the quartz is off by more than 10% something is VERY wrong! */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); } - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + goto adj_done; + } if (txc->modes) { + long sec; + if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -389,22 +396,18 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + +adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index cb01cd8..f98a1b7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -384,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) } /* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + +/* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4692487..b1c05bf 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b711ffc..0581c11 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -430,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } @@ -503,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -552,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -559,6 +581,19 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ /* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ + tick_check_oneshot_broadcast(cpu); +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); +#endif +} + +/* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -611,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f..e7acfb4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,26 @@ struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; - s64 ns_offset; + s64 nsec; - /* read clocksource: */ cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); + nsec = cyc2ns(clock, cycle_delta); + timespec_add_ns(&xtime, nsec); - return ns_offset; + nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + clock->raw_time.tv_nsec += nsec; } /** @@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void) */ void getnstimeofday(struct timespec *ts) { + cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - nsecs = __get_nsec_offset(); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(struct timespec *tv) { + struct timespec ts_delta; unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); - nsec -= __get_nsec_offset(); + clocksource_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + xtime = *tv; - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday); static void change_clocksource(void) { struct clocksource *new; - cycle_t now; - u64 nsec; new = clocksource_get_next(); if (clock == new) return; - new->cycle_last = 0; - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); + clocksource_forward_now(); - clock = new; - clock->cycle_last = now; + new->raw_time = clock->raw_time; + clock = new; + clock->cycle_last = 0; + clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,11 +205,44 @@ static void change_clocksource(void) */ } #else +static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; } #endif /** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + cycle_t cycle_now, cycle_delta; + + do { + seq = read_seqbegin(&xtime_lock); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + + *ts = clock->raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + +/** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) @@ -265,8 +303,6 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } - /* Make sure that we have the correct xtime reference */ - timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* Get the current xtime offset */ - timekeeping_suspend_nsecs = __get_nsec_offset(); + clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -454,23 +487,29 @@ void update_wall_time(void) #else offset = clock->cycle_interval; #endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; + clock->cycle_last += clock->cycle_interval; + clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } + clock->raw_time.tv_nsec += clock->raw_interval; + if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { + clock->raw_time.tv_nsec -= NSEC_PER_SEC; + clock->raw_time.tv_sec++; + } + /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); @@ -479,9 +518,12 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + /* store full nanoseconds into xtime after rounding it up and + * add the remainder to the error difference. + */ + xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20f..f642691 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } @@ -109,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -183,12 +185,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -222,7 +228,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -232,7 +238,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else @@ -244,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 510fe69..56becf3 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1436,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1453,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 263e9e6..1cb3e1f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,23 +1,37 @@ # # Architectures that offer an FTRACE implementation should select HAVE_FTRACE: # + +config NOP_TRACER + bool + config HAVE_FTRACE bool + select NOP_TRACER config HAVE_DYNAMIC_FTRACE bool +config HAVE_FTRACE_MCOUNT_RECORD + bool + config TRACER_MAX_TRACE bool +config RING_BUFFER + bool + config TRACING bool select DEBUG_FS + select RING_BUFFER select STACKTRACE + select TRACEPOINTS config FTRACE bool "Kernel Function Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select FRAME_POINTER select TRACING select CONTEXT_SWITCH_TRACER @@ -36,6 +50,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -59,6 +74,7 @@ config PREEMPT_TRACER depends on GENERIC_TIME depends on PREEMPT depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -86,6 +102,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -96,16 +113,56 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select MARKERS help This tracer gets called from the context switch and records all switching of tasks. +config BOOT_TRACER + bool "Trace boot initcalls" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select TRACING + help + This tracer helps developers to optimize boot times: it records + the timings of the initcalls and traces key events and the identity + of tasks that can cause boot delays, such as context-switches. + + Its aim is to be parsed by the /scripts/bootgraph.pl tool to + produce pretty graphics about boot inefficiencies, giving a visual + representation of the delays during initcalls - but the raw + /debug/tracing/trace text output is readable too. + + ( Note that tracing self tests can't be enabled if this tracer is + selected, because the self-tests are an initcall as well and that + would invalidate the boot trace. ) + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select FTRACE + select STACKTRACE + help + This special tracer records the maximum stack footprint of the + kernel and displays it in debugfs/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. Because this logic has to execute in every + kernel function, all the time, this option can slow down the + kernel measurably and is generally intended for kernel + developers only. + + Say N if unsure. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE depends on HAVE_DYNAMIC_FTRACE + depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -121,12 +178,17 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + config FTRACE_SELFTEST bool config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 71d17de..a85dfba 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o endif obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o @@ -19,6 +20,9 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_BOOT_TRACER) += trace_boot.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6e3af3..4dda4f6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -81,7 +81,7 @@ void clear_ftrace_function(void) static int __register_ftrace_function(struct ftrace_ops *ops) { - /* Should never be called by interrupts */ + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); ops->next = ftrace_list; @@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) struct ftrace_ops **p; int ret = 0; + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); /* @@ -153,6 +154,30 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +/* + * The hash lock is only needed when the recording of the mcount + * callers are dynamic. That is, by the caller themselves and + * not recorded via the compilation. + */ +static DEFINE_SPINLOCK(ftrace_hash_lock); +#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) \ + spin_unlock_irqrestore(&ftrace_hash_lock, flags) +#else +/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ +#define ftrace_hash_lock(flags) do { (void)(flags); } while (0) +#define ftrace_hash_unlock(flags) do { } while(0) +#endif + +/* + * Since MCOUNT_ADDR may point to mcount itself, we do not want + * to get it confused by reading a reference in the code as we + * are parsing on objcopy output of text. Use a variable for + * it instead. + */ +static unsigned long mcount_addr = MCOUNT_ADDR; + static struct task_struct *ftraced_task; enum { @@ -171,7 +196,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); static DEFINE_MUTEX(ftrace_regex_lock); @@ -294,13 +318,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node) static void ftrace_free_rec(struct dyn_ftrace *rec) { - /* no locking, only called from kstop_machine */ - rec->ip = (unsigned long)ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } +void ftrace_release(void *start, unsigned long size) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = s + size; + int i; + + if (ftrace_disabled || !start) + return; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if ((rec->ip >= s) && (rec->ip < e)) + ftrace_free_rec(rec); + } + } + spin_unlock(&ftrace_lock); + +} + static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -338,7 +386,6 @@ ftrace_record_ip(unsigned long ip) unsigned long flags; unsigned long key; int resched; - int atomic; int cpu; if (!ftrace_enabled || ftrace_disabled) @@ -368,9 +415,7 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out; - atomic = irqs_disabled(); - - spin_lock_irqsave(&ftrace_shutdown_lock, flags); + ftrace_hash_lock(flags); /* This ip may have hit the hash before the lock */ if (ftrace_ip_in_hash(ip, key)) @@ -387,7 +432,7 @@ ftrace_record_ip(unsigned long ip) ftraced_trigger = 1; out_unlock: - spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + ftrace_hash_unlock(flags); out: per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -531,6 +576,16 @@ static void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + static int ftrace_code_disable(struct dyn_ftrace *rec) { @@ -541,10 +596,27 @@ ftrace_code_disable(struct dyn_ftrace *rec) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, MCOUNT_ADDR); + call = ftrace_call_replace(ip, mcount_addr); failed = ftrace_modify_code(ip, call, nop); if (failed) { + switch (failed) { + case 1: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case 2: + WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" expected: ", call); + print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" replace: ", nop); + printk(KERN_CONT "\n"); + break; + } + rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -792,47 +864,7 @@ static int ftrace_update_code(void) return 1; } -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dyn_table_alloc(void) +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) { struct ftrace_page *pg; int cnt; @@ -859,7 +891,9 @@ static int __init ftrace_dyn_table_alloc(void) pg = ftrace_pages = ftrace_pages_start; - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld hash entries in %d pages\n", + num_to_init, cnt); for (i = 0; i < cnt; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -901,6 +935,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); retry: if (iter->idx >= iter->pg->index) { if (iter->pg->next) { @@ -910,15 +946,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((!(iter->flags & FTRACE_ITER_FAILURES) && + if ((rec->flags & FTRACE_FL_FREE) || + + (!(iter->flags & FTRACE_ITER_FAILURES) && (rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_FAILURES) && - (!(rec->flags & FTRACE_FL_FAILED) || - (rec->flags & FTRACE_FL_FREE))) || - - ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { @@ -926,6 +960,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) goto retry; } } + spin_unlock(&ftrace_lock); iter->pos = *pos; @@ -1039,8 +1074,8 @@ static void ftrace_filter_reset(int enable) unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 0; pg = ftrace_pages_start; @@ -1053,7 +1088,7 @@ static void ftrace_filter_reset(int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static int @@ -1165,8 +1200,8 @@ ftrace_match(unsigned char *buff, int len, int enable) } } - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 1; pg = ftrace_pages_start; @@ -1203,7 +1238,7 @@ ftrace_match(unsigned char *buff, int len, int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static ssize_t @@ -1556,6 +1591,114 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +static int ftrace_convert_nops(unsigned long *start, + unsigned long *end) +{ + unsigned long *p; + unsigned long addr; + unsigned long flags; + + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + ftrace_record_ip(addr); + spin_unlock(&ftrace_lock); + ftrace_shutdown_replenish(); + } + + /* p is ignored */ + local_irq_save(flags); + __ftrace_update_code(p); + local_irq_restore(flags); + + return 0; +} + +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ + if (ftrace_disabled || start == end) + return; + ftrace_convert_nops(start, end); +} + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ + unsigned long count, addr, flags; + int ret; + + /* Keep the ftrace pointer to the stub */ + addr = (unsigned long)ftrace_stub; + + local_irq_save(flags); + ftrace_dyn_arch_init(&addr); + local_irq_restore(flags); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + + ret = ftrace_dyn_table_alloc(count); + if (ret) + goto failed; + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_convert_nops(__start_mcount_loc, + __stop_mcount_loc); + + return; + failed: + ftrace_disabled = 1; +} +#else /* CONFIG_FTRACE_MCOUNT_RECORD */ +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + static int __init ftrace_dynamic_init(void) { struct task_struct *p; @@ -1572,7 +1715,7 @@ static int __init ftrace_dynamic_init(void) goto failed; } - ret = ftrace_dyn_table_alloc(); + ret = ftrace_dyn_table_alloc(NR_TO_INIT); if (ret) goto failed; @@ -1593,6 +1736,8 @@ static int __init ftrace_dynamic_init(void) } core_initcall(ftrace_dynamic_init); +#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ + #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 0000000..94af1fe --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,2014 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> /* used for sched_clock() (for now) */ +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/fs.h> + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +/* FIXME!!! */ +u64 ring_buffer_time_stamp(int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return sched_clock() << DEBUG_SHIFT; +} + +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} + +#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_ALIGNMENT_SHIFT 2 +#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +#define RB_MAX_SMALL_DATA 28 + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +/* inline for ring buffer fast paths */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + /* undefined */ + return -1; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + if (event->len) + length = event->len << RB_ALIGNMENT_SHIFT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; + default: + BUG(); + } + /* not hit */ + return 0; +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + return rb_event_length(event); +} + +/* inline for ring buffer fast paths */ +static inline void * +rb_event_data(struct ring_buffer_event *event) +{ + BUG_ON(event->type != RINGBUF_TYPE_DATA); + /* If length is in len field, then array[0] has the data */ + if (event->len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu_mask(cpu, buffer->cpumask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * This hack stolen from mm/slob.c. + * We can store per page timing information in the page frame of the page. + * Thanks to Peter Zijlstra for suggesting this idea. + */ +struct buffer_page { + u64 time_stamp; /* page time stamp */ + local_t write; /* index for next write */ + local_t commit; /* write commited index */ + unsigned read; /* index for next read */ + struct list_head list; /* list of free pages */ + void *page; /* Actual data page */ +}; + +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static inline void free_buffer_page(struct buffer_page *bpage) +{ + if (bpage->page) + __free_page(bpage->page); + kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + +#define BUF_PAGE_SIZE PAGE_SIZE + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + struct ring_buffer *buffer; + spinlock_t lock; + struct lock_class_key lock_key; + struct list_head pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* commited pages */ + struct buffer_page *reader_page; + unsigned long overrun; + unsigned long entries; + u64 write_stamp; + u64 read_stamp; + atomic_t record_disabled; +}; + +struct ring_buffer { + unsigned long size; + unsigned pages; + unsigned flags; + int cpus; + cpumask_t cpumask; + atomic_t record_disabled; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + struct buffer_page *head_page; + u64 read_stamp; +}; + +#define RB_WARN_ON(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +#define RB_WARN_ON_RET(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return -1; \ + } \ + } while (0) + +#define RB_WARN_ON_ONCE(buffer, cond) \ + do { \ + static int once; \ + if (unlikely(cond) && !once) { \ + once++; \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +/** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); + RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + + list_for_each_entry_safe(page, tmp, head, list) { + RB_WARN_ON_RET(cpu_buffer, + page->list.next->prev != &page->list); + RB_WARN_ON_RET(cpu_buffer, + page->list.prev->next != &page->list); + } + + return 0; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + unsigned long addr; + LIST_HEAD(pages); + unsigned i; + + for (i = 0; i < nr_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page->page = (void *)addr; + } + + list_splice(&pages, head); + + rb_check_pages(cpu_buffer); + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + return -ENOMEM; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *page; + unsigned long addr; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->lock); + INIT_LIST_HEAD(&cpu_buffer->pages); + + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto fail_free_buffer; + + cpu_buffer->reader_page = page; + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto fail_free_reader; + page->page = (void *)addr; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + + ret = rb_allocate_pages(cpu_buffer, buffer->pages); + if (ret < 0) + goto fail_free_reader; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + return cpu_buffer; + + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + list_del_init(&cpu_buffer->reader_page->list); + free_buffer_page(cpu_buffer->reader_page); + + list_for_each_entry_safe(page, tmp, head, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + kfree(cpu_buffer); +} + +/* + * Causes compile errors if the struct buffer_page gets bigger + * than the struct page. + */ +extern int ring_buffer_page_too_big(void); + +/** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +{ + struct ring_buffer *buffer; + int bsize; + int cpu; + + /* Paranoid! Optimizes out when all is well */ + if (sizeof(struct buffer_page) > sizeof(struct page)) + ring_buffer_page_too_big(); + + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + + /* need at least two pages */ + if (buffer->pages == 1) + buffer->pages++; + + buffer->cpumask = cpu_possible_map; + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_buffer; + + for_each_buffer_cpu(buffer, cpu) { + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + } + + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_buffer: + kfree(buffer); + return NULL; +} + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + + kfree(buffer); +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static void +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(&cpu_buffer->pages)); + p = cpu_buffer->pages.next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + free_buffer_page(page); + } + BUG_ON(list_empty(&cpu_buffer->pages)); + + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + +} + +static void +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *pages, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(pages)); + p = pages->next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + list_add_tail(&page->list, &cpu_buffer->pages); + } + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * + * The tracer is responsible for making sure that the buffer is + * not being used while changing the size. + * Note: We may be able to change the above requirement by using + * RCU synchronizations. + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns -1 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned nr_pages, rm_pages, new_pages; + struct buffer_page *page, *tmp; + unsigned long buffer_size; + unsigned long addr; + LIST_HEAD(pages); + int i, cpu; + + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size *= BUF_PAGE_SIZE; + buffer_size = buffer->pages * BUF_PAGE_SIZE; + + /* we need a minimum of two pages */ + if (size < BUF_PAGE_SIZE * 2) + size = BUF_PAGE_SIZE * 2; + + if (size == buffer_size) + return size; + + mutex_lock(&buffer->mutex); + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + if (size < buffer_size) { + + /* easy case, just free pages */ + BUG_ON(nr_pages >= buffer->pages); + + rm_pages = buffer->pages - nr_pages; + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_remove_pages(cpu_buffer, rm_pages); + } + goto out; + } + + /* + * This is a bit more difficult. We only want to add pages + * when we can allocate enough for all CPUs. We do this + * by allocating all the pages and storing them on a local + * link list. If we succeed in our allocation, then we + * add these pages to the cpu_buffers. Otherwise we just free + * them all and return -ENOMEM; + */ + BUG_ON(nr_pages <= buffer->pages); + new_pages = nr_pages - buffer->pages; + + for_each_buffer_cpu(buffer, cpu) { + for (i = 0; i < new_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), + cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page->page = (void *)addr; + } + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_insert_pages(cpu_buffer, &pages, new_pages); + } + + BUG_ON(!list_empty(&pages)); + + out: + buffer->pages = nr_pages; + mutex_unlock(&buffer->mutex); + + return size; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + return -ENOMEM; +} + +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING; +} + +static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +{ + return page->page + index; +} + +static inline struct ring_buffer_event * +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader_page->read); +} + +static inline struct ring_buffer_event * +rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->head_page, + cpu_buffer->head_page->read); +} + +static inline struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ + return __rb_page_index(iter->head_page, iter->head); +} + +static inline unsigned rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write); +} + +static inline unsigned rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->commit); +} + +/* Size is determined by what has been commited */ +static inline unsigned rb_page_size(struct buffer_page *bpage) +{ + return rb_page_commit(bpage); +} + +static inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->commit_page); +} + +static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->head_page); +} + +/* + * When the tail hits the head and the buffer is in overwrite mode, + * the head jumps to the next page and all content on the previous + * page is discarded. But before doing so, we update the overrun + * variable of the buffer. + */ +static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + unsigned long head; + + for (head = 0; head < rb_head_size(cpu_buffer); + head += rb_event_length(event)) { + + event = __rb_page_index(cpu_buffer->head_page, head); + BUG_ON(rb_null_event(event)); + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->overrun++; + cpu_buffer->entries--; + } +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **page) +{ + struct list_head *p = (*page)->list.next; + + if (p == &cpu_buffer->pages) + p = p->next; + + *page = list_entry(p, struct buffer_page, list); +} + +static inline unsigned +rb_event_index(struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + + return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); +} + +static inline int +rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static inline void +rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + while (cpu_buffer->commit_page->page != (void *)addr) { + RB_WARN_ON(cpu_buffer, + cpu_buffer->commit_page == cpu_buffer->tail_page); + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + } + + /* Now set the commit to the event's index */ + local_set(&cpu_buffer->commit_page->commit, index); +} + +static inline void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + barrier(); + } +} + +static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->reader_page->read = 0; +} + +static inline void rb_inc_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* + * The iterator could be on the reader page (it starts there). + * But the head could have moved, since the reader was + * found. Check for this case and assign the iterator + * to the head page instead of next. + */ + if (iter->head_page == cpu_buffer->reader_page) + iter->head_page = cpu_buffer->head_page; + else + rb_inc_page(cpu_buffer, &iter->head_page); + + iter->read_stamp = iter->head_page->time_stamp; + iter->head = 0; +} + +/** + * ring_buffer_update_event - update event type and data + * @event: the even to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static inline void +rb_update_event(struct ring_buffer_event *event, + unsigned type, unsigned length) +{ + event->type = type; + + switch (type) { + + case RINGBUF_TYPE_PADDING: + break; + + case RINGBUF_TYPE_TIME_EXTEND: + event->len = + (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + + case RINGBUF_TYPE_TIME_STAMP: + event->len = + (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + + case RINGBUF_TYPE_DATA: + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA) { + event->len = 0; + event->array[0] = length; + } else + event->len = + (length + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + default: + BUG(); + } +} + +static inline unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length = 1; + + if (length > RB_MAX_SMALL_DATA) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ALIGNMENT); + + return length; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *head_page, *reader_page; + unsigned long tail, write; + struct ring_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_event *event; + unsigned long flags; + + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) { + struct buffer_page *next_page = tail_page; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + rb_inc_page(cpu_buffer, &next_page); + + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; + + /* we grabbed the lock before incrementing */ + RB_WARN_ON(cpu_buffer, next_page == reader_page); + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == cpu_buffer->commit_page)) { + WARN_ON_ONCE(1); + goto out_unlock; + } + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) { + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + goto out_unlock; + } + + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + rb_update_overflow(cpu_buffer); + + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(cpu_buffer->cpu); + cpu_buffer->tail_page->time_stamp = *ts; + } + + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + event->type = RINGBUF_TYPE_PADDING; + } + + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + } + + /* We reserved something on the buffer */ + + BUG_ON(write > BUF_PAGE_SIZE); + + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->time_stamp = *ts; + + return event; + + out_unlock: + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + return NULL; +} + +static int +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + u64 *ts, u64 *delta) +{ + struct ring_buffer_event *event; + static int once; + int ret; + + if (unlikely(*delta > (1ULL << 59) && !once++)) { + printk(KERN_WARNING "Delta way too big! %llu" + " ts=%llu write stamp = %llu\n", + *delta, *ts, cpu_buffer->write_stamp); + WARN_ON(1); + } + + /* + * The delta is too big, we to add a + * new timestamp. + */ + event = __rb_reserve_next(cpu_buffer, + RINGBUF_TYPE_TIME_EXTEND, + RB_LEN_TIME_EXTEND, + ts); + if (!event) + return -EBUSY; + + if (PTR_ERR(event) == -EAGAIN) + return -EAGAIN; + + /* Only a commited time event can update the write stamp */ + if (rb_is_commit(cpu_buffer, event)) { + /* + * If this is the first on the page, then we need to + * update the page itself, and just put in a zero. + */ + if (rb_event_index(event)) { + event->time_delta = *delta & TS_MASK; + event->array[0] = *delta >> TS_SHIFT; + } else { + cpu_buffer->commit_page->time_stamp = *ts; + event->time_delta = 0; + event->array[0] = 0; + } + cpu_buffer->write_stamp = *ts; + /* let the caller know this was the commit */ + ret = 1; + } else { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; + } + + *delta = 0; + + return ret; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int commit = 0; + + again: + ts = ring_buffer_time_stamp(cpu_buffer->cpu); + + /* + * Only the first commit can update the timestamp. + * Yes there is a race here. If an interrupt comes in + * just after the conditional and it traces too, then it + * will also check the deltas. More than one timestamp may + * also be made. But only the entry that did the actual + * commit will be something other than zero. + */ + if (cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer)) { + + delta = ts - cpu_buffer->write_stamp; + + /* make sure this delta is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (unlikely(ts < cpu_buffer->write_stamp)) + goto again; + + if (test_time_stamp(delta)) { + + commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + + if (commit == -EBUSY) + return NULL; + + if (commit == -EAGAIN) + goto again; + + RB_WARN_ON(cpu_buffer, commit < 0); + } + } else + /* Non commits have zero deltas */ + delta = 0; + + event = __rb_reserve_next(cpu_buffer, type, length, &ts); + if (PTR_ERR(event) == -EAGAIN) + goto again; + + if (!event) { + if (unlikely(commit)) + /* + * Ouch! We needed a timestamp and it was commited. But + * we didn't get our event reserved. + */ + rb_set_commit_to_write(cpu_buffer); + return NULL; + } + + /* + * If the timestamp was commited, make the commit our entry + * now so that we will update it when needed. + */ + if (commit) + rb_set_commit_event(cpu_buffer, event); + else if (!rb_is_commit(cpu_buffer, event)) + delta = 0; + + event->time_delta = delta; + + return event; +} + +static DEFINE_PER_CPU(int, rb_need_resched); + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * @flags: a pointer to save the interrupt flags + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, + unsigned long length, + unsigned long *flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int cpu, resched; + + if (atomic_read(&buffer->record_disabled)) + return NULL; + + /* If we are tracing schedule, we don't want to recurse */ + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + length = rb_calculate_event_length(length); + if (length > BUF_PAGE_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + if (!event) + goto out; + + /* + * Need to store resched state on this cpu. + * Only the first needs to. + */ + + if (preempt_count() == 1) + per_cpu(rb_need_resched, cpu) = resched; + + return event; + + out: + if (resched) + preempt_enable_notrace(); + else + preempt_enable_notrace(); + return NULL; +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + cpu_buffer->entries++; + + /* Only process further if we own the commit */ + if (!rb_is_commit(cpu_buffer, event)) + return; + + cpu_buffer->write_stamp += event->time_delta; + + rb_set_commit_to_write(cpu_buffer); +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * @flags: the interrupt flags received from ring_buffer_lock_reserve. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) { + if (per_cpu(rb_need_resched, cpu)) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); + } else + preempt_enable_no_resched_notrace(); + + return 0; +} + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, + void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned long event_length; + void *body; + int ret = -EBUSY; + int cpu, resched; + + if (atomic_read(&buffer->record_disabled)) + return -EBUSY; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + event_length = rb_calculate_event_length(length); + event = rb_reserve_next_event(cpu_buffer, + RINGBUF_TYPE_DATA, event_length); + if (!event) + goto out; + + body = rb_event_data(event); + + memcpy(body, data, length); + + rb_commit(cpu_buffer, event); + + ret = 0; + out: + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); + + return ret; +} + +static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *commit = cpu_buffer->commit_page; + + return reader->read == rb_page_commit(reader) && + (commit == reader || + (commit == head && + head->read == rb_page_commit(commit))); +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable(struct ring_buffer *buffer) +{ + atomic_inc(&buffer->record_disabled); +} + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct ring_buffer *buffer) +{ + atomic_dec(&buffer->record_disabled); +} + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_inc(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->overrun; +} + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long entries = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + entries += cpu_buffer->entries; + } + + return entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long overruns = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + overruns += cpu_buffer->overrun; + } + + return overruns; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* Iterator usage is expected to have record disabled */ + if (list_empty(&cpu_buffer->reader_page->list)) { + iter->head_page = cpu_buffer->head_page; + iter->head = cpu_buffer->head_page->read; + } else { + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + } + if (iter->head) + iter->read_stamp = cpu_buffer->read_stamp; + else + iter->read_stamp = iter->head_page->time_stamp; +} + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = iter->cpu_buffer; + + return iter->head_page == cpu_buffer->commit_page && + iter->head == rb_commit_index(cpu_buffer); +} + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + cpu_buffer->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + iter->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + iter->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = NULL; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + again: + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader_page->read < rb_page_size(reader)) + goto out; + + /* Never should we have an index greater than the size */ + RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader)); + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->commit_page == cpu_buffer->reader_page) + goto out; + + /* + * Splice the empty reader page into the list around the head. + * Reset the reader page to size zero. + */ + + reader = cpu_buffer->head_page; + cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.prev = reader->list.prev; + + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->commit, 0); + + /* Make the reader page now replace the head */ + reader->list.prev->next = &cpu_buffer->reader_page->list; + reader->list.next->prev = &cpu_buffer->reader_page->list; + + /* + * If the tail is on the reader, then we must set the head + * to the inserted page, otherwise we set it one before. + */ + cpu_buffer->head_page = cpu_buffer->reader_page; + + if (cpu_buffer->commit_page != reader) + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + + /* Finally update the reader page to the new head */ + cpu_buffer->reader_page = reader; + rb_reset_reader_page(cpu_buffer); + + goto again; + + out: + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + unsigned length; + + reader = rb_get_reader_page(cpu_buffer); + + /* This function should not be called when buffer is empty */ + BUG_ON(!reader); + + event = rb_reader_event(cpu_buffer); + + if (event->type == RINGBUF_TYPE_DATA) + cpu_buffer->entries--; + + rb_update_read_stamp(cpu_buffer, event); + + length = rb_event_length(event); + cpu_buffer->reader_page->read += length; +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned length; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + /* + * Check if we are at the end of the buffer. + */ + if (iter->head >= rb_page_size(iter->head_page)) { + BUG_ON(iter->head_page == cpu_buffer->commit_page); + rb_inc_iter(iter); + return; + } + + event = rb_iter_head_event(iter); + + length = rb_event_length(event); + + /* + * This should not be called to advance the header if we are + * at the tail of the buffer. + */ + BUG_ON((iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer))); + + rb_update_iter_read_stamp(iter, event); + + iter->head += length; + + /* check for end of page padding */ + if ((iter->head >= rb_page_size(iter->head_page)) && + (iter->head_page != cpu_buffer->commit_page)) + rb_advance_iter(iter); +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + struct buffer_page *reader; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + again: + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + return NULL; + + event = rb_reader_event(cpu_buffer); + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + RB_WARN_ON(cpu_buffer, 1); + rb_advance_reader(cpu_buffer); + return NULL; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = cpu_buffer->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (ring_buffer_iter_empty(iter)) + return NULL; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + again: + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + event = rb_iter_head_event(iter); + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + rb_inc_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = iter->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + event = ring_buffer_peek(buffer, cpu, ts); + if (!event) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + rb_advance_reader(cpu_buffer); + + return event; +} + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * + * This starts up an iteration through the buffer. It also disables + * the recording to the buffer until the reading is finished. + * This prevents the reading from being corrupted. This is not + * a consuming read, so a producer is not expected. + * + * Must be paired with ring_buffer_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_iter *iter; + unsigned long flags; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + iter->cpu_buffer = cpu_buffer; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + spin_lock_irqsave(&cpu_buffer->lock, flags); + ring_buffer_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + return iter; +} + +/** + * ring_buffer_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + atomic_dec(&cpu_buffer->record_disabled); + kfree(iter); +} + +/** + * ring_buffer_read - read the next item in the ring buffer by the iterator + * @iter: The ring buffer iterator + * @ts: The time stamp of the event read. + * + * This reads the next event in the ring buffer and increments the iterator. + */ +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_event *event; + + event = ring_buffer_iter_peek(iter, ts); + if (!event) + return NULL; + + rb_advance_iter(iter); + + return event; +} + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + */ +unsigned long ring_buffer_size(struct ring_buffer *buffer) +{ + return BUF_PAGE_SIZE * buffer->pages; +} + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->commit, 0); + + cpu_buffer->head_page->read = 0; + + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->commit_page = cpu_buffer->head_page; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->commit, 0); + cpu_buffer->reader_page->read = 0; + + cpu_buffer->overrun = 0; + cpu_buffer->entries = 0; +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + unsigned long flags; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + rb_reset_cpu(cpu_buffer); + + spin_unlock_irqrestore(&cpu_buffer->lock, flags); +} + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + ring_buffer_reset_cpu(buffer, cpu); +} + +/** + * rind_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +int ring_buffer_empty(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* yes this is racy, but if you don't like the race, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!rb_per_cpu_empty(cpu_buffer)) + return 0; + } + return 1; +} + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 1; + + cpu_buffer = buffer->buffers[cpu]; + return rb_per_cpu_empty(cpu_buffer); +} + +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; + + if (!cpu_isset(cpu, buffer_a->cpumask) || + !cpu_isset(cpu, buffer_b->cpumask)) + return -EINVAL; + + /* At least make sure the two buffers are somewhat the same */ + if (buffer_a->size != buffer_b->size || + buffer_a->pages != buffer_b->pages) + return -EINVAL; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + + /* + * We can't do a synchronize_sched here because this + * function can be called in atomic context. + * Normally this will be called from the same CPU as cpu. + * If not it's up to the caller to protect this. + */ + atomic_inc(&cpu_buffer_a->record_disabled); + atomic_inc(&cpu_buffer_b->record_disabled); + + buffer_a->buffers[cpu] = cpu_buffer_b; + buffer_b->buffers[cpu] = cpu_buffer_a; + + cpu_buffer_b->buffer = buffer_a; + cpu_buffer_a->buffer = buffer_b; + + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); + + return 0; +} + diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f3fb3d..d345d64 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -14,6 +14,7 @@ #include <linux/utsrelease.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> +#include <linux/notifier.h> #include <linux/debugfs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> @@ -22,6 +23,7 @@ #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> +#include <linux/kdebug.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> @@ -31,25 +33,36 @@ #include <linux/writeback.h> #include <linux/stacktrace.h> +#include <linux/ring_buffer.h> #include "trace.h" +#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) + unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; -static unsigned long __read_mostly tracing_nr_buffers; +static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); + +static inline void ftrace_disable_cpu(void) +{ + preempt_disable(); + local_inc(&__get_cpu_var(ftrace_cpu_disabled)); +} + +static inline void ftrace_enable_cpu(void) +{ + local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + preempt_enable(); +} + static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -static int trace_alloc_page(void); -static int trace_free_page(void); - static int tracing_disabled = 1; -static unsigned long tracing_pages_allocated; - long ns2usecs(cycle_t nsec) { @@ -60,7 +73,9 @@ ns2usecs(cycle_t nsec) cycle_t ftrace_now(int cpu) { - return cpu_clock(cpu); + u64 ts = ring_buffer_time_stamp(cpu); + ring_buffer_normalize_time_stamp(cpu, &ts); + return ts; } /* @@ -100,11 +115,18 @@ static int tracer_enabled = 1; int ftrace_function_enabled; /* - * trace_nr_entries is the number of entries that is allocated - * for a buffer. Note, the number of entries is always rounded - * to ENTRIES_PER_PAGE. + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable. */ -static unsigned long trace_nr_entries = 65536UL; +#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ + +static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; @@ -133,24 +155,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -static notrace void no_trace_init(struct trace_array *tr) -{ - int cpu; - - ftrace_function_enabled = 0; - if(tr->ctrl) - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); - tracer_enabled = 0; -} - -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { - .name = "none", - .init = no_trace_init -}; - - /** * trace_wake_up - wake up tasks waiting for trace input * @@ -167,23 +171,21 @@ void trace_wake_up(void) wake_up(&trace_wait); } -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) - -static int __init set_nr_entries(char *str) +static int __init set_buf_size(char *str) { - unsigned long nr_entries; + unsigned long buf_size; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &nr_entries); + ret = strict_strtoul(str, 0, &buf_size); /* nr_entries can not be zero */ - if (ret < 0 || nr_entries == 0) + if (ret < 0 || buf_size == 0) return 0; - trace_nr_entries = nr_entries; + trace_buf_size = buf_size; return 1; } -__setup("trace_entries=", set_nr_entries); +__setup("trace_buf_size=", set_buf_size); unsigned long nsecs_to_usecs(unsigned long nsecs) { @@ -191,21 +193,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) } /* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * NEED_RESCED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - */ -enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_NEED_RESCHED = 0x02, - TRACE_FLAG_HARDIRQ = 0x04, - TRACE_FLAG_SOFTIRQ = 0x08, -}; - -/* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ @@ -224,6 +211,7 @@ static const char *trace_options[] = { "block", "stacktrace", "sched-tree", + "ftrace_printk", NULL }; @@ -266,54 +254,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } -#define CHECK_COND(cond) \ - if (unlikely(cond)) { \ - tracing_disabled = 1; \ - WARN_ON(1); \ - return -1; \ - } - -/** - * check_pages - integrity check of trace buffers - * - * As a safty measure we check to make sure the data pages have not - * been corrupted. - */ -int check_pages(struct trace_array_cpu *data) -{ - struct page *page, *tmp; - - CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); - CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); - - list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - CHECK_COND(page->lru.next->prev != &page->lru); - CHECK_COND(page->lru.prev->next != &page->lru); - } - - return 0; -} - -/** - * head_page - page address of the first page in per_cpu buffer. - * - * head_page returns the page address of the first page in - * a per_cpu buffer. This also preforms various consistency - * checks to make sure the buffer has not been corrupted. - */ -void *head_page(struct trace_array_cpu *data) -{ - struct page *page; - - if (list_empty(&data->trace_pages)) - return NULL; - - page = list_entry(data->trace_pages.next, struct page, lru); - BUG_ON(&page->lru == &data->trace_pages); - - return page_address(page); -} - /** * trace_seq_printf - sequence printing of trace information * @s: trace sequence descriptor @@ -395,28 +335,23 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } -#define HEX_CHARS 17 -static const char hex2asc[] = "0123456789abcdef"; +#define MAX_MEMHEX_BYTES 8 +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; unsigned char *data = mem; - unsigned char byte; int i, j; - BUG_ON(len >= HEX_CHARS); - #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else for (i = len-1, j = 0; i >= 0; i--) { #endif - byte = data[i]; - - hex[j++] = hex2asc[byte & 0x0f]; - hex[j++] = hex2asc[byte >> 4]; + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); } hex[j++] = ' '; @@ -460,34 +395,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } -/* - * flip the trace buffers between two trace descriptors. - * This usually is the buffers between the global_trace and - * the max_tr to record a snapshot of a current trace. - * - * The ftrace_max_lock must be held. - */ -static void -flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) -{ - struct list_head flip_pages; - - INIT_LIST_HEAD(&flip_pages); - - memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, - sizeof(struct trace_array_cpu) - - offsetof(struct trace_array_cpu, trace_head_idx)); - - check_pages(tr1); - check_pages(tr2); - list_splice_init(&tr1->trace_pages, &flip_pages); - list_splice_init(&tr2->trace_pages, &tr1->trace_pages); - list_splice_init(&flip_pages, &tr2->trace_pages); - BUG_ON(!list_empty(&flip_pages)); - check_pages(tr1); - check_pages(tr2); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -500,17 +407,17 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data; - int i; + struct ring_buffer *buf = tr->buffer; WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - /* clear out all the previous traces */ - for_each_tracing_cpu(i) { - data = tr->data[i]; - flip_trace(max_tr.data[i], data); - tracing_reset(data); - } + + tr->buffer = max_tr.buffer; + max_tr.buffer = buf; + + ftrace_disable_cpu(); + ring_buffer_reset(tr->buffer); + ftrace_enable_cpu(); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -527,16 +434,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; - int i; + int ret; WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - for_each_tracing_cpu(i) - tracing_reset(max_tr.data[i]); - flip_trace(max_tr.data[cpu], data); - tracing_reset(data); + ftrace_disable_cpu(); + + ring_buffer_reset(max_tr.buffer); + ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + + ftrace_enable_cpu(); + + WARN_ON_ONCE(ret); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -573,7 +483,6 @@ int register_tracer(struct tracer *type) #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest) { struct tracer *saved_tracer = current_trace; - struct trace_array_cpu *data; struct trace_array *tr = &global_trace; int saved_ctrl = tr->ctrl; int i; @@ -585,10 +494,7 @@ int register_tracer(struct tracer *type) * If we fail, we do not register this tracer. */ for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); + tracing_reset(tr, i); } current_trace = type; tr->ctrl = 0; @@ -604,10 +510,7 @@ int register_tracer(struct tracer *type) } /* Only reset on passing, to avoid touching corrupted buffers */ for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); + tracing_reset(tr, i); } printk(KERN_CONT "PASSED\n"); } @@ -653,13 +556,11 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void tracing_reset(struct trace_array_cpu *data) +void tracing_reset(struct trace_array *tr, int cpu) { - data->trace_idx = 0; - data->overrun = 0; - data->trace_head = data->trace_tail = head_page(data); - data->trace_head_idx = 0; - data->trace_tail_idx = 0; + ftrace_disable_cpu(); + ring_buffer_reset_cpu(tr->buffer, cpu); + ftrace_enable_cpu(); } #define SAVED_CMDLINES 128 @@ -745,82 +646,16 @@ void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } -static inline struct list_head * -trace_next_list(struct trace_array_cpu *data, struct list_head *next) -{ - /* - * Roundrobin - but skip the head (which is not a real page): - */ - next = next->next; - if (unlikely(next == &data->trace_pages)) - next = next->next; - BUG_ON(next == &data->trace_pages); - - return next; -} - -static inline void * -trace_next_page(struct trace_array_cpu *data, void *addr) -{ - struct list_head *next; - struct page *page; - - page = virt_to_page(addr); - - next = trace_next_list(data, &page->lru); - page = list_entry(next, struct page, lru); - - return page_address(page); -} - -static inline struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) -{ - unsigned long idx, idx_next; - struct trace_entry *entry; - - data->trace_idx++; - idx = data->trace_head_idx; - idx_next = idx + 1; - - BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - - entry = data->trace_head + idx * TRACE_ENTRY_SIZE; - - if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { - data->trace_head = trace_next_page(data, data->trace_head); - idx_next = 0; - } - - if (data->trace_head == data->trace_tail && - idx_next == data->trace_tail_idx) { - /* overrun */ - data->overrun++; - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = - trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } - } - - data->trace_head_idx = idx_next; - - return entry; -} - -static inline void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) +void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, + int pc) { struct task_struct *tsk = current; - unsigned long pc; - - pc = preempt_count(); - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->t = ftrace_now(raw_smp_processor_id()); - entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->flags = + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); @@ -828,145 +663,139 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) void trace_function(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct ftrace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + /* If we are reading the ring buffer, don't trace */ + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_FN; + entry->ip = ip; + entry->parent_ip = parent_ip; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } void ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) { if (likely(!atomic_read(&data->disabled))) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); } -#ifdef CONFIG_MMIOTRACE -void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_rw *rw) +static void ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct stack_entry *entry; + struct stack_trace trace; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_RW; - entry->mmiorw = *rw; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - -void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_map *map) -{ - struct trace_entry *entry; - unsigned long irq_flags; + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_STACK; - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_MAP; - entry->mmiomap = *map; + memset(&entry->caller, 0, sizeof(entry->caller)); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->caller; - trace_wake_up(); + save_stack_trace(&trace); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } -#endif void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, int skip) { - struct trace_entry *entry; - struct stack_trace trace; - - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_STACK; - - memset(&entry->stack, 0, sizeof(entry->stack)); - - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->stack.caller; - - save_stack_trace(&trace); + ftrace_trace_stack(tr, data, flags, skip, preempt_count()); } -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) +static void +ftrace_trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + int pc) { + struct ring_buffer_event *event; struct trace_array_cpu *data = __data; struct trace_array *tr = __tr; - struct trace_entry *entry; + struct special_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->special.arg1 = arg1; - entry->special.arg2 = arg2; - entry->special.arg3 = arg3; - __trace_stack(tr, data, irq_flags, 4); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, pc); + entry->ent.type = TRACE_SPECIAL; + entry->arg1 = arg1; + entry->arg2 = arg2; + entry->arg3 = arg3; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ftrace_trace_stack(tr, data, irq_flags, 4, pc); trace_wake_up(); } void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); +} + +void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags) + unsigned long flags, int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_CTX; - entry->ctx.prev_pid = prev->pid; - entry->ctx.prev_prio = prev->prio; - entry->ctx.prev_state = prev->state; - entry->ctx.next_pid = next->pid; - entry->ctx.next_prio = next->prio; - entry->ctx.next_state = next->state; - __trace_stack(tr, data, flags, 5); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_CTX; + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ftrace_trace_stack(tr, data, flags, 5, pc); } void @@ -974,25 +803,28 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *curr, - unsigned long flags) + unsigned long flags, int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_WAKE; - entry->ctx.prev_pid = curr->pid; - entry->ctx.prev_prio = curr->prio; - entry->ctx.prev_state = curr->state; - entry->ctx.next_pid = wakee->pid; - entry->ctx.next_prio = wakee->prio; - entry->ctx.next_state = wakee->state; - __trace_stack(tr, data, flags, 6); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_WAKE; + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ftrace_trace_stack(tr, data, flags, 6, pc); trace_wake_up(); } @@ -1002,23 +834,21 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - unsigned long flags; - long disabled; int cpu; + int pc; - if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + if (tracing_disabled || !tr->ctrl) return; - local_irq_save(flags); + pc = preempt_count(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) - __trace_special(tr, data, arg1, arg2, arg3); + if (likely(!atomic_read(&data->disabled))) + ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } #ifdef CONFIG_FTRACE @@ -1029,7 +859,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu; + int cpu, resched; + int pc; if (unlikely(!ftrace_function_enabled)) return; @@ -1037,16 +868,22 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (skip_trace(ip)) return; - local_irq_save(flags); + pc = preempt_count(); + resched = need_resched(); + preempt_disable_notrace(); + local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - local_irq_restore(flags); + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -1073,111 +910,96 @@ enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; -static struct trace_entry * -trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, - struct trace_iterator *iter, int cpu) +static void trace_iterator_increment(struct trace_iterator *iter, int cpu) { - struct page *page; - struct trace_entry *array; + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); - if (iter->next_idx[cpu] >= tr->entries || - iter->next_idx[cpu] >= data->trace_idx || - (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx)) - return NULL; + iter->idx++; + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - if (!iter->next_page[cpu]) { - /* Initialize the iterator for this cpu trace buffer */ - WARN_ON(!data->trace_tail); - page = virt_to_page(data->trace_tail); - iter->next_page[cpu] = &page->lru; - iter->next_page_idx[cpu] = data->trace_tail_idx; - } + ftrace_enable_cpu(); +} + +static struct trace_entry * +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) +{ + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - page = list_entry(iter->next_page[cpu], struct page, lru); - BUG_ON(&data->trace_pages == &page->lru); + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + + if (buf_iter) + event = ring_buffer_iter_peek(buf_iter, ts); + else + event = ring_buffer_peek(iter->tr->buffer, cpu, ts); - array = page_address(page); + ftrace_enable_cpu(); - WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); - return &array[iter->next_page_idx[cpu]]; + return event ? ring_buffer_event_data(event) : NULL; } static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - struct trace_array *tr = iter->tr; + struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + u64 next_ts = 0, ts; int next_cpu = -1; int cpu; for_each_tracing_cpu(cpu) { - if (!head_page(tr->data[cpu])) + + if (ring_buffer_empty_cpu(buffer, cpu)) continue; - ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + + ent = peek_next_entry(iter, cpu, &ts); + /* * Pick the entry with the smallest timestamp: */ - if (ent && (!next || ent->t < next->t)) { + if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; + next_ts = ts; } } if (ent_cpu) *ent_cpu = next_cpu; + if (ent_ts) + *ent_ts = next_ts; + return next; } -static void trace_iterator_increment(struct trace_iterator *iter) +/* Find the next real entry, without updating the iterator itself */ +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - iter->idx++; - iter->next_idx[iter->cpu]++; - iter->next_page_idx[iter->cpu]++; - - if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - - iter->next_page_idx[iter->cpu] = 0; - iter->next_page[iter->cpu] = - trace_next_list(data, iter->next_page[iter->cpu]); - } + return __find_next_entry(iter, ent_cpu, ent_ts); } -static void trace_consume(struct trace_iterator *iter) +/* Find the next real entry, and increment the iterator to the next entry */ +static void *find_next_entry_inc(struct trace_iterator *iter) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } + if (iter->ent) + trace_iterator_increment(iter, iter->cpu); - /* Check if we empty it, then reset the index */ - if (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx) - data->trace_idx = 0; + return iter->ent ? iter : NULL; } -static void *find_next_entry_inc(struct trace_iterator *iter) +static void trace_consume(struct trace_iterator *iter) { - struct trace_entry *next; - int next_cpu = -1; - - next = find_next_entry(iter, &next_cpu); - - iter->prev_ent = iter->ent; - iter->prev_cpu = iter->cpu; - - iter->ent = next; - iter->cpu = next_cpu; - - if (next) - trace_iterator_increment(iter); - - return next ? iter : NULL; + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); + ftrace_enable_cpu(); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1210,7 +1032,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct trace_iterator *iter = m->private; void *p = NULL; loff_t l = 0; - int i; + int cpu; mutex_lock(&trace_types_lock); @@ -1229,14 +1051,15 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->ent = NULL; iter->cpu = 0; iter->idx = -1; - iter->prev_ent = NULL; - iter->prev_cpu = -1; - for_each_tracing_cpu(i) { - iter->next_idx[i] = 0; - iter->next_page[i] = NULL; + ftrace_disable_cpu(); + + for_each_tracing_cpu(cpu) { + ring_buffer_iter_reset(iter->buffer_iter[cpu]); } + ftrace_enable_cpu(); + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -1330,21 +1153,21 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); } @@ -1355,23 +1178,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long total = 0; - unsigned long entries = 0; - int cpu; + unsigned long total; + unsigned long entries; const char *name = "preemption"; if (type) name = type->name; - for_each_tracing_cpu(cpu) { - if (head_page(tr->data[cpu])) { - total += tr->data[cpu]->trace_idx; - if (tr->data[cpu]->trace_idx > tr->entries) - entries += tr->entries; - else - entries += tr->data[cpu]->trace_idx; - } - } + entries = ring_buffer_entries(iter->tr->buffer); + total = entries + + ring_buffer_overruns(iter->tr->buffer); seq_printf(m, "%s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1428,7 +1244,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) comm = trace_find_cmdline(entry->pid); trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%3d", cpu); trace_seq_printf(s, "%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); @@ -1457,7 +1273,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; static void -lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, unsigned long rel_usecs) { trace_seq_printf(s, " %4lldus", abs_usecs); @@ -1471,34 +1287,76 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static int +/* + * The message is supposed to contain an ending newline. + * If the printing stops prematurely, try to add a newline of our own. + */ +void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) +{ + struct trace_entry *ent; + struct trace_field_cont *cont; + bool ok = true; + + ent = peek_next_entry(iter, iter->cpu, NULL); + if (!ent || ent->type != TRACE_CONT) { + trace_seq_putc(s, '\n'); + return; + } + + do { + cont = (struct trace_field_cont *)ent; + if (ok) + ok = (trace_seq_printf(s, "%s", cont->buf) > 0); + + ftrace_disable_cpu(); + + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + else + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + + ftrace_enable_cpu(); + + ent = peek_next_entry(iter, iter->cpu, NULL); + } while (ent && ent->type == TRACE_CONT); + + if (!ok) + trace_seq_putc(s, '\n'); +} + +static enum print_line_t print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *next_entry = find_next_entry(iter, NULL); + struct trace_entry *next_entry; unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; unsigned long abs_usecs; unsigned long rel_usecs; + u64 next_ts; char *comm; int S, T; int i; unsigned state; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + + next_entry = find_next_entry(iter, NULL, &next_ts); if (!next_entry) - next_entry = entry; - rel_usecs = ns2usecs(next_entry->t - entry->t); - abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + next_ts = iter->ts; + rel_usecs = ns2usecs(next_ts - iter->ts); + abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); if (verbose) { comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, entry->pid, cpu, entry->flags, entry->preempt_count, trace_idx, - ns2usecs(entry->t), + ns2usecs(iter->ts), abs_usecs/1000, abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); @@ -1507,52 +1365,85 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { - case TRACE_FN: - seq_print_ip_sym(s, entry->fn.ip, sym_flags); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_puts(s, " ("); - if (kretprobed(entry->fn.parent_ip)) + if (kretprobed(field->parent_ip)) trace_seq_puts(s, KRETPROBE_MSG); else - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + seq_print_ip_sym(s, field->parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; + } case TRACE_CTX: - case TRACE_WAKE: - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; - state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + state = field->prev_state ? + __ffs(field->prev_state) + 1 : 0; S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; - comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + comm = trace_find_cmdline(field->next_pid); + trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T, comm); break; - case TRACE_SPECIAL: + } + case TRACE_SPECIAL: { + struct special_entry *field; + + trace_assign_type(field, entry); + trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->arg1, + field->arg2, + field->arg3); break; - case TRACE_STACK: + } + case TRACE_STACK: { + struct stack_entry *field; + + trace_assign_type(field, entry); + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + seq_print_ip_sym(s, field->caller[i], sym_flags); } trace_seq_puts(s, "\n"); break; + } + case TRACE_PRINT: { + struct print_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, sym_flags); + trace_seq_printf(s, ": %s", field->buf); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } - return 1; + return TRACE_TYPE_HANDLED; } -static int print_trace_fmt(struct trace_iterator *iter) +static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1567,90 +1458,126 @@ static int print_trace_fmt(struct trace_iterator *iter) entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + comm = trace_find_cmdline(iter->ent->pid); - t = ns2usecs(entry->t); + t = ns2usecs(iter->ts); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, "[%03d] ", iter->cpu); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; switch (entry->type) { - case TRACE_FN: - ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + ret = seq_print_ip_sym(s, field->ip, sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - entry->fn.parent_ip) { + field->parent_ip) { ret = trace_seq_printf(s, " <-"); if (!ret) - return 0; - if (kretprobed(entry->fn.parent_ip)) + return TRACE_TYPE_PARTIAL_LINE; + if (kretprobed(field->parent_ip)) ret = trace_seq_puts(s, KRETPROBE_MSG); else - ret = seq_print_ip_sym(s, entry->fn.parent_ip, + ret = seq_print_ip_sym(s, + field->parent_ip, sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = trace_seq_printf(s, "\n"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; + } case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; + ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", + field->prev_pid, + field->prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; - case TRACE_SPECIAL: + } + case TRACE_SPECIAL: { + struct special_entry *field; + + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->arg1, + field->arg2, + field->arg3); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; - case TRACE_STACK: + } + case TRACE_STACK: { + struct stack_entry *field; + + trace_assign_type(field, entry); + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) { ret = trace_seq_puts(s, " <= "); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, entry->stack.caller[i], + ret = seq_print_ip_sym(s, field->caller[i], sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = trace_seq_puts(s, "\n"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } - return 1; + case TRACE_PRINT: { + struct print_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, sym_flags); + trace_seq_printf(s, ": %s", field->buf); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } + } + return TRACE_TYPE_HANDLED; } -static int print_raw_fmt(struct trace_iterator *iter) +static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1659,47 +1586,77 @@ static int print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, entry->t); + entry->pid, iter->cpu, iter->ts); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; switch (entry->type) { - case TRACE_FN: + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%x %x\n", - entry->fn.ip, entry->fn.parent_ip); + field->ip, + field->parent_ip); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; + } case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, S, - entry->ctx.next_pid, - entry->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; + } case TRACE_SPECIAL: - case TRACE_STACK: + case TRACE_STACK: { + struct special_entry *field; + + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->arg1, + field->arg2, + field->arg3); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } - return 1; + case TRACE_PRINT: { + struct print_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "# %lx %s", field->ip, field->buf); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } + } + return TRACE_TYPE_HANDLED; } #define SEQ_PUT_FIELD_RET(s, x) \ @@ -1710,11 +1667,12 @@ do { \ #define SEQ_PUT_HEX_FIELD_RET(s, x) \ do { \ + BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ return 0; \ } while (0) -static int print_hex_fmt(struct trace_iterator *iter) +static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; @@ -1723,97 +1681,139 @@ static int print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, entry->t); + SEQ_PUT_HEX_FIELD_RET(s, iter->ts); switch (entry->type) { - case TRACE_FN: - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_HEX_FIELD_RET(s, field->ip); + SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); break; + } case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); + SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); SEQ_PUT_HEX_FIELD_RET(s, T); break; + } case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + case TRACE_STACK: { + struct special_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_HEX_FIELD_RET(s, field->arg1); + SEQ_PUT_HEX_FIELD_RET(s, field->arg2); + SEQ_PUT_HEX_FIELD_RET(s, field->arg3); break; } + } SEQ_PUT_FIELD_RET(s, newline); - return 1; + return TRACE_TYPE_HANDLED; } -static int print_bin_fmt(struct trace_iterator *iter) +static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); - SEQ_PUT_FIELD_RET(s, entry->t); + SEQ_PUT_FIELD_RET(s, iter->cpu); + SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { - case TRACE_FN: - SEQ_PUT_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->ip); + SEQ_PUT_FIELD_RET(s, field->parent_ip); break; - case TRACE_CTX: - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + } + case TRACE_CTX: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->prev_pid); + SEQ_PUT_FIELD_RET(s, field->prev_prio); + SEQ_PUT_FIELD_RET(s, field->prev_state); + SEQ_PUT_FIELD_RET(s, field->next_pid); + SEQ_PUT_FIELD_RET(s, field->next_prio); + SEQ_PUT_FIELD_RET(s, field->next_state); break; + } case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_FIELD_RET(s, entry->special.arg3); + case TRACE_STACK: { + struct special_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->arg1); + SEQ_PUT_FIELD_RET(s, field->arg2); + SEQ_PUT_FIELD_RET(s, field->arg3); break; } + } return 1; } static int trace_empty(struct trace_iterator *iter) { - struct trace_array_cpu *data; int cpu; for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (head_page(data) && data->trace_idx && - (data->trace_tail != data->trace_head || - data->trace_tail_idx != data->trace_head_idx)) - return 0; + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } } + return 1; } -static int print_trace_line(struct trace_iterator *iter) +static enum print_line_t print_trace_line(struct trace_iterator *iter) { - if (iter->trace && iter->trace->print_line) - return iter->trace->print_line(iter); + enum print_line_t ret; + + if (iter->trace && iter->trace->print_line) { + ret = iter->trace->print_line(iter); + if (ret != TRACE_TYPE_UNHANDLED) + return ret; + } if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -1869,6 +1869,8 @@ static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; + struct seq_file *m; + int cpu; if (tracing_disabled) { *ret = -ENODEV; @@ -1889,28 +1891,45 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + for_each_tracing_cpu(cpu) { + + iter->buffer_iter[cpu] = + ring_buffer_read_start(iter->tr->buffer, cpu); + + if (!iter->buffer_iter[cpu]) + goto fail_buffer; + } + /* TODO stop tracer */ *ret = seq_open(file, &tracer_seq_ops); - if (!*ret) { - struct seq_file *m = file->private_data; - m->private = iter; + if (*ret) + goto fail_buffer; - /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } + m = file->private_data; + m->private = iter; - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - } else { - kfree(iter); - iter = NULL; + /* stop the trace while dumping */ + if (iter->tr->ctrl) { + tracer_enabled = 0; + ftrace_function_enabled = 0; } + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + mutex_unlock(&trace_types_lock); out: return iter; + + fail_buffer: + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + mutex_unlock(&trace_types_lock); + + return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -1926,8 +1945,14 @@ int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct trace_iterator *iter = m->private; + int cpu; mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + if (iter->trace && iter->trace->close) iter->trace->close(iter); @@ -2352,9 +2377,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, struct tracer *t; char buf[max_tracer_type_len+1]; int i; + size_t ret; if (cnt > max_tracer_type_len) cnt = max_tracer_type_len; + ret = cnt; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2370,7 +2397,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (strcmp(t->name, buf) == 0) break; } - if (!t || t == current_trace) + if (!t) { + ret = -EINVAL; + goto out; + } + if (t == current_trace) goto out; if (current_trace && current_trace->reset) @@ -2383,9 +2414,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + if (ret == cnt) + filp->f_pos += cnt; - return cnt; + return ret; } static ssize_t @@ -2500,20 +2532,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array_cpu *data; - static cpumask_t mask; - unsigned long flags; -#ifdef CONFIG_FTRACE - int ftrace_save; -#endif - int cpu; ssize_t sret; /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) return sret; - sret = 0; trace_seq_reset(&iter->seq); @@ -2524,6 +2548,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, goto out; } +waitagain: + sret = 0; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { @@ -2588,46 +2614,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, offsetof(struct trace_iterator, seq)); iter->pos = -1; - /* - * We need to stop all tracing on all CPUS to read the - * the next buffer. This is a bit expensive, but is - * not done often. We fill all what we can read, - * and then release the locks again. - */ - - cpus_clear(mask); - local_irq_save(flags); -#ifdef CONFIG_FTRACE - ftrace_save = ftrace_enabled; - ftrace_enabled = 0; -#endif - smp_wmb(); - for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (!head_page(data) || !data->trace_idx) - continue; - - atomic_inc(&data->disabled); - cpu_set(cpu, mask); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_lock(&data->lock); - - if (data->overrun > iter->last_overrun[cpu]) - iter->overrun[cpu] += - data->overrun - iter->last_overrun[cpu]; - iter->last_overrun[cpu] = data->overrun; - } - while (find_next_entry_inc(iter) != NULL) { - int ret; + enum print_line_t ret; int len = iter->seq.len; ret = print_trace_line(iter); - if (!ret) { + if (ret == TRACE_TYPE_PARTIAL_LINE) { /* don't print partial lines */ iter->seq.len = len; break; @@ -2639,26 +2631,17 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, break; } - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_unlock(&data->lock); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - atomic_dec(&data->disabled); - } -#ifdef CONFIG_FTRACE - ftrace_enabled = ftrace_save; -#endif - local_irq_restore(flags); - /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= iter->seq.len) trace_seq_reset(&iter->seq); + + /* + * If there was nothing to send to user, inspite of consuming trace + * entries, go back to wait for more entries. + */ if (sret == -EBUSY) - sret = 0; + goto waitagain; out: mutex_unlock(&trace_types_lock); @@ -2684,7 +2667,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int i, ret; + int ret; + struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2704,59 +2688,38 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); - if (current_trace != &no_tracer) { + if (tr->ctrl) { cnt = -EBUSY; - pr_info("ftrace: set current_tracer to none" + pr_info("ftrace: please disable tracing" " before modifying buffer size\n"); goto out; } - if (val > global_trace.entries) { - long pages_requested; - unsigned long freeable_pages; - - /* make sure we have enough memory before mapping */ - pages_requested = - (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; - - /* account for each buffer (and max_tr) */ - pages_requested *= tracing_nr_buffers * 2; - - /* Check for overflow */ - if (pages_requested < 0) { - cnt = -ENOMEM; - goto out; - } - - freeable_pages = determine_dirtyable_memory(); - - /* we only allow to request 1/4 of useable memory */ - if (pages_requested > - ((freeable_pages + tracing_pages_allocated) / 4)) { - cnt = -ENOMEM; + if (val != global_trace.entries) { + ret = ring_buffer_resize(global_trace.buffer, val); + if (ret < 0) { + cnt = ret; goto out; } - while (global_trace.entries < val) { - if (trace_alloc_page()) { - cnt = -ENOMEM; - goto out; + ret = ring_buffer_resize(max_tr.buffer, val); + if (ret < 0) { + int r; + cnt = ret; + r = ring_buffer_resize(global_trace.buffer, + global_trace.entries); + if (r < 0) { + /* AARGH! We are left with different + * size max buffer!!!! */ + WARN_ON(1); + tracing_disabled = 1; } - /* double check that we don't go over the known pages */ - if (tracing_pages_allocated > pages_requested) - break; + goto out; } - } else { - /* include the number of entries in val (inc of page entries) */ - while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) - trace_free_page(); + global_trace.entries = val; } - /* check integrity */ - for_each_tracing_cpu(i) - check_pages(global_trace.data[i]); - filp->f_pos += cnt; /* If check pages failed, return ENOMEM */ @@ -2769,6 +2732,52 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int mark_printk(const char *fmt, ...) +{ + int ret; + va_list args; + va_start(args, fmt); + ret = trace_vprintk(0, fmt, args); + va_end(args); + return ret; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char *buf; + char *end; + struct trace_array *tr = &global_trace; + + if (!tr->ctrl || tracing_disabled) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + buf = kmalloc(cnt + 1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + kfree(buf); + return -EFAULT; + } + + /* Cut from the first nil or newline. */ + buf[cnt] = '\0'; + end = strchr(buf, '\n'); + if (end) + *end = '\0'; + + cnt = mark_printk("%s\n", buf); + kfree(buf); + *fpos += cnt; + + return cnt; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -2800,6 +2809,11 @@ static struct file_operations tracing_entries_fops = { .write = tracing_entries_write, }; +static struct file_operations tracing_mark_fops = { + .open = tracing_open_generic, + .write = tracing_mark_write, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -2846,7 +2860,7 @@ struct dentry *tracing_init_dentry(void) #include "trace_selftest.c" #endif -static __init void tracer_init_debugfs(void) +static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; struct dentry *entry; @@ -2881,12 +2895,12 @@ static __init void tracer_init_debugfs(void) entry = debugfs_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + pr_warning("Could not create debugfs 'available_tracers' entry\n"); entry = debugfs_create_file("current_tracer", 0444, d_tracer, &global_trace, &set_tracer_fops); if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + pr_warning("Could not create debugfs 'current_tracer' entry\n"); entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, @@ -2899,7 +2913,7 @@ static __init void tracer_init_debugfs(void) &tracing_thresh, &tracing_max_lat_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'tracing_thresh' entry\n"); entry = debugfs_create_file("README", 0644, d_tracer, NULL, &tracing_readme_fops); if (!entry) @@ -2909,13 +2923,19 @@ static __init void tracer_init_debugfs(void) NULL, &tracing_pipe_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'trace_pipe' entry\n"); entry = debugfs_create_file("trace_entries", 0644, d_tracer, &global_trace, &tracing_entries_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'trace_entries' entry\n"); + + entry = debugfs_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'trace_marker' entry\n"); #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, @@ -2928,230 +2948,263 @@ static __init void tracer_init_debugfs(void) #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + return 0; } -static int trace_alloc_page(void) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { + static DEFINE_SPINLOCK(trace_buf_lock); + static char trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct page *page, *tmp; - LIST_HEAD(pages); - void *array; - unsigned pages_allocated = 0; - int i; + struct print_entry *entry; + unsigned long flags, irq_flags; + int cpu, len = 0, size, pc; - /* first allocate a page for each CPU */ - for_each_tracing_cpu(i) { - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } + if (!tr->ctrl || tracing_disabled) + return 0; - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); -#endif - } + if (unlikely(atomic_read(&data->disabled))) + goto out; - /* Now that we successfully allocate a page per CPU, add them */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - ClearPageLRU(page); + spin_lock_irqsave(&trace_buf_lock, flags); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - SetPageLRU(page); -#endif - } - tracing_pages_allocated += pages_allocated; - global_trace.entries += ENTRIES_PER_PAGE; + len = min(len, TRACE_BUF_SIZE-1); + trace_buf[len] = 0; - return 0; + size = sizeof(*entry) + len + 1; + event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_PRINT; + entry->ip = ip; - free_pages: - list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - return -ENOMEM; + memcpy(&entry->buf, trace_buf, len); + entry->buf[len] = 0; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out_unlock: + spin_unlock_irqrestore(&trace_buf_lock, flags); + + out: + preempt_enable_notrace(); + + return len; } +EXPORT_SYMBOL_GPL(trace_vprintk); -static int trace_free_page(void) +int __ftrace_printk(unsigned long ip, const char *fmt, ...) { - struct trace_array_cpu *data; - struct page *page; - struct list_head *p; - int i; - int ret = 0; + int ret; + va_list ap; - /* free one page from each buffer */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - tracing_pages_allocated--; - tracing_pages_allocated--; - __free_page(page); + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; - tracing_reset(data); + va_start(ap, fmt); + ret = trace_vprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__ftrace_printk); -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - __free_page(page); +static int trace_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + ftrace_dump(); + return NOTIFY_OK; +} - tracing_reset(data); -#endif - } - global_trace.entries -= ENTRIES_PER_PAGE; +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; - return ret; +static int trace_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + switch (val) { + case DIE_OOPS: + ftrace_dump(); + break; + default: + break; + } + return NOTIFY_OK; } -__init static int tracer_alloc_buffers(void) +static struct notifier_block trace_die_notifier = { + .notifier_call = trace_die_handler, + .priority = 200 +}; + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT 1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE KERN_INFO + +static void +trace_printk_seq(struct trace_seq *s) { - struct trace_array_cpu *data; - void *array; - struct page *page; - int pages = 0; - int ret = -ENOMEM; - int i; + /* Probably should print a warning here. */ + if (s->len >= 1000) + s->len = 1000; - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_nr_buffers = num_possible_cpus(); - tracing_buffer_mask = cpu_possible_map; + /* should be zero ended, but we are paranoid. */ + s->buffer[s->len] = 0; - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); + printk(KERN_TRACE "%s", s->buffer); - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } + trace_seq_reset(s); +} + + +void ftrace_dump(void) +{ + static DEFINE_SPINLOCK(ftrace_dump_lock); + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + static cpumask_t mask; + static int dump_ran; + unsigned long flags; + int cnt = 0, cpu; - /* set the array to the list */ - INIT_LIST_HEAD(&data->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &data->trace_pages); - /* use the LRU flag to differentiate the two buffers */ - ClearPageLRU(page); + /* only one dump */ + spin_lock_irqsave(&ftrace_dump_lock, flags); + if (dump_ran) + goto out; - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + dump_ran = 1; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } + /* No turning back! */ + ftrace_kill_atomic(); - INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &max_tr.data[i]->trace_pages); - SetPageLRU(page); -#endif + for_each_tracing_cpu(cpu) { + atomic_inc(&global_trace.data[cpu]->disabled); } + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + + iter.tr = &global_trace; + iter.trace = current_trace; + /* - * Since we allocate by orders of pages, we may be able to - * round up a bit. + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. */ - global_trace.entries = ENTRIES_PER_PAGE; - pages++; - while (global_trace.entries < trace_nr_entries) { - if (trace_alloc_page()) - break; - pages++; + cpus_clear(mask); + + while (!trace_empty(&iter)) { + + if (!cnt) + printk(KERN_TRACE "---------------------------------\n"); + + cnt++; + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (find_next_entry_inc(&iter) != NULL) { + print_trace_line(&iter); + trace_consume(&iter); + } + + trace_printk_seq(&iter.seq); } - max_tr.entries = global_trace.entries; - pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", - pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); - pr_info(" actual entries %ld\n", global_trace.entries); + if (!cnt) + printk(KERN_TRACE " (ftrace buffer empty)\n"); + else + printk(KERN_TRACE "---------------------------------\n"); + + out: + spin_unlock_irqrestore(&ftrace_dump_lock, flags); +} + +__init static int tracer_alloc_buffers(void) +{ + struct trace_array_cpu *data; + int i; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_buffer_mask = cpu_possible_map; + + global_trace.buffer = ring_buffer_alloc(trace_buf_size, + TRACE_BUFFER_FLAGS); + if (!global_trace.buffer) { + printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); + WARN_ON(1); + return 0; + } + global_trace.entries = ring_buffer_size(global_trace.buffer); - tracer_init_debugfs(); +#ifdef CONFIG_TRACER_MAX_TRACE + max_tr.buffer = ring_buffer_alloc(trace_buf_size, + TRACE_BUFFER_FLAGS); + if (!max_tr.buffer) { + printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); + WARN_ON(1); + ring_buffer_free(global_trace.buffer); + return 0; + } + max_tr.entries = ring_buffer_size(max_tr.buffer); + WARN_ON(max_tr.entries != global_trace.entries); +#endif + + /* Allocate the first page for all buffers */ + for_each_tracing_cpu(i) { + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + } trace_init_cmdlines(); - register_tracer(&no_tracer); - current_trace = &no_tracer; + register_tracer(&nop_trace); +#ifdef CONFIG_BOOT_TRACER + register_tracer(&boot_tracer); + current_trace = &boot_tracer; + current_trace->init(&global_trace); +#else + current_trace = &nop_trace; +#endif /* All seems OK, enable tracing */ global_trace.ctrl = tracer_enabled; tracing_disabled = 0; - return 0; + atomic_notifier_chain_register(&panic_notifier_list, + &trace_panic_notifier); - free_buffers: - for (i-- ; i >= 0; i--) { - struct page *page, *tmp; - struct trace_array_cpu *data = global_trace.data[i]; + register_die_notifier(&trace_die_notifier); - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } -#endif - } - return ret; + return 0; } -fs_initcall(tracer_alloc_buffers); +early_initcall(tracer_alloc_buffers); +fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f69f867..f1f9957 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,7 +5,9 @@ #include <asm/atomic.h> #include <linux/sched.h> #include <linux/clocksource.h> +#include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/ftrace.h> enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -13,38 +15,60 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, + TRACE_CONT, TRACE_STACK, + TRACE_PRINT, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, + TRACE_BOOT, __TRACE_LAST_TYPE }; /* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned char type; + unsigned char cpu; + unsigned char flags; + unsigned char preempt_count; + int pid; +}; + +/* * Function trace entry - function address and parent function addres: */ struct ftrace_entry { + struct trace_entry ent; unsigned long ip; unsigned long parent_ip; }; +extern struct tracer boot_tracer; /* * Context switch trace entry - which task (and prio) we switched from/to: */ struct ctx_switch_entry { + struct trace_entry ent; unsigned int prev_pid; unsigned char prev_prio; unsigned char prev_state; unsigned int next_pid; unsigned char next_prio; unsigned char next_state; + unsigned int next_cpu; }; /* * Special (free-form) trace entry: */ struct special_entry { + struct trace_entry ent; unsigned long arg1; unsigned long arg2; unsigned long arg3; @@ -57,33 +81,60 @@ struct special_entry { #define FTRACE_STACK_ENTRIES 8 struct stack_entry { + struct trace_entry ent; unsigned long caller[FTRACE_STACK_ENTRIES]; }; /* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + * ftrace_printk entry: */ -struct trace_entry { - char type; - char cpu; - char flags; - char preempt_count; - int pid; - cycle_t t; - union { - struct ftrace_entry fn; - struct ctx_switch_entry ctx; - struct special_entry special; - struct stack_entry stack; - struct mmiotrace_rw mmiorw; - struct mmiotrace_map mmiomap; - }; +struct print_entry { + struct trace_entry ent; + unsigned long ip; + char buf[]; +}; + +#define TRACE_OLD_SIZE 88 + +struct trace_field_cont { + unsigned char type; + /* Temporary till we get rid of this completely */ + char buf[TRACE_OLD_SIZE - 1]; +}; + +struct trace_mmiotrace_rw { + struct trace_entry ent; + struct mmiotrace_rw rw; }; -#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) +struct trace_mmiotrace_map { + struct trace_entry ent; + struct mmiotrace_map map; +}; + +struct trace_boot { + struct trace_entry ent; + struct boot_trace initcall; +}; + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + * CONT - multiple entries hold the trace item + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, + TRACE_FLAG_CONT = 0x10, +}; + +#define TRACE_BUF_SIZE 1024 /* * The CPU trace array - it consists of thousands of trace entries @@ -91,16 +142,9 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - struct list_head trace_pages; atomic_t disabled; - raw_spinlock_t lock; - struct lock_class_key lock_key; /* these fields get copied into max-trace: */ - unsigned trace_head_idx; - unsigned trace_tail_idx; - void *trace_head; /* producer */ - void *trace_tail; /* consumer */ unsigned long trace_idx; unsigned long overrun; unsigned long saved_latency; @@ -124,6 +168,7 @@ struct trace_iterator; * They have on/off state as well: */ struct trace_array { + struct ring_buffer *buffer; unsigned long entries; long ctrl; int cpu; @@ -132,6 +177,56 @@ struct trace_array { struct trace_array_cpu *data[NR_CPUS]; }; +#define FTRACE_CMP_TYPE(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id && (entry)->type != id); \ + break; \ + } + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + * Where "type" is the trace type that includes the trace_entry + * as the "ent" item. And "id" is the trace identifier that is + * used in the trace_type enum. + * + * If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent) \ + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct special_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ + TRACE_MMIO_MAP); \ + IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ + __ftrace_bad_type(); \ + } while (0) + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ +}; + /* * A specific tracer, represented by methods that operate on a trace array: */ @@ -152,7 +247,7 @@ struct tracer { int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif - int (*print_line)(struct trace_iterator *iter); + enum print_line_t (*print_line)(struct trace_iterator *iter); struct tracer *next; int print_max; }; @@ -171,57 +266,58 @@ struct trace_iterator { struct trace_array *tr; struct tracer *trace; void *private; - long last_overrun[NR_CPUS]; - long overrun[NR_CPUS]; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; int cpu; - - struct trace_entry *prev_ent; - int prev_cpu; + u64 ts; unsigned long iter_flags; loff_t pos; - unsigned long next_idx[NR_CPUS]; - struct list_head *next_page[NR_CPUS]; - unsigned next_page_idx[NR_CPUS]; long idx; }; -void tracing_reset(struct trace_array_cpu *data); +void trace_wake_up(void); +void tracing_reset(struct trace_array *tr, int cpu); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, - unsigned long flags); + unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags); + unsigned long flags, int pc); void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *cur, - unsigned long flags); + unsigned long flags, int pc); void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, - unsigned long arg3); + unsigned long arg3, int pc); void trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, - unsigned long flags); + unsigned long flags, int pc); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -268,51 +364,33 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif -#ifdef CONFIG_MMIOTRACE -extern void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_rw *rw); -extern void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_map *map); -#endif - #ifdef CONFIG_FTRACE_STARTUP_TEST -#ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_IRQSOFF_TRACER extern int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_PREEMPT_TRACER extern int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr); -#endif -#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_SCHED_TRACER extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_nop(struct tracer *trace, + struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_SYSPROF_TRACER extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); -#endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern void trace_seq_print_cont(struct trace_seq *s, + struct trace_iterator *iter); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); +extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; @@ -334,6 +412,9 @@ enum trace_iterator_flags { TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, + TRACE_ITER_PRINTK = 0x400, }; +extern struct tracer nop_trace; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c new file mode 100644 index 0000000..d0a5e50 --- /dev/null +++ b/kernel/trace/trace_boot.c @@ -0,0 +1,126 @@ +/* + * ring buffer based initcalls tracer + * + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + */ + +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> + +#include "trace.h" + +static struct trace_array *boot_trace; +static int trace_boot_enabled; + + +/* Should be started after do_pre_smp_initcalls() in init/main.c */ +void start_boot_trace(void) +{ + trace_boot_enabled = 1; +} + +void stop_boot_trace(void) +{ + trace_boot_enabled = 0; +} + +void reset_boot_trace(struct trace_array *tr) +{ + stop_boot_trace(); +} + +static void boot_trace_init(struct trace_array *tr) +{ + int cpu; + boot_trace = tr; + + trace_boot_enabled = 0; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); +} + +static void boot_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_boot_trace(); + else + stop_boot_trace(); +} + +static enum print_line_t initcall_print_line(struct trace_iterator *iter) +{ + int ret; + struct trace_entry *entry = iter->ent; + struct trace_boot *field = (struct trace_boot *)entry; + struct boot_trace *it = &field->initcall; + struct trace_seq *s = &iter->seq; + struct timespec calltime = ktime_to_timespec(it->calltime); + struct timespec rettime = ktime_to_timespec(it->rettime); + + if (entry->type == TRACE_BOOT) { + ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", + calltime.tv_sec, + calltime.tv_nsec, + it->func, it->caller); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " + "returned %d after %lld msecs\n", + rettime.tv_sec, + rettime.tv_nsec, + it->func, it->result, it->duration); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +struct tracer boot_tracer __read_mostly = +{ + .name = "initcall", + .init = boot_trace_init, + .reset = reset_boot_trace, + .ctrl_update = boot_trace_ctrl_update, + .print_line = initcall_print_line, +}; + +void trace_boot(struct boot_trace *it, initcall_t fn) +{ + struct ring_buffer_event *event; + struct trace_boot *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = boot_trace; + + if (!trace_boot_enabled) + return; + + /* Get its name now since this function could + * disappear because it is in the .init section. + */ + sprint_symbol(it->func, (unsigned long)fn); + preempt_disable(); + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_BOOT; + entry->initcall = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3121448..e90eb0c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,7 +23,7 @@ static void function_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void start_function_trace(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ece6cfb..a7db7f0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr, unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; + int pc; /* * usecs conversion is slow so we try to delay the conversion @@ -141,6 +142,8 @@ check_critical_timing(struct trace_array *tr, local_save_flags(flags); + pc = preempt_count(); + if (!report_latency(delta)) goto out; @@ -150,7 +153,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); latency = nsecs_to_usecs(delta); @@ -173,8 +176,8 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(data); - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + tracing_reset(tr, cpu); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); } static inline void @@ -203,11 +206,11 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - tracing_reset(data); + tracing_reset(tr, cpu); local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); per_cpu(tracing_cpu, cpu) = 1; @@ -234,14 +237,14 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!head_page(data)) || + if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b13dc19..f284846 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void mmio_trace_init(struct trace_array *tr) @@ -130,10 +130,14 @@ static unsigned long count_overruns(struct trace_iterator *iter) { int cpu; unsigned long cnt = 0; +/* FIXME: */ +#if 0 for_each_online_cpu(cpu) { cnt += iter->overrun[cpu]; iter->overrun[cpu] = 0; } +#endif + (void)cpu; return cnt; } @@ -171,17 +175,21 @@ print_out: return (ret == -EBUSY) ? 0 : ret; } -static int mmio_print_rw(struct trace_iterator *iter) +static enum print_line_t mmio_print_rw(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_mmiotrace_rw *field; + struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; - switch (entry->mmiorw.opcode) { + trace_assign_type(field, entry); + rw = &field->rw; + + switch (rw->opcode) { case MMIO_READ: ret = trace_seq_printf(s, "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", @@ -209,21 +217,25 @@ static int mmio_print_rw(struct trace_iterator *iter) break; } if (ret) - return 1; - return 0; + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; } -static int mmio_print_map(struct trace_iterator *iter) +static enum print_line_t mmio_print_map(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_map *m = &entry->mmiomap; + struct trace_mmiotrace_map *field; + struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; - int ret = 1; + int ret; - switch (entry->mmiorw.opcode) { + trace_assign_type(field, entry); + m = &field->map; + + switch (m->opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", @@ -241,20 +253,43 @@ static int mmio_print_map(struct trace_iterator *iter) break; } if (ret) - return 1; - return 0; + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret; + + /* The trailing newline must be in the message. */ + ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return TRACE_TYPE_HANDLED; } -/* return 0 to abort printing without consuming current entry in pipe mode */ -static int mmio_print_line(struct trace_iterator *iter) +static enum print_line_t mmio_print_line(struct trace_iterator *iter) { switch (iter->ent->type) { case TRACE_MMIO_RW: return mmio_print_rw(iter); case TRACE_MMIO_MAP: return mmio_print_map(iter); + case TRACE_PRINT: + return mmio_print_mark(iter); default: - return 1; /* ignore unknown entries */ + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ } } @@ -276,6 +311,27 @@ __init static int init_mmio_trace(void) } device_initcall(init_mmio_trace); +static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; + unsigned long irq_flags; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); + entry->ent.type = TRACE_MMIO_RW; + entry->rw = *rw; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; @@ -283,6 +339,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw) __trace_mmiotrace_rw(tr, data, rw); } +static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; + unsigned long irq_flags; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); + entry->ent.type = TRACE_MMIO_MAP; + entry->map = *map; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; @@ -293,3 +370,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map) __trace_mmiotrace_map(tr, data, map); preempt_enable(); } + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 0000000..4592b48 --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,64 @@ +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net> + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> + +#include "trace.h" + +static struct trace_array *ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void nop_trace_init(struct trace_array *tr) +{ + int cpu; + ctx_trace = tr; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + if (tr->ctrl) + start_nop_trace(tr); +} + +static void nop_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_nop_trace(tr); +} + +static void nop_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_nop_trace(tr); + else + stop_nop_trace(tr); +} + +struct tracer nop_trace __read_mostly = +{ + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, + .ctrl_update = nop_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, +#endif +}; + diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb817a2..b8f56be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -9,8 +9,8 @@ #include <linux/debugfs.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> -#include <linux/marker.h> #include <linux/ftrace.h> +#include <trace/sched.h> #include "trace.h" @@ -19,15 +19,16 @@ static int __read_mostly tracer_enabled; static atomic_t sched_ref; static void -sched_switch_func(void *private, void *__rq, struct task_struct *prev, +probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) { - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; - long disabled; int cpu; + int pc; + + if (!atomic_read(&sched_ref)) + return; tracing_record_cmdline(prev); tracing_record_cmdline(next); @@ -35,97 +36,41 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev, if (!tracer_enabled) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + data = ctx_trace->data[cpu]; - if (likely(disabled == 1)) - tracing_sched_switch_trace(tr, data, prev, next, flags); + if (likely(!atomic_read(&data->disabled))) + tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); - atomic_dec(&data->disabled); local_irq_restore(flags); } -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - if (!atomic_read(&sched_ref)) - return; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - sched_switch_func(probe_data, __rq, prev, next); -} - static void -wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct - task_struct *curr) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) { - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; + int cpu, pc; - if (!tracer_enabled) + if (!likely(tracer_enabled)) return; - tracing_record_cmdline(curr); + pc = preempt_count(); + tracing_record_cmdline(current); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + data = ctx_trace->data[cpu]; - if (likely(disabled == 1)) - tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + if (likely(!atomic_read(&data->disabled))) + tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, + flags, pc); - atomic_dec(&data->disabled); local_irq_restore(flags); } -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_func(probe_data, __rq, task, curr); -} - static void sched_switch_reset(struct trace_array *tr) { int cpu; @@ -133,67 +78,47 @@ static void sched_switch_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static int tracing_sched_register(void) { int ret; - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); + ret = register_trace_sched_wakeup(probe_sched_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); + ret = register_trace_sched_wakeup_new(probe_sched_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &ctx_trace); + ret = register_trace_sched_switch(probe_sched_switch); if (ret) { - pr_info("sched trace: Couldn't add marker" + pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_schedule\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_wakeup_new(probe_sched_wakeup); fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_wakeup(probe_sched_wakeup); return ret; } static void tracing_sched_unregister(void) { - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_switch(probe_sched_switch); + unregister_trace_sched_wakeup_new(probe_sched_wakeup); + unregister_trace_sched_wakeup(probe_sched_wakeup); } static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e303ccb..fe4a252 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <linux/marker.h> +#include <trace/sched.h> #include "trace.h" @@ -44,10 +44,12 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int resched; int cpu; + int pc; if (likely(!wakeup_task)) return; + pc = preempt_count(); resched = need_resched(); preempt_disable_notrace(); @@ -70,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) if (task_cpu(wakeup_task) != cpu) goto unlock; - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); unlock: __raw_spin_unlock(&wakeup_lock); @@ -112,17 +114,18 @@ static int report_latency(cycle_t delta) } static void notrace -wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, +probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; long disabled; int cpu; + int pc; + + tracing_record_cmdline(prev); if (unlikely(!tracer_enabled)) return; @@ -139,12 +142,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (next != wakeup_task) return; + pc = preempt_count(); + /* The task we are waiting for is waking up */ - data = tr->data[wakeup_cpu]; + data = wakeup_trace->data[wakeup_cpu]; /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&tr->data[cpu]->disabled); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (likely(disabled != 1)) goto out; @@ -155,7 +160,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); /* * usecs conversion is slow so we try to delay the conversion @@ -174,39 +179,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, t0 = nsecs_to_usecs(T0); t1 = nsecs_to_usecs(T1); - update_max_tr(tr, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); out_unlock: - __wakeup_reset(tr); + __wakeup_reset(wakeup_trace); __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - tracing_record_cmdline(prev); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - wakeup_sched_switch(probe_data, __rq, prev, next); + atomic_dec(&wakeup_trace->data[cpu]->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -216,7 +196,7 @@ static void __wakeup_reset(struct trace_array *tr) for_each_possible_cpu(cpu) { data = tr->data[cpu]; - tracing_reset(data); + tracing_reset(tr, cpu); } wakeup_cpu = -1; @@ -240,19 +220,26 @@ static void wakeup_reset(struct trace_array *tr) } static void -wakeup_check_start(struct trace_array *tr, struct task_struct *p, - struct task_struct *curr) +probe_wakeup(struct rq *rq, struct task_struct *p) { int cpu = smp_processor_id(); unsigned long flags; long disabled; + int pc; + + if (likely(!tracer_enabled)) + return; + + tracing_record_cmdline(p); + tracing_record_cmdline(current); if (likely(!rt_task(p)) || p->prio >= wakeup_prio || - p->prio >= curr->prio) + p->prio >= current->prio) return; - disabled = atomic_inc_return(&tr->data[cpu]->disabled); + pc = preempt_count(); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (unlikely(disabled != 1)) goto out; @@ -264,7 +251,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, goto out_locked; /* reset the trace */ - __wakeup_reset(tr); + __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); wakeup_prio = p->prio; @@ -274,74 +261,37 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); - tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - trace_function(tr, tr->data[wakeup_cpu], - CALLER_ADDR1, CALLER_ADDR2, flags); + wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); + trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: __raw_spin_unlock(&wakeup_lock); out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct trace_array **ptr = probe_data; - struct trace_array *tr = *ptr; - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_check_start(tr, task, curr); + atomic_dec(&wakeup_trace->data[cpu]->disabled); } static void start_wakeup_tracer(struct trace_array *tr) { int ret; - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); + ret = register_trace_sched_wakeup(probe_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return; } - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); + ret = register_trace_sched_wakeup_new(probe_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &wakeup_trace); + ret = register_trace_sched_switch(probe_wakeup_sched_switch); if (ret) { - pr_info("sched trace: Couldn't add marker" + pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_schedule\n"); goto fail_deprobe_wake_new; } @@ -363,28 +313,18 @@ static void start_wakeup_tracer(struct trace_array *tr) return; fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_wakeup_new(probe_wakeup); fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_wakeup(probe_wakeup); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; unregister_ftrace_function(&trace_ops); - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_switch(probe_wakeup_sched_switch); + unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup); } static void wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0911b7e..09cf230 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,65 +9,29 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: + case TRACE_CONT: case TRACE_STACK: + case TRACE_PRINT: case TRACE_SPECIAL: return 1; } return 0; } -static int -trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) { - struct trace_entry *entries; - struct page *page; - int idx = 0; - int i; + struct ring_buffer_event *event; + struct trace_entry *entry; - BUG_ON(list_empty(&data->trace_pages)); - page = list_entry(data->trace_pages.next, struct page, lru); - entries = page_address(page); + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { + entry = ring_buffer_event_data(event); - check_pages(data); - if (head_page(data) != entries) - goto failed; - - /* - * The starting trace buffer always has valid elements, - * if any element exists. - */ - entries = head_page(data); - - for (i = 0; i < tr->entries; i++) { - - if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + if (!trace_valid_entry(entry)) { printk(KERN_CONT ".. invalid entry %d ", - entries[idx].type); + entry->type); goto failed; } - - idx++; - if (idx >= ENTRIES_PER_PAGE) { - page = virt_to_page(entries); - if (page->lru.next == &data->trace_pages) { - if (i != tr->entries - 1) { - printk(KERN_CONT ".. entries buffer mismatch"); - goto failed; - } - } else { - page = list_entry(page->lru.next, struct page, lru); - entries = page_address(page); - } - idx = 0; - } } - - page = virt_to_page(entries); - if (page->lru.next != &data->trace_pages) { - printk(KERN_CONT ".. too many entries"); - goto failed; - } - return 0; failed: @@ -89,13 +53,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ raw_local_irq_save(flags); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(cpu) { - if (!head_page(tr->data[cpu])) - continue; - cnt += tr->data[cpu]->trace_idx; + cnt = ring_buffer_entries(tr->buffer); - ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + for_each_possible_cpu(cpu) { + ret = trace_test_buffer_cpu(tr, cpu); if (ret) break; } @@ -120,11 +82,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, int (*func)(void)) { - unsigned long count; - int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + unsigned long count; char *func_name; + int ret; /* The ftrace test PASSED */ printk(KERN_CONT "PASSED\n"); @@ -157,6 +119,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* enable tracing */ tr->ctrl = 1; trace->init(tr); + /* Sleep for a 1/10 of a second */ msleep(100); @@ -212,10 +175,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + unsigned long count; + int ret; /* make sure msleep has been recorded */ msleep(1); @@ -415,6 +378,15 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } #endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ + /* What could possibly go wrong? */ + return 0; +} +#endif + #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { @@ -486,6 +458,9 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wake_up_process(p); + /* give a little time to let the thread wake up */ + msleep(100); + /* stop the tracing. */ tr->ctrl = 0; trace->ctrl_update(tr); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 0000000..74c5d9a --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/stacktrace.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = + { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; + +static struct stack_trace max_stack_trace = { + .max_entries = STACK_TRACE_ENTRIES, + .entries = stack_dump_trace, +}; + +static unsigned long max_stack_size; +static raw_spinlock_t max_stack_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static int stack_trace_disabled __read_mostly; +static DEFINE_PER_CPU(int, trace_active); + +static inline void check_stack(void) +{ + unsigned long this_size, flags; + unsigned long *p, *top, *start; + int i; + + this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = THREAD_SIZE - this_size; + + if (this_size <= max_stack_size) + return; + + raw_local_irq_save(flags); + __raw_spin_lock(&max_stack_lock); + + /* a race could have already updated it */ + if (this_size <= max_stack_size) + goto out; + + max_stack_size = this_size; + + max_stack_trace.nr_entries = 0; + max_stack_trace.skip = 3; + + save_stack_trace(&max_stack_trace); + + /* + * Now find where in the stack these are. + */ + i = 0; + start = &this_size; + top = (unsigned long *) + (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + + /* + * Loop through all the entries. One of the entries may + * for some reason be missed on the stack, so we may + * have to account for them. If they are all there, this + * loop will only happen once. This code only takes place + * on a new max, so it is far from a fast path. + */ + while (i < max_stack_trace.nr_entries) { + + stack_dump_index[i] = this_size; + p = start; + + for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (*p == stack_dump_trace[i]) { + this_size = stack_dump_index[i++] = + (top - p) * sizeof(unsigned long); + /* Start the search from here */ + start = p + 1; + } + } + + i++; + } + + out: + __raw_spin_unlock(&max_stack_lock); + raw_local_irq_restore(flags); +} + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + int cpu, resched; + + if (unlikely(!ftrace_enabled || stack_trace_disabled)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + /* no atomic needed, we only modify this variable by this cpu */ + if (per_cpu(trace_active, cpu)++ != 0) + goto out; + + check_stack(); + + out: + per_cpu(trace_active, cpu)--; + /* prevent recursion in schedule */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = stack_trace_call, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + long *ptr = filp->private_data; + unsigned long val, flags; + char buf[64]; + int ret; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + raw_local_irq_save(flags); + __raw_spin_lock(&max_stack_lock); + *ptr = val; + __raw_spin_unlock(&max_stack_lock); + raw_local_irq_restore(flags); + + return count; +} + +static struct file_operations stack_max_size_fops = { + .open = tracing_open_generic, + .read = stack_max_size_read, + .write = stack_max_size_write, +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + long i = (long)m->private; + + (*pos)++; + + i++; + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) + return NULL; + + m->private = (void *)i; + + return &m->private; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + void *t = &m->private; + loff_t l = 0; + + local_irq_disable(); + __raw_spin_lock(&max_stack_lock); + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + __raw_spin_unlock(&max_stack_lock); + local_irq_enable(); +} + +static int trace_lookup_stack(struct seq_file *m, long i) +{ + unsigned long addr = stack_dump_trace[i]; +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, addr); + + return seq_printf(m, "%s\n", str); +#else + return seq_printf(m, "%p\n", (void*)addr); +#endif +} + +static int t_show(struct seq_file *m, void *v) +{ + long i = *(long *)v; + int size; + + if (i < 0) { + seq_printf(m, " Depth Size Location" + " (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries); + return 0; + } + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) + return 0; + + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + + trace_lookup_stack(m, i); + + return 0; +} + +static struct seq_operations stack_trace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &stack_trace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = (void *)-1; + } + + return ret; +} + +static struct file_operations stack_trace_fops = { + .open = stack_trace_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +static __init int stack_trace_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); + if (!entry) + pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + + entry = debugfs_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); + if (!entry) + pr_warning("Could not create debugfs 'stack_trace' entry\n"); + + register_ftrace_function(&trace_ops); + + return 0; +} + +device_initcall(stack_trace_init); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index db58fb6..9587d3b 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -241,7 +241,7 @@ static void stack_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void start_stack_trace(struct trace_array *tr) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c new file mode 100644 index 0000000..f2b7c28 --- /dev/null +++ b/kernel/tracepoint.c @@ -0,0 +1,477 @@ +/* + * Copyright (C) 2008 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/tracepoint.h> +#include <linux/err.h> +#include <linux/slab.h> + +extern struct tracepoint __start___tracepoints[]; +extern struct tracepoint __stop___tracepoints[]; + +/* Set to 1 to enable tracepoint debug output */ +static const int tracepoint_debug; + +/* + * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the + * builtin and module tracepoints and the hash table. + */ +static DEFINE_MUTEX(tracepoints_mutex); + +/* + * Tracepoint hash table, containing the active tracepoints. + * Protected by tracepoints_mutex. + */ +#define TRACEPOINT_HASH_BITS 6 +#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) + +/* + * Note about RCU : + * It is used to to delay the free of multiple probes array until a quiescent + * state is reached. + * Tracepoint entries modifications are protected by the tracepoints_mutex. + */ +struct tracepoint_entry { + struct hlist_node hlist; + void **funcs; + int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + char name[0]; +}; + +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; + +static void free_old_closure(struct rcu_head *head) +{ + struct tracepoint_entry *entry = container_of(head, + struct tracepoint_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +{ + if (!old) + return; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu_sched(&entry->rcu, free_old_closure); +} + +static void debug_print_probes(struct tracepoint_entry *entry) +{ + int i; + + if (!tracepoint_debug) + return; + + for (i = 0; entry->funcs[i]; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); +} + +static void * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0; + void **old, **new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->funcs; + if (old) { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) + if (old[nr_probes] == probe) + return ERR_PTR(-EEXIST); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (old) + memcpy(new, old, nr_probes * sizeof(void *)); + new[nr_probes] = probe; + entry->refcount = nr_probes + 1; + entry->funcs = new; + debug_print_probes(entry); + return old; +} + +static void * +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0, nr_del = 0, i; + void **old, **new; + + old = entry->funcs; + + debug_print_probes(entry); + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) { + if ((!probe || old[nr_probes] == probe)) + nr_del++; + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->funcs = NULL; + entry->refcount = 0; + debug_print_probes(entry); + return old; + } else { + int j = 0; + /* N -> M, (N > 1, M > 0) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i]; i++) + if ((probe && old[i] != probe)) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->funcs = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get tracepoint if the tracepoint is present in the tracepoint hash table. + * Must be called with tracepoints_mutex held. + * Returns NULL if not present. + */ +static struct tracepoint_entry *get_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the tracepoint to the tracepoint hash table. Must be called with + * tracepoints_mutex held. + */ +static struct tracepoint_entry *add_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "tracepoint %s busy\n", name); + return ERR_PTR(-EEXIST); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(&e->name[0], name, name_len); + e->funcs = NULL; + e->refcount = 0; + e->rcu_pending = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the tracepoint from the tracepoint hash table. Must be called with + * mutex_lock held. + */ +static int remove_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + int found = 0; + size_t len = strlen(name) + 1; + u32 hash = jhash(name, len-1, 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->refcount) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu_sched has been executed */ + if (e->rcu_pending) + rcu_barrier_sched(); + kfree(e); + return 0; +} + +/* + * Sets the probe callback corresponding to one tracepoint. + */ +static void set_tracepoint(struct tracepoint_entry **entry, + struct tracepoint *elem, int active) +{ + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + /* + * rcu_assign_pointer has a smp_wmb() which makes sure that the new + * probe callbacks array is consistent before setting a pointer to it. + * This array is referenced by __DO_TRACE from + * include/linux/tracepoints.h. A matching smp_read_barrier_depends() + * is used. + */ + rcu_assign_pointer(elem->funcs, (*entry)->funcs); + elem->state = active; +} + +/* + * Disable a tracepoint and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. + */ +static void disable_tracepoint(struct tracepoint *elem) +{ + elem->state = 0; +} + +/** + * tracepoint_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ + struct tracepoint *iter; + struct tracepoint_entry *mark_entry; + + mutex_lock(&tracepoints_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_tracepoint(iter->name); + if (mark_entry) { + set_tracepoint(&mark_entry, iter, + !!mark_entry->refcount); + } else { + disable_tracepoint(iter); + } + } + mutex_unlock(&tracepoints_mutex); +} + +/* + * Update probes, removing the faulty probes. + */ +static void tracepoint_update_probes(void) +{ + /* Core kernel tracepoints */ + tracepoint_update_probe_range(__start___tracepoints, + __stop___tracepoints); + /* tracepoints in modules. */ + module_update_tracepoints(); +} + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @name: tracepoint name + * @probe: probe handler + * + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int tracepoint_probe_register(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + int ret = 0; + void *old; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } + } + /* + * If we detect that a call_rcu_sched is pending for this tracepoint, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier_sched(); + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + goto end; + } + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); + tracepoint_entry_free_old(entry, old); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register); + +/** + * tracepoint_probe_unregister - Disconnect a probe from a tracepoint + * @name: tracepoint name + * @probe: probe function pointer + * + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int tracepoint_probe_unregister(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + int ret = -ENOENT; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); + old = tracepoint_entry_remove_probe(entry, probe); + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); + tracepoint_entry_free_old(entry, old); + remove_tracepoint(name); /* Ignore busy error message */ + ret = 0; +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +/** + * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. + * @tracepoint: current tracepoints (in), next tracepoint (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next tracepoint has been found (1) or not (0). + * Will return the first tracepoint in the range if the input tracepoint is + * NULL. + */ +int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end) +{ + if (!*tracepoint && begin != end) { + *tracepoint = begin; + return 1; + } + if (*tracepoint >= begin && *tracepoint < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); + +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + if (!iter->module) { + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints, __stop___tracepoints); + if (found) + goto end; + } + /* tracepoints in modules. */ + found = module_get_iter_tracepoints(iter); +end: + if (!found) + tracepoint_iter_reset(iter); +} + +void tracepoint_iter_start(struct tracepoint_iter *iter) +{ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_start); + +void tracepoint_iter_next(struct tracepoint_iter *iter) +{ + iter->tracepoint++; + /* + * iter->tracepoint may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the tracepoints, getting the + * tracepoints from following modules if necessary. + */ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_next); + +void tracepoint_iter_stop(struct tracepoint_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + +void tracepoint_iter_reset(struct tracepoint_iter *iter) +{ + iter->module = NULL; + iter->tracepoint = NULL; +} +EXPORT_SYMBOL_GPL(tracepoint_iter_reset); diff --git a/mm/memory.c b/mm/memory.c index 3a6c4a6..164951c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -64,8 +64,6 @@ #include "internal.h" -#include "internal.h" - #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 8d7a27a..3e67d57 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -95,6 +95,7 @@ put_dentry: put_memory: return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_zero_setup - setup a shared anonymous mapping diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 712ae47..65ae576 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -17,7 +17,6 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/debugobjects.h> -#include <linux/vmalloc.h> #include <linux/kallsyms.h> #include <linux/list.h> #include <linux/rbtree.h> @@ -175,6 +174,21 @@ static int vmap_page_range(unsigned long addr, unsigned long end, return nr; } +static inline int is_vmalloc_or_module_addr(const void *x) +{ + /* + * x86-64 and sparc64 put modules in a special place, + * and fall back on vmalloc() if that fails. Others + * just put it in the vmalloc space. + */ +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + unsigned long addr = (unsigned long)x; + if (addr >= MODULES_VADDR && addr < MODULES_END) + return 1; +#endif + return is_vmalloc_addr(x); +} + /* * Walk a vmap address to the struct page it maps. */ @@ -188,8 +202,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ - VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && - !is_module_address(addr)); + VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (!pgd_none(*pgd)) { pud_t *pud = pud_offset(pgd, addr); diff --git a/samples/Kconfig b/samples/Kconfig index e1fb471..4b02f5a 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -13,6 +13,12 @@ config SAMPLE_MARKERS help This build markers example modules. +config SAMPLE_TRACEPOINTS + tristate "Build tracepoints examples -- loadable modules only" + depends on TRACEPOINTS && m + help + This build tracepoints example modules. + config SAMPLE_KOBJECT tristate "Build kobject examples" help diff --git a/samples/Makefile b/samples/Makefile index 2e02575..10eaca8 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,3 +1,3 @@ # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ +obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ diff --git a/samples/markers/probe-example.c b/samples/markers/probe-example.c index c8e099d..2dfb3b3 100644 --- a/samples/markers/probe-example.c +++ b/samples/markers/probe-example.c @@ -81,6 +81,7 @@ static void __exit probe_fini(void) probe_array[i].probe_func, &probe_array[i]); printk(KERN_INFO "Number of event b : %u\n", atomic_read(&eventb_count)); + marker_synchronize_unregister(); } module_init(probe_init); diff --git a/samples/tracepoints/Makefile b/samples/tracepoints/Makefile new file mode 100644 index 0000000..36479ad --- /dev/null +++ b/samples/tracepoints/Makefile @@ -0,0 +1,6 @@ +# builds the tracepoint example kernel modules; +# then to use one (as root): insmod <module_name.ko> + +obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-sample.o +obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-probe-sample.o +obj-$(CONFIG_SAMPLE_TRACEPOINTS) += tracepoint-probe-sample2.o diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h new file mode 100644 index 0000000..0216b55 --- /dev/null +++ b/samples/tracepoints/tp-samples-trace.h @@ -0,0 +1,13 @@ +#ifndef _TP_SAMPLES_TRACE_H +#define _TP_SAMPLES_TRACE_H + +#include <linux/proc_fs.h> /* for struct inode and struct file */ +#include <linux/tracepoint.h> + +DEFINE_TRACE(subsys_event, + TPPROTO(struct inode *inode, struct file *file), + TPARGS(inode, file)); +DEFINE_TRACE(subsys_eventb, + TPPROTO(void), + TPARGS()); +#endif diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c new file mode 100644 index 0000000..55abfdd --- /dev/null +++ b/samples/tracepoints/tracepoint-probe-sample.c @@ -0,0 +1,55 @@ +/* + * tracepoint-probe-sample.c + * + * sample tracepoint probes. + */ + +#include <linux/module.h> +#include <linux/file.h> +#include <linux/dcache.h> +#include "tp-samples-trace.h" + +/* + * Here the caller only guarantees locking for struct file and struct inode. + * Locking must therefore be done in the probe to use the dentry. + */ +static void probe_subsys_event(struct inode *inode, struct file *file) +{ + path_get(&file->f_path); + dget(file->f_path.dentry); + printk(KERN_INFO "Event is encountered with filename %s\n", + file->f_path.dentry->d_name.name); + dput(file->f_path.dentry); + path_put(&file->f_path); +} + +static void probe_subsys_eventb(void) +{ + printk(KERN_INFO "Event B is encountered\n"); +} + +int __init tp_sample_trace_init(void) +{ + int ret; + + ret = register_trace_subsys_event(probe_subsys_event); + WARN_ON(ret); + ret = register_trace_subsys_eventb(probe_subsys_eventb); + WARN_ON(ret); + + return 0; +} + +module_init(tp_sample_trace_init); + +void __exit tp_sample_trace_exit(void) +{ + unregister_trace_subsys_eventb(probe_subsys_eventb); + unregister_trace_subsys_event(probe_subsys_event); +} + +module_exit(tp_sample_trace_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Tracepoint Probes Samples"); diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c new file mode 100644 index 0000000..5e9fcf4 --- /dev/null +++ b/samples/tracepoints/tracepoint-probe-sample2.c @@ -0,0 +1,42 @@ +/* + * tracepoint-probe-sample2.c + * + * 2nd sample tracepoint probes. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include "tp-samples-trace.h" + +/* + * Here the caller only guarantees locking for struct file and struct inode. + * Locking must therefore be done in the probe to use the dentry. + */ +static void probe_subsys_event(struct inode *inode, struct file *file) +{ + printk(KERN_INFO "Event is encountered with inode number %lu\n", + inode->i_ino); +} + +int __init tp_sample_trace_init(void) +{ + int ret; + + ret = register_trace_subsys_event(probe_subsys_event); + WARN_ON(ret); + + return 0; +} + +module_init(tp_sample_trace_init); + +void __exit tp_sample_trace_exit(void) +{ + unregister_trace_subsys_event(probe_subsys_event); +} + +module_exit(tp_sample_trace_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Tracepoint Probes Samples"); diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c new file mode 100644 index 0000000..4ae4b7f --- /dev/null +++ b/samples/tracepoints/tracepoint-sample.c @@ -0,0 +1,53 @@ +/* tracepoint-sample.c + * + * Executes a tracepoint when /proc/tracepoint-example is opened. + * + * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include "tp-samples-trace.h" + +struct proc_dir_entry *pentry_example; + +static int my_open(struct inode *inode, struct file *file) +{ + int i; + + trace_subsys_event(inode, file); + for (i = 0; i < 10; i++) + trace_subsys_eventb(); + return -EPERM; +} + +static struct file_operations mark_ops = { + .open = my_open, +}; + +static int example_init(void) +{ + printk(KERN_ALERT "example init\n"); + pentry_example = proc_create("tracepoint-example", 0444, NULL, + &mark_ops); + if (!pentry_example) + return -EPERM; + return 0; +} + +static void example_exit(void) +{ + printk(KERN_ALERT "example exit\n"); + remove_proc_entry("tracepoint-example", NULL); +} + +module_init(example_init) +module_exit(example_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Tracepoint example"); diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 277cfe0..5ed4cbf 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -198,10 +198,17 @@ cmd_modversions = \ fi; endif +ifdef CONFIG_FTRACE_MCOUNT_RECORD +cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \ + "$(ARCH)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" \ + "$(MV)" "$(@)"; +endif + define rule_cc_o_c $(call echo-cmd,checksrc) $(cmd_checksrc) \ $(call echo-cmd,cc_o_c) $(cmd_cc_o_c); \ $(cmd_modversions) \ + $(cmd_record_mcount) \ scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' > \ $(dot-target).tmp; \ rm -f $(depfile); \ diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl index 2243353..5e7316e 100644 --- a/scripts/bootgraph.pl +++ b/scripts/bootgraph.pl @@ -37,13 +37,13 @@ # dmesg | perl scripts/bootgraph.pl > output.svg # -my @rows; -my %start, %end, %row; +my %start, %end; my $done = 0; -my $rowcount = 0; my $maxtime = 0; my $firsttime = 100; my $count = 0; +my %pids; + while (<>) { my $line = $_; if ($line =~ /([0-9\.]+)\] calling ([a-zA-Z0-9\_]+)\+/) { @@ -54,14 +54,8 @@ while (<>) { $firsttime = $1; } } - $row{$func} = 1; if ($line =~ /\@ ([0-9]+)/) { - my $pid = $1; - if (!defined($rows[$pid])) { - $rowcount = $rowcount + 1; - $rows[$pid] = $rowcount; - } - $row{$func} = $rows[$pid]; + $pids{$func} = $1; } $count = $count + 1; } @@ -109,17 +103,25 @@ $styles[11] = "fill:rgb(128,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb( my $mult = 950.0 / ($maxtime - $firsttime); my $threshold = ($maxtime - $firsttime) / 60.0; my $stylecounter = 0; +my %rows; +my $rowscount = 1; while (($key,$value) = each %start) { my $duration = $end{$key} - $start{$key}; if ($duration >= $threshold) { my $s, $s2, $e, $y; + $pid = $pids{$key}; + + if (!defined($rows{$pid})) { + $rows{$pid} = $rowscount; + $rowscount = $rowscount + 1; + } $s = ($value - $firsttime) * $mult; $s2 = $s + 6; $e = ($end{$key} - $firsttime) * $mult; $w = $e - $s; - $y = $row{$key} * 150; + $y = $rows{$pid} * 150; $y2 = $y + 4; $style = $styles[$stylecounter]; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e30bac1..f88bb3e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# (c) 2001, Dave Jones. <davej@codemonkey.org.uk> (the file handling bit) +# (c) 2001, Dave Jones. <davej@redhat.com> (the file handling bit) # (c) 2005, Joel Schopp <jschopp@austin.ibm.com> (the ugly bit) # (c) 2007, Andy Whitcroft <apw@uk.ibm.com> (new conditions, test suite, etc) # Licensed under the terms of the GNU GPL License version 2 diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl new file mode 100755 index 0000000..f56d760 --- /dev/null +++ b/scripts/recordmcount.pl @@ -0,0 +1,395 @@ +#!/usr/bin/perl -w +# (c) 2008, Steven Rostedt <srostedt@redhat.com> +# Licensed under the terms of the GNU GPL License version 2 +# +# recordmcount.pl - makes a section called __mcount_loc that holds +# all the offsets to the calls to mcount. +# +# +# What we want to end up with is a section in vmlinux called +# __mcount_loc that contains a list of pointers to all the +# call sites in the kernel that call mcount. Later on boot up, the kernel +# will read this list, save the locations and turn them into nops. +# When tracing or profiling is later enabled, these locations will then +# be converted back to pointers to some function. +# +# This is no easy feat. This script is called just after the original +# object is compiled and before it is linked. +# +# The references to the call sites are offsets from the section of text +# that the call site is in. Hence, all functions in a section that +# has a call site to mcount, will have the offset from the beginning of +# the section and not the beginning of the function. +# +# The trick is to find a way to record the beginning of the section. +# The way we do this is to look at the first function in the section +# which will also be the location of that section after final link. +# e.g. +# +# .section ".text.sched" +# .globl my_func +# my_func: +# [...] +# call mcount (offset: 0x5) +# [...] +# ret +# other_func: +# [...] +# call mcount (offset: 0x1b) +# [...] +# +# Both relocation offsets for the mcounts in the above example will be +# offset from .text.sched. If we make another file called tmp.s with: +# +# .section __mcount_loc +# .quad my_func + 0x5 +# .quad my_func + 0x1b +# +# We can then compile this tmp.s into tmp.o, and link it to the original +# object. +# +# But this gets hard if my_func is not globl (a static function). +# In such a case we have: +# +# .section ".text.sched" +# my_func: +# [...] +# call mcount (offset: 0x5) +# [...] +# ret +# .globl my_func +# other_func: +# [...] +# call mcount (offset: 0x1b) +# [...] +# +# If we make the tmp.s the same as above, when we link together with +# the original object, we will end up with two symbols for my_func: +# one local, one global. After final compile, we will end up with +# an undefined reference to my_func. +# +# Since local objects can reference local variables, we need to find +# a way to make tmp.o reference the local objects of the original object +# file after it is linked together. To do this, we convert the my_func +# into a global symbol before linking tmp.o. Then after we link tmp.o +# we will only have a single symbol for my_func that is global. +# We can convert my_func back into a local symbol and we are done. +# +# Here are the steps we take: +# +# 1) Record all the local symbols by using 'nm' +# 2) Use objdump to find all the call site offsets and sections for +# mcount. +# 3) Compile the list into its own object. +# 4) Do we have to deal with local functions? If not, go to step 8. +# 5) Make an object that converts these local functions to global symbols +# with objcopy. +# 6) Link together this new object with the list object. +# 7) Convert the local functions back to local symbols and rename +# the result as the original object. +# End. +# 8) Link the object with the list object. +# 9) Move the result back to the original object. +# End. +# + +use strict; + +my $P = $0; +$P =~ s@.*/@@g; + +my $V = '0.1'; + +if ($#ARGV < 6) { + print "usage: $P arch objdump objcopy cc ld nm rm mv inputfile\n"; + print "version: $V\n"; + exit(1); +} + +my ($arch, $objdump, $objcopy, $cc, $ld, $nm, $rm, $mv, $inputfile) = @ARGV; + +$objdump = "objdump" if ((length $objdump) == 0); +$objcopy = "objcopy" if ((length $objcopy) == 0); +$cc = "gcc" if ((length $cc) == 0); +$ld = "ld" if ((length $ld) == 0); +$nm = "nm" if ((length $nm) == 0); +$rm = "rm" if ((length $rm) == 0); +$mv = "mv" if ((length $mv) == 0); + +#print STDERR "running: $P '$arch' '$objdump' '$objcopy' '$cc' '$ld' " . +# "'$nm' '$rm' '$mv' '$inputfile'\n"; + +my %locals; # List of local (static) functions +my %weak; # List of weak functions +my %convert; # List of local functions used that needs conversion + +my $type; +my $section_regex; # Find the start of a section +my $function_regex; # Find the name of a function + # (return offset and func name) +my $mcount_regex; # Find the call site to mcount (return offset) + +if ($arch eq "x86_64") { + $section_regex = "Disassembly of section"; + $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; + $type = ".quad"; + + # force flags for this arch + $ld .= " -m elf_x86_64"; + $objdump .= " -M x86-64"; + $objcopy .= " -O elf64-x86-64"; + $cc .= " -m64"; + +} elsif ($arch eq "i386") { + $section_regex = "Disassembly of section"; + $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$"; + $type = ".long"; + + # force flags for this arch + $ld .= " -m elf_i386"; + $objdump .= " -M i386"; + $objcopy .= " -O elf32-i386"; + $cc .= " -m32"; + +} else { + die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; +} + +my $text_found = 0; +my $read_function = 0; +my $opened = 0; +my $mcount_section = "__mcount_loc"; + +my $dirname; +my $filename; +my $prefix; +my $ext; + +if ($inputfile =~ m,^(.*)/([^/]*)$,) { + $dirname = $1; + $filename = $2; +} else { + $dirname = "."; + $filename = $inputfile; +} + +if ($filename =~ m,^(.*)(\.\S),) { + $prefix = $1; + $ext = $2; +} else { + $prefix = $filename; + $ext = ""; +} + +my $mcount_s = $dirname . "/.tmp_mc_" . $prefix . ".s"; +my $mcount_o = $dirname . "/.tmp_mc_" . $prefix . ".o"; + +# +# --globalize-symbols came out in 2.17, we must test the version +# of objcopy, and if it is less than 2.17, then we can not +# record local functions. +my $use_locals = 01; +my $local_warn_once = 0; +my $found_version = 0; + +open (IN, "$objcopy --version |") || die "error running $objcopy"; +while (<IN>) { + if (/objcopy.*\s(\d+)\.(\d+)/) { + my $major = $1; + my $minor = $2; + + $found_version = 1; + if ($major < 2 || + ($major == 2 && $minor < 17)) { + $use_locals = 0; + } + last; + } +} +close (IN); + +if (!$found_version) { + print STDERR "WARNING: could not find objcopy version.\n" . + "\tDisabling local function references.\n"; +} + + +# +# Step 1: find all the local (static functions) and weak symbols. +# 't' is local, 'w/W' is weak (we never use a weak function) +# +open (IN, "$nm $inputfile|") || die "error running $nm"; +while (<IN>) { + if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) { + $locals{$1} = 1; + } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) { + $weak{$2} = $1; + } +} +close(IN); + +my @offsets; # Array of offsets of mcount callers +my $ref_func; # reference function to use for offsets +my $offset = 0; # offset of ref_func to section beginning + +## +# update_funcs - print out the current mcount callers +# +# Go through the list of offsets to callers and write them to +# the output file in a format that can be read by an assembler. +# +sub update_funcs +{ + return if ($#offsets < 0); + + defined($ref_func) || die "No function to reference"; + + # A section only had a weak function, to represent it. + # Unfortunately, a weak function may be overwritten by another + # function of the same name, making all these offsets incorrect. + # To be safe, we simply print a warning and bail. + if (defined $weak{$ref_func}) { + print STDERR + "$inputfile: WARNING: referencing weak function" . + " $ref_func for mcount\n"; + return; + } + + # is this function static? If so, note this fact. + if (defined $locals{$ref_func}) { + + # only use locals if objcopy supports globalize-symbols + if (!$use_locals) { + return; + } + $convert{$ref_func} = 1; + } + + # Loop through all the mcount caller offsets and print a reference + # to the caller based from the ref_func. + for (my $i=0; $i <= $#offsets; $i++) { + if (!$opened) { + open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; + $opened = 1; + print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; + } + printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; + } +} + +# +# Step 2: find the sections and mcount call sites +# +open(IN, "$objdump -dr $inputfile|") || die "error running $objdump"; + +my $text; + +while (<IN>) { + # is it a section? + if (/$section_regex/) { + $read_function = 1; + # print out any recorded offsets + update_funcs() if ($text_found); + + # reset all markers and arrays + $text_found = 0; + undef($ref_func); + undef(@offsets); + + # section found, now is this a start of a function? + } elsif ($read_function && /$function_regex/) { + $text_found = 1; + $offset = hex $1; + $text = $2; + + # if this is either a local function or a weak function + # keep looking for functions that are global that + # we can use safely. + if (!defined($locals{$text}) && !defined($weak{$text})) { + $ref_func = $text; + $read_function = 0; + } else { + # if we already have a function, and this is weak, skip it + if (!defined($ref_func) || !defined($weak{$text})) { + $ref_func = $text; + } + } + } + + # is this a call site to mcount? If so, record it to print later + if ($text_found && /$mcount_regex/) { + $offsets[$#offsets + 1] = hex $1; + } +} + +# dump out anymore offsets that may have been found +update_funcs() if ($text_found); + +# If we did not find any mcount callers, we are done (do nothing). +if (!$opened) { + exit(0); +} + +close(FILE); + +# +# Step 3: Compile the file that holds the list of call sites to mcount. +# +`$cc -o $mcount_o -c $mcount_s`; + +my @converts = keys %convert; + +# +# Step 4: Do we have sections that started with local functions? +# +if ($#converts >= 0) { + my $globallist = ""; + my $locallist = ""; + + foreach my $con (@converts) { + $globallist .= " --globalize-symbol $con"; + $locallist .= " --localize-symbol $con"; + } + + my $globalobj = $dirname . "/.tmp_gl_" . $filename; + my $globalmix = $dirname . "/.tmp_mx_" . $filename; + + # + # Step 5: set up each local function as a global + # + `$objcopy $globallist $inputfile $globalobj`; + + # + # Step 6: Link the global version to our list. + # + `$ld -r $globalobj $mcount_o -o $globalmix`; + + # + # Step 7: Convert the local functions back into local symbols + # + `$objcopy $locallist $globalmix $inputfile`; + + # Remove the temp files + `$rm $globalobj $globalmix`; + +} else { + + my $mix = $dirname . "/.tmp_mx_" . $filename; + + # + # Step 8: Link the object with our list of call sites object. + # + `$ld -r $inputfile $mcount_o -o $mix`; + + # + # Step 9: Move the result back to the original object. + # + `$mv $mix $inputfile`; +} + +# Clean up the temp files +`$rm $mcount_o $mcount_s`; + +exit(0); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 576e511..3e3fde7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -75,6 +75,7 @@ #include <linux/string.h> #include <linux/selinux.h> #include <linux/mutex.h> +#include <linux/posix-timers.h> #include "avc.h" #include "objsec.h" @@ -2322,13 +2323,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm) initrlim = init_task.signal->rlim+i; rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } - if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * This will cause RLIMIT_CPU calculations - * to be refigured. - */ - current->it_prof_expires = jiffies_to_cputime(1); - } + update_rlimit_cpu(rlim->rlim_cur); } /* Wake up the parent if it is waiting so that it can |