aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorAndrew Dodd <atd7@cornell.edu>2013-02-16 18:41:04 -0500
committerAndrew Dodd <atd7@cornell.edu>2013-02-27 09:19:08 -0500
commitb08797f2afdfc604c3143f8725d058aeef8ddcb2 (patch)
treec59e963bd6931d4e9f9526034ab402cc551f18ae /arch/x86
parentcbfae70f1dcaf3cc6e93061179dad80caa1597fe (diff)
parent54ea5b40f067cf098cac639973c6628c6944cfb2 (diff)
downloadkernel_samsung_smdk4412-b08797f2afdfc604c3143f8725d058aeef8ddcb2.zip
kernel_samsung_smdk4412-b08797f2afdfc604c3143f8725d058aeef8ddcb2.tar.gz
kernel_samsung_smdk4412-b08797f2afdfc604c3143f8725d058aeef8ddcb2.tar.bz2
Merge remote-tracking branch 'kernelorg/linux-3.0.y' into 3_0_64
Conflicts: arch/arm/Kconfig arch/arm/include/asm/hwcap.h arch/arm/kernel/smp.c arch/arm/plat-samsung/adc.c drivers/gpu/drm/i915/i915_reg.h drivers/gpu/drm/i915/intel_drv.h drivers/mmc/core/sd.c drivers/net/tun.c drivers/net/usb/usbnet.c drivers/regulator/max8997.c drivers/usb/core/hub.c drivers/usb/host/xhci.h drivers/usb/serial/qcserial.c fs/jbd2/transaction.c include/linux/migrate.h kernel/sys.c kernel/time/timekeeping.c lib/genalloc.c mm/memory-failure.c mm/memory_hotplug.c mm/mempolicy.c mm/page_alloc.c mm/vmalloc.c mm/vmscan.c mm/vmstat.c scripts/Kbuild.include Change-Id: I91e2d85c07320c7ccfc04cf98a448e89bed6ade6
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig9
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S6
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/archrandom.h75
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/pgtable-3level.h50
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/ptrace.h15
-rw-r--r--arch/x86/include/asm/system.h7
-rw-r--r--arch/x86/include/asm/traps.h26
-rw-r--r--arch/x86/kernel/acpi/boot.c27
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/amd_nb.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c30
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c60
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c11
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpu/rdrand.c73
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/entry_32.S9
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c4
-rw-r--r--arch/x86/kernel/microcode_core.c31
-rw-r--r--arch/x86/kernel/msr.c3
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/ptrace.c30
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c97
-rw-r--r--arch/x86/kernel/setup_percpu.c14
-rw-r--r--arch/x86/mm/hugetlbpage.c21
-rw-r--r--arch/x86/mm/init.c69
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/fixup.c17
-rw-r--r--arch/x86/platform/efi/efi_64.c22
-rw-r--r--arch/x86/xen/enlighten.c33
-rw-r--r--arch/x86/xen/mmu.c7
-rw-r--r--arch/x86/xen/p2m.c36
-rw-r--r--arch/x86/xen/setup.c4
46 files changed, 733 insertions, 196 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 37357a5..a0e9bda 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1451,6 +1451,15 @@ config ARCH_USES_PG_UNCACHED
def_bool y
depends on X86_PAT
+config ARCH_RANDOM
+ def_bool y
+ prompt "x86 architectural random number generator" if EXPERT
+ ---help---
+ Enable the x86 architectural RDRAND instruction
+ (Intel Bull Mountain technology) to generate random numbers.
+ If supported, this is a high bandwidth, cryptographically
+ secure hardware random number generator.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index be6d9e3..3470624 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2460,10 +2460,12 @@ ENTRY(aesni_cbc_dec)
pxor IN3, STATE4
movaps IN4, IV
#else
- pxor (INP), STATE2
- pxor 0x10(INP), STATE3
pxor IN1, STATE4
movaps IN2, IV
+ movups (INP), IN1
+ pxor IN1, STATE2
+ movups 0x10(INP), IN2
+ pxor IN2, STATE3
#endif
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index c1870dd..26af1e3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -208,7 +208,7 @@ sysexit_from_sys_call:
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
movl %eax,%esi /* second arg, syscall return value */
cmpl $0,%eax /* is it < 0? */
setl %al /* 1 if so, 0 if not */
@@ -218,7 +218,7 @@ sysexit_from_sys_call:
GET_THREAD_INFO(%r10)
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %edi,TI_flags(%r10)
jz \exit
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
new file mode 100644
index 0000000..0d9ec77
--- /dev/null
+++ b/arch/x86/include/asm/archrandom.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef ASM_X86_ARCHRANDOM_H
+#define ASM_X86_ARCHRANDOM_H
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
+#include <asm/nops.h>
+
+#define RDRAND_RETRY_LOOPS 10
+
+#define RDRAND_INT ".byte 0x0f,0xc7,0xf0"
+#ifdef CONFIG_X86_64
+# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0"
+#else
+# define RDRAND_LONG RDRAND_INT
+#endif
+
+#ifdef CONFIG_ARCH_RANDOM
+
+#define GET_RANDOM(name, type, rdrand, nop) \
+static inline int name(type *v) \
+{ \
+ int ok; \
+ alternative_io("movl $0, %0\n\t" \
+ nop, \
+ "\n1: " rdrand "\n\t" \
+ "jc 2f\n\t" \
+ "decl %0\n\t" \
+ "jnz 1b\n\t" \
+ "2:", \
+ X86_FEATURE_RDRAND, \
+ ASM_OUTPUT2("=r" (ok), "=a" (*v)), \
+ "0" (RDRAND_RETRY_LOOPS)); \
+ return ok; \
+}
+
+#ifdef CONFIG_X86_64
+
+GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5);
+GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4);
+
+#else
+
+GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3);
+GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* CONFIG_ARCH_RANDOM */
+
+extern void x86_init_rdrand(struct cpuinfo_x86 *c);
+
+#endif /* ASM_X86_ARCHRANDOM_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 71cc380..c5d941f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -173,7 +173,7 @@
#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index effff47..43876f1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
ptep->pte_low = pte.pte_low;
}
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the
+ * pmd can only transition from null to not null while pmd_read_atomic runs.
+ * So there's no need of literally reading it atomically.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * THP or null or point to a pte (and in turn become "stable") at any
+ * time under pmd_read_atomic, so it's mandatory to read it atomically
+ * with cmpxchg8b.
+ */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+ pmdval_t ret;
+ u32 *tmp = (u32 *)pmdp;
+
+ ret = (pmdval_t) (*tmp);
+ if (ret) {
+ /*
+ * If the low part is null, we must not read the high part
+ * or we can end up with a partial pmd.
+ */
+ smp_rmb();
+ ret |= ((pmdval_t)*(tmp + 1)) << 32;
+ }
+
+ return (pmd_t) { ret };
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+ return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18601c8..884507e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -146,8 +146,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
static inline int pmd_large(pmd_t pte)
{
- return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
+ return pmd_flags(pte) & _PAGE_PSE;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -415,7 +414,13 @@ static inline int pte_hidden(pte_t pte)
static inline int pmd_present(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_PRESENT;
+ /*
+ * Checking for _PAGE_PSE is needed too because
+ * split_huge_page will temporarily clear the present bit (but
+ * the _PAGE_PSE flag will remain set at all times while the
+ * _PAGE_PRESENT bit is clear).
+ */
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}
static inline int pmd_none(pmd_t pmd)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5d9c61d..e5f7248 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -99,7 +99,6 @@ struct cpuinfo_x86 {
u16 apicid;
u16 initial_apicid;
u16 x86_clflush_size;
-#ifdef CONFIG_SMP
/* number of cores as seen by the OS: */
u16 booted_cores;
/* Physical processor id: */
@@ -110,7 +109,6 @@ struct cpuinfo_x86 {
u8 compute_unit_id;
/* Index into per_cpu list: */
u16 cpu_index;
-#endif
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define X86_VENDOR_INTEL 0
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 94e7618..f332d64 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -187,21 +187,14 @@ static inline int v8086_mode(struct pt_regs *regs)
#endif
}
-/*
- * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
- * when it traps. The previous stack will be directly underneath the saved
- * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
- *
- * This is valid only for kernel mode traps.
- */
-static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
-{
#ifdef CONFIG_X86_32
- return (unsigned long)(&regs->sp);
+extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
#else
+static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
return regs->sp;
-#endif
}
+#endif
#define GET_IP(regs) ((regs)->ip)
#define GET_FP(regs) ((regs)->bp)
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c2ff2a1..f0d89d9 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -93,10 +93,6 @@ do { \
"memory"); \
} while (0)
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
#else
/* frame pointer must be last for get_wchan */
@@ -392,9 +388,6 @@ static inline void clflush(volatile void *__p)
#define nop() asm volatile ("nop")
-void disable_hlt(void);
-void enable_hlt(void);
-
void cpu_idle_wait(void);
extern unsigned long arch_align_stack(unsigned long sp);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da6..1d44903 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H
+#include <linux/kprobes.h>
#include <asm/debugreg.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
@@ -87,4 +88,29 @@ asmlinkage void smp_thermal_interrupt(void);
asmlinkage void mce_threshold_interrupt(void);
#endif
+/* Interrupts/Exceptions */
+enum {
+ X86_TRAP_DE = 0, /* 0, Divide-by-zero */
+ X86_TRAP_DB, /* 1, Debug */
+ X86_TRAP_NMI, /* 2, Non-maskable Interrupt */
+ X86_TRAP_BP, /* 3, Breakpoint */
+ X86_TRAP_OF, /* 4, Overflow */
+ X86_TRAP_BR, /* 5, Bound Range Exceeded */
+ X86_TRAP_UD, /* 6, Invalid Opcode */
+ X86_TRAP_NM, /* 7, Device Not Available */
+ X86_TRAP_DF, /* 8, Double Fault */
+ X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */
+ X86_TRAP_TS, /* 10, Invalid TSS */
+ X86_TRAP_NP, /* 11, Segment Not Present */
+ X86_TRAP_SS, /* 12, Stack Segment Fault */
+ X86_TRAP_GP, /* 13, General Protection Fault */
+ X86_TRAP_PF, /* 14, Page Fault */
+ X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */
+ X86_TRAP_MF, /* 16, x87 Floating-Point Exception */
+ X86_TRAP_AC, /* 17, Alignment Check */
+ X86_TRAP_MC, /* 18, Machine Check */
+ X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */
+ X86_TRAP_IRET = 32, /* 32, IRET Exception */
+};
+
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d..479d03c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -416,12 +416,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
return 0;
}
- if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+ if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
return 0;
}
- if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
@@ -1327,17 +1329,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1431,7 +1428,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
@@ -1466,6 +1463,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d5..4c734e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -161,7 +161,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
#endif
#ifdef P6_NOP1
-static const unsigned char __initconst_or_module p6nops[] =
+static const unsigned char p6nops[] =
{
P6_NOP1,
P6_NOP2,
@@ -220,7 +220,7 @@ void __init arch_init_ideal_nops(void)
ideal_nops = intel_nops;
#endif
}
-
+ break;
default:
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index bae1efe..be16854 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -154,16 +154,14 @@ int amd_get_subcaches(int cpu)
{
struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
unsigned int mask;
- int cuid = 0;
+ int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return 0;
pci_read_config_dword(link, 0x1d4, &mask);
-#ifdef CONFIG_SMP
cuid = cpu_data(cpu).compute_unit_id;
-#endif
return (mask >> (4 * cuid)) & 0xf;
}
@@ -172,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask)
static unsigned int reset, ban;
struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
unsigned int reg;
- int cuid = 0;
+ int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
return -EINVAL;
@@ -190,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask)
pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
}
-#ifdef CONFIG_SMP
cuid = cpu_data(cpu).compute_unit_id;
-#endif
mask <<= 4 * cuid;
mask |= (0xf ^ (1 << cuid)) << 26;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6042981..0e3a82a 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -15,6 +15,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
+obj-y += rdrand.o
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b13ed39..a93741d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -146,7 +146,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -190,7 +189,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
valid_k7:
;
-#endif
}
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -556,6 +554,34 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ u64 val;
+
+ if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
+ val |= 0x1E;
+ checking_wrmsrl(0xc0011021, val);
+ }
+ }
+
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ u64 val;
+
+ if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
+ val |= 0x1E;
+ checking_wrmsrl(0xc0011021, val);
+ }
+ }
+
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22a073d..1579ab9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -15,6 +15,7 @@
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
#include <asm/sections.h>
@@ -675,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
-#ifdef CONFIG_SMP
c->cpu_index = 0;
-#endif
filter_cpuid_features(c, false);
setup_smep(c);
@@ -760,10 +759,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->apicid = c->initial_apicid;
# endif
#endif
-
-#ifdef CONFIG_X86_HT
c->phys_proc_id = c->initial_apicid;
-#endif
}
setup_smep(c);
@@ -857,6 +853,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
#endif
init_hypervisor(c);
+ x86_init_rdrand(c);
/*
* Clear/Set all flags overriden by options, need do it
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ed6086e..e0dc000 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -179,7 +179,6 @@ static void __cpuinit trap_init_f00f_bug(void)
static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -196,7 +195,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
-#endif
}
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c..362190b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -101,15 +101,19 @@ static struct severity {
};
/*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
*/
static int error_context(struct mce *m)
{
- if (m->mcgstatus & MCG_STATUS_EIPV)
- return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
- /* Unknown, assume kernel */
- return IN_KERNEL;
+ return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
int mce_severity(struct mce *a, int tolerant, char **msg)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b..1396edf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -122,9 +122,7 @@ void mce_setup(struct mce *m)
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
}
@@ -453,6 +451,13 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
m->ip = regs->ip;
m->cs = regs->cs;
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
} else {
m->ip = 0;
m->cs = 0;
@@ -990,6 +995,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
add_taint(TAINT_MACHINE_CHECK);
+ mce_get_rip(&m, regs);
severity = mce_severity(&m, tolerant, NULL);
/*
@@ -1028,7 +1034,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_get_rip(&m, regs);
mce_log(&m);
if (severity > worst) {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad..b97aa72 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -52,6 +52,7 @@ struct threshold_block {
unsigned int cpu;
u32 address;
u16 interrupt_enable;
+ bool interrupt_capable;
u16 threshold_limit;
struct kobject kobj;
struct list_head miscj;
@@ -64,11 +65,9 @@ struct threshold_bank {
};
static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
-#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
0, 0, 0, 0, 1
};
-#endif
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
@@ -86,6 +85,21 @@ struct thresh_restart {
u16 old_limit;
};
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -107,8 +121,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
return 1;
};
-/* must be called with correct cpu affinity */
-/* Called via smp_call_function_single() */
+/*
+ * Called via smp_call_function_single(), must be called with correct
+ * cpu affinity.
+ */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
@@ -131,6 +147,12 @@ static void threshold_restart_bank(void *_tr)
(new_count & THRESHOLD_MAX);
}
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
if (tr->set_lvt_off) {
if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
/* set new lvt offset */
@@ -139,9 +161,10 @@ static void threshold_restart_bank(void *_tr)
}
}
- tr->b->interrupt_enable ?
- (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (hi &= ~MASK_INT_TYPE_HI);
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, lo, hi);
@@ -202,18 +225,21 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
+
if (shared_bank[bank] && c->cpu_core_id)
break;
-#endif
- offset = setup_APIC_mce(offset,
- (high & MASK_LVTOFF_HI) >> 20);
memset(&b, 0, sizeof(b));
- b.cpu = cpu;
- b.bank = bank;
- b.block = block;
- b.address = address;
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = address;
+ b.interrupt_capable = lvt_interrupt_supported(bank, high);
+
+ if (b.interrupt_capable) {
+ int new = (high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce(offset, new);
+ }
mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
@@ -313,6 +339,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
@@ -471,6 +500,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
b->cpu = cpu;
b->address = address;
b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
INIT_LIST_HEAD(&b->miscj);
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 27c6251..99cd9d2 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -322,17 +322,6 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
-/*
- * Set up the most two significant bit to notify mce log that this thermal
- * event type.
- * This is a temp solution. May be changed in the future with mce log
- * infrasture.
- */
-#define CORE_THROTTLED (0)
-#define CORE_POWER_LIMIT ((__u64)1 << 62)
-#define PACKAGE_THROTTLED ((__u64)2 << 62)
-#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
-
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
@@ -362,27 +351,23 @@ static void intel_thermal_interrupt(void)
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
- mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+ mce_log_therm_throt_event(msr_val);
if (this_cpu_has(X86_FEATURE_PLN))
- if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
- CORE_LEVEL) != 0)
- mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+ CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL) != 0)
- mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ PACKAGE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN))
- if (therm_throt_process(msr_val &
+ therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
- PACKAGE_LEVEL) != 0)
- mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
- | msr_val);
+ PACKAGE_LEVEL);
}
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d..4b50c96 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -437,6 +437,7 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x023 DE PERF_CTL[2:0]
* 0x02D LS PERF_CTL[3]
* 0x02E LS PERF_CTL[3,0]
+ * 0x031 LS PERF_CTL[2:0] (**)
* 0x043 CU PERF_CTL[2:0]
* 0x045 CU PERF_CTL[2:0]
* 0x046 CU PERF_CTL[2:0]
@@ -450,10 +451,12 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x0DD LS PERF_CTL[5:0]
* 0x0DE LS PERF_CTL[5:0]
* 0x0DF LS PERF_CTL[5:0]
+ * 0x1C0 EX PERF_CTL[5:3]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
*
- * (*) depending on the umask all FPU counters may be used
+ * (*) depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -503,6 +506,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
return &amd_f15_PMC3;
case 0x02E:
return &amd_f15_PMC30;
+ case 0x031:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ return &amd_f15_PMC20;
+ return &emptyconstraint;
+ case 0x1C0:
+ return &amd_f15_PMC53;
default:
return &amd_f15_PMC50;
}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 62ac8cb..72c365a 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- unsigned int cpu = 0;
+ unsigned int cpu;
int i;
-#ifdef CONFIG_SMP
cpu = c->cpu_index;
-#endif
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
new file mode 100644
index 0000000..feca286
--- /dev/null
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -0,0 +1,73 @@
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/archrandom.h>
+#include <asm/sections.h>
+
+static int __init x86_rdrand_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_RDRAND);
+ return 1;
+}
+__setup("nordrand", x86_rdrand_setup);
+
+/* We can't use arch_get_random_long() here since alternatives haven't run */
+static inline int rdrand_long(unsigned long *v)
+{
+ int ok;
+ asm volatile("1: " RDRAND_LONG "\n\t"
+ "jc 2f\n\t"
+ "decl %0\n\t"
+ "jnz 1b\n\t"
+ "2:"
+ : "=r" (ok), "=a" (*v)
+ : "0" (RDRAND_RETRY_LOOPS));
+ return ok;
+}
+
+/*
+ * Force a reseed cycle; we are architecturally guaranteed a reseed
+ * after no more than 512 128-bit chunks of random data. This also
+ * acts as a test of the CPU capability.
+ */
+#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
+
+void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_ARCH_RANDOM
+ unsigned long tmp;
+ int i, count, ok;
+
+ if (!cpu_has(c, X86_FEATURE_RDRAND))
+ return; /* Nothing to do */
+
+ for (count = i = 0; i < RESEED_LOOP; i++) {
+ ok = rdrand_long(&tmp);
+ if (ok)
+ count++;
+ }
+
+ if (count != RESEED_LOOP)
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+#endif
+}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c7f64e6..ea6106c 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
+ { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index edb3d46..2df1252 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1029,7 +1029,7 @@ ENTRY(xen_sysenter_target)
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
- pushl_cfi $0
+ pushl_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
TRACE_IRQS_OFF
@@ -1071,14 +1071,15 @@ ENTRY(xen_failsafe_callback)
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
+ /* EAX == 0 => Category 1 (Bad segment)
+ EAX != 0 => Category 2 (Bad IRET) */
testl %eax,%eax
popl_cfi %eax
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
- addl $16,%esp
- jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
+ jmp iret_exc
+5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0..dd4dba4 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1308,7 +1308,7 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0
+ pushq_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp error_exit
CFI_ENDPROC
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index aa083d3..0aa649e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -427,7 +427,7 @@ void hpet_msi_unmask(struct irq_data *data)
/* unmask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg |= HPET_TN_FSB;
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
@@ -438,7 +438,7 @@ void hpet_msi_mask(struct irq_data *data)
/* mask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg &= ~HPET_TN_FSB;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index b727450..53ab9ff 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -162,6 +162,7 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
#define F15H_MPB_MAX_SIZE 4096
+#define F16H_MPB_MAX_SIZE 3458
switch (c->x86) {
case 0x14:
@@ -170,6 +171,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
case 0x15:
max_size = F15H_MPB_MAX_SIZE;
break;
+ case 0x16:
+ max_size = F16H_MPB_MAX_SIZE;
+ break;
default:
max_size = F1XH_MPB_MAX_SIZE;
break;
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f924280..c4e2465 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -297,20 +297,31 @@ static ssize_t reload_store(struct sys_device *dev,
const char *buf, size_t size)
{
unsigned long val;
- int cpu = dev->id;
- int ret = 0;
- char *end;
+ int cpu;
+ ssize_t ret = 0, tmp_ret;
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
+ /* allow reload only from the BSP */
+ if (boot_cpu_data.cpu_index != dev->id)
return -EINVAL;
- if (val == 1) {
- get_online_cpus();
- if (cpu_online(cpu))
- ret = reload_for_cpu(cpu);
- put_online_cpus();
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return size;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ tmp_ret = reload_for_cpu(cpu);
+ if (tmp_ret != 0)
+ pr_warn("Error reloading microcode on CPU %d\n", cpu);
+
+ /* save retval of the first encountered reload error */
+ if (!ret)
+ ret = tmp_ret;
}
+ put_online_cpus();
if (!ret)
ret = size;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2..f7d1a64 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -175,6 +175,9 @@ static int msr_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
cpu = iminor(file->f_path.dentry->d_inode);
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e1ba8cb..4272502 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -341,34 +341,10 @@ void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
#endif
-#ifdef CONFIG_X86_32
-/*
- * This halt magic was a workaround for ancient floppy DMA
- * wreckage. It should be safe to remove.
- */
-static int hlt_counter;
-void disable_hlt(void)
-{
- hlt_counter++;
-}
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
- hlt_counter--;
-}
-EXPORT_SYMBOL(enable_hlt);
-
-static inline int hlt_use_halt(void)
-{
- return (!hlt_counter && boot_cpu_data.hlt_works_ok);
-}
-#else
static inline int hlt_use_halt(void)
{
return 1;
}
-#endif
/*
* We use this if we don't have any better
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2..911e16d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -164,6 +165,35 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. The previous stack will be directly underneath the saved
+ * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
+ *
+ * Now, if the stack is empty, '&regs->sp' is out of range. In this
+ * case we try to take the previous stack. To always return a non-null
+ * stack pointer we fall back to regs as stack if no previous stack
+ * exists.
+ *
+ * This is valid only for kernel mode traps.
+ */
+unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
+ unsigned long sp = (unsigned long)&regs->sp;
+ struct thread_info *tinfo;
+
+ if (context == (sp & ~(THREAD_SIZE - 1)))
+ return sp;
+
+ tinfo = (struct thread_info *)context;
+ if (tinfo->previous_esp)
+ return tinfo->previous_esp;
+
+ return (unsigned long)regs;
+}
+EXPORT_SYMBOL_GPL(kernel_stack_pointer);
+
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d4a705f..89d6877 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -452,6 +452,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
},
},
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index afaf384..6c4e9ff 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -631,6 +631,83 @@ static __init void reserve_ibft_region(void)
static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+static bool __init snb_gfx_workaround_needed(void)
+{
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
+
+ return false;
+}
+
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
+ */
+static void __init trim_snb_memory(void)
+{
+ static const unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
+
+ if (!snb_gfx_workaround_needed())
+ return;
+
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+ * Reserve all memory below the 1 MB mark that has not
+ * already been reserved.
+ */
+ memblock_reserve(0, 1<<20);
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
+}
+
+/*
+ * Here we put platform-specific memory range workarounds, i.e.
+ * memory known to be corrupt or otherwise in need to be reserved on
+ * specific platforms.
+ *
+ * If this gets used more widely it could use a real dispatch mechanism.
+ */
+static void __init trim_platform_memory_ranges(void)
+{
+ trim_snb_memory();
+}
+
static void __init trim_bios_range(void)
{
/*
@@ -651,6 +728,7 @@ static void __init trim_bios_range(void)
* take them out.
*/
e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
@@ -929,6 +1007,8 @@ void __init setup_arch(char **cmdline_p)
setup_trampolines();
+ trim_platform_memory_ranges();
+
init_gbpages();
/* max_pfn_mapped is updated here */
@@ -937,8 +1017,21 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
- max_pfn_mapped = init_memory_mapping(1UL<<32,
- max_pfn<<PAGE_SHIFT);
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->addr + ei->size <= 1UL << 32)
+ continue;
+
+ if (ei->type == E820_RESERVED)
+ continue;
+
+ max_pfn_mapped = init_memory_mapping(
+ ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr,
+ ei->addr + ei->size);
+ }
+
/* can we preseve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727..5a98aa2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
#endif
rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
- const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f581a18..df7d12c 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
}
/*
- * search for a shareable pmd page for hugetlb.
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+static pte_t *
+huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
+ pte_t *pte;
if (!vma_shareable(vma, addr))
- return;
+ return (pte_t *)pmd_alloc(mm, pud, addr);
mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
}
/*
@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
} else {
BUG_ON(sz != PMD_SIZE);
if (pud_none(*pud))
- huge_pmd_share(mm, addr, pud);
- pte = (pte_t *) pmd_alloc(mm, pud, addr);
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 87488b9..c22c423 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,36 +28,50 @@ int direct_gbpages
#endif
;
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+/*
+ * First calculate space needed for kernel direct mapping page tables to cover
+ * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
+ * pages. Then find enough contiguous space for those page tables.
+ */
+static void __init find_early_table_space(struct map_range *mr, int nr_range)
{
- unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
+ int i;
+ unsigned long puds = 0, pmds = 0, ptes = 0, tables;
+ unsigned long start = 0, good_end;
phys_addr_t base;
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ for (i = 0; i < nr_range; i++) {
+ unsigned long range, extra;
- if (use_gbpages) {
- unsigned long extra;
+ range = mr[i].end - mr[i].start;
+ puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+ if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
+ extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
+ pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else {
+ pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
+ }
- if (use_pse) {
- unsigned long extra;
-
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
+ extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
+ extra += PMD_SIZE;
#endif
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else {
+ ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ }
+ }
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
#ifdef CONFIG_X86_32
@@ -74,8 +88,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+ printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
+ mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
+ (pgt_buf_top << PAGE_SHIFT) - 1);
}
void __init native_pagetable_reserve(u64 start, u64 end)
@@ -83,12 +98,6 @@ void __init native_pagetable_reserve(u64 start, u64 end)
memblock_x86_reserve_range(start, end, "PGTABLE");
}
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
-
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -260,7 +269,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* nodes are discovered.
*/
if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
+ find_early_table_space(mr, nr_range);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 68894fd..a00c588 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -55,7 +55,7 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
val |= counter_config->extra;
event &= model->event_mask ? model->event_mask : 0xFF;
val |= event & 0xFF;
- val |= (event & 0x0F00) << 24;
+ val |= (u64)(event & 0x0F00) << 24;
return val;
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 6dd8955..0951b81 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -521,3 +521,20 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
+
+/*
+ * Twinhead H12Y needs us to block out a region otherwise we map devices
+ * there and any access kills the box.
+ *
+ * See: https://bugzilla.kernel.org/show_bug.cgi?id=10231
+ *
+ * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor)
+ */
+static void __devinit twinhead_reserve_killing_zone(struct pci_dev *dev)
+{
+ if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) {
+ pr_info("Reserving memory on Twinhead H12Y\n");
+ request_mem_region(0xFFB00000, 0x100000, "twinhead");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac3aa54..0fba86d 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -38,7 +38,7 @@
#include <asm/cacheflush.h>
#include <asm/fixmap.h>
-static pgd_t save_pgd __initdata;
+static pgd_t *save_pgd __initdata;
static unsigned long efi_flags __initdata;
static void __init early_code_mapping_set_exec(int executable)
@@ -61,12 +61,20 @@ static void __init early_code_mapping_set_exec(int executable)
void __init efi_call_phys_prelog(void)
{
unsigned long vaddress;
+ int pgd;
+ int n_pgds;
early_code_mapping_set_exec(1);
local_irq_save(efi_flags);
- vaddress = (unsigned long)__va(0x0UL);
- save_pgd = *pgd_offset_k(0x0UL);
- set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
+
+ n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
+ save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
+
+ for (pgd = 0; pgd < n_pgds; pgd++) {
+ save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
+ vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
+ set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
+ }
__flush_tlb_all();
}
@@ -75,7 +83,11 @@ void __init efi_call_phys_epilog(void)
/*
* After the lock is released, the original page table is restored.
*/
- set_pgd(pgd_offset_k(0x0UL), save_pgd);
+ int pgd;
+ int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+ for (pgd = 0; pgd < n_pgds; pgd++)
+ set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
+ kfree(save_pgd);
__flush_tlb_all();
local_irq_restore(efi_flags);
early_code_mapping_set_exec(0);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0fb662a..9f808af 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -62,6 +62,7 @@
#include <asm/reboot.h>
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
+#include <asm/pci_x86.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -197,6 +198,9 @@ static void __init xen_banner(void)
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
@@ -217,6 +221,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
maskedx = cpuid_leaf1_edx_mask;
break;
+ case CPUID_THERM_POWER_LEAF:
+ /* Disabling APERFMPERF for kernel usage */
+ maskecx = ~(1 << APERFMPERF_PRESENT);
+ break;
+
case 0xb:
/* Suppress extended topology stuff */
maskebx = 0;
@@ -794,7 +803,16 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
-
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+ return 0;
+}
+static inline void xen_write_cr8(unsigned long val)
+{
+ BUG_ON(val);
+}
+#endif
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
@@ -959,6 +977,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
+#ifdef CONFIG_X86_64
+ .read_cr8 = xen_read_cr8,
+ .write_cr8 = xen_write_cr8,
+#endif
+
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
@@ -966,6 +989,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
+ .read_tscp = native_read_tscp,
+
.iret = xen_iret,
.irq_enable_sysexit = xen_sysexit,
#ifdef CONFIG_X86_64
@@ -1259,8 +1284,10 @@ asmlinkage void __init xen_start_kernel(void)
/* Make sure ACS will be enabled */
pci_request_acs();
}
-
-
+#ifdef CONFIG_PCI
+ /* PCI BIOS service won't work from a PV guest. */
+ pci_probe &= ~PCI_PROBE_BIOS;
+#endif
xen_raw_console_write("about to get started...\n");
xen_setup_runstate_info(0);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5f76c0a..d957dce 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -320,8 +320,13 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
{
if (val & _PAGE_PRESENT) {
unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ unsigned long pfn = mfn_to_pfn(mfn);
+
pteval_t flags = val & PTE_FLAGS_MASK;
- val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+ if (unlikely(pfn == ~0))
+ val = flags & ~_PAGE_PRESENT;
+ else
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
}
return val;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 58efeb9..2f7847d 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -683,6 +683,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
+ int ret = 0;
pfn = page_to_pfn(page);
if (!PageHighMem(page)) {
@@ -706,6 +707,24 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
+ /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
+ * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
+ * pfn so that the following mfn_to_pfn(mfn) calls will return the
+ * pfn from the m2p_override (the backend pfn) instead.
+ * We need to do this because the pages shared by the frontend
+ * (xen-blkfront) can be already locked (lock_page, called by
+ * do_read_cache_page); when the userspace backend tries to use them
+ * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
+ * do_blockdev_direct_IO is going to try to lock the same pages
+ * again resulting in a deadlock.
+ * As a side effect get_user_pages_fast might not be safe on the
+ * frontend pages while they are being shared with the backend,
+ * because mfn_to_pfn (that ends up being called by GUPF) will
+ * return the backend pfn rather than the frontend pfn. */
+ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+ if (ret == 0 && get_phys_to_machine(pfn) == mfn)
+ set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+
return 0;
}
EXPORT_SYMBOL_GPL(m2p_add_override);
@@ -717,6 +736,7 @@ int m2p_remove_override(struct page *page, bool clear_pte)
unsigned long uninitialized_var(address);
unsigned level;
pte_t *ptep = NULL;
+ int ret = 0;
pfn = page_to_pfn(page);
mfn = get_phys_to_machine(pfn);
@@ -743,6 +763,22 @@ int m2p_remove_override(struct page *page, bool clear_pte)
/* No tlb flush necessary because the caller already
* left the pte unmapped. */
+ /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
+ * somewhere in this domain, even before being added to the
+ * m2p_override (see comment above in m2p_add_override).
+ * If there are no other entries in the m2p_override corresponding
+ * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
+ * the original pfn (the one shared by the frontend): the backend
+ * cannot do any IO on this page anymore because it has been
+ * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
+ * the original pfn causes mfn_to_pfn(mfn) to return the frontend
+ * pfn again. */
+ mfn &= ~FOREIGN_FRAME_BIT;
+ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+ if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ m2p_find_override(mfn) == NULL)
+ set_phys_to_machine(pfn, mfn);
+
return 0;
}
EXPORT_SYMBOL_GPL(m2p_remove_override);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f8dcda4..5669564 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -15,6 +15,7 @@
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/acpi.h>
+#include <asm/numa.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -463,4 +464,7 @@ void __init xen_arch_setup(void)
boot_option_idle_override = IDLE_HALT;
fiddle_vdso();
+#ifdef CONFIG_NUMA
+ numa_off = 1;
+#endif
}