aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-01-13 15:47:10 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 17:32:46 -0800
commit8ee53820edfd1f3b6554c593f337148dd3d7fc91 (patch)
treeca69957e928cd3efa1b47f92dcfb00591702684c /arch/x86
parent4b7167b9ff9b7f3f528cbc4c7d02ebd275b9b10c (diff)
downloadkernel_samsung_smdk4412-8ee53820edfd1f3b6554c593f337148dd3d7fc91.zip
kernel_samsung_smdk4412-8ee53820edfd1f3b6554c593f337148dd3d7fc91.tar.gz
kernel_samsung_smdk4412-8ee53820edfd1f3b6554c593f337148dd3d7fc91.tar.bz2
thp: mmu_notifier_test_young
For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kvm/mmu.c34
-rw-r--r--arch/x86/mm/gup.c3
3 files changed, 38 insertions, 0 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21..ffd7f8d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 47b2c32..f02b8ed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return young;
}
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int young = 0;
+
+ /*
+ * If there's no access bit in the secondary pte set by the
+ * hardware it's up to gup-fast/gup to set the access bit in
+ * the primary pte or in the page structure.
+ */
+ if (!shadow_accessed_mask)
+ goto out;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ young = _spte & PT_ACCESSED_MASK;
+ if (young) {
+ young = 1;
+ break;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+out:
+ return young;
+}
+
#define RMAP_RECYCLE_THRESHOLD 1000
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 269aa53..dbe34b9 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
}
static inline void get_huge_page_tail(struct page *page)