From 6aab341e0a28aff100a09831c5300a2994b8b986 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 28 Nov 2005 14:34:23 -0800
Subject: mm: re-architect the VM_UNPAGED logic

This replaces the (in my opinion horrible) VM_UNMAPPED logic with very
explicit support for a "remapped page range" aka VM_PFNMAP.  It allows a
VM area to contain an arbitrary range of page table entries that the VM
never touches, and never considers to be normal pages.

Any user of "remap_pfn_range()" automatically gets this new
functionality, and doesn't even have to mark the pages reserved or
indeed mark them any other way.  It just works.  As a side effect, doing
mmap() on /dev/mem works for arbitrary ranges.

Sparc update from David in the next commit.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fremap.c    |  22 +++----
 mm/madvise.c   |   2 +-
 mm/memory.c    | 189 ++++++++++++++++++++++++++++++---------------------------
 mm/mempolicy.c |  12 ++--
 mm/msync.c     |  12 +---
 mm/nommu.c     |   2 +-
 mm/rmap.c      |  14 +----
 7 files changed, 118 insertions(+), 135 deletions(-)

(limited to 'mm')

diff --git a/mm/fremap.c b/mm/fremap.c
index 007cbad..f851775 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page = NULL;
 
 	if (pte_present(pte)) {
-		unsigned long pfn = pte_pfn(pte);
-		flush_cache_page(vma, addr, pfn);
+		flush_cache_page(vma, addr, pte_pfn(pte));
 		pte = ptep_clear_flush(vma, addr, ptep);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, pte, addr);
-			goto out;
+		page = vm_normal_page(vma, addr, pte);
+		if (page) {
+			if (pte_dirty(pte))
+				set_page_dirty(page);
+			page_remove_rmap(page);
+			page_cache_release(page);
 		}
-		page = pfn_to_page(pfn);
-		if (pte_dirty(pte))
-			set_page_dirty(page);
-		page_remove_rmap(page);
-		page_cache_release(page);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
 		pte_clear(mm, addr, ptep);
 	}
-out:
 	return !!page;
 }
 
@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
diff --git a/mm/madvise.c b/mm/madvise.c
index 328a3bc..2b7cf04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED))
+	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4..b57fbc6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 }
 
 /*
- * This function is called to print an error when a pte in a
- * !VM_UNPAGED region is found pointing to an invalid pfn (which
- * is an error.
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
  *
  * The calling function must still handle the error.
  */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
 }
 
 /*
- * page_is_anon applies strict checks for an anonymous page belonging to
- * this vma at this address.  It is used on VM_UNPAGED vmas, which are
- * usually populated with shared originals (which must not be counted),
- * but occasionally contain private COWed copies (when !VM_SHARED, or
- * perhaps via ptrace when VM_SHARED).  An mmap of /dev/mem might window
- * free pages, pages from other processes, or from other parts of this:
- * it's tricky, but try not to be deceived by foreign anonymous pages.
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
  */
-static inline int page_is_anon(struct page *page,
-			struct vm_area_struct *vma, unsigned long addr)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
-	return page && PageAnon(page) && page_mapped(page) &&
-		page_address_in_vma(page, vma) == addr;
+	unsigned long pfn = pte_pfn(pte);
+
+	if (vma->vm_flags & VM_PFNMAP) {
+		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+		if (pfn == vma->vm_pgoff + off)
+			return NULL;
+	}
+
+	/*
+	 * Add some anal sanity checks for now. Eventually,
+	 * we should just do "return pfn_to_page(pfn)", but
+	 * in the meantime we check that we get a valid pfn,
+	 * and that the resulting page looks ok.
+	 *
+	 * Remove this test eventually!
+	 */
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		return NULL;
+	}
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page 
+	 * tables. 
+	 *
+	 * The PAGE_ZERO() pages and various VDSO mappings can
+	 * cause them to exist.
+	 */
+	return pfn_to_page(pfn);
 }
 
 /*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
-	unsigned long pfn;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	pfn = pte_pfn(pte);
-	page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-	if (unlikely(vm_flags & VM_UNPAGED))
-		if (!page_is_anon(page, vma, addr))
-			goto out_set_pte;
-
-	/*
-	 * If the pte points outside of valid memory but
-	 * the region is not VM_UNPAGED, we have a problem.
-	 */
-	if (unlikely(!page)) {
-		print_bad_pte(vma, pte, addr);
-		goto out_set_pte; /* try to do something sane */
-	}
-
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (vm_flags & VM_SHARED)
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
-	get_page(page);
-	page_dup_rmap(page);
-	rss[!!PageAnon(page)]++;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (page) {
+		get_page(page);
+		page_dup_rmap(page);
+		rss[!!PageAnon(page)]++;
+	}
 
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) {
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		}
 		if (pte_present(ptent)) {
 			struct page *page;
-			unsigned long pfn;
 
 			(*zap_work) -= PAGE_SIZE;
 
-			pfn = pte_pfn(ptent);
-			page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-			if (unlikely(vma->vm_flags & VM_UNPAGED)) {
-				if (!page_is_anon(page, vma, addr))
-					page = NULL;
-			} else if (unlikely(!page))
-				print_bad_pte(vma, ptent, addr);
-
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 /*
  * Do a quick page-table lookup for a single page.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int flags)
 {
 	pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
-	unsigned long pfn;
 	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
 
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 		goto unlock;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
-	pfn = pte_pfn(pte);
-	if (!pfn_valid(pfn))
+	page = vm_normal_page(vma, address, pte);
+	if (unlikely(!page))
 		goto unlock;
 
-	page = pfn_to_page(pfn);
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				return i ? : -EFAULT;
 			}
 			if (pages) {
-				pages[i] = pte_page(*pte);
-				get_page(pages[i]);
+				struct page *page = vm_normal_page(vma, start, *pte);
+				pages[i] = page;
+				if (page)
+					get_page(page);
 			}
 			pte_unmap(pte);
 			if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				foll_flags |= FOLL_WRITE;
 
 			cond_resched();
-			while (!(page = follow_page(mm, start, foll_flags))) {
+			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
 				ret = __handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 *	in 2.6 the LRU scan won't even find its pages, so this
 	 *	flag means no more than count its pages in reserved_vm,
 	 * 	and omit it from core dump, even when VM_IO turned off.
-	 *   VM_UNPAGED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out): in
-	 *	particular, zap_pte_range does not try to free them.
+	 *   VM_PFNMAP tells the core MM that the base pages are just
+	 *	raw PFN mappings, and do not have a "struct page" associated
+	 *	with them.
 	 */
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_pgoff = pfn;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+	/*
+	 * If the source page was a PFN mapping, we don't have
+	 * a "struct page" for it. We do a best-effort copy by
+	 * just copying from the original user address. If that
+	 * fails, we just zero-fill it. Live with it.
+	 */
+	if (unlikely(!src)) {
+		void *kaddr = kmap_atomic(dst, KM_USER0);
+		unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
+		if (left)
+			memset(kaddr, 0, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+		
+	}
+	copy_user_highpage(dst, src, va);
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spinlock_t *ptl, pte_t orig_pte)
 {
 	struct page *old_page, *src_page, *new_page;
-	unsigned long pfn = pte_pfn(orig_pte);
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	if (unlikely(!pfn_valid(pfn))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 * Or it's an attempt to COW an out-of-map VM_UNPAGED
-		 * entry, which copy_user_highpage does not support.
-		 */
-		print_bad_pte(vma, orig_pte, address);
-		ret = VM_FAULT_OOM;
-		goto unlock;
-	}
-	old_page = pfn_to_page(pfn);
+	old_page = vm_normal_page(vma, address, orig_pte);
 	src_page = old_page;
-
-	if (unlikely(vma->vm_flags & VM_UNPAGED))
-		if (!page_is_anon(old_page, vma, address)) {
-			old_page = NULL;
-			goto gotten;
-		}
+	if (!old_page)
+		goto gotten;
 
 	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
@@ -1351,7 +1373,7 @@ gotten:
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		copy_user_highpage(new_page, src_page, address);
+		cow_user_page(new_page, src_page, address);
 	}
 
 	/*
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	/*
-	 * A VM_UNPAGED vma will normally be filled with present ptes
-	 * by remap_pfn_range, and never arrive here; but it might have
-	 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
-	 * It's weird enough handling anon pages in unpaged vmas, we do
-	 * not want to worry about ZERO_PAGEs too (it may or may not
-	 * matter if their counts wrap): just give them anon pages.
-	 */
-
-	if (write_access || (vma->vm_flags & VM_UNPAGED)) {
+	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
@@ -1930,7 +1941,7 @@ retry:
 		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
-		copy_user_highpage(page, new_page, address);
+		cow_user_page(page, new_page, address);
 		page_cache_release(new_page);
 		new_page = page;
 		anon = 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5609a31..bec88c8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
-		unsigned long pfn;
+		struct page *page;
 		unsigned int nid;
 
 		if (!pte_present(*pte))
 			continue;
-		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn)) {
-			print_bad_pte(vma, *pte, addr);
+		page = vm_normal_page(vma, addr, *pte);
+		if (!page)
 			continue;
-		}
-		nid = pfn_to_nid(pfn);
+		nid = page_to_nid(page);
 		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
-	if (first->vm_flags & VM_UNPAGED)
-		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		if (!vma->vm_next && vma->vm_end < end)
diff --git a/mm/msync.c b/mm/msync.c
index b3f4caf..1b5b6f6 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 again:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
-		unsigned long pfn;
 		struct page *page;
 
 		if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
 			continue;
 		if (!pte_maybe_dirty(*pte))
 			continue;
-		pfn = pte_pfn(*pte);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, *pte, addr);
+		page = vm_normal_page(vma, addr, *pte);
+		if (!page)
 			continue;
-		}
-		page = pfn_to_page(pfn);
-
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
 			set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
 	 * to do anything more on an msync().
-	 * Can't do anything with VM_UNPAGED regions either.
 	 */
-	if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED))
+	if (vma->vm_flags & VM_HUGETLB)
 		return;
 
 	BUG_ON(addr >= end);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab..c119681 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int foll_flags)
 {
 	return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index 2e034a0..6389cda 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 /*
  * At what user virtual address is page expected in vma? checking that the
  * page matches the vma: currently only used on anon pages, by unuse_vma;
- * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking
- * care that an mmap of /dev/mem might window free and foreign pages.
  */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
@@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
-	unsigned long pfn;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	for (; address < end; pte++, address += PAGE_SIZE) {
 		if (!pte_present(*pte))
 			continue;
-
-		pfn = pte_pfn(*pte);
-		if (unlikely(!pfn_valid(pfn))) {
-			print_bad_pte(vma, *pte, address);
-			continue;
-		}
-
-		page = pfn_to_page(pfn);
-		BUG_ON(PageAnon(page));
+		page = vm_normal_page(vma, address, *pte);
+		BUG_ON(!page || PageAnon(page));
 
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
-- 
cgit v1.1


From e0f39591cc178026607fcbbe9a53be435fe8285d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 28 Nov 2005 13:43:44 -0800
Subject: [PATCH] Workaround for gcc 2.96 (undefined references)

  LD      .tmp_vmlinux1
mm/built-in.o(.text+0x100d6): In function `copy_page_range':
: undefined reference to `__pud_alloc'
mm/built-in.o(.text+0x1010b): In function `copy_page_range':
: undefined reference to `__pmd_alloc'
mm/built-in.o(.text+0x11ef4): In function `__handle_mm_fault':
: undefined reference to `__pud_alloc'
fs/built-in.o(.text+0xc930): In function `install_arg_page':
: undefined reference to `__pud_alloc'
make: *** [.tmp_vmlinux1] Error 1

Those missing references in mm/memory.c arise from this code in
include/linux/mm.h, combined with the fact that __PGTABLE_PMD_FOLDED and
__PGTABLE_PUD_FOLDED are both set and __ARCH_HAS_4LEVEL_HACK is not:

/*
 * The following ifdef needed to get the 4level-fixup.h header to work.
 * Remove it when 4level-fixup.h has been removed.
 */
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
                NULL: pud_offset(pgd, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

With my configuration the pgd_none and pud_none routines are inlines
returning a constant 0.  Apparently the old compiler avoids generating
calls to __pud_alloc and __pmd_alloc but still lists them as undefined
references in the module's symbol table.

I don't know which change caused this problem.  I think it was added
somewhere between 2.6.14 and 2.6.15-rc1, because I remember building
several 2.6.14-rc kernels without difficulty.  However I can't point to an
individual culprit.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index b57fbc6..9ab206b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2160,6 +2160,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	return 0;
+}
 #endif /* __PAGETABLE_PUD_FOLDED */
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -2188,6 +2194,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+	return 0;
+}
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 int make_pages_present(unsigned long addr, unsigned long end)
-- 
cgit v1.1


From 3148890bfa4f36c9949871264e06ef4d449eeff9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 28 Nov 2005 13:44:03 -0800
Subject: [PATCH] mm: __alloc_pages cleanup fix

I believe this patch is required to fix breakage in the asynch reclaim
watermark logic introduced by this patch:

http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=7fb1d9fca5c6e3b06773b69165a73f3fb786b8ee

Just some background of the watermark logic in case it isn't clear...
Basically what we have is this:

 ---  pages_high
   |
   | (a)
   |
 ---  pages_low
   |
   | (b)
   |
 ---  pages_min
   |
   | (c)
   |
 ---  0

Now when pages_low is reached, we want to kick asynch reclaim, which gives us
an interval of "b" before we must start synch reclaim, and gives kswapd an
interval of "a" before it need go back to sleep.

When pages_min is reached, normal allocators must enter synch reclaim, but
PF_MEMALLOC, ALLOC_HARDER, and ALLOC_HIGH (ie.  atomic allocations, recursive
allocations, etc.) get access to varying amounts of the reserve "c".

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1731236..b257720 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -773,9 +773,12 @@ again:
 }
 
 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
-#define ALLOC_HARDER		0x02 /* try to alloc harder */
-#define ALLOC_HIGH		0x04 /* __GFP_HIGH set */
-#define ALLOC_CPUSET		0x08 /* check for correct cpuset */
+#define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
+#define ALLOC_HARDER		0x10 /* try to alloc harder */
+#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
 
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
@@ -830,7 +833,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 			continue;
 
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-			if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+			unsigned long mark;
+			if (alloc_flags & ALLOC_WMARK_MIN)
+				mark = (*z)->pages_min;
+			else if (alloc_flags & ALLOC_WMARK_LOW)
+				mark = (*z)->pages_low;
+			else
+				mark = (*z)->pages_high;
+			if (!zone_watermark_ok(*z, order, mark,
 				    classzone_idx, alloc_flags))
 				continue;
 		}
@@ -871,7 +881,7 @@ restart:
 	}
 
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-				zonelist, ALLOC_CPUSET);
+				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
 
@@ -888,7 +898,7 @@ restart:
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy.
 	 */
-	alloc_flags = 0;
+	alloc_flags = ALLOC_WMARK_MIN;
 	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
@@ -959,7 +969,7 @@ rebalance:
 		 * under heavy pressure.
 		 */
 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-						zonelist, ALLOC_CPUSET);
+				zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page)
 			goto got_pg;
 
-- 
cgit v1.1


From f7b7fd8f3ebbb2810d6893295aa984acd0fd30db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 28 Nov 2005 13:44:07 -0800
Subject: [PATCH] temporarily disable swap token on memory pressure

Some users (hi Zwane) have seen a problem when running a workload that
eats nearly all of physical memory - th system does an OOM kill, even
when there is still a lot of swap free.

The problem appears to be a very big task that is holding the swap
token, and the VM has a very hard time finding any other page in the
system that is swappable.

Instead of ignoring the swap token when sc->priority reaches 0, we could
simply take the swap token away from the memory hog and make sure we
don't give it back to the memory hog for a few seconds.

This patch resolves the problem Zwane ran into.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c   | 26 ++++++++++----------------
 mm/thrash.c | 10 +++++++---
 mm/vmscan.c | 11 +++++++++--
 3 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 6389cda..491ac35 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -290,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
 static int page_referenced_one(struct page *page,
-	struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
+	struct vm_area_struct *vma, unsigned int *mapcount)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
@@ -311,7 +311,7 @@ static int page_referenced_one(struct page *page,
 
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+	if (mm != current->mm && has_swap_token(mm) &&
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
@@ -321,7 +321,7 @@ out:
 	return referenced;
 }
 
-static int page_referenced_anon(struct page *page, int ignore_token)
+static int page_referenced_anon(struct page *page)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
@@ -334,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
 
 	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		referenced += page_referenced_one(page, vma, &mapcount,
-							ignore_token);
+		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
 	}
@@ -354,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
  *
  * This function is only called from page_referenced for object-based pages.
  */
-static int page_referenced_file(struct page *page, int ignore_token)
+static int page_referenced_file(struct page *page)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
@@ -392,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
 			referenced++;
 			break;
 		}
-		referenced += page_referenced_one(page, vma, &mapcount,
-							ignore_token);
+		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
 	}
@@ -410,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
-int page_referenced(struct page *page, int is_locked, int ignore_token)
+int page_referenced(struct page *page, int is_locked)
 {
 	int referenced = 0;
 
-	if (!swap_token_default_timeout)
-		ignore_token = 1;
-
 	if (page_test_and_clear_young(page))
 		referenced++;
 
@@ -425,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 
 	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
-			referenced += page_referenced_anon(page, ignore_token);
+			referenced += page_referenced_anon(page);
 		else if (is_locked)
-			referenced += page_referenced_file(page, ignore_token);
+			referenced += page_referenced_file(page);
 		else if (TestSetPageLocked(page))
 			referenced++;
 		else {
 			if (page->mapping)
-				referenced += page_referenced_file(page,
-								ignore_token);
+				referenced += page_referenced_file(page);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/thrash.c b/mm/thrash.c
index eff3c18..f4c560b 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -57,14 +57,17 @@ void grab_swap_token(void)
 	/* We have the token. Let others know we still need it. */
 	if (has_swap_token(current->mm)) {
 		current->mm->recent_pagein = 1;
+		if (unlikely(!swap_token_default_timeout))
+			disable_swap_token();
 		return;
 	}
 
 	if (time_after(jiffies, swap_token_check)) {
 
-		/* Can't get swapout protection if we exceed our RSS limit. */
-		// if (current->mm->rss > current->mm->rlimit_rss)
-		//	return;
+		if (!swap_token_default_timeout) {
+			swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+			return;
+		}
 
 		/* ... or if we recently held the token. */
 		if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
 {
 	spin_lock(&swap_token_lock);
 	if (likely(mm == swap_token_mm)) {
+		mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
 		swap_token_mm = &init_mm;
 		swap_token_check = jiffies;
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2813054..078cf92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -407,7 +407,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		if (PageWriteback(page))
 			goto keep_locked;
 
-		referenced = page_referenced(page, 1, sc->priority <= 0);
+		referenced = page_referenced(page, 1);
 		/* In active use or really unfreeable?  Activate it. */
 		if (referenced && page_mapping_inuse(page))
 			goto activate_locked;
@@ -756,7 +756,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		if (page_mapped(page)) {
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
-			    page_referenced(page, 0, sc->priority <= 0)) {
+			    page_referenced(page, 0)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
@@ -960,6 +960,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_reclaimed = 0;
 		sc.priority = priority;
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+		if (!priority)
+			disable_swap_token();
 		shrink_caches(zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
@@ -1056,6 +1058,10 @@ loop_again:
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 
+		/* The swap token gets in the way of swapout... */
+		if (!priority)
+			disable_swap_token();
+
 		all_zones_ok = 1;
 
 		if (nr_pages == 0) {
@@ -1360,6 +1366,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	sc.nr_reclaimed = 0;
 	/* scan at the highest priority */
 	sc.priority = 0;
+	disable_swap_token();
 
 	if (nr_pages > SWAP_CLUSTER_MAX)
 		sc.swap_cluster_max = nr_pages;
-- 
cgit v1.1


From ea164d73a7a0b2b2be3a1d8c2a8a4dab8999fa9c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Mon, 28 Nov 2005 13:44:15 -0800
Subject: [PATCH] shrinker->nr = LONG_MAX means deadlock for icache

With Andrew Morton <akpm@osdl.org>

The slab scanning code tries to balance the scanning rate of slabs versus the
scanning rate of LRU pages.  To do this, it retains state concerning how many
slabs have been scanned - if a particular slab shrinker didn't scan enough
objects, we remember that for next time, and scan more objects on the next
pass.

The problem with this is that with (say) a huge number of GFP_NOIO
direct-reclaim attempts, the number of objects which are to be scanned when we
finally get a GFP_KERNEL request can be huge.  Because some shrinker handlers
just bail out if !__GFP_FS.

So the patch clamps the number of objects-to-be-scanned to 2* the total number
of objects in the slab cache.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 078cf92..b0cd81c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
 		unsigned long total_scan;
+		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
 
 		delta = (4 * scanned) / shrinker->seeks;
-		delta *= (*shrinker->shrinker)(0, gfp_mask);
+		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
 		shrinker->nr += delta;
-		if (shrinker->nr < 0)
-			shrinker->nr = LONG_MAX;	/* It wrapped! */
+		if (shrinker->nr < 0) {
+			printk(KERN_ERR "%s: nr=%ld\n",
+					__FUNCTION__, shrinker->nr);
+			shrinker->nr = max_pass;
+		}
+
+		/*
+		 * Avoid risking looping forever due to too large nr value:
+		 * never try to free more than twice the estimate number of
+		 * freeable entries.
+		 */
+		if (shrinker->nr > max_pass * 2)
+			shrinker->nr = max_pass * 2;
 
 		total_scan = shrinker->nr;
 		shrinker->nr = 0;
-- 
cgit v1.1


From fa2a455b028f3b6ca4dae129c6337d7edf21f12c Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 29 Nov 2005 18:43:17 +1100
Subject: [PATCH] Fix vma argument in get_usr_pages() for gate areas

The system call gate area handling called vm_normal_page() with the
wrong vma (which was always NULL, and caused an oops).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 9ab206b..6c1eac9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -988,7 +988,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				return i ? : -EFAULT;
 			}
 			if (pages) {
-				struct page *page = vm_normal_page(vma, start, *pte);
+				struct page *page = vm_normal_page(gate_vma, start, *pte);
 				pages[i] = page;
 				if (page)
 					get_page(page);
-- 
cgit v1.1


From eca351336acb2fa943611e0846562ce3997ef53b Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Tue, 29 Nov 2005 11:45:26 -0800
Subject: [PATCH] Fix missing pfn variables caused by vm changes

I image this showed up because of "unused var..." when the changes
occured, because flush_cache_page() is a noop in most places.  This
showed up for me on parisc however, where flush_cache_page() is a real
function.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 ++--
 mm/rmap.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 6c1eac9..74839b3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1345,7 +1345,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		int reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
 		if (reuse) {
-			flush_cache_page(vma, address, pfn);
+			flush_cache_page(vma, address, pte_pfn(orig_pte));
 			entry = pte_mkyoung(orig_pte);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			ptep_set_access_flags(vma, address, page_table, entry, 1);
@@ -1389,7 +1389,7 @@ gotten:
 			}
 		} else
 			inc_mm_counter(mm, anon_rss);
-		flush_cache_page(vma, address, pfn);
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		ptep_establish(vma, address, page_table, entry);
diff --git a/mm/rmap.c b/mm/rmap.c
index 491ac35..f853c6d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -641,7 +641,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 			continue;
 
 		/* Nuke the page table entry. */
-		flush_cache_page(vma, address, pfn);
+		flush_cache_page(vma, address, pte_pfn(*pte));
 		pteval = ptep_clear_flush(vma, address, pte);
 
 		/* If nonlinear, store the file page offset in the pte. */
-- 
cgit v1.1


From 238f58d898df941aa9d1cb390fb27ff4febe8965 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 29 Nov 2005 13:01:56 -0800
Subject: Support strange discontiguous PFN remappings

These get created by some drivers that don't generally even want a pfn
remapping at all, but would really mostly prefer to just map pages
they've allocated individually instead.

For now, create a helper function that turns such an incomplete PFN
remapping call into a loop that does that explicit mapping.  In the long
run we almost certainly want to export a totally different interface for
that, though.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 74839b3..990e7dc6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1147,6 +1147,95 @@ int zeromap_page_range(struct vm_area_struct *vma,
 }
 
 /*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+{
+	int retval;
+	pgd_t * pgd;
+	pud_t * pud;
+	pmd_t * pmd;  
+	pte_t * pte;
+	spinlock_t *ptl;  
+
+	retval = -EINVAL;
+	if (PageAnon(page) || !PageReserved(page))
+		goto out;
+	retval = -ENOMEM;
+	flush_dcache_page(page);
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		goto out;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (!pte)
+		goto out;
+	retval = -EBUSY;
+	if (!pte_none(*pte))
+		goto out_unlock;
+
+	/* Ok, finally just insert the thing.. */
+	get_page(page);
+	inc_mm_counter(mm, file_rss);
+	page_add_file_rmap(page);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+	retval = 0;
+out_unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return retval;
+}
+
+/*
+ * Somebody does a pfn remapping that doesn't actually work as a vma.
+ *
+ * Do it as individual pages instead, and warn about it. It's bad form,
+ * and very inefficient.
+ */
+static int incomplete_pfn_remap(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		unsigned long pfn, pgprot_t prot)
+{
+	static int warn = 10;
+	struct page *page;
+	int retval;
+
+	if (!(vma->vm_flags & VM_INCOMPLETE)) {
+		if (warn) {
+			warn--;
+			printk("%s does an incomplete pfn remapping", current->comm);
+			dump_stack();
+		}
+	}
+	vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED;
+
+	if (start < vma->vm_start || end > vma->vm_end)
+		return -EINVAL;
+
+	if (!pfn_valid(pfn))
+		return -EINVAL;
+
+	retval = 0;
+	page = pfn_to_page(pfn);
+	while (start < end) {
+		retval = insert_page(vma->vm_mm, start, page, prot);
+		if (retval < 0)
+			break;
+		start += PAGE_SIZE;
+		page++;
+	}
+	return retval;
+}
+
+/*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
  * in null mappings (currently treated as "copy-on-access")
@@ -1220,6 +1309,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	struct mm_struct *mm = vma->vm_mm;
 	int err;
 
+	if (addr != vma->vm_start || end != vma->vm_end)
+		return incomplete_pfn_remap(vma, addr, end, pfn, prot);
+
 	/*
 	 * Physically remapped pages are special. Tell the
 	 * rest of the world about it:
-- 
cgit v1.1


From c9cfcddfd65735437a4cb8563d6b66a6da8a5ed6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 29 Nov 2005 14:03:14 -0800
Subject: VM: add common helper function to create the page tables

This logic was duplicated four times, for no good reason.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fremap.c | 24 ++----------------------
 mm/memory.c | 26 ++++++++++++++------------
 2 files changed, 16 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/mm/fremap.c b/mm/fremap.c
index f851775..9f381e5 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -55,20 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t size;
 	int err = -ENOMEM;
 	pte_t *pte;
-	pmd_t *pmd;
-	pud_t *pud;
-	pgd_t *pgd;
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
-	if (!pud)
-		goto out;
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 
@@ -110,20 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	int err = -ENOMEM;
 	pte_t *pte;
-	pmd_t *pmd;
-	pud_t *pud;
-	pgd_t *pgd;
 	pte_t pte_val;
 	spinlock_t *ptl;
 
-	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
-	if (!pud)
-		goto out;
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 
diff --git a/mm/memory.c b/mm/memory.c
index 990e7dc6..74f95ae 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1146,6 +1146,18 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	return err;
 }
 
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+{
+	pgd_t * pgd = pgd_offset(mm, addr);
+	pud_t * pud = pud_alloc(mm, pgd, addr);
+	if (pud) {
+		pmd_t * pmd = pmd_alloc(mm, pgd, addr);
+		if (pmd)
+			return pte_alloc_map_lock(mm, pmd, addr, ptl);
+	}
+	return NULL;
+}
+
 /*
  * This is the old fallback for page remapping.
  *
@@ -1156,10 +1168,7 @@ int zeromap_page_range(struct vm_area_struct *vma,
 static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
 {
 	int retval;
-	pgd_t * pgd;
-	pud_t * pud;
-	pmd_t * pmd;  
-	pte_t * pte;
+	pte_t *pte;
 	spinlock_t *ptl;  
 
 	retval = -EINVAL;
@@ -1167,14 +1176,7 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
 		goto out;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
-	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
-	if (!pud)
-		goto out;
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-- 
cgit v1.1


From 5d2a2dbbc1025dbf7998b9289574d9592b8f21cc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 29 Nov 2005 14:07:55 -0800
Subject: cow_user_page: fix page alignment

High Dickins points out that the user virtual address passed to the page
fault handler isn't necessarily page-aligned.

Also, add a comment on why the copy could fail for the user address case.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 74f95ae..745b348 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1394,8 +1394,15 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 	 */
 	if (unlikely(!src)) {
 		void *kaddr = kmap_atomic(dst, KM_USER0);
-		unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
-		if (left)
+		void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+		/*
+		 * This really shouldn't fail, because the page is there
+		 * in the page tables. But it might just be unreadable,
+		 * in which case we just give up and fill the result with
+		 * zeroes.
+		 */
+		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
 			memset(kaddr, 0, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
 		return;
-- 
cgit v1.1


From e5bbe4dfc8dbfc50ef89f8641e020616d4d1e69e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 29 Nov 2005 16:54:51 +0000
Subject: [PATCH] pfnmap: remove src_page from do_wp_page

Clean away do_wp_page's "src_page": cow_user_page makes it unnecessary.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 745b348..ae259b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1433,12 +1433,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *src_page, *new_page;
+	struct page *old_page, *new_page;
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
-	src_page = old_page;
 	if (!old_page)
 		goto gotten;
 
@@ -1466,7 +1465,7 @@ gotten:
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	if (src_page == ZERO_PAGE(address)) {
+	if (old_page == ZERO_PAGE(address)) {
 		new_page = alloc_zeroed_user_highpage(vma, address);
 		if (!new_page)
 			goto oom;
@@ -1474,7 +1473,7 @@ gotten:
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		cow_user_page(new_page, src_page, address);
+		cow_user_page(new_page, old_page, address);
 	}
 
 	/*
-- 
cgit v1.1


From 325f04dbca60a4cfe4ac25e7cf246edd07eb4c5f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 29 Nov 2005 16:55:48 +0000
Subject: [PATCH] pfnmap: do_no_page BUG_ON again

Use copy_user_highpage directly instead of cow_user_page in do_no_page:
in the immediately following page_cache_release, and elsewhere, it is
assuming that new_page is normal.  If any VM_PFNMAP driver can get to
do_no_page, it's just a BUG (but not in the case of do_anonymous_page).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index ae259b6..5bfa52a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2009,6 +2009,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
+	BUG_ON(vma->vm_flags & VM_PFNMAP);
+
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
@@ -2041,7 +2043,7 @@ retry:
 		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
-		cow_user_page(page, new_page, address);
+		copy_user_highpage(page, new_page, address);
 		page_cache_release(new_page);
 		new_page = page;
 		anon = 1;
-- 
cgit v1.1


From 49c91fb01ff3948285608c65754b3ffbf57d50f2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@fys.uio.no>
Date: Tue, 29 Nov 2005 19:27:22 -0500
Subject: [PATCH] VM: Fix typos in get_locked_pte

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 5bfa52a..8d10b55 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1146,12 +1146,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	return err;
 }
 
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);
 	pud_t * pud = pud_alloc(mm, pgd, addr);
 	if (pud) {
-		pmd_t * pmd = pmd_alloc(mm, pgd, addr);
+		pmd_t * pmd = pmd_alloc(mm, pud, addr);
 		if (pmd)
 			return pte_alloc_map_lock(mm, pmd, addr, ptl);
 	}
-- 
cgit v1.1