From 4f5e387650ca965426a7b4681120988b4ba0e116 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 27 Apr 2012 08:42:53 -0700
Subject: percpu: pcpu_embed_first_chunk() should free unused parts after all
 allocs are complete

commit 42b64281453249dac52861f9b97d18552a7ec62b upstream.

pcpu_embed_first_chunk() allocates memory for each node, copies percpu
data and frees unused portions of it before proceeding to the next
group.  This assumes that allocations for different nodes doesn't
overlap; however, depending on memory topology, the bootmem allocator
may end up allocating memory from a different node than the requested
one which may overlap with the portion freed from one of the previous
percpu areas.  This leads to percpu groups for different nodes
overlapping which is a serious bug.

This patch separates out copy & partial free from the allocation loop
such that all allocations are complete before partial frees happen.

This also fixes overlapping frees which could happen on allocation
failure path - out_free_areas path frees whole groups but the groups
could have portions freed at that point.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Pavel V. Panteleev" <pp_84@mail.ru>
Tested-by: "Pavel V. Panteleev" <pp_84@mail.ru>
LKML-Reference: <E1SNhwY-0007ui-V7.pp_84-mail-ru@f220.mail.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/percpu.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 0ae7a09..af0cc7a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1630,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 		areas[group] = ptr;
 
 		base = min(ptr, base);
+	}
+
+	/*
+	 * Copy data and free unused parts.  This should happen after all
+	 * allocations are complete; otherwise, we may end up with
+	 * overlapping groups.
+	 */
+	for (group = 0; group < ai->nr_groups; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+		void *ptr = areas[group];
 
 		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
 			if (gi->cpu_map[i] == NR_CPUS) {
-- 
cgit v1.1


From afe85051b486d5d558d93ab2e509dc3bfdffc2c5 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 10 May 2012 13:01:44 -0700
Subject: hugetlb: prevent BUG_ON in hugetlb_fault() -> hugetlb_cow()

commit 4998a6c0edce7fae9c0a5463f6ec3fa585258ee7 upstream.

Commit 66aebce747eaf ("hugetlb: fix race condition in hugetlb_fault()")
added code to avoid a race condition by elevating the page refcount in
hugetlb_fault() while calling hugetlb_cow().

However, one code path in hugetlb_cow() includes an assertion that the
page count is 1, whereas it may now also have the value 2 in this path.

The consensus is that this BUG_ON has served its purpose, so rather than
extending it to cover both cases, we just remove it.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f7001ac..00b0abb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2398,7 +2398,6 @@ retry_avoidcopy:
 		if (outside_reserve) {
 			BUG_ON(huge_pte_none(pte));
 			if (unmap_ref_private(mm, vma, old_page, address)) {
-				BUG_ON(page_count(old_page) != 1);
 				BUG_ON(huge_pte_none(pte));
 				spin_lock(&mm->page_table_lock);
 				goto retry_avoidcopy;
-- 
cgit v1.1


From c928e8c32d44cbd9082455c45fd6f288b310f0f7 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Thu, 10 May 2012 13:01:46 -0700
Subject: mm: nobootmem: fix sign extend problem in __free_pages_memory()

commit 6bc2e853c6b46a6041980d58200ad9b0a73a60ff upstream.

Systems with 8 TBytes of memory or greater can hit a problem where only
the the first 8 TB of memory shows up.  This is due to "int i" being
smaller than "unsigned long start_aligned", causing the high bits to be
dropped.

The fix is to change `i' to unsigned long to match start_aligned
and end_aligned.

Thanks to Jack Steiner for assistance tracking this down.

Signed-off-by: Russ Anderson <rja@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/nobootmem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7..e39e3ef 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 
 static void __init __free_pages_memory(unsigned long start, unsigned long end)
 {
-	int i;
-	unsigned long start_aligned, end_aligned;
+	unsigned long i, start_aligned, end_aligned;
 	int order = ilog2(BITS_PER_LONG);
 
 	start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
-- 
cgit v1.1


From 37de6be49f6c88f7d1306594b7aae5aeee2fa499 Mon Sep 17 00:00:00 2001
From: Sha Zhengju <handai.szj@taobao.com>
Date: Thu, 10 May 2012 13:01:45 -0700
Subject: memcg: free spare array to avoid memory leak

commit 8c7577637ca31385e92769a77e2ab5b428e8b99c upstream.

When the last event is unregistered, there is no need to keep the spare
array anymore.  So free it to avoid memory leak.

Signed-off-by: Sha Zhengju <handai.szj@taobao.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memcontrol.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 283068f..ffb99b4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4605,6 +4605,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
+	/* If all events are unregistered, free the spare array */
+	if (!new) {
+		kfree(thresholds->spare);
+		thresholds->spare = NULL;
+	}
+
 	rcu_assign_pointer(thresholds->primary, new);
 
 	/* To be sure that nobody uses thresholds */
-- 
cgit v1.1


From beb9576530bc9a2685847fa1fcbf96b97686fcaf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 23 May 2012 12:48:13 +0100
Subject: mm: mempolicy: Let vma_merge and vma_split handle vma->vm_policy
 linkages

commit 05f144a0d5c2207a0349348127f996e104ad7404 upstream.

Dave Jones' system call fuzz testing tool "trinity" triggered the
following bug error with slab debugging enabled

    =============================================================================
    BUG numa_policy (Not tainted): Poison overwritten
    -----------------------------------------------------------------------------

    INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b
    INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154
     __slab_alloc+0x3d3/0x445
     kmem_cache_alloc+0x29d/0x2b0
     mpol_new+0xa3/0x140
     sys_mbind+0x142/0x620
     system_call_fastpath+0x16/0x1b
    INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154
     __slab_free+0x2e/0x1de
     kmem_cache_free+0x25a/0x260
     __mpol_put+0x27/0x30
     remove_vma+0x68/0x90
     exit_mmap+0x118/0x140
     mmput+0x73/0x110
     exit_mm+0x108/0x130
     do_exit+0x162/0xb90
     do_group_exit+0x4f/0xc0
     sys_exit_group+0x17/0x20
     system_call_fastpath+0x16/0x1b
    INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x          (null) flags=0x20000000004080
    INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0

This implied a reference counting bug and the problem happened during
mbind().

mbind() applies a new memory policy to a range and uses mbind_range() to
merge existing VMAs or split them as necessary.  In the event of splits,
mpol_dup() will allocate a new struct mempolicy and maintain existing
reference counts whose rules are documented in
Documentation/vm/numa_memory_policy.txt .

The problem occurs with shared memory policies.  The vm_op->set_policy
increments the reference count if necessary and split_vma() and
vma_merge() have already handled the existing reference counts.
However, policy_vma() screws it up by replacing an existing
vma->vm_policy with one that potentially has the wrong reference count
leading to a premature free.  This patch removes the damage caused by
policy_vma().

With this patch applied Dave's trinity tool runs an mbind test for 5
minutes without error.  /proc/slabinfo reported that there are no
numa_policy or shared_policy_node objects allocated after the test
completed and the shared memory region was deleted.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Dave Jones <davej@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Stephen Wilson <wilsons@start.ca>
Cc: Christoph Lameter <cl@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a85171d..3dac2d1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -606,27 +606,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
-{
-	int err = 0;
-	struct mempolicy *old = vma->vm_policy;
-
-	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
-		 vma->vm_ops, vma->vm_file,
-		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-
-	if (vma->vm_ops && vma->vm_ops->set_policy)
-		err = vma->vm_ops->set_policy(vma, new);
-	if (!err) {
-		mpol_get(new);
-		vma->vm_policy = new;
-		mpol_put(old);
-	}
-	return err;
-}
-
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
 		       unsigned long end, struct mempolicy *new_pol)
@@ -666,9 +645,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-		err = policy_vma(vma, new_pol);
-		if (err)
-			goto out;
+
+		/*
+		 * Apply policy to a single VMA. The reference counting of
+		 * policy for vma_policy linkages has already been handled by
+		 * vma_merge and split_vma as necessary. If this is a shared
+		 * policy then ->set_policy will increment the reference count
+		 * for an sp node.
+		 */
+		pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+			vma->vm_start, vma->vm_end, vma->vm_pgoff,
+			vma->vm_ops, vma->vm_file,
+			vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+		if (vma->vm_ops && vma->vm_ops->set_policy) {
+			err = vma->vm_ops->set_policy(vma, new_pol);
+			if (err)
+				goto out;
+		}
 	}
 
  out:
-- 
cgit v1.1


From dce59c2faeb130855bd05d025d854ae79d8dbedd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 29 May 2012 15:06:45 -0700
Subject: mm: consider all swapped back pages in used-once logic

commit e48982734ea0500d1eba4f9d96195acc5406cad6 upstream.

Commit 645747462435 ("vmscan: detect mapped file pages used only once")
made mapped pages have another round in inactive list because they might
be just short lived and so we could consider them again next time.  This
heuristic helps to reduce pressure on the active list with a streaming
IO worklods.

This patch fixes a regression introduced by this commit for heavy shmem
based workloads because unlike Anon pages, which are excluded from this
heuristic because they are usually long lived, shmem pages are handled
as a regular page cache.

This doesn't work quite well, unfortunately, if the workload is mostly
backed by shmem (in memory database sitting on 80% of memory) with a
streaming IO in the background (backup - up to 20% of memory).  Anon
inactive list is full of (dirty) shmem pages when watermarks are hit.
Shmem pages are kept in the inactive list (they are referenced) in the
first round and it is hard to reclaim anything else so we reach lower
scanning priorities very quickly which leads to an excessive swap out.

Let's fix this by excluding all swap backed pages (they tend to be long
lived wrt.  the regular page cache anyway) from used-once heuristic and
rather activate them if they are referenced.

The customer's workload is shmem backed database (80% of RAM) and they
are measuring transactions/s with an IO in the background (20%).
Transactions touch more or less random rows in the table.  The
transaction rate fell by a factor of 3 (in the worst case) because of
commit 64574746.  This patch restores the previous numbers.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6072d74..769935d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -665,7 +665,7 @@ static enum page_references page_check_references(struct page *page,
 		return PAGEREF_RECLAIM;
 
 	if (referenced_ptes) {
-		if (PageAnon(page))
+		if (PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
 		/*
 		 * All mapped pages start out with page table
-- 
cgit v1.1


From 5c2d31dda012797578d012425a785d58e14d2053 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 10 Jan 2012 15:08:39 -0800
Subject: mm/vmalloc.c: change void* into explict vm_struct*

commit db1aecafef58b5dda39c4228debe2c845e4a27ab upstream.

vmap_area->private is void* but we don't use the field for various purpose
but use only for vm_struct.  So change it to a vm_struct* with naming to
improve for readability and type checking.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmalloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 43b44db..3e927cc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
 	struct rb_node rb_node;		/* address sorted rbtree */
 	struct list_head list;		/* address sorted list */
 	struct list_head purge_list;	/* "lazy purge" list */
-	void *private;
+	struct vm_struct *vm;
 	struct rcu_head rcu_head;
 };
 
@@ -1274,7 +1274,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 	vm->addr = (void *)va->va_start;
 	vm->size = va->va_end - va->va_start;
 	vm->caller = caller;
-	va->private = vm;
+	va->vm = vm;
 	va->flags |= VM_VM_AREA;
 }
 
@@ -1397,7 +1397,7 @@ static struct vm_struct *find_vm_area(const void *addr)
 
 	va = find_vmap_area((unsigned long)addr);
 	if (va && va->flags & VM_VM_AREA)
-		return va->private;
+		return va->vm;
 
 	return NULL;
 }
@@ -1416,7 +1416,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 
 	va = find_vmap_area((unsigned long)addr);
 	if (va && va->flags & VM_VM_AREA) {
-		struct vm_struct *vm = va->private;
+		struct vm_struct *vm = va->vm;
 
 		if (!(vm->flags & VM_UNLIST)) {
 			struct vm_struct *tmp, **p;
-- 
cgit v1.1


From c201beec4842674cd4773771931d25c9a5d45d66 Mon Sep 17 00:00:00 2001
From: KyongHo <pullip.cho@samsung.com>
Date: Tue, 29 May 2012 15:06:49 -0700
Subject: mm: fix faulty initialization in vmalloc_init()

commit dbda591d920b4c7692725b13e3f68ecb251e9080 upstream.

The transfer of ->flags causes some of the static mapping virtual
addresses to be prematurely freed (before the mapping is removed) because
VM_LAZY_FREE gets "set" if tmp->flags has VM_IOREMAP set.  This might
cause subsequent vmalloc/ioremap calls to fail because it might allocate
one of the freed virtual address ranges that aren't unmapped.

va->flags has different types of flags from tmp->flags.  If a region with
VM_IOREMAP set is registered with vm_area_add_early(), it will be removed
by __purge_vmap_area_lazy().

Fix vmalloc_init() to correctly initialize vmap_area for the given
vm_struct.

Also initialise va->vm.  If it is not set, find_vm_area() for the early
vm regions will always fail.

Signed-off-by: KyongHo Cho <pullip.cho@samsung.com>
Cc: "Olav Haugan" <ohaugan@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmalloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3e927cc..bdb7004 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1174,9 +1174,10 @@ void __init vmalloc_init(void)
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-		va->flags = tmp->flags | VM_VM_AREA;
+		va->flags = VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
+		va->vm = tmp;
 		__insert_vmap_area(va);
 	}
 
-- 
cgit v1.1


From 2209ffb965c6b17602aae5e637961e4f0f8a4162 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 29 May 2012 15:06:46 -0700
Subject: hugetlb: fix resv_map leak in error path

commit c50ac050811d6485616a193eb0f37bfbd191cc89 and
4523e1458566a0e8ecfaff90f380dd23acc44d27 upstream.

When called for anonymous (non-shared) mappings, hugetlb_reserve_pages()
does a resv_map_alloc().  It depends on code in hugetlbfs's
vm_ops->close() to release that allocation.

However, in the mmap() failure path, we do a plain unmap_region() without
the remove_vma() which actually calls vm_ops->close().

This is a decent fix.  This leak could get reintroduced if new code (say,
after hugetlb_reserve_pages() in hugetlbfs_file_mmap()) decides to return
an error.  But, I think it would have to unroll the reservation anyway.

Christoph's test case:

	http://marc.info/?l=linux-mm&m=133728900729735

This patch applies to 3.4 and later.  A version for earlier kernels is at
https://lkml.org/lkml/2012/5/22/418.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: Christoph Lameter <cl@linux.com>
Tested-by: Christoph Lameter <cl@linux.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/hugetlb.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 00b0abb..05f8fd4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2060,6 +2060,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 		kref_get(&reservations->refs);
 }
 
+static void resv_map_put(struct vm_area_struct *vma)
+{
+	struct resv_map *reservations = vma_resv_map(vma);
+
+	if (!reservations)
+		return;
+	kref_put(&reservations->refs, resv_map_release);
+}
+
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
 	struct hstate *h = hstate_vma(vma);
@@ -2075,7 +2084,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		reserve = (end - start) -
 			region_count(&reservations->regions, start, end);
 
-		kref_put(&reservations->refs, resv_map_release);
+		resv_map_put(vma);
 
 		if (reserve) {
 			hugetlb_acct_memory(h, -reserve);
@@ -2877,12 +2886,16 @@ int hugetlb_reserve_pages(struct inode *inode,
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
 
-	if (chg < 0)
-		return chg;
+	if (chg < 0) {
+		ret = chg;
+		goto out_err;
+	}
 
 	/* There must be enough filesystem quota for the mapping */
-	if (hugetlb_get_quota(inode->i_mapping, chg))
-		return -ENOSPC;
+	if (hugetlb_get_quota(inode->i_mapping, chg)) {
+		ret = -ENOSPC;
+		goto out_err;
+	}
 
 	/*
 	 * Check enough hugepages are available for the reservation.
@@ -2891,7 +2904,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
-		return ret;
+		goto out_err;
 	}
 
 	/*
@@ -2908,6 +2921,10 @@ int hugetlb_reserve_pages(struct inode *inode,
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
 		region_add(&inode->i_mapping->private_list, from, to);
 	return 0;
+out_err:
+	if (vma)
+		resv_map_put(vma);
+	return ret;
 }
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
-- 
cgit v1.1


From 32ef2126fabc8a984506a1a4e83f3459ba1a2075 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@huawei.com>
Date: Wed, 11 Jul 2012 14:01:52 -0700
Subject: memory hotplug: fix invalid memory access caused by stale kswapd
 pointer

commit d8adde17e5f858427504725218c56aef90e90fc7 upstream.

kswapd_stop() is called to destroy the kswapd work thread when all memory
of a NUMA node has been offlined.  But kswapd_stop() only terminates the
work thread without resetting NODE_DATA(nid)->kswapd to NULL.  The stale
pointer will prevent kswapd_run() from creating a new work thread when
adding memory to the memory-less NUMA node again.  Eventually the stale
pointer may cause invalid memory access.

An example stack dump as below. It's reproduced with 2.6.32, but latest
kernel has the same issue.

  BUG: unable to handle kernel NULL pointer dereference at (null)
  IP: [<ffffffff81051a94>] exit_creds+0x12/0x78
  PGD 0
  Oops: 0000 [#1] SMP
  last sysfs file: /sys/devices/system/memory/memory391/state
  CPU 11
  Modules linked in: cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq microcode fuse loop dm_mod tpm_tis rtc_cmos i2c_i801 rtc_core tpm serio_raw pcspkr sg tpm_bios igb i2c_core iTCO_wdt rtc_lib mptctl iTCO_vendor_support button dca bnx2 usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix libata thermal processor thermal_sys hwmon mptsas mptscsih mptbase scsi_transport_sas scsi_mod
  Pid: 7949, comm: sh Not tainted 2.6.32.12-qiuxishi-5-default #92 Tecal RH2285
  RIP: 0010:exit_creds+0x12/0x78
  RSP: 0018:ffff8806044f1d78  EFLAGS: 00010202
  RAX: 0000000000000000 RBX: ffff880604f22140 RCX: 0000000000019502
  RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000
  RBP: ffff880604f22150 R08: 0000000000000000 R09: ffffffff81a4dc10
  R10: 00000000000032a0 R11: ffff880006202500 R12: 0000000000000000
  R13: 0000000000c40000 R14: 0000000000008000 R15: 0000000000000001
  FS:  00007fbc03d066f0(0000) GS:ffff8800282e0000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 0000000000000000 CR3: 000000060f029000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process sh (pid: 7949, threadinfo ffff8806044f0000, task ffff880603d7c600)
  Stack:
   ffff880604f22140 ffffffff8103aac5 ffff880604f22140 ffffffff8104d21e
   ffff880006202500 0000000000008000 0000000000c38000 ffffffff810bd5b1
   0000000000000000 ffff880603d7c600 00000000ffffdd29 0000000000000003
  Call Trace:
    __put_task_struct+0x5d/0x97
    kthread_stop+0x50/0x58
    offline_pages+0x324/0x3da
    memory_block_change_state+0x179/0x1db
    store_mem_state+0x9e/0xbb
    sysfs_write_file+0xd0/0x107
    vfs_write+0xad/0x169
    sys_write+0x45/0x6e
    system_call_fastpath+0x16/0x1b
  Code: ff 4d 00 0f 94 c0 84 c0 74 08 48 89 ef e8 1f fd ff ff 5b 5d 31 c0 41 5c c3 53 48 8b 87 20 06 00 00 48 89 fb 48 8b bf 18 06 00 00 <8b> 00 48 c7 83 18 06 00 00 00 00 00 00 f0 ff 0f 0f 94 c0 84 c0
  RIP  exit_creds+0x12/0x78
   RSP <ffff8806044f1d78>
  CR2: 0000000000000000

[akpm@linux-foundation.org: add pglist_data.kswapd locking comments]
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 769935d..1b0ed36 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2952,14 +2952,17 @@ int kswapd_run(int nid)
 }
 
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
  */
 void kswapd_stop(int nid)
 {
 	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
 
-	if (kswapd)
+	if (kswapd) {
 		kthread_stop(kswapd);
+		NODE_DATA(nid)->kswapd = NULL;
+	}
 }
 
 static int __init kswapd_init(void)
-- 
cgit v1.1


From c58c52e0f44d2883ddc31ac021b88a121b332982 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 11 Jul 2012 14:02:13 -0700
Subject: mm, thp: abort compaction if migration page cannot be charged to
 memcg

commit 4bf2bba3750f10aa9e62e6949bc7e8329990f01b upstream.

If page migration cannot charge the temporary page to the memcg,
migrate_pages() will return -ENOMEM.  This isn't considered in memory
compaction however, and the loop continues to iterate over all
pageblocks trying to isolate and migrate pages.  If a small number of
very large memcgs happen to be oom, however, these attempts will mostly
be futile leading to an enormous amout of cpu consumption due to the
page migration failures.

This patch will short circuit and fail memory compaction if
migrate_pages() returns -ENOMEM.  COMPACT_PARTIAL is returned in case
some migrations were successful so that the page allocator will retry.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index c4bc5ac..adc5336 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -596,8 +596,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		if (err) {
 			putback_lru_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
+			if (err == -ENOMEM) {
+				ret = COMPACT_PARTIAL;
+				goto out;
+			}
 		}
-
 	}
 
 out:
-- 
cgit v1.1


From e12fcd38abe8a869cbabd77724008f1cf812a3e7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 5 Jul 2012 16:00:11 -0700
Subject: mm: Hold a file reference in madvise_remove

commit 9ab4233dd08036fe34a89c7dc6f47a8bf2eb29eb upstream.

Otherwise the code races with munmap (causing a use-after-free
of the vma) or with close (causing a use-after-free of the struct
file).

The bug was introduced by commit 90ed52ebe481 ("[PATCH] holepunch: fix
mmap_sem i_mutex deadlock")

[bwh: Backported to 3.2:
 - Adjust context
 - madvise_remove() calls vmtruncate_range(), not do_fallocate()]
[luto: Backported to 3.0: Adjust context]

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/madvise.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491..deabe5f6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/file.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma,
 	struct address_space *mapping;
 	loff_t offset, endoff;
 	int error;
+	struct file *f;
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
 
 	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
 		return -EINVAL;
 
-	if (!vma->vm_file || !vma->vm_file->f_mapping
-		|| !vma->vm_file->f_mapping->host) {
+	f = vma->vm_file;
+
+	if (!f || !f->f_mapping || !f->f_mapping->host) {
 			return -EINVAL;
 	}
 
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma,
 	endoff = (loff_t)(end - vma->vm_start - 1)
 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-	/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+	/*
+	 * vmtruncate_range may need to take i_mutex and i_alloc_sem.
+	 * We need to explicitly grab a reference because the vma (and
+	 * hence the vma's reference to the file) can go away as soon as
+	 * we drop mmap_sem.
+	 */
+	get_file(f);
 	up_read(&current->mm->mmap_sem);
 	error = vmtruncate_range(mapping->host, offset, endoff);
+	fput(f);
 	down_read(&current->mm->mmap_sem);
 	return error;
 }
-- 
cgit v1.1


From 6d40de834ce9bb2964a70b7f91a98406eceb0399 Mon Sep 17 00:00:00 2001
From: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Date: Tue, 17 Jul 2012 15:48:07 -0700
Subject: mm: fix lost kswapd wakeup in kswapd_stop()

commit 1c7e7f6c0703d03af6bcd5ccc11fc15d23e5ecbe upstream.

Offlining memory may block forever, waiting for kswapd() to wake up
because kswapd() does not check the event kthread->should_stop before
sleeping.

The proper pattern, from Documentation/memory-barriers.txt, is:

   ---  waker  ---
   event_indicated = 1;
   wake_up_process(event_daemon);

   ---  sleeper  ---
   for (;;) {
      set_current_state(TASK_UNINTERRUPTIBLE);
      if (event_indicated)
         break;
      schedule();
   }

   set_current_state() may be wrapped by:
      prepare_to_wait();

In the kswapd() case, event_indicated is kthread->should_stop.

  === offlining memory (waker) ===
   kswapd_stop()
      kthread_stop()
         kthread->should_stop = 1
         wake_up_process()
         wait_for_completion()

  ===  kswapd_try_to_sleep (sleeper) ===
   kswapd_try_to_sleep()
      prepare_to_wait()
           .
           .
      schedule()
           .
           .
      finish_wait()

The schedule() needs to be protected by a test of kthread->should_stop,
which is wrapped by kthread_should_stop().

Reproducer:
   Do heavy file I/O in background.
   Do a memory offline/online in a tight loop

Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1b0ed36..130fa32 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2695,7 +2695,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 		 * them before going back to sleep.
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-		schedule();
+
+		if (!kthread_should_stop())
+			schedule();
+
 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
 	} else {
 		if (remaining)
-- 
cgit v1.1


From 9116bc4fb2f96d4a3190932bf17088174da04401 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Mon, 31 Oct 2011 17:09:46 -0700
Subject: mm/vmstat.c: cache align vm_stat

commit a1cb2c60ddc98ff4e5246f410558805401ceee67 upstream.

Stable note: Not tracked on Bugzilla. This patch is known to make a big
        difference to tmpfs performance on larger machines.

This was found to adversely affect tmpfs I/O performance.

Tests run on a 640 cpu UV system.

With 120 threads doing parallel writes, each to different tmpfs mounts:
No patch:		~300 MB/sec
With vm_stat alignment:	~430 MB/sec

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Christoph Lameter <cl@gentwo.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/vmstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7..6559013 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
  *
  * vm_stat contains the global counters
  */
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_stat);
 
 #ifdef CONFIG_SMP
-- 
cgit v1.1


From 71a07f4cf29615d30369760c022972d4875758b3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 10 Jan 2012 15:07:14 -0800
Subject: mm: reduce the amount of work done when updating min_free_kbytes

commit 938929f14cb595f43cd1a4e63e22d36cab1e4a1f upstream.

Stable note: Fixes https://bugzilla.novell.com/show_bug.cgi?id=726210 .
        Large machines with 1TB or more of RAM take a long time to boot
        without this patch and may spew out soft lockup warnings.

When min_free_kbytes is updated, some pageblocks are marked
MIGRATE_RESERVE.  Ordinarily, this work is unnoticable as it happens early
in boot but on large machines with 1TB of memory, this has been reported
to delay boot times, probably due to the NUMA distances involved.

The bulk of the work is due to calling calling pageblock_is_reserved() an
unnecessary amount of times and accessing far more struct page metadata
than is necessary.  This patch significantly reduces the amount of work
done by setup_zone_migrate_reserve() improving boot times on 1TB machines.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/page_alloc.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 947a7e9..70c049d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3418,25 +3418,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 
-		/* Blocks with reserved pages will never free, skip them. */
-		block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-		if (pageblock_is_reserved(pfn, block_end_pfn))
-			continue;
-
 		block_migratetype = get_pageblock_migratetype(page);
 
-		/* If this block is reserved, account for it */
-		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-			reserve--;
-			continue;
-		}
+		/* Only test what is necessary when the reserves are not met */
+		if (reserve > 0) {
+			/*
+			 * Blocks with reserved pages will never free, skip
+			 * them.
+			 */
+			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+			if (pageblock_is_reserved(pfn, block_end_pfn))
+				continue;
 
-		/* Suitable for reserving if this block is movable */
-		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-			set_pageblock_migratetype(page, MIGRATE_RESERVE);
-			move_freepages_block(zone, page, MIGRATE_RESERVE);
-			reserve--;
-			continue;
+			/* If this block is reserved, account for it */
+			if (block_migratetype == MIGRATE_RESERVE) {
+				reserve--;
+				continue;
+			}
+
+			/* Suitable for reserving if this block is movable */
+			if (block_migratetype == MIGRATE_MOVABLE) {
+				set_pageblock_migratetype(page,
+							MIGRATE_RESERVE);
+				move_freepages_block(zone, page,
+							MIGRATE_RESERVE);
+				reserve--;
+				continue;
+			}
 		}
 
 		/*
-- 
cgit v1.1


From 33c17eafdeefb08fbb6ded946abcf024f76c9615 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Wed, 14 Sep 2011 16:21:52 -0700
Subject: mm: vmscan: fix force-scanning small targets without swap

commit a4d3e9e76337059406fcf3ead288c0df22a790e9 upstream.

Stable note: Not tracked in Bugzilla. This patch augments an earlier commit
        that avoids scanning priority being artificially raised. The older
	fix was particularly important for small memcgs to avoid calling
	wait_iff_congested() unnecessarily.

Without swap, anonymous pages are not scanned.  As such, they should not
count when considering force-scanning a small target if there is no swap.

Otherwise, targets are not force-scanned even when their effective scan
number is zero and the other conditions--kswapd/memcg--apply.

This fixes 246e87a93934 ("memcg: fix get_scan_count() for small
targets").

[akpm@linux-foundation.org: fix comment]
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 130fa32..347bb447 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1747,23 +1747,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
 	u64 fraction[2], denominator;
 	enum lru_list l;
 	int noswap = 0;
-	int force_scan = 0;
+	bool force_scan = false;
 	unsigned long nr_force_scan[2];
 
-
-	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
-	if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
-		/* kswapd does zone balancing and need to scan this zone */
-		if (scanning_global_lru(sc) && current_is_kswapd())
-			force_scan = 1;
-		/* memcg may have small limit and need to avoid priority drop */
-		if (!scanning_global_lru(sc))
-			force_scan = 1;
-	}
+	/* kswapd does zone balancing and needs to scan this zone */
+	if (scanning_global_lru(sc) && current_is_kswapd())
+		force_scan = true;
+	/* memcg may have small limit and need to avoid priority drop */
+	if (!scanning_global_lru(sc))
+		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1776,6 +1768,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
 		goto out;
 	}
 
+	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
 	if (scanning_global_lru(sc)) {
 		free  = zone_page_state(zone, NR_FREE_PAGES);
 		/* If we have very few page cache pages,
-- 
cgit v1.1


From 564ea9dd5ab042cb2fe8373f4d627073706e1d4f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 25 Aug 2011 15:59:12 -0700
Subject: vmscan: clear ZONE_CONGESTED for zone with good watermark

commit 439423f6894aa0dec22187526827456f5004baed upstream.

Stable note: Not tracked in Bugzilla. kswapd is responsible for clearing
	ZONE_CONGESTED after it balances a zone and this patch fixes a bug
	where that was failing to happen. Without this patch, processes
	can stall in wait_iff_congested unnecessarily. For users, this can
	look like an interactivity stall but some workloads would see it
	as sudden drop in throughput.

ZONE_CONGESTED is only cleared in kswapd, but pages can be freed in any
task.  It's possible ZONE_CONGESTED isn't cleared in some cases:

 1. the zone is already balanced just entering balance_pgdat() for
    order-0 because concurrent tasks free memory.  In this case, later
    check will skip the zone as it's balanced so the flag isn't cleared.

 2. high order balance fallbacks to order-0.  quote from Mel: At the
    end of balance_pgdat(), kswapd uses the following logic;

	If reclaiming at high order {
		for each zone {
			if all_unreclaimable
				skip
			if watermark is not met
				order = 0
				loop again

			/* watermark is met */
			clear congested
		}
	}

    i.e. it clears ZONE_CONGESTED if it the zone is balanced.  if not,
    it restarts balancing at order-0.  However, if the higher zones are
    balanced for order-0, kswapd will miss clearing ZONE_CONGESTED as
    that only happens after a zone is shrunk.  This can mean that
    wait_iff_congested() stalls unnecessarily.

This patch makes kswapd clear ZONE_CONGESTED during its initial
highmem->dma scan for zones that are already balanced.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 347bb447..6b0f8a6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2456,6 +2456,9 @@ loop_again:
 					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
 				break;
+			} else {
+				/* If balanced, clear the congested flag */
+				zone_clear_flag(zone, ZONE_CONGESTED);
 			}
 		}
 		if (i < 0)
-- 
cgit v1.1


From 5e5b3d2ed3aee6f8bbe38c0945876aacce11ff03 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:34 +1000
Subject: vmscan: add shrink_slab tracepoints

commit 095760730c1047c69159ce88021a7fa3833502c8 upstream.

Stable note: This patch makes later patches easier to apply but otherwise
        has little to justify it. It is a diagnostic patch that was part
        of a series addressing excessive slab shrinking after GFP_NOFS
        failures. There is detailed information on the series' motivation
        at https://lkml.org/lkml/2011/6/2/42 .

It is impossible to understand what the shrinkers are actually doing
without instrumenting the code, so add a some tracepoints to allow
insight to be gained.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/vmscan.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6b0f8a6..abc7981 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -250,6 +250,7 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		unsigned long long delta;
 		unsigned long total_scan;
 		unsigned long max_pass;
+		int shrink_ret = 0;
 
 		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
 		delta = (4 * nr_pages_scanned) / shrinker->seeks;
@@ -274,9 +275,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		total_scan = shrinker->nr;
 		shrinker->nr = 0;
 
+		trace_mm_shrink_slab_start(shrinker, shrink, total_scan,
+					nr_pages_scanned, lru_pages,
+					max_pass, delta, total_scan);
+
 		while (total_scan >= SHRINK_BATCH) {
 			long this_scan = SHRINK_BATCH;
-			int shrink_ret;
 			int nr_before;
 
 			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -293,6 +297,8 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		}
 
 		shrinker->nr += total_scan;
+		trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan,
+					 shrinker->nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
-- 
cgit v1.1


From 6a5091a09f9278f8f821e3f33ac748633d143cea Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:35 +1000
Subject: vmscan: shrinker->nr updates race and go wrong

commit acf92b485cccf028177f46918e045c0c4e80ee10 upstream.

Stable note: Not tracked in Bugzilla. This patch reduces excessive
	reclaim of slab objects reducing the amount of information
	that has to be brought back in from disk.

shrink_slab() allows shrinkers to be called in parallel so the
struct shrinker can be updated concurrently. It does not provide any
exclusio for such updates, so we can get the shrinker->nr value
increasing or decreasing incorrectly.

As a result, when a shrinker repeatedly returns a value of -1 (e.g.
a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire,
sometimes updating with the scan count that wasn't used, sometimes
losing it altogether. Worse is when a shrinker does work and that
update is lost due to racy updates, which means the shrinker will do
the work again!

Fix this by making the total_scan calculations independent of
shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to
other updates via cmpxchg loops.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index abc7981..bb7df1e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -251,17 +251,29 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		unsigned long total_scan;
 		unsigned long max_pass;
 		int shrink_ret = 0;
+		long nr;
+		long new_nr;
 
+		/*
+		 * copy the current shrinker scan count into a local variable
+		 * and zero it so that other concurrent shrinker invocations
+		 * don't also do this scanning work.
+		 */
+		do {
+			nr = shrinker->nr;
+		} while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+		total_scan = nr;
 		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
 		delta = (4 * nr_pages_scanned) / shrinker->seeks;
 		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
-		shrinker->nr += delta;
-		if (shrinker->nr < 0) {
+		total_scan += delta;
+		if (total_scan < 0) {
 			printk(KERN_ERR "shrink_slab: %pF negative objects to "
 			       "delete nr=%ld\n",
-			       shrinker->shrink, shrinker->nr);
-			shrinker->nr = max_pass;
+			       shrinker->shrink, total_scan);
+			total_scan = max_pass;
 		}
 
 		/*
@@ -269,13 +281,10 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		 * never try to free more than twice the estimate number of
 		 * freeable entries.
 		 */
-		if (shrinker->nr > max_pass * 2)
-			shrinker->nr = max_pass * 2;
-
-		total_scan = shrinker->nr;
-		shrinker->nr = 0;
+		if (total_scan > max_pass * 2)
+			total_scan = max_pass * 2;
 
-		trace_mm_shrink_slab_start(shrinker, shrink, total_scan,
+		trace_mm_shrink_slab_start(shrinker, shrink, nr,
 					nr_pages_scanned, lru_pages,
 					max_pass, delta, total_scan);
 
@@ -296,9 +305,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 			cond_resched();
 		}
 
-		shrinker->nr += total_scan;
-		trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan,
-					 shrinker->nr);
+		/*
+		 * move the unused scan count back into the shrinker in a
+		 * manner that handles concurrent updates. If we exhausted the
+		 * scan, there is no need to do an update.
+		 */
+		do {
+			nr = shrinker->nr;
+			new_nr = total_scan + nr;
+			if (total_scan <= 0)
+				break;
+		} while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
-- 
cgit v1.1


From 7554e3446a916363447a81a29f9300d3f2fbf503 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:36 +1000
Subject: vmscan: reduce wind up shrinker->nr when shrinker can't do work

commit 3567b59aa80ac4417002bf58e35dce5c777d4164 upstream.

Stable note: Not tracked in Bugzilla. This patch reduces excessive
	reclaim of slab objects reducing the amount of information that
	has to be brought back in from disk. The third and fourth paragram
	in the series describes the impact.

When a shrinker returns -1 to shrink_slab() to indicate it cannot do
any work given the current memory reclaim requirements, it adds the
entire total_scan count to shrinker->nr. The idea ehind this is that
whenteh shrinker is next called and can do work, it will do the work
of the previously aborted shrinker call as well.

However, if a filesystem is doing lots of allocation with GFP_NOFS
set, then we get many, many more aborts from the shrinkers than we
do successful calls. The result is that shrinker->nr winds up to
it's maximum permissible value (twice the current cache size) and
then when the next shrinker call that can do work is issued, it
has enough scan count built up to free the entire cache twice over.

This manifests itself in the cache going from full to empty in a
matter of seconds, even when only a small part of the cache is
needed to be emptied to free sufficient memory.

Under metadata intensive workloads on ext4 and XFS, I'm seeing the
VFS caches increase memory consumption up to 75% of memory (no page
cache pressure) over a period of 30-60s, and then the shrinker
empties them down to zero in the space of 2-3s. This cycle repeats
over and over again, with the shrinker completely trashing the inode
and dentry caches every minute or so the workload continues.

This behaviour was made obvious by the shrink_slab tracepoints added
earlier in the series, and made worse by the patch that corrected
the concurrent accounting of shrinker->nr.

To avoid this problem, stop repeated small increments of the total
scan value from winding shrinker->nr up to a value that can cause
the entire cache to be freed. We still need to allow it to wind up,
so use the delta as the "large scan" threshold check - if the delta
is more than a quarter of the entire cache size, then it is a large
scan and allowed to cause lots of windup because we are clearly
needing to free lots of memory.

If it isn't a large scan then limit the total scan to half the size
of the cache so that windup never increases to consume the whole
cache. Reducing the total scan limit further does not allow enough
wind-up to maintain the current levels of performance, whilst a
higher threshold does not prevent the windup from freeing the entire
cache under sustained workloads.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb7df1e..d375de3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -277,6 +277,21 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		}
 
 		/*
+		 * We need to avoid excessive windup on filesystem shrinkers
+		 * due to large numbers of GFP_NOFS allocations causing the
+		 * shrinkers to return -1 all the time. This results in a large
+		 * nr being built up so when a shrink that can do some work
+		 * comes along it empties the entire cache due to nr >>>
+		 * max_pass.  This is bad for sustaining a working set in
+		 * memory.
+		 *
+		 * Hence only allow the shrinker to scan the entire cache when
+		 * a large delta change is calculated directly.
+		 */
+		if (delta < max_pass / 4)
+			total_scan = min(total_scan, max_pass / 2);
+
+		/*
 		 * Avoid risking looping forever due to too large nr value:
 		 * never try to free more than twice the estimate number of
 		 * freeable entries.
-- 
cgit v1.1


From 4d4724067d512e7f17010112da8ec64917c192e7 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 31 Oct 2011 17:09:31 -0700
Subject: vmscan: limit direct reclaim for higher order allocations

commit e0887c19b2daa140f20ca8104bdc5740f39dbb86 upstream.

Stable note: Not tracked on Bugzilla. THP and compaction was found to
	aggressively reclaim pages and stall systems under different
	situations that was addressed piecemeal over time.  Paragraph
	3 of this changelog is the motivation for this patch.

When suffering from memory fragmentation due to unfreeable pages, THP page
faults will repeatedly try to compact memory.  Due to the unfreeable
pages, compaction fails.

Needless to say, at that point page reclaim also fails to create free
contiguous 2MB areas.  However, that doesn't stop the current code from
trying, over and over again, and freeing a minimum of 4MB (2UL <<
sc->order pages) at every single invocation.

This resulted in my 12GB system having 2-3GB free memory, a corresponding
amount of used swap and very sluggish response times.

This can be avoided by having the direct reclaim code not reclaim from
zones that already have plenty of free memory available for compaction.

If compaction still fails due to unmovable memory, doing additional
reclaim will only hurt the system, not help.

[jweiner@redhat.com: change comment to explain the order check]
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d375de3..e1ae88b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2059,6 +2059,22 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 				continue;
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;	/* Let kswapd poll it */
+			if (COMPACTION_BUILD) {
+				/*
+				 * If we already have plenty of memory
+				 * free for compaction, don't free any
+				 * more.  Even though compaction is
+				 * invoked for any non-zero order,
+				 * only frequent costly order
+				 * reclamation is disruptive enough to
+				 * become a noticable problem, like
+				 * transparent huge page allocations.
+				 */
+				if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+					(compaction_suitable(zone, sc->order) ||
+					 compaction_deferred(zone)))
+					continue;
+			}
 			/*
 			 * This steals pages from memory cgroups over softlimit
 			 * and returns the number of reclaimed pages and
-- 
cgit v1.1


From 4682e89d1455d66e2536d9efb2875d61a1f1f294 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 31 Oct 2011 17:09:33 -0700
Subject: vmscan: abort reclaim/compaction if compaction can proceed

commit e0c23279c9f800c403f37511484d9014ac83adec upstream.

Stable note: Not tracked on Bugzilla. THP and compaction was found to
	aggressively reclaim pages and stall systems under different
	situations that was addressed piecemeal over time.

If compaction can proceed, shrink_zones() stops doing any work but its
callers still call shrink_slab() which raises the priority and potentially
sleeps.  This is unnecessary and wasteful so this patch aborts direct
reclaim/compaction entirely if compaction can proceed.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Johannes Weiner <jweiner@redhat.com>
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e1ae88b..b146b42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2037,14 +2037,19 @@ restart:
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is either ready to begin or deferred.
+ * This indicates to the caller that it should retry the allocation or fail.
  */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
+	bool should_abort_reclaim = false;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2061,19 +2066,20 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 				continue;	/* Let kswapd poll it */
 			if (COMPACTION_BUILD) {
 				/*
-				 * If we already have plenty of memory
-				 * free for compaction, don't free any
-				 * more.  Even though compaction is
-				 * invoked for any non-zero order,
-				 * only frequent costly order
-				 * reclamation is disruptive enough to
-				 * become a noticable problem, like
-				 * transparent huge page allocations.
+				 * If we already have plenty of memory free for
+				 * compaction in this zone, don't free any more.
+				 * Even though compaction is invoked for any
+				 * non-zero order, only frequent costly order
+				 * reclamation is disruptive enough to become a
+				 * noticable problem, like transparent huge page
+				 * allocations.
 				 */
 				if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
 					(compaction_suitable(zone, sc->order) ||
-					 compaction_deferred(zone)))
+					 compaction_deferred(zone))) {
+					should_abort_reclaim = true;
 					continue;
+				}
 			}
 			/*
 			 * This steals pages from memory cgroups over softlimit
@@ -2092,6 +2098,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 
 		shrink_zone(priority, zone, sc);
 	}
+
+	return should_abort_reclaim;
 }
 
 static bool zone_reclaimable(struct zone *zone)
@@ -2156,7 +2164,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token(sc->mem_cgroup);
-		shrink_zones(priority, zonelist, sc);
+		if (shrink_zones(priority, zonelist, sc))
+			break;
+
 		/*
 		 * Don't shrink slabs when reclaiming memory from
 		 * over limit cgroups
-- 
cgit v1.1


From f665a680f89357a6773fb97684690c76933888f6 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 31 Oct 2011 17:06:44 -0700
Subject: mm: compaction: trivial clean up in acct_isolated()

commit b9e84ac1536d35aee03b2601f19694949f0bd506 upstream.

Stable note: Not tracked in Bugzilla. This patch makes later patches
	easier to apply but has no other impact.

acct_isolated of compaction uses page_lru_base_type which returns only
base type of LRU list so it never returns LRU_ACTIVE_ANON or
LRU_ACTIVE_FILE.  In addtion, cc->nr_[anon|file] is used in only
acct_isolated so it doesn't have fields in conpact_control.

This patch removes fields from compact_control and makes clear function of
acct_issolated which counts the number of anon|file pages isolated.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index adc5336..d6ba037 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
 
-	/* Account for isolated anon and file pages */
-	unsigned long nr_anon;
-	unsigned long nr_file;
-
 	unsigned int order;		/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
 static void acct_isolated(struct zone *zone, struct compact_control *cc)
 {
 	struct page *page;
-	unsigned int count[NR_LRU_LISTS] = { 0, };
+	unsigned int count[2] = { 0, };
 
-	list_for_each_entry(page, &cc->migratepages, lru) {
-		int lru = page_lru_base_type(page);
-		count[lru]++;
-	}
+	list_for_each_entry(page, &cc->migratepages, lru)
+		count[!!page_is_file_cache(page)]++;
 
-	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
-	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+	__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
-- 
cgit v1.1


From a15a3971cc49eefbde40b397a446c0fa9c5fed9c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 31 Oct 2011 17:06:47 -0700
Subject: mm: change isolate mode from #define to bitwise type

commit 4356f21d09283dc6d39a6f7287a65ddab61e2808 upstream.

Stable note: Not tracked in Bugzilla. This patch makes later patches
	easier to apply but has no other impact.

Change ISOLATE_XXX macro with bitwise isolate_mode_t type.  Normally,
macro isn't recommended as it's type-unsafe and making debugging harder as
symbol cannot be passed throught to the debugger.

Quote from Johannes
" Hmm, it would probably be cleaner to fully convert the isolation mode
into independent flags.  INACTIVE, ACTIVE, BOTH is currently a
tri-state among flags, which is a bit ugly."

This patch moves isolate mode from swap.h to mmzone.h by memcontrol.h

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c |  3 ++-
 mm/memcontrol.c |  3 ++-
 mm/vmscan.c     | 37 ++++++++++++++++++++-----------------
 3 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index d6ba037..26521a1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -371,7 +371,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 		}
 
 		/* Try isolate the page */
-		if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+		if (__isolate_lru_page(page,
+				ISOLATE_ACTIVE|ISOLATE_INACTIVE, 0) != 0)
 			continue;
 
 		VM_BUG_ON(PageTransCompound(page));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ffb99b4..57cdf5a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
-					int mode, struct zone *z,
+					isolate_mode_t mode,
+					struct zone *z,
 					struct mem_cgroup *mem_cont,
 					int active, int file)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b146b42..9267ba1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1012,23 +1012,27 @@ keep_lumpy:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+	bool all_lru_mode;
 	int ret = -EINVAL;
 
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
 		return ret;
 
+	all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+		(ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
 	/*
 	 * When checking the active state, we need to be sure we are
 	 * dealing with comparible boolean values.  Take the logical not
 	 * of each.
 	 */
-	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+	if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
 		return ret;
 
-	if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+	if (!all_lru_mode && !!page_is_file_cache(page) != file)
 		return ret;
 
 	/*
@@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode, int file)
+		unsigned long *scanned, int order, isolate_mode_t mode,
+		int file)
 {
 	unsigned long nr_taken = 0;
 	unsigned long nr_lumpy_taken = 0;
@@ -1201,8 +1206,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
-					int mode, struct zone *z,
-					int active, int file)
+					isolate_mode_t mode,
+					struct zone *z,	int active, int file)
 {
 	int lru = LRU_BASE;
 	if (active)
@@ -1448,6 +1453,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	unsigned long nr_taken;
 	unsigned long nr_anon;
 	unsigned long nr_file;
+	isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
 
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,15 +1464,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 	}
 
 	set_reclaim_mode(priority, sc, false);
+	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+		reclaim_mode |= ISOLATE_ACTIVE;
+
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 
 	if (scanning_global_lru(sc)) {
-		nr_taken = isolate_pages_global(nr_to_scan,
-			&page_list, &nr_scanned, sc->order,
-			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-					ISOLATE_BOTH : ISOLATE_INACTIVE,
-			zone, 0, file);
+		nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+			&nr_scanned, sc->order, reclaim_mode, zone, 0, file);
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1475,12 +1481,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 			__count_zone_vm_events(PGSCAN_DIRECT, zone,
 					       nr_scanned);
 	} else {
-		nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-			&page_list, &nr_scanned, sc->order,
-			sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-					ISOLATE_BOTH : ISOLATE_INACTIVE,
-			zone, sc->mem_cgroup,
-			0, file);
+		nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+			&nr_scanned, sc->order, reclaim_mode, zone,
+			sc->mem_cgroup, 0, file);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
 		 * scanned pages on its own.
-- 
cgit v1.1


From 19faec0520b3b16dfd58cde30938a3c4d3dcdd5b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 31 Oct 2011 17:06:51 -0700
Subject: mm: compaction: make isolate_lru_page() filter-aware

commit 39deaf8585152f1a35c1676d3d7dc6ae0fb65967 upstream.

Stable note: Not tracked in Bugzilla. THP and compaction disrupt the LRU
	list leading to poor reclaim decisions which has a variable
	performance impact.

In async mode, compaction doesn't migrate dirty or writeback pages.  So,
it's meaningless to pick the page and re-add it to lru list.

Of course, when we isolate the page in compaction, the page might be dirty
or writeback but when we try to migrate the page, the page would be not
dirty, writeback.  So it could be migrated.  But it's very unlikely as
isolate and migration cycle is much faster than writeout.

So, this patch helps cpu overhead and prevent unnecessary LRU churning.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c | 7 +++++--
 mm/vmscan.c     | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 26521a1..b30c40f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -261,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	unsigned long last_pageblock_nr = 0, pageblock_nr;
 	unsigned long nr_scanned = 0, nr_isolated = 0;
 	struct list_head *migratelist = &cc->migratepages;
+	isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
 
 	/* Do not scan outside zone boundaries */
 	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -370,9 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 			continue;
 		}
 
+		if (!cc->sync)
+			mode |= ISOLATE_CLEAN;
+
 		/* Try isolate the page */
-		if (__isolate_lru_page(page,
-				ISOLATE_ACTIVE|ISOLATE_INACTIVE, 0) != 0)
+		if (__isolate_lru_page(page, mode, 0) != 0)
 			continue;
 
 		VM_BUG_ON(PageTransCompound(page));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9267ba1..6835735 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1045,6 +1045,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 
 	ret = -EBUSY;
 
+	if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+		return ret;
+
 	if (likely(get_page_unless_zero(page))) {
 		/*
 		 * Be careful not to clear PageLRU until after we're
-- 
cgit v1.1


From 5e02dde6aee7c4492b3a62ad93e7f1120877a019 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 31 Oct 2011 17:06:55 -0700
Subject: mm: zone_reclaim: make isolate_lru_page() filter-aware

commit f80c0673610e36ae29d63e3297175e22f70dde5f upstream.

Stable note: Not tracked in Bugzilla. THP and compaction disrupt the LRU list
	leading to poor reclaim decisions which has a variable
	performance impact.

In __zone_reclaim case, we don't want to shrink mapped page.  Nonetheless,
we have isolated mapped page and re-add it into LRU's head.  It's
unnecessary CPU overhead and makes LRU churning.

Of course, when we isolate the page, the page might be mapped but when we
try to migrate the page, the page would be not mapped.  So it could be
migrated.  But race is rare and although it happens, it's no big deal.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6835735..0c78bd3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1048,6 +1048,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 	if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
 		return ret;
 
+	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+		return ret;
+
 	if (likely(get_page_unless_zero(page))) {
 		/*
 		 * Be careful not to clear PageLRU until after we're
@@ -1471,6 +1474,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 		reclaim_mode |= ISOLATE_ACTIVE;
 
 	lru_add_drain();
+
+	if (!sc->may_unmap)
+		reclaim_mode |= ISOLATE_UNMAPPED;
+	if (!sc->may_writepage)
+		reclaim_mode |= ISOLATE_CLEAN;
+
 	spin_lock_irq(&zone->lru_lock);
 
 	if (scanning_global_lru(sc)) {
@@ -1588,19 +1597,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	struct page *page;
 	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 	unsigned long nr_rotated = 0;
+	isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
 
 	lru_add_drain();
+
+	if (!sc->may_unmap)
+		reclaim_mode |= ISOLATE_UNMAPPED;
+	if (!sc->may_writepage)
+		reclaim_mode |= ISOLATE_CLEAN;
+
 	spin_lock_irq(&zone->lru_lock);
 	if (scanning_global_lru(sc)) {
 		nr_taken = isolate_pages_global(nr_pages, &l_hold,
 						&pgscanned, sc->order,
-						ISOLATE_ACTIVE, zone,
+						reclaim_mode, zone,
 						1, file);
 		zone->pages_scanned += pgscanned;
 	} else {
 		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
 						&pgscanned, sc->order,
-						ISOLATE_ACTIVE, zone,
+						reclaim_mode, zone,
 						sc->mem_cgroup, 1, file);
 		/*
 		 * mem_cgroup_isolate_pages() keeps track of
-- 
cgit v1.1


From 331fae62e66ac4209f23df0df66999932e513fff Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 31 Oct 2011 17:06:57 -0700
Subject: mm: migration: clean up unmap_and_move()

commit 0dabec93de633a87adfbbe1d800a4c56cd19d73b upstream.

Stable note: Not tracked in Bugzilla. This patch makes later patches
	easier to apply but has no other impact.

unmap_and_move() is one a big messy function.  Clean it up.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/migrate.c | 75 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 35 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 14d0a6a..33358f8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 	return rc;
 }
 
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+				int force, bool offlining, bool sync)
 {
-	int rc = 0;
-	int *result = NULL;
-	struct page *newpage = get_new_page(page, private, &result);
+	int rc = -EAGAIN;
 	int remap_swapcache = 1;
 	int charge = 0;
 	struct mem_cgroup *mem;
 	struct anon_vma *anon_vma = NULL;
 
-	if (!newpage)
-		return -ENOMEM;
-
-	if (page_count(page) == 1) {
-		/* page was freed from under us. So we are done. */
-		goto move_newpage;
-	}
-	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page(page)))
-			goto move_newpage;
-
-	/* prepare cgroup just returns 0 or -ENOMEM */
-	rc = -EAGAIN;
-
 	if (!trylock_page(page)) {
 		if (!force || !sync)
-			goto move_newpage;
+			goto out;
 
 		/*
 		 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		 * altogether.
 		 */
 		if (current->flags & PF_MEMALLOC)
-			goto move_newpage;
+			goto out;
 
 		lock_page(page);
 	}
@@ -785,27 +765,52 @@ uncharge:
 		mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
 	unlock_page(page);
+out:
+	return rc;
+}
 
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+			struct page *page, int force, bool offlining, bool sync)
+{
+	int rc = 0;
+	int *result = NULL;
+	struct page *newpage = get_new_page(page, private, &result);
+
+	if (!newpage)
+		return -ENOMEM;
+
+	if (page_count(page) == 1) {
+		/* page was freed from under us. So we are done. */
+		goto out;
+	}
+
+	if (unlikely(PageTransHuge(page)))
+		if (unlikely(split_huge_page(page)))
+			goto out;
+
+	rc = __unmap_and_move(page, newpage, force, offlining, sync);
+out:
 	if (rc != -EAGAIN) {
- 		/*
- 		 * A page that has been migrated has all references
- 		 * removed and will be freed. A page that has not been
- 		 * migrated will have kepts its references and be
- 		 * restored.
- 		 */
- 		list_del(&page->lru);
+		/*
+		 * A page that has been migrated has all references
+		 * removed and will be freed. A page that has not been
+		 * migrated will have kepts its references and be
+		 * restored.
+		 */
+		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
 		putback_lru_page(page);
 	}
-
 	/*
 	 * Move the new page to the LRU. If migration was not successful
 	 * then this will free the page.
 	 */
 	putback_lru_page(newpage);
-
 	if (result) {
 		if (rc)
 			*result = rc;
-- 
cgit v1.1


From ec46a9e8767b9fa37c5d18b18b24ea96a5d2695d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:22 -0800
Subject: mm: compaction: allow compaction to isolate dirty pages

commit a77ebd333cd810d7b680d544be88c875131c2bd3 upstream.

Stable note: Not tracked in Bugzilla. A fix aimed at preserving page aging
	information by reducing LRU list churning had the side-effect of
	reducing THP allocation success rates. This was part of a series
	to restore the success rates while preserving the reclaim fix.

Short summary: There are severe stalls when a USB stick using VFAT is
used with THP enabled that are reduced by this series.  If you are
experiencing this problem, please test and report back and considering I
have seen complaints from openSUSE and Fedora users on this as well as a
few private mails, I'm guessing it's a widespread issue.  This is a new
type of USB-related stall because it is due to synchronous compaction
writing where as in the past the big problem was dirty pages reaching
the end of the LRU and being written by reclaim.

Am cc'ing Andrew this time and this series would replace
mm-do-not-stall-in-synchronous-compaction-for-thp-allocations.patch.
I'm also cc'ing Dave Jones as he might have merged that patch to Fedora
for wider testing and ideally it would be reverted and replaced by this
series.

That said, the later patches could really do with some review.  If this
series is not the answer then a new direction needs to be discussed
because as it is, the stalls are unacceptable as the results in this
leader show.

For testers that try backporting this to 3.1, it won't work because
there is a non-obvious dependency on not writing back pages in direct
reclaim so you need those patches too.

Changelog since V5
o Rebase to 3.2-rc5
o Tidy up the changelogs a bit

Changelog since V4
o Added reviewed-bys, credited Andrea properly for sync-light
o Allow dirty pages without mappings to be considered for migration
o Bound the number of pages freed for compaction
o Isolate PageReclaim pages on their own LRU list

This is against 3.2-rc5 and follows on from discussions on "mm: Do
not stall in synchronous compaction for THP allocations" and "[RFC
PATCH 0/5] Reduce compaction-related stalls". Initially, the proposed
patch eliminated stalls due to compaction which sometimes resulted in
user-visible interactivity problems on browsers by simply never using
sync compaction. The downside was that THP success allocation rates
were lower because dirty pages were not being migrated as reported by
Andrea. His approach at fixing this was nacked on the grounds that
it reverted fixes from Rik merged that reduced the amount of pages
reclaimed as it severely impacted his workloads performance.

This series attempts to reconcile the requirements of maximising THP
usage, without stalling in a user-visible fashion due to compaction
or cheating by reclaiming an excessive number of pages.

Patch 1 partially reverts commit 39deaf85 to allow migration to isolate
	dirty pages. This is because migration can move some dirty
	pages without blocking.

Patch 2 notes that the /proc/sys/vm/compact_memory handler is not using
	synchronous compaction when it should be. This is unrelated
	to the reported stalls but is worth fixing.

Patch 3 checks if we isolated a compound page during lumpy scan and
	account for it properly. For the most part, this affects
	tracing so it's unrelated to the stalls but worth fixing.

Patch 4 notes that it is possible to abort reclaim early for compaction
	and return 0 to the page allocator potentially entering the
	"may oom" path. This has not been observed in practice but
	the rest of the series potentially makes it easier to happen.

Patch 5 adds a sync parameter to the migratepage callback and gives
	the callback responsibility for migrating the page without
	blocking if sync==false. For example, fallback_migrate_page
	will not call writepage if sync==false. This increases the
	number of pages that can be handled by asynchronous compaction
	thereby reducing stalls.

Patch 6 restores filter-awareness to isolate_lru_page for migration.
	In practice, it means that pages under writeback and pages
	without a ->migratepage callback will not be isolated
	for migration.

Patch 7 avoids calling direct reclaim if compaction is deferred but
	makes sure that compaction is only deferred if sync
	compaction was used.

Patch 8 introduces a sync-light migration mechanism that sync compaction
	uses. The objective is to allow some stalls but to not call
	->writepage which can lead to significant user-visible stalls.

Patch 9 notes that while we want to abort reclaim ASAP to allow
	compation to go ahead that we leave a very small window of
	opportunity for compaction to run. This patch allows more pages
	to be freed by reclaim but bounds the number to a reasonable
	level based on the high watermark on each zone.

Patch 10 allows slabs to be shrunk even after compaction_ready() is
	true for one zone. This is to avoid a problem whereby a single
	small zone can abort reclaim even though no pages have been
	reclaimed and no suitably large zone is in a usable state.

Patch 11 fixes a problem with the rate of page scanning. As reclaim is
	rarely stalling on pages under writeback it means that scan
	rates are very high. This is particularly true for direct
	reclaim which is not calling writepage. The vmstat figures
	implied that much of this was busy work with PageReclaim pages
	marked for immediate reclaim. This patch is a prototype that
	moves these pages to their own LRU list.

This has been tested and other than 2 USB keys getting trashed,
nothing horrible fell out. That said, I am a bit unhappy with the
rescue logic in patch 11 but did not find a better way around it. It
does significantly reduce scan rates and System CPU time indicating
it is the right direction to take.

What is of critical importance is that stalls due to compaction
are massively reduced even though sync compaction was still
allowed. Testing from people complaining about stalls copying to USBs
with THP enabled are particularly welcome.

The following tests all involve THP usage and USB keys in some
way. Each test follows this type of pattern

1. Read from some fast fast storage, be it raw device or file. Each time
   the copy finishes, start again until the test ends
2. Write a large file to a filesystem on a USB stick. Each time the copy
   finishes, start again until the test ends
3. When memory is low, start an alloc process that creates a mapping
   the size of physical memory to stress THP allocation. This is the
   "real" part of the test and the part that is meant to trigger
   stalls when THP is enabled. Copying continues in the background.
4. Record the CPU usage and time to execute of the alloc process
5. Record the number of THP allocs and fallbacks as well as the number of THP
   pages in use a the end of the test just before alloc exited
6. Run the test 5 times to get an idea of variability
7. Between each run, sync is run and caches dropped and the test
   waits until nr_dirty is a small number to avoid interference
   or caching between iterations that would skew the figures.

The individual tests were then

writebackCPDeviceBasevfat
	Disable THP, read from a raw device (sda), vfat on USB stick
writebackCPDeviceBaseext4
	Disable THP, read from a raw device (sda), ext4 on USB stick
writebackCPDevicevfat
	THP enabled, read from a raw device (sda), vfat on USB stick
writebackCPDeviceext4
	THP enabled, read from a raw device (sda), ext4 on USB stick
writebackCPFilevfat
	THP enabled, read from a file on fast storage and USB, both vfat
writebackCPFileext4
	THP enabled, read from a file on fast storage and USB, both ext4

The kernels tested were

3.1		3.1
vanilla		3.2-rc5
freemore	Patches 1-10
immediate	Patches 1-11
andrea		The 8 patches Andrea posted as a basis of comparison

The results are very long unfortunately. I'll start with the case
where we are not using THP at all

writebackCPDeviceBasevfat
                   3.1.0-vanilla         rc5-vanilla       freemore-v6r1        isolate-v6r1         andrea-v2r1
System Time         1.28 (    0.00%)   54.49 (-4143.46%)   48.63 (-3687.69%)    4.69 ( -265.11%)   51.88 (-3940.81%)
+/-                 0.06 (    0.00%)    2.45 (-4305.55%)    4.75 (-8430.57%)    7.46 (-13282.76%)    4.76 (-8440.70%)
User Time           0.09 (    0.00%)    0.05 (   40.91%)    0.06 (   29.55%)    0.07 (   15.91%)    0.06 (   27.27%)
+/-                 0.02 (    0.00%)    0.01 (   45.39%)    0.02 (   25.07%)    0.00 (   77.06%)    0.01 (   52.24%)
Elapsed Time      110.27 (    0.00%)   56.38 (   48.87%)   49.95 (   54.70%)   11.77 (   89.33%)   53.43 (   51.54%)
+/-                 7.33 (    0.00%)    3.77 (   48.61%)    4.94 (   32.63%)    6.71 (    8.50%)    4.76 (   35.03%)
THP Active          0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)
+/-                 0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)
Fault Alloc         0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)
+/-                 0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)
Fault Fallback      0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)
+/-                 0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)    0.00 (    0.00%)

The THP figures are obviously all 0 because THP was enabled. The
main thing to watch is the elapsed times and how they compare to
times when THP is enabled later. It's also important to note that
elapsed time is improved by this series as System CPu time is much
reduced.

writebackCPDevicevfat

                   3.1.0-vanilla         rc5-vanilla       freemore-v6r1        isolate-v6r1         andrea-v2r1
System Time         1.22 (    0.00%)   13.89 (-1040.72%)   46.40 (-3709.20%)    4.44 ( -264.37%)   47.37 (-3789.33%)
+/-                 0.06 (    0.00%)   22.82 (-37635.56%)    3.84 (-6249.44%)    6.48 (-10618.92%)    6.60
(-10818.53%)
User Time           0.06 (    0.00%)    0.06 (   -6.90%)    0.05 (   17.24%)    0.05 (   13.79%)    0.04 (   31.03%)
+/-                 0.01 (    0.00%)    0.01 (   33.33%)    0.01 (   33.33%)    0.01 (   39.14%)    0.01 (   25.46%)
Elapsed Time     10445.54 (    0.00%) 2249.92 (   78.46%)   70.06 (   99.33%)   16.59 (   99.84%)  472.43 (
95.48%)
+/-               643.98 (    0.00%)  811.62 (  -26.03%)   10.02 (   98.44%)    7.03 (   98.91%)   59.99 (   90.68%)
THP Active         15.60 (    0.00%)   35.20 (  225.64%)   65.00 (  416.67%)   70.80 (  453.85%)   62.20 (  398.72%)
+/-                18.48 (    0.00%)   51.29 (  277.59%)   15.99 (   86.52%)   37.91 (  205.18%)   22.02 (  119.18%)
Fault Alloc       121.80 (    0.00%)   76.60 (   62.89%)  155.40 (  127.59%)  181.20 (  148.77%)  286.60 (  235.30%)
+/-                73.51 (    0.00%)   61.11 (   83.12%)   34.89 (   47.46%)   31.88 (   43.36%)   68.13 (   92.68%)
Fault Fallback    881.20 (    0.00%)  926.60 (   -5.15%)  847.60 (    3.81%)  822.00 (    6.72%)  716.60 (   18.68%)
+/-                73.51 (    0.00%)   61.26 (   16.67%)   34.89 (   52.54%)   31.65 (   56.94%)   67.75 (    7.84%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds)       3540.88   1945.37    716.04     64.97   1937.03
Total Elapsed Time (seconds)              52417.33  11425.90    501.02    230.95   2520.28

The first thing to note is the "Elapsed Time" for the vanilla kernels
of 2249 seconds versus 56 with THP disabled which might explain the
reports of USB stalls with THP enabled. Applying the patches brings
performance in line with THP-disabled performance while isolating
pages for immediate reclaim from the LRU cuts down System CPU time.

The "Fault Alloc" success rate figures are also improved. The vanilla
kernel only managed to allocate 76.6 pages on average over the course
of 5 iterations where as applying the series allocated 181.20 on
average albeit it is well within variance. It's worth noting that
applies the series at least descreases the amount of variance which
implies an improvement.

Andrea's series had a higher success rate for THP allocations but
at a severe cost to elapsed time which is still better than vanilla
but still much worse than disabling THP altogether. One can bring my
series close to Andrea's by removing this check

        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
         * has requested the system not be heavily disrupted, fail the
         * allocation now instead of entering direct reclaim
         */
        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;

I didn't include a patch that removed the above check because hurting
overall performance to improve the THP figure is not what the average
user wants. It's something to consider though if someone really wants
to maximise THP usage no matter what it does to the workload initially.

This is summary of vmstat figures from the same test.

                                       3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1
Page Ins                                  3257266139  1111844061    17263623    10901575   161423219
Page Outs                                   81054922    30364312     3626530     3657687     8753730
Swap Ins                                        3294        2851        6560        4964        4592
Swap Outs                                     390073      528094      620197      790912      698285
Direct pages scanned                      1077581700  3024951463  1764930052   115140570  5901188831
Kswapd pages scanned                        34826043     7112868     2131265     1686942     1893966
Kswapd pages reclaimed                      28950067     4911036     1246044      966475     1497726
Direct pages reclaimed                     805148398   280167837     3623473     2215044    40809360
Kswapd efficiency                                83%         69%         58%         57%         79%
Kswapd velocity                              664.399     622.521    4253.852    7304.360     751.490
Direct efficiency                                74%          9%          0%          1%          0%
Direct velocity                            20557.737  264745.137 3522673.849  498551.938 2341481.435
Percentage direct scans                          96%         99%         99%         98%         99%
Page writes by reclaim                        722646      529174      620319      791018      699198
Page writes file                              332573        1080         122         106         913
Page writes anon                              390073      528094      620197      790912      698285
Page reclaim immediate                             0  2552514720  1635858848   111281140  5478375032
Page rescued immediate                             0           0           0       87848           0
Slabs scanned                                  23552       23552        9216        8192        9216
Direct inode steals                              231           0           0           0           0
Kswapd inode steals                                0           0           0           0           0
Kswapd skipped wait                            28076         786           0          61           6
THP fault alloc                                  609         383         753         906        1433
THP collapse alloc                                12           6           0           0           6
THP splits                                       536         211         456         593        1136
THP fault fallback                              4406        4633        4263        4110        3583
THP collapse fail                                120         127           0           0           4
Compaction stalls                               1810         728         623         779        3200
Compaction success                               196          53          60          80         123
Compaction failures                             1614         675         563         699        3077
Compaction pages moved                        193158       53545      243185      333457      226688
Compaction move failure                         9952        9396       16424       23676       45070

The main things to look at are

1. Page In/out figures are much reduced by the series.

2. Direct page scanning is incredibly high (264745.137 pages scanned
   per second on the vanilla kernel) but isolating PageReclaim pages
   on their own list reduces the number of pages scanned significantly.

3. The fact that "Page rescued immediate" is a positive number implies
   that we sometimes race removing pages from the LRU_IMMEDIATE list
   that need to be put back on a normal LRU but it happens only for
   0.07% of the pages marked for immediate reclaim.

writebackCPDeviceext4
                   3.1.0-vanilla         rc5-vanilla       freemore-v6r1        isolate-v6r1         andrea-v2r1
System Time         1.51 (    0.00%)    1.77 (  -17.66%)    1.46 (    2.92%)    1.15 (   23.77%)    1.89 (  -25.63%)
+/-                 0.27 (    0.00%)    0.67 ( -148.52%)    0.33 (  -22.76%)    0.30 (  -11.15%)    0.19 (   30.16%)
User Time           0.03 (    0.00%)    0.04 (  -37.50%)    0.05 (  -62.50%)    0.07 ( -112.50%)    0.04 (  -18.75%)
+/-                 0.01 (    0.00%)    0.02 ( -146.64%)    0.02 (  -97.91%)    0.02 (  -75.59%)    0.02 (  -63.30%)
Elapsed Time      124.93 (    0.00%)  114.49 (    8.36%)   96.77 (   22.55%)   27.48 (   78.00%)  205.70 (  -64.65%)
+/-                20.20 (    0.00%)   74.39 ( -268.34%)   59.88 ( -196.48%)    7.72 (   61.79%)   25.03 (  -23.95%)
THP Active        161.80 (    0.00%)   83.60 (   51.67%)  141.20 (   87.27%)   84.60 (   52.29%)   82.60 (   51.05%)
+/-                71.95 (    0.00%)   43.80 (   60.88%)   26.91 (   37.40%)   59.02 (   82.03%)   52.13 (   72.45%)
Fault Alloc       471.40 (    0.00%)  228.60 (   48.49%)  282.20 (   59.86%)  225.20 (   47.77%)  388.40 (   82.39%)
+/-                88.07 (    0.00%)   87.42 (   99.26%)   73.79 (   83.78%)  109.62 (  124.47%)   82.62 (   93.81%)
Fault Fallback    531.60 (    0.00%)  774.60 (  -45.71%)  720.80 (  -35.59%)  777.80 (  -46.31%)  614.80 (  -15.65%)
+/-                88.07 (    0.00%)   87.26 (    0.92%)   73.79 (   16.22%)  109.62 (  -24.47%)   82.29 (    6.56%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds)         50.22     33.76     30.65     24.14    128.45
Total Elapsed Time (seconds)               1113.73   1132.19   1029.45    759.49   1707.26

Similar test but the USB stick is using ext4 instead of vfat. As
ext4 does not use writepage for migration, the large stalls due to
compaction when THP is enabled are not observed. Still, isolating
PageReclaim pages on their own list helped completion time largely
by reducing the number of pages scanned by direct reclaim although
time spend in congestion_wait could also be a factor.

Again, Andrea's series had far higher success rates for THP allocation
at the cost of elapsed time. I didn't look too closely but a quick
look at the vmstat figures tells me kswapd reclaimed 8 times more pages
than the patch series and direct reclaim reclaimed roughly three times
as many pages. It follows that if memory is aggressively reclaimed,
there will be more available for THP.

writebackCPFilevfat
                   3.1.0-vanilla         rc5-vanilla       freemore-v6r1        isolate-v6r1         andrea-v2r1
System Time         1.76 (    0.00%)   29.10 (-1555.52%)   46.01 (-2517.18%)    4.79 ( -172.35%)   54.89 (-3022.53%)
+/-                 0.14 (    0.00%)   25.61 (-18185.17%)    2.15 (-1434.83%)    6.60 (-4610.03%)    9.75
(-6863.76%)
User Time           0.05 (    0.00%)    0.07 (  -45.83%)    0.05 (   -4.17%)    0.06 (  -29.17%)    0.06 (  -16.67%)
+/-                 0.02 (    0.00%)    0.02 (   20.11%)    0.02 (   -3.14%)    0.01 (   31.58%)    0.01 (   47.41%)
Elapsed Time     22520.79 (    0.00%) 1082.85 (   95.19%)   73.30 (   99.67%)   32.43 (   99.86%)  291.84 (  98.70%)
+/-              7277.23 (    0.00%)  706.29 (   90.29%)   19.05 (   99.74%)   17.05 (   99.77%)  125.55 (   98.27%)
THP Active         83.80 (    0.00%)   12.80 (   15.27%)   15.60 (   18.62%)   13.00 (   15.51%)    0.80 (    0.95%)
+/-                66.81 (    0.00%)   20.19 (   30.22%)    5.92 (    8.86%)   15.06 (   22.54%)    1.17 (    1.75%)
Fault Alloc       171.00 (    0.00%)   67.80 (   39.65%)   97.40 (   56.96%)  125.60 (   73.45%)  133.00 (   77.78%)
+/-                82.91 (    0.00%)   30.69 (   37.02%)   53.91 (   65.02%)   55.05 (   66.40%)   21.19 (   25.56%)
Fault Fallback    832.00 (    0.00%)  935.20 (  -12.40%)  906.00 (   -8.89%)  877.40 (   -5.46%)  870.20 (   -4.59%)
+/-                82.91 (    0.00%)   30.69 (   62.98%)   54.01 (   34.86%)   55.05 (   33.60%)   20.91 (   74.78%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds)       7229.81    928.42    704.52     80.68   1330.76
Total Elapsed Time (seconds)             112849.04   5618.69    571.11    360.54   1664.28

In this case, the test is reading/writing only from filesystems but as
it's vfat, it's slow due to calling writepage during compaction. Little
to observe really - the time to complete the test goes way down
with the series applied and THP allocation success rates go up in
comparison to 3.2-rc5.  The success rates are lower than 3.1.0 but
the elapsed time for that kernel is abysmal so it is not really a
sensible comparison.

As before, Andrea's series allocates more THPs at the cost of overall
performance.

writebackCPFileext4
                   3.1.0-vanilla         rc5-vanilla       freemore-v6r1        isolate-v6r1         andrea-v2r1
System Time         1.51 (    0.00%)    1.77 (  -17.66%)    1.46 (    2.92%)    1.15 (   23.77%)    1.89 (  -25.63%)
+/-                 0.27 (    0.00%)    0.67 ( -148.52%)    0.33 (  -22.76%)    0.30 (  -11.15%)    0.19 (   30.16%)
User Time           0.03 (    0.00%)    0.04 (  -37.50%)    0.05 (  -62.50%)    0.07 ( -112.50%)    0.04 (  -18.75%)
+/-                 0.01 (    0.00%)    0.02 ( -146.64%)    0.02 (  -97.91%)    0.02 (  -75.59%)    0.02 (  -63.30%)
Elapsed Time      124.93 (    0.00%)  114.49 (    8.36%)   96.77 (   22.55%)   27.48 (   78.00%)  205.70 (  -64.65%)
+/-                20.20 (    0.00%)   74.39 ( -268.34%)   59.88 ( -196.48%)    7.72 (   61.79%)   25.03 (  -23.95%)
THP Active        161.80 (    0.00%)   83.60 (   51.67%)  141.20 (   87.27%)   84.60 (   52.29%)   82.60 (   51.05%)
+/-                71.95 (    0.00%)   43.80 (   60.88%)   26.91 (   37.40%)   59.02 (   82.03%)   52.13 (   72.45%)
Fault Alloc       471.40 (    0.00%)  228.60 (   48.49%)  282.20 (   59.86%)  225.20 (   47.77%)  388.40 (   82.39%)
+/-                88.07 (    0.00%)   87.42 (   99.26%)   73.79 (   83.78%)  109.62 (  124.47%)   82.62 (   93.81%)
Fault Fallback    531.60 (    0.00%)  774.60 (  -45.71%)  720.80 (  -35.59%)  777.80 (  -46.31%)  614.80 (  -15.65%)
+/-                88.07 (    0.00%)   87.26 (    0.92%)   73.79 (   16.22%)  109.62 (  -24.47%)   82.29 (    6.56%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds)         50.22     33.76     30.65     24.14    128.45
Total Elapsed Time (seconds)               1113.73   1132.19   1029.45    759.49   1707.26

Same type of story - elapsed times go down. In this case, allocation
success rates are roughtly the same. As before, Andrea's has higher
success rates but takes a lot longer.

Overall the series does reduce latencies and while the tests are
inherency racy as alloc competes with the cp processes, the variability
was included. The THP allocation rates are not as high as they could
be but that is because we would have to be more aggressive about
reclaim and compaction impacting overall performance.

This patch:

Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware")
noted that compaction does not migrate dirty or writeback pages and that
is was meaningless to pick the page and re-add it to the LRU list.

What was missed during review is that asynchronous migration moves dirty
pages if their ->migratepage callback is migrate_page() because these can
be moved without blocking.  This potentially impacted hugepage allocation
success rates by a factor depending on how many dirty pages are in the
system.

This patch partially reverts 39deaf85 to allow migration to isolate dirty
pages again.  This increases how much compaction disrupts the LRU but that
is addressed later in the series.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index b30c40f..228f91b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -371,9 +371,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 			continue;
 		}
 
-		if (!cc->sync)
-			mode |= ISOLATE_CLEAN;
-
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, mode, 0) != 0)
 			continue;
-- 
cgit v1.1


From 397d9c507ff1c9c5afc80c80ee245c2455d6a1db Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:34 -0800
Subject: mm: compaction: determine if dirty pages can be migrated without
 blocking within ->migratepage

commit b969c4ab9f182a6e1b2a0848be349f99714947b0 upstream.

Stable note: Not tracked in Bugzilla. A fix aimed at preserving page
	aging information by reducing LRU list churning had the side-effect
	of reducing THP allocation success rates. This was part of a series
	to restore the success rates while preserving the reclaim fix.

Asynchronous compaction is used when allocating transparent hugepages to
avoid blocking for long periods of time.  Due to reports of stalling,
there was a debate on disabling synchronous compaction but this severely
impacted allocation success rates.  Part of the reason was that many dirty
pages are skipped in asynchronous compaction by the following check;

	if (PageDirty(page) && !sync &&
		mapping->a_ops->migratepage != migrate_page)
			rc = -EBUSY;

This skips over all mapping aops using buffer_migrate_page() even though
it is possible to migrate some of these pages without blocking.  This
patch updates the ->migratepage callback with a "sync" parameter.  It is
the responsibility of the callback to fail gracefully if migration would
block.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/migrate.c | 129 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 92 insertions(+), 37 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 33358f8..d43689a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -220,6 +220,55 @@ out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync)
+{
+	struct buffer_head *bh = head;
+
+	/* Simple case, sync compaction */
+	if (sync) {
+		do {
+			get_bh(bh);
+			lock_buffer(bh);
+			bh = bh->b_this_page;
+
+		} while (bh != head);
+
+		return true;
+	}
+
+	/* async case, we cannot block on lock_buffer so use trylock_buffer */
+	do {
+		get_bh(bh);
+		if (!trylock_buffer(bh)) {
+			/*
+			 * We failed to lock the buffer and cannot stall in
+			 * async migration. Release the taken locks
+			 */
+			struct buffer_head *failed_bh = bh;
+			put_bh(failed_bh);
+			bh = head;
+			while (bh != failed_bh) {
+				unlock_buffer(bh);
+				put_bh(bh);
+				bh = bh->b_this_page;
+			}
+			return false;
+		}
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+								bool sync)
+{
+	return true;
+}
+#endif /* CONFIG_BLOCK */
+
 /*
  * Replace the page in the mapping.
  *
@@ -229,7 +278,8 @@ out:
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page,
+		struct buffer_head *head, bool sync)
 {
 	int expected_count;
 	void **pslot;
@@ -259,6 +309,19 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	}
 
 	/*
+	 * In the async migration case of moving a page with buffers, lock the
+	 * buffers using trylock before the mapping is moved. If the mapping
+	 * was moved, we later failed to lock the buffers and could not move
+	 * the mapping back due to an elevated page count, we would have to
+	 * block waiting on other references to be dropped.
+	 */
+	if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) {
+		page_unfreeze_refs(page, expected_count);
+		spin_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
+	/*
 	 * Now we know that no one else is looking at the page.
 	 */
 	get_page(newpage);	/* add cache reference */
@@ -415,13 +478,13 @@ EXPORT_SYMBOL(fail_migrate_page);
  * Pages are locked upon entry and exit.
  */
 int migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page, bool sync)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync);
 
 	if (rc)
 		return rc;
@@ -438,28 +501,28 @@ EXPORT_SYMBOL(migrate_page);
  * exist.
  */
 int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page)
+		struct page *newpage, struct page *page, bool sync)
 {
 	struct buffer_head *bh, *head;
 	int rc;
 
 	if (!page_has_buffers(page))
-		return migrate_page(mapping, newpage, page);
+		return migrate_page(mapping, newpage, page, sync);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(mapping, newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page, head, sync);
 
 	if (rc)
 		return rc;
 
-	bh = head;
-	do {
-		get_bh(bh);
-		lock_buffer(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
+	/*
+	 * In the async case, migrate_page_move_mapping locked the buffers
+	 * with an IRQ-safe spinlock held. In the sync case, the buffers
+	 * need to be locked now
+	 */
+	if (sync)
+		BUG_ON(!buffer_migrate_lock_buffers(head, sync));
 
 	ClearPagePrivate(page);
 	set_page_private(newpage, page_private(page));
@@ -536,10 +599,13 @@ static int writeout(struct address_space *mapping, struct page *page)
  * Default handling if a filesystem does not provide a migration function.
  */
 static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page)
+	struct page *newpage, struct page *page, bool sync)
 {
-	if (PageDirty(page))
+	if (PageDirty(page)) {
+		if (!sync)
+			return -EBUSY;
 		return writeout(mapping, page);
+	}
 
 	/*
 	 * Buffers may be managed in a filesystem specific way.
@@ -549,7 +615,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-	return migrate_page(mapping, newpage, page);
+	return migrate_page(mapping, newpage, page, sync);
 }
 
 /*
@@ -585,29 +651,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 
 	mapping = page_mapping(page);
 	if (!mapping)
-		rc = migrate_page(mapping, newpage, page);
-	else {
+		rc = migrate_page(mapping, newpage, page, sync);
+	else if (mapping->a_ops->migratepage)
 		/*
-		 * Do not writeback pages if !sync and migratepage is
-		 * not pointing to migrate_page() which is nonblocking
-		 * (swapcache/tmpfs uses migratepage = migrate_page).
+		 * Most pages have a mapping and most filesystems provide a
+		 * migratepage callback. Anonymous pages are part of swap
+		 * space which also has its own migratepage callback. This
+		 * is the most common path for page migration.
 		 */
-		if (PageDirty(page) && !sync &&
-		    mapping->a_ops->migratepage != migrate_page)
-			rc = -EBUSY;
-		else if (mapping->a_ops->migratepage)
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(mapping,
-							newpage, page);
-		else
-			rc = fallback_migrate_page(mapping, newpage, page);
-	}
+		rc = mapping->a_ops->migratepage(mapping,
+						newpage, page, sync);
+	else
+		rc = fallback_migrate_page(mapping, newpage, page, sync);
 
 	if (rc) {
 		newpage->mapping = NULL;
-- 
cgit v1.1


From c17a36656685a2af6ea9e99fa243a7103b643d12 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:41 -0800
Subject: mm: page allocator: do not call direct reclaim for THP allocations
 while compaction is deferred

commit 66199712e9eef5aede09dbcd9dfff87798a66917 upstream.

Stable note: Not tracked in Buzilla. This was part of a series that
	reduced interactivity stalls experienced when THP was enabled.

If compaction is deferred, direct reclaim is used to try to free enough
pages for the allocation to succeed.  For small high-orders, this has a
reasonable chance of success.  However, if the caller has specified
__GFP_NO_KSWAPD to limit the disruption to the system, it makes more sense
to fail the allocation rather than stall the caller in direct reclaim.
This patch skips direct reclaim if compaction is deferred and the caller
specifies __GFP_NO_KSWAPD.

Async compaction only considers a subset of pages so it is possible for
compaction to be deferred prematurely and not enter direct reclaim even in
cases where it should.  To compensate for this, this patch also defers
compaction only if sync compaction failed.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Rik van Riel<riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/page_alloc.c | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 70c049d..0d490ba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1897,14 +1897,20 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress,
-	bool sync_migration)
+	int migratetype, bool sync_migration,
+	bool *deferred_compaction,
+	unsigned long *did_some_progress)
 {
 	struct page *page;
 
-	if (!order || compaction_deferred(preferred_zone))
+	if (!order)
 		return NULL;
 
+	if (compaction_deferred(preferred_zone)) {
+		*deferred_compaction = true;
+		return NULL;
+	}
+
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration);
@@ -1932,7 +1938,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
-		defer_compaction(preferred_zone);
+
+		/*
+		 * As async compaction considers a subset of pageblocks, only
+		 * defer if the failure was a sync compaction failure.
+		 */
+		if (sync_migration)
+			defer_compaction(preferred_zone);
 
 		cond_resched();
 	}
@@ -1944,8 +1956,9 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress,
-	bool sync_migration)
+	int migratetype, bool sync_migration,
+	bool *deferred_compaction,
+	unsigned long *did_some_progress)
 {
 	return NULL;
 }
@@ -2095,6 +2108,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	bool sync_migration = false;
+	bool deferred_compaction = false;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2175,12 +2189,22 @@ rebalance:
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress,
-					sync_migration);
+					migratetype, sync_migration,
+					&deferred_compaction,
+					&did_some_progress);
 	if (page)
 		goto got_pg;
 	sync_migration = true;
 
+	/*
+	 * If compaction is deferred for high-order allocations, it is because
+	 * sync compaction recently failed. In this is the case and the caller
+	 * has requested the system not be heavily disrupted, fail the
+	 * allocation now instead of entering direct reclaim
+	 */
+	if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+		goto nopage;
+
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
@@ -2243,8 +2267,9 @@ rebalance:
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress,
-					sync_migration);
+					migratetype, sync_migration,
+					&deferred_compaction,
+					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
-- 
cgit v1.1


From a7e32d7a2a801b7838b4159e9d73ea86f68ae002 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:38 -0800
Subject: mm: compaction: make isolate_lru_page() filter-aware again

commit c82449352854ff09e43062246af86bdeb628f0c3 upstream.

Stable note: Not tracked in Bugzilla. A fix aimed at preserving page aging
	information by reducing LRU list churning had the side-effect of
	reducing THP allocation success rates. This was part of a series
	to restore the success rates while preserving the reclaim fix.

Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware")
noted that compaction does not migrate dirty or writeback pages and that
is was meaningless to pick the page and re-add it to the LRU list.  This
had to be partially reverted because some dirty pages can be migrated by
compaction without blocking.

This patch updates "mm: compaction: make isolate_lru_page" by skipping
over pages that migration has no possibility of migrating to minimise LRU
disruption.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel<riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c |  3 +++
 mm/vmscan.c     | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 228f91b..da44920 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -371,6 +371,9 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 			continue;
 		}
 
+		if (!cc->sync)
+			mode |= ISOLATE_ASYNC_MIGRATE;
+
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, mode, 0) != 0)
 			continue;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0c78bd3..45c40d6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1045,8 +1045,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 
 	ret = -EBUSY;
 
-	if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
-		return ret;
+	/*
+	 * To minimise LRU disruption, the caller can indicate that it only
+	 * wants to isolate pages it will be able to operate on without
+	 * blocking - clean pages for the most part.
+	 *
+	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
+	 * is used by reclaim when it is cannot write to backing storage
+	 *
+	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+	 * that it is possible to migrate without blocking
+	 */
+	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+		/* All the caller can do on PageWriteback is block */
+		if (PageWriteback(page))
+			return ret;
+
+		if (PageDirty(page)) {
+			struct address_space *mapping;
+
+			/* ISOLATE_CLEAN means only clean pages */
+			if (mode & ISOLATE_CLEAN)
+				return ret;
+
+			/*
+			 * Only pages without mappings or that have a
+			 * ->migratepage callback are possible to migrate
+			 * without blocking
+			 */
+			mapping = page_mapping(page);
+			if (mapping && !mapping->a_ops->migratepage)
+				return ret;
+		}
+	}
 
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
-- 
cgit v1.1


From 5d62e5ca429b85ecadaa5042bdb1d8b88d4bfe80 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Mon, 31 Oct 2011 17:08:39 -0700
Subject: kswapd: avoid unnecessary rebalance after an unsuccessful balancing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit d2ebd0f6b89567eb93ead4e2ca0cbe03021f344b upstream.

Stable note: Fixes https://bugzilla.redhat.com/show_bug.cgi?id=712019.  This
	patch reduces kswapd CPU usage.

In commit 215ddd66 ("mm: vmscan: only read new_classzone_idx from pgdat
when reclaiming successfully") , Mel Gorman said kswapd is better to sleep
after a unsuccessful balancing if there is tighter reclaim request pending
in the balancing.  But in the following scenario, kswapd do something that
is not matched our expectation.  The patch fixes this issue.

1, Read pgdat request A (classzone_idx, order = 3)
2, balance_pgdat()
3, During pgdat, a new pgdat request B (classzone_idx, order = 5) is placed
4, balance_pgdat() returns but failed since returned order = 0
5, pgdat of request A assigned to balance_pgdat(), and do balancing again.
   While the expectation behavior of kswapd should try to sleep.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Tested-by: Pádraig Brady <P@draigBrady.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 45c40d6..5cc0f92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2844,7 +2844,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
 	unsigned long order, new_order;
+	unsigned balanced_order;
 	int classzone_idx, new_classzone_idx;
+	int balanced_classzone_idx;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 
@@ -2875,7 +2877,9 @@ static int kswapd(void *p)
 	set_freezable();
 
 	order = new_order = 0;
+	balanced_order = 0;
 	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+	balanced_classzone_idx = classzone_idx;
 	for ( ; ; ) {
 		int ret;
 
@@ -2884,7 +2888,8 @@ static int kswapd(void *p)
 		 * new request of a similar or harder type will succeed soon
 		 * so consider going to sleep on the basis we reclaimed at
 		 */
-		if (classzone_idx >= new_classzone_idx && order == new_order) {
+		if (balanced_classzone_idx >= new_classzone_idx &&
+					balanced_order == new_order) {
 			new_order = pgdat->kswapd_max_order;
 			new_classzone_idx = pgdat->classzone_idx;
 			pgdat->kswapd_max_order =  0;
@@ -2899,7 +2904,8 @@ static int kswapd(void *p)
 			order = new_order;
 			classzone_idx = new_classzone_idx;
 		} else {
-			kswapd_try_to_sleep(pgdat, order, classzone_idx);
+			kswapd_try_to_sleep(pgdat, balanced_order,
+						balanced_classzone_idx);
 			order = pgdat->kswapd_max_order;
 			classzone_idx = pgdat->classzone_idx;
 			pgdat->kswapd_max_order = 0;
@@ -2916,7 +2922,9 @@ static int kswapd(void *p)
 		 */
 		if (!ret) {
 			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-			order = balance_pgdat(pgdat, order, &classzone_idx);
+			balanced_classzone_idx = classzone_idx;
+			balanced_order = balance_pgdat(pgdat, order,
+						&balanced_classzone_idx);
 		}
 	}
 	return 0;
-- 
cgit v1.1


From 9203b3fa57cc16bf1ad1be7c64b01b5e45cc6151 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Mon, 31 Oct 2011 17:08:45 -0700
Subject: kswapd: assign new_order and new_classzone_idx after wakeup in
 sleeping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit f0dfcde099453aa4c0dc42473828d15a6d492936 upstream.

Stable note: Fixes https://bugzilla.redhat.com/show_bug.cgi?id=712019.  This
	patch reduces kswapd CPU usage.

There 2 places to read pgdat in kswapd.  One is return from a successful
balance, another is waked up from kswapd sleeping.  The new_order and
new_classzone_idx represent the balance input order and classzone_idx.

But current new_order and new_classzone_idx are not assigned after
kswapd_try_to_sleep(), that will cause a bug in the following scenario.

1: after a successful balance, kswapd goes to sleep, and new_order = 0;
   new_classzone_idx = __MAX_NR_ZONES - 1;

2: kswapd waked up with order = 3 and classzone_idx = ZONE_NORMAL

3: in the balance_pgdat() running, a new balance wakeup happened with
   order = 5, and classzone_idx = ZONE_NORMAL

4: the first wakeup(order = 3) finished successufly, return order = 3
   but, the new_order is still 0, so, this balancing will be treated as a
   failed balance.  And then the second tighter balancing will be missed.

So, to avoid the above problem, the new_order and new_classzone_idx need
to be assigned for later successful comparison.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Tested-by: Pádraig Brady <P@draigBrady.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5cc0f92..870cbcf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2908,6 +2908,8 @@ static int kswapd(void *p)
 						balanced_classzone_idx);
 			order = pgdat->kswapd_max_order;
 			classzone_idx = pgdat->classzone_idx;
+			new_order = order;
+			new_classzone_idx = classzone_idx;
 			pgdat->kswapd_max_order = 0;
 			pgdat->classzone_idx = pgdat->nr_zones - 1;
 		}
-- 
cgit v1.1


From f869774c37710ef2b773d167d184b9936988d07f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:43 -0800
Subject: mm: compaction: introduce sync-light migration for use by compaction

commit a6bc32b899223a877f595ef9ddc1e89ead5072b8 upstream.

Stable note: Not tracked in Buzilla. This was part of a series that
	reduced interactivity stalls experienced when THP was enabled.
	These stalls were particularly noticable when copying data
	to a USB stick but the experiences for users varied a lot.

This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT
mode that avoids writing back pages to backing storage.  Async compaction
maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT.
For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is
used.

This avoids sync compaction stalling for an excessive length of time,
particularly when copying files to a USB stick where there might be a
large number of dirty pages backed by a filesystem that does not support
->writepages.

[aarcange@redhat.com: This patch is heavily based on Andrea's work]
[akpm@linux-foundation.org: fix fs/nfs/write.c build]
[akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c     |  2 +-
 mm/memory-failure.c |  2 +-
 mm/memory_hotplug.c |  2 +-
 mm/mempolicy.c      |  2 +-
 mm/migrate.c        | 78 +++++++++++++++++++++++++++++------------------------
 5 files changed, 47 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index da44920..8ea7308 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -577,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
 				(unsigned long)cc, false,
-				cc->sync);
+				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
 		update_nr_listpages(cc);
 		nr_remaining = cc->nr_migratepages;
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f5..6496748 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
 					    page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-								0, true);
+							0, MIGRATE_SYNC);
 		if (ret) {
 			putback_lru_pages(&pagelist);
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b..ae5a3f2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		}
 		/* this function returns # of failed pages */
 		ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
-								true, true);
+							true, MIGRATE_SYNC);
 		if (ret)
 			putback_lru_pages(&source);
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3dac2d1..dd5f874 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -926,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,
-								false, true);
+							false, MIGRATE_SYNC);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
diff --git a/mm/migrate.c b/mm/migrate.c
index d43689a..480714b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -222,12 +222,13 @@ out:
 
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync)
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+							enum migrate_mode mode)
 {
 	struct buffer_head *bh = head;
 
 	/* Simple case, sync compaction */
-	if (sync) {
+	if (mode != MIGRATE_ASYNC) {
 		do {
 			get_bh(bh);
 			lock_buffer(bh);
@@ -263,7 +264,7 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync)
 }
 #else
 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
-								bool sync)
+							enum migrate_mode mode)
 {
 	return true;
 }
@@ -279,7 +280,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page,
-		struct buffer_head *head, bool sync)
+		struct buffer_head *head, enum migrate_mode mode)
 {
 	int expected_count;
 	void **pslot;
@@ -315,7 +316,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	 * the mapping back due to an elevated page count, we would have to
 	 * block waiting on other references to be dropped.
 	 */
-	if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) {
+	if (mode == MIGRATE_ASYNC && head &&
+			!buffer_migrate_lock_buffers(head, mode)) {
 		page_unfreeze_refs(page, expected_count);
 		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
@@ -478,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
  * Pages are locked upon entry and exit.
  */
 int migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, bool sync)
+		struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync);
+	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
 
 	if (rc)
 		return rc;
@@ -501,17 +504,17 @@ EXPORT_SYMBOL(migrate_page);
  * exist.
  */
 int buffer_migrate_page(struct address_space *mapping,
-		struct page *newpage, struct page *page, bool sync)
+		struct page *newpage, struct page *page, enum migrate_mode mode)
 {
 	struct buffer_head *bh, *head;
 	int rc;
 
 	if (!page_has_buffers(page))
-		return migrate_page(mapping, newpage, page, sync);
+		return migrate_page(mapping, newpage, page, mode);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, head, sync);
+	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
 
 	if (rc)
 		return rc;
@@ -521,8 +524,8 @@ int buffer_migrate_page(struct address_space *mapping,
 	 * with an IRQ-safe spinlock held. In the sync case, the buffers
 	 * need to be locked now
 	 */
-	if (sync)
-		BUG_ON(!buffer_migrate_lock_buffers(head, sync));
+	if (mode != MIGRATE_ASYNC)
+		BUG_ON(!buffer_migrate_lock_buffers(head, mode));
 
 	ClearPagePrivate(page);
 	set_page_private(newpage, page_private(page));
@@ -599,10 +602,11 @@ static int writeout(struct address_space *mapping, struct page *page)
  * Default handling if a filesystem does not provide a migration function.
  */
 static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page, bool sync)
+	struct page *newpage, struct page *page, enum migrate_mode mode)
 {
 	if (PageDirty(page)) {
-		if (!sync)
+		/* Only writeback pages in full synchronous migration */
+		if (mode != MIGRATE_SYNC)
 			return -EBUSY;
 		return writeout(mapping, page);
 	}
@@ -615,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-	return migrate_page(mapping, newpage, page, sync);
+	return migrate_page(mapping, newpage, page, mode);
 }
 
 /*
@@ -630,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  *  == 0 - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page,
-					int remap_swapcache, bool sync)
+				int remap_swapcache, enum migrate_mode mode)
 {
 	struct address_space *mapping;
 	int rc;
@@ -651,7 +655,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 
 	mapping = page_mapping(page);
 	if (!mapping)
-		rc = migrate_page(mapping, newpage, page, sync);
+		rc = migrate_page(mapping, newpage, page, mode);
 	else if (mapping->a_ops->migratepage)
 		/*
 		 * Most pages have a mapping and most filesystems provide a
@@ -660,9 +664,9 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		 * is the most common path for page migration.
 		 */
 		rc = mapping->a_ops->migratepage(mapping,
-						newpage, page, sync);
+						newpage, page, mode);
 	else
-		rc = fallback_migrate_page(mapping, newpage, page, sync);
+		rc = fallback_migrate_page(mapping, newpage, page, mode);
 
 	if (rc) {
 		newpage->mapping = NULL;
@@ -677,7 +681,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 }
 
 static int __unmap_and_move(struct page *page, struct page *newpage,
-				int force, bool offlining, bool sync)
+			int force, bool offlining, enum migrate_mode mode)
 {
 	int rc = -EAGAIN;
 	int remap_swapcache = 1;
@@ -686,7 +690,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	struct anon_vma *anon_vma = NULL;
 
 	if (!trylock_page(page)) {
-		if (!force || !sync)
+		if (!force || mode == MIGRATE_ASYNC)
 			goto out;
 
 		/*
@@ -732,10 +736,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 
 	if (PageWriteback(page)) {
 		/*
-		 * For !sync, there is no point retrying as the retry loop
-		 * is expected to be too short for PageWriteback to be cleared
+		 * Only in the case of a full syncronous migration is it
+		 * necessary to wait for PageWriteback. In the async case,
+		 * the retry loop is too short and in the sync-light case,
+		 * the overhead of stalling is too much
 		 */
-		if (!sync) {
+		if (mode != MIGRATE_SYNC) {
 			rc = -EBUSY;
 			goto uncharge;
 		}
@@ -806,7 +812,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 
 skip_unmap:
 	if (!page_mapped(page))
-		rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+		rc = move_to_new_page(newpage, page, remap_swapcache, mode);
 
 	if (rc && remap_swapcache)
 		remove_migration_ptes(page, page);
@@ -829,7 +835,8 @@ out:
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, bool offlining, bool sync)
+			struct page *page, int force, bool offlining,
+			enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -847,7 +854,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		if (unlikely(split_huge_page(page)))
 			goto out;
 
-	rc = __unmap_and_move(page, newpage, force, offlining, sync);
+	rc = __unmap_and_move(page, newpage, force, offlining, mode);
 out:
 	if (rc != -EAGAIN) {
 		/*
@@ -895,7 +902,8 @@ out:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
 				unsigned long private, struct page *hpage,
-				int force, bool offlining, bool sync)
+				int force, bool offlining,
+				enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -908,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	rc = -EAGAIN;
 
 	if (!trylock_page(hpage)) {
-		if (!force || !sync)
+		if (!force || mode != MIGRATE_SYNC)
 			goto out;
 		lock_page(hpage);
 	}
@@ -919,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
 
 	if (!page_mapped(hpage))
-		rc = move_to_new_page(new_hpage, hpage, 1, sync);
+		rc = move_to_new_page(new_hpage, hpage, 1, mode);
 
 	if (rc)
 		remove_migration_ptes(hpage, hpage);
@@ -962,7 +970,7 @@ out:
  */
 int migrate_pages(struct list_head *from,
 		new_page_t get_new_page, unsigned long private, bool offlining,
-		bool sync)
+		enum migrate_mode mode)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -983,7 +991,7 @@ int migrate_pages(struct list_head *from,
 
 			rc = unmap_and_move(get_new_page, private,
 						page, pass > 2, offlining,
-						sync);
+						mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1013,7 +1021,7 @@ out:
 
 int migrate_huge_pages(struct list_head *from,
 		new_page_t get_new_page, unsigned long private, bool offlining,
-		bool sync)
+		enum migrate_mode mode)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1030,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
 
 			rc = unmap_and_move_huge_page(get_new_page,
 					private, page, pass > 2, offlining,
-					sync);
+					mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1159,7 +1167,7 @@ set_status:
 	err = 0;
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_page_node,
-				(unsigned long)pm, 0, true);
+				(unsigned long)pm, 0, MIGRATE_SYNC);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
-- 
cgit v1.1


From d50462a3a29fc5f53ef4a5d74eb693b4d4cb1512 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:45 -0800
Subject: mm: vmscan: when reclaiming for compaction, ensure there are
 sufficient free pages available

commit fe4b1b244bdb96136855f2c694071cb09d140766 upstream.

Stable note: Not tracked on Bugzilla. THP and compaction was found to
	aggressively reclaim pages and stall systems under different
	situations that was addressed piecemeal over time. This patch
	addresses a problem where the fix regressed THP allocation
	success rates.

In commit e0887c19 ("vmscan: limit direct reclaim for higher order
allocations"), Rik noted that reclaim was too aggressive when THP was
enabled.  In his initial patch he used the number of free pages to decide
if reclaim should abort for compaction.  My feedback was that reclaim and
compaction should be using the same logic when deciding if reclaim should
be aborted.

Unfortunately, this had the effect of reducing THP success rates when the
workload included something like streaming reads that continually
allocated pages.  The window during which compaction could run and return
a THP was too small.

This patch combines Rik's two patches together.  compaction_suitable() is
still used to decide if reclaim should be aborted to allow compaction is
used.  However, it will also ensure that there is a reasonable buffer of
free pages available.  This improves upon the THP allocation success rates
but bounds the number of pages that are freed for compaction.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel<riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 870cbcf..eadab09 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2075,6 +2075,42 @@ restart:
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+	unsigned long balance_gap, watermark;
+	bool watermark_ok;
+
+	/* Do not consider compaction for orders reclaim is meant to satisfy */
+	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+		return false;
+
+	/*
+	 * Compaction takes time to run and there are potentially other
+	 * callers using the pages just freed. Continue reclaiming until
+	 * there is a buffer of free pages available to give compaction
+	 * a reasonable chance of completing and allocating the page
+	 */
+	balance_gap = min(low_wmark_pages(zone),
+		(zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+			KSWAPD_ZONE_BALANCE_GAP_RATIO);
+	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+	/*
+	 * If compaction is deferred, reclaim up to a point where
+	 * compaction will have a chance of success when re-enabled
+	 */
+	if (compaction_deferred(zone))
+		return watermark_ok;
+
+	/* If compaction is not ready to start, keep reclaiming */
+	if (!compaction_suitable(zone, sc->order))
+		return false;
+
+	return watermark_ok;
+}
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2092,8 +2128,8 @@ restart:
  * scan then give up on it.
  *
  * This function returns true if a zone is being reclaimed for a costly
- * high-order allocation and compaction is either ready to begin or deferred.
- * This indicates to the caller that it should retry the allocation or fail.
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should retry the allocation or fail.
  */
 static bool shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
@@ -2127,9 +2163,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 				 * noticable problem, like transparent huge page
 				 * allocations.
 				 */
-				if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-					(compaction_suitable(zone, sc->order) ||
-					 compaction_deferred(zone))) {
+				if (compaction_ready(zone, sc)) {
 					should_abort_reclaim = true;
 					continue;
 				}
-- 
cgit v1.1


From da0dc52b5236e73d7d7b5a58e8e067236fd1323e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:33 -0800
Subject: mm: vmscan: do not OOM if aborting reclaim to start compaction

commit 7335084d446b83cbcb15da80497d03f0c1dc9e21 upstream.

Stable note: Not tracked in Bugzilla. This patch makes later patches
	easier to apply but otherwise has little to justify it. The
	problem it fixes was never observed but the source of the
	theoretical problem did not exist for very long.

During direct reclaim it is possible that reclaim will be aborted so that
compaction can be attempted to satisfy a high-order allocation.  If this
decision is made before any pages are reclaimed, it is possible that 0 is
returned to the page allocator potentially triggering an OOM.  This has
not been observed but it is a possibility so this patch addresses it.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eadab09..441f97e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2240,6 +2240,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long writeback_threshold;
+	bool should_abort_reclaim;
 
 	get_mems_allowed();
 	delayacct_freepages_start();
@@ -2251,7 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token(sc->mem_cgroup);
-		if (shrink_zones(priority, zonelist, sc))
+		should_abort_reclaim = shrink_zones(priority, zonelist, sc);
+		if (should_abort_reclaim)
 			break;
 
 		/*
@@ -2318,6 +2320,10 @@ out:
 	if (oom_killer_disabled)
 		return 0;
 
+	/* Aborting reclaim to try compaction? don't OOM, then */
+	if (should_abort_reclaim)
+		return 1;
+
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
 		return 1;
-- 
cgit v1.1


From 9cad5d6a3ce8ffc5fee70c6514ccb7ce003b8792 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 12 Jan 2012 17:19:49 -0800
Subject: mm: vmscan: check if reclaim should really abort even if
 compaction_ready() is true for one zone

commit 0cee34fd72c582b4f8ad8ce00645b75fb4168199 upstream.

Stable note: Not tracked on Bugzilla. THP and compaction was found to
	aggressively reclaim pages and stall systems under different
	situations that was addressed piecemeal over time.

If compaction can proceed for a given zone, shrink_zones() does not
reclaim any more pages from it.  After commit [e0c2327: vmscan: abort
reclaim/compaction if compaction can proceed], do_try_to_free_pages()
tries to finish as soon as possible once one zone can compact.

This was intended to prevent slabs being shrunk unnecessarily but there
are side-effects.  One is that a small zone that is ready for compaction
will abort reclaim even if the chances of successfully allocating a THP
from that zone is small.  It also means that reclaim can return too early
even though sc->nr_to_reclaim pages were not reclaimed.

This partially reverts the commit until it is proven that slabs are really
being shrunk unnecessarily but preserves the check to return 1 to avoid
OOM if reclaim was aborted prematurely.

[aarcange@redhat.com: This patch replaces a revert from Andrea]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andy Isaacson <adi@hexapodia.org>
Cc: Nai Xia <nai.xia@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 441f97e..f8c96c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2129,7 +2129,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  *
  * This function returns true if a zone is being reclaimed for a costly
  * high-order allocation and compaction is ready to begin. This indicates to
- * the caller that it should retry the allocation or fail.
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
  */
 static bool shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
@@ -2138,7 +2139,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 	struct zone *zone;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
-	bool should_abort_reclaim = false;
+	bool aborted_reclaim = false;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2164,7 +2165,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 				 * allocations.
 				 */
 				if (compaction_ready(zone, sc)) {
-					should_abort_reclaim = true;
+					aborted_reclaim = true;
 					continue;
 				}
 			}
@@ -2186,7 +2187,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
 		shrink_zone(priority, zone, sc);
 	}
 
-	return should_abort_reclaim;
+	return aborted_reclaim;
 }
 
 static bool zone_reclaimable(struct zone *zone)
@@ -2240,7 +2241,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long writeback_threshold;
-	bool should_abort_reclaim;
+	bool aborted_reclaim;
 
 	get_mems_allowed();
 	delayacct_freepages_start();
@@ -2252,9 +2253,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token(sc->mem_cgroup);
-		should_abort_reclaim = shrink_zones(priority, zonelist, sc);
-		if (should_abort_reclaim)
-			break;
+		aborted_reclaim = shrink_zones(priority, zonelist, sc);
 
 		/*
 		 * Don't shrink slabs when reclaiming memory from
@@ -2320,8 +2319,8 @@ out:
 	if (oom_killer_disabled)
 		return 0;
 
-	/* Aborting reclaim to try compaction? don't OOM, then */
-	if (should_abort_reclaim)
+	/* Aborted reclaim to try compaction? don't OOM, then */
+	if (aborted_reclaim)
 		return 1;
 
 	/* top priority shrink_zones still had more to do? don't OOM, then */
-- 
cgit v1.1


From 03722816ec065d8c7a9306fc2d601251bc0c623e Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 10 Jan 2012 15:06:59 -0800
Subject: vmscan: promote shared file mapped pages

commit 34dbc67a644f11ab3475d822d72e25409911e760 upstream.

Stable note: Not tracked in Bugzilla. There were reports of shared
	mapped pages being unfairly reclaimed in comparison to older kernels.
	This is being addressed over time. The specific workload being
	addressed here in described in paragraph four and while paragraph
	five says it did not help performance as such, it made a difference
	to major page faults. I'm aware of at least one bug for a large
	vendor that was due to increased major faults.

Commit 645747462435 ("vmscan: detect mapped file pages used only once")
greatly decreases lifetime of single-used mapped file pages.
Unfortunately it also decreases life time of all shared mapped file
pages.  Because after commit bf3f3bc5e7347 ("mm: don't mark_page_accessed
in fault path") page-fault handler does not mark page active or even
referenced.

Thus page_check_references() activates file page only if it was used twice
while it stays in inactive list, meanwhile it activates anon pages after
first access.  Inactive list can be small enough, this way reclaimer can
accidentally throw away any widely used page if it wasn't used twice in
short period.

After this patch page_check_references() also activate file mapped page at
first inactive list scan if this page is already used multiple times via
several ptes.

I found this while trying to fix degragation in rhel6 (~2.6.32) from rhel5
(~2.6.18).  There a complete mess with >100 web/mail/spam/ftp containers,
they share all their files but there a lot of anonymous pages: ~500mb
shared file mapped memory and 15-20Gb non-shared anonymous memory.  In
this situation major-pagefaults are very costly, because all containers
share the same page.  In my load kernel created a disproportionate
pressure on the file memory, compared with the anonymous, they equaled
only if I raise swappiness up to 150 =)

These patches actually wasn't helped a lot in my problem, but I saw
noticable (10-20 times) reduce in count and average time of
major-pagefault in file-mapped areas.

Actually both patches are fixes for commit v2.6.33-5448-g6457474, because
it was aimed at one scenario (singly used pages), but it breaks the logic
in other scenarios (shared and/or executable pages)

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8c96c7..811141e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -723,7 +723,7 @@ static enum page_references page_check_references(struct page *page,
 		 */
 		SetPageReferenced(page);
 
-		if (referenced_page)
+		if (referenced_page || referenced_ptes > 1)
 			return PAGEREF_ACTIVATE;
 
 		return PAGEREF_KEEP;
-- 
cgit v1.1


From 4391b5f49e28bdb78ddf67495abb2f767474216d Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 10 Jan 2012 15:07:03 -0800
Subject: vmscan: activate executable pages after first usage

commit c909e99364c8b6ca07864d752950b6b4ecf6bef4 upstream.

Stable note: Not tracked in Bugzilla. There were reports of shared
	mapped pages being unfairly reclaimed in comparison to older kernels.
	This is being addressed over time.

Logic added in commit 8cab4754d24a0 ("vmscan: make mapped executable pages
the first class citizen") was noticeably weakened in commit
645747462435d84 ("vmscan: detect mapped file pages used only once").

Currently these pages can become "first class citizens" only after second
usage.  After this patch page_check_references() will activate they after
first usage, and executable code gets yet better chance to stay in memory.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 811141e..fed3022 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -726,6 +726,12 @@ static enum page_references page_check_references(struct page *page,
 		if (referenced_page || referenced_ptes > 1)
 			return PAGEREF_ACTIVATE;
 
+		/*
+		 * Activate file-backed executable pages after first usage.
+		 */
+		if (vm_flags & VM_EXEC)
+			return PAGEREF_ACTIVATE;
+
 		return PAGEREF_KEEP;
 	}
 
-- 
cgit v1.1


From 503e973ce4a57bc949ddbf35d4b4ecd1a90263f8 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 10 Jan 2012 15:08:18 -0800
Subject: mm/vmscan.c: consider swap space when deciding whether to continue
 reclaim

commit 86cfd3a45042ab242d47f3935a02811a402beab6 upstream.

Stable note: Not tracked in Bugzilla. This patch reduces kswapd CPU
	usage on swapless systems with high anonymous memory usage.

It's pointless to continue reclaiming when we have no swap space and lots
of anon pages in the inactive list.

Without this patch, it is possible when swap is disabled to continue
trying to reclaim when there are only anonymous pages in the system even
though that will not make any progress.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index fed3022..7a56981 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2008,8 +2008,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
-	inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-				zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+	inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+	if (nr_swap_pages > 0)
+		inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
 	if (sc->nr_reclaimed < pages_for_compaction &&
 			inactive_lru_pages > pages_for_compaction)
 		return true;
-- 
cgit v1.1


From d2b02236b85226c9f2cd28e32a9fdf197584e448 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 10 Jan 2012 15:08:33 -0800
Subject: mm: test PageSwapBacked in lumpy reclaim

commit 043bcbe5ec51e0478ef2b44acef17193e01d7f70 upstream.

Stable note: Not tracked in Bugzilla. There were reports of shared
	mapped pages being unfairly reclaimed in comparison to older kernels.
	This is being addressed over time. Even though the subject
	refers to lumpy reclaim, it impacts compaction as well.

Lumpy reclaim does well to stop at a PageAnon when there's no swap, but
better is to stop at any PageSwapBacked, which includes shmem/tmpfs too.

Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7a56981..39bb416 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1199,7 +1199,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			 * anon page which don't already have a swap slot is
 			 * pointless.
 			 */
-			if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+			if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
 			    !PageSwapCache(cursor_page))
 				break;
 
-- 
cgit v1.1


From 4d01a2e38a9c27d3de083ecc561b32b6a55fc7eb Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Thu, 12 Jan 2012 17:18:06 -0800
Subject: mm: vmscan: convert global reclaim to per-memcg LRU lists

commit b95a2f2d486d0d768a92879c023a03757b9c7e58 upstream - WARNING: this is a substitute patch.

Stable note: Not tracked in Bugzilla. This is a partial backport of an
	upstream commit addressing a completely different issue
	that accidentally contained an important fix. The workload
	this patch helps was memcached when IO is started in the
	background. memcached should stay resident but without this patch
	it gets swapped. Sometimes this manifests as a drop in throughput
	but mostly it was observed through /proc/vmstat.

Commit [246e87a9: memcg: fix get_scan_count() for small targets] was meant
to fix a problem whereby small scan targets on memcg were ignored causing
priority to raise too sharply. It forced scanning to take place if the
target was small, memcg or kswapd.

From the time it was introduced it caused excessive reclaim by kswapd
with workloads being pushed to swap that previously would have stayed
resident. This was accidentally fixed in commit [b95a2f2d: mm: vmscan:
convert global reclaim to per-memcg LRU lists] by making it harder for
kswapd to force scan small targets but that patchset is not suitable for
backporting. This was later changed again by commit [90126375: mm/vmscan:
push lruvec pointer into get_scan_count()] into a format that looks
like it would be a straight-forward backport but there is a subtle
difference due to the use of lruvecs.

The impact of the accidental fix is to make it harder for kswapd to force
scan small targets by taking zone->all_unreclaimable into account. This
patch is the closest equivalent available based on what is backported.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39bb416..6697b7a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1850,7 +1850,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
 	unsigned long nr_force_scan[2];
 
 	/* kswapd does zone balancing and needs to scan this zone */
-	if (scanning_global_lru(sc) && current_is_kswapd())
+	if (scanning_global_lru(sc) && current_is_kswapd() &&
+	    zone->all_unreclaimable)
 		force_scan = true;
 	/* memcg may have small limit and need to avoid priority drop */
 	if (!scanning_global_lru(sc))
-- 
cgit v1.1


From 627c5c60b4ac673e9f4be758858073071684dce9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 21 Mar 2012 16:34:11 -0700
Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3

commit cc9a6c8776615f9c194ccf0b63a0aa5628235545 upstream.

Stable note:  Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely
	expensive and severely impacted page allocator performance. This
	is part of a series of patches that reduce page allocator overhead.

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.

[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths.  This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32.  The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.

For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.

This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side.  This is much cheaper on some architectures, including x86.  The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.

While updating the nodemask, a check is made to see if a false failure
is a risk.  If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.

In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%.  The
actual results were

                             3.3.0-rc3          3.3.0-rc3
                             rc3-vanilla        nobarrier-v2r1
    Clients   1 UserTime       0.07 (  0.00%)   0.08 (-14.19%)
    Clients   2 UserTime       0.07 (  0.00%)   0.07 (  2.72%)
    Clients   4 UserTime       0.08 (  0.00%)   0.07 (  3.29%)
    Clients   1 SysTime        0.70 (  0.00%)   0.65 (  6.65%)
    Clients   2 SysTime        0.85 (  0.00%)   0.82 (  3.65%)
    Clients   4 SysTime        1.41 (  0.00%)   1.41 (  0.32%)
    Clients   1 WallTime       0.77 (  0.00%)   0.74 (  4.19%)
    Clients   2 WallTime       0.47 (  0.00%)   0.45 (  3.73%)
    Clients   4 WallTime       0.38 (  0.00%)   0.37 (  1.58%)
    Clients   1 Flt/sec/cpu  497620.28 (  0.00%) 520294.53 (  4.56%)
    Clients   2 Flt/sec/cpu  414639.05 (  0.00%) 429882.01 (  3.68%)
    Clients   4 Flt/sec/cpu  257959.16 (  0.00%) 258761.48 (  0.31%)
    Clients   1 Flt/sec      495161.39 (  0.00%) 517292.87 (  4.47%)
    Clients   2 Flt/sec      820325.95 (  0.00%) 850289.77 (  3.65%)
    Clients   4 Flt/sec      1020068.93 (  0.00%) 1022674.06 (  0.26%)
    MMTests Statistics: duration
    Sys Time Running Test (seconds)             135.68    132.17
    User+Sys Time Running Test (seconds)         164.2    160.13
    Total Elapsed Time (seconds)                123.46    120.87

The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected).  The
actual number of page faults is noticeably improved.

For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.

To test the actual bug the commit fixed I opened two terminals.  The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data.  In a second window, the nodemask of the
cpuset was continually randomised in a loop.

Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/filemap.c    | 11 +++++++----
 mm/hugetlb.c    | 15 +++++++++++----
 mm/mempolicy.c  | 28 +++++++++++++++++++++-------
 mm/page_alloc.c | 33 +++++++++++++++++++++++----------
 mm/slab.c       | 13 ++++++++-----
 mm/slub.c       | 40 +++++++++++++++++++++++++---------------
 mm/vmscan.c     |  2 --
 7 files changed, 95 insertions(+), 47 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index b7d8603..10481eb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
 	struct page *page;
 
 	if (cpuset_do_page_mem_spread()) {
-		get_mems_allowed();
-		n = cpuset_mem_spread_node();
-		page = alloc_pages_exact_node(n, gfp, 0);
-		put_mems_allowed();
+		unsigned int cpuset_mems_cookie;
+		do {
+			cpuset_mems_cookie = get_mems_allowed();
+			n = cpuset_mem_spread_node();
+			page = alloc_pages_exact_node(n, gfp, 0);
+		} while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
 		return page;
 	}
 	return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 05f8fd4..64f2b7a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -454,14 +454,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
-	struct page *page = NULL;
+	struct page *page;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
 	struct zonelist *zonelist;
 	struct zone *zone;
 	struct zoneref *z;
+	unsigned int cpuset_mems_cookie;
 
-	get_mems_allowed();
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
 	zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol, &nodemask);
 	/*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 			}
 		}
 	}
-err:
+
 	mpol_cond_put(mpol);
-	put_mems_allowed();
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
 	return page;
+
+err:
+	mpol_cond_put(mpol);
+	return NULL;
 }
 
 static void update_and_free_page(struct hstate *h, struct page *page)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dd5f874..cff919f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1810,18 +1810,24 @@ struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, int node)
 {
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct mempolicy *pol;
 	struct zonelist *zl;
 	struct page *page;
+	unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+	pol = get_vma_policy(current, vma, addr);
+	cpuset_mems_cookie = get_mems_allowed();
 
-	get_mems_allowed();
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
-		put_mems_allowed();
+		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+			goto retry_cpuset;
+
 		return page;
 	}
 	zl = policy_zonelist(gfp, pol, node);
@@ -1832,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		struct page *page =  __alloc_pages_nodemask(gfp, order,
 						zl, policy_nodemask(gfp, pol));
 		__mpol_put(pol);
-		put_mems_allowed();
+		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+			goto retry_cpuset;
 		return page;
 	}
 	/*
@@ -1840,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	 */
 	page = __alloc_pages_nodemask(gfp, order, zl,
 				      policy_nodemask(gfp, pol));
-	put_mems_allowed();
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
 	return page;
 }
 
@@ -1867,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 	struct page *page;
+	unsigned int cpuset_mems_cookie;
 
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
-	get_mems_allowed();
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
@@ -1882,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		page = __alloc_pages_nodemask(gfp, order,
 				policy_zonelist(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
-	put_mems_allowed();
+
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
+
 	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0d490ba..9177aa3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2293,8 +2293,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
-	struct page *page;
+	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
+	unsigned int cpuset_mems_cookie;
 
 	gfp_mask &= gfp_allowed_mask;
 
@@ -2313,15 +2314,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
-	get_mems_allowed();
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
-	if (!preferred_zone) {
-		put_mems_allowed();
-		return NULL;
-	}
+	if (!preferred_zone)
+		goto out;
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2331,9 +2332,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
-	put_mems_allowed();
 
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+	/*
+	 * When updating a task's mems_allowed, it is possible to race with
+	 * parallel threads in such a way that an allocation can fail while
+	 * the mask is being updated. If a page allocation is about to fail,
+	 * check if the cpuset changed during allocation and if so, retry.
+	 */
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
+
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2557,13 +2568,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
+	unsigned int cpuset_mems_cookie;
 
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 
-	get_mems_allowed();
-	ret = !node_isset(nid, cpuset_current_mems_allowed);
-	put_mems_allowed();
+	do {
+		cpuset_mems_cookie = get_mems_allowed();
+		ret = !node_isset(nid, cpuset_current_mems_allowed);
+	} while (!put_mems_allowed(cpuset_mems_cookie));
 out:
 	return ret;
 }
diff --git a/mm/slab.c b/mm/slab.c
index d96e223..a67f812 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
 	nid_alloc = nid_here = numa_mem_id();
-	get_mems_allowed();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_slab_spread_node();
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
-	put_mems_allowed();
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
+	unsigned int cpuset_mems_cookie;
 
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	get_mems_allowed();
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
 retry:
 	/*
 	 * Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
 			}
 		}
 	}
-	put_mems_allowed();
+
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+		goto retry_cpuset;
 	return obj;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index 10ab233..ae6e80e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
+	unsigned int cpuset_mems_cookie;
 
 	/*
 	 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
-	get_mems_allowed();
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-		struct kmem_cache_node *n;
-
-		n = get_node(s, zone_to_nid(zone));
-
-		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-				n->nr_partial > s->min_partial) {
-			page = get_partial_node(n);
-			if (page) {
-				put_mems_allowed();
-				return page;
+	do {
+		cpuset_mems_cookie = get_mems_allowed();
+		zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+			struct kmem_cache_node *n;
+
+			n = get_node(s, zone_to_nid(zone));
+
+			if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+					n->nr_partial > s->min_partial) {
+				page = get_partial_node(n);
+				if (page) {
+					/*
+					 * Return the object even if
+					 * put_mems_allowed indicated that
+					 * the cpuset mems_allowed was
+					 * updated in parallel. It's a
+					 * harmless race between the alloc
+					 * and the cpuset update.
+					 */
+					put_mems_allowed(cpuset_mems_cookie);
+					return page;
+				}
 			}
 		}
-	}
-	put_mems_allowed();
+	} while (!put_mems_allowed(cpuset_mems_cookie));
 #endif
 	return NULL;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6697b7a..1378487 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2251,7 +2251,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
 
-	get_mems_allowed();
 	delayacct_freepages_start();
 
 	if (scanning_global_lru(sc))
@@ -2314,7 +2313,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 out:
 	delayacct_freepages_end();
-	put_mems_allowed();
 
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
-- 
cgit v1.1


From ad04b9e911d7bc1edaa599a85f44b5fc83f9e90e Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 25 Apr 2012 16:01:46 -0700
Subject: mm/hugetlb: fix warning in alloc_huge_page/dequeue_huge_page_vma

commit b1c12cbcd0a02527c180a862e8971e249d3b347d upstream.

Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely
	expensive and severely impacted page allocator performance. This
	is part of a series of patches that reduce page allocator overhead.

Fix a gcc warning (and bug?) introduced in cc9a6c877 ("cpuset: mm: reduce
large amounts of memory barrier related damage v3")

Local variable "page" can be uninitialized if the nodemask from vma policy
does not intersects with nodemask from cpuset.  Even if it doesn't happens
it is better to initialize this variable explicitly than to introduce
a kernel oops in a weird corner case.

mm/hugetlb.c: In function `alloc_huge_page':
mm/hugetlb.c:1135:5: warning: `page' may be used uninitialized in this function

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 64f2b7a..ae60a53 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -454,7 +454,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
-	struct page *page;
+	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
 	struct zonelist *zonelist;
-- 
cgit v1.1


From 909e0a4e5c3a9d3b60c1eecb34de75afa64ade95 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu, 8 Dec 2011 14:33:51 -0800
Subject: vmscan: fix initial shrinker size handling

commit 635697c663f38106063d5659f0cf2e45afcd4bb5 upstream.

Stable note: The commit [acf92b48: vmscan: shrinker->nr updates race and
	go wrong] aimed to reduce excessive reclaim of slab objects but
	had bug in how it treated shrinker functions that returned -1.

A shrinker function can return -1, means that it cannot do anything
without a risk of deadlock.  For example prune_super() does this if it
cannot grab a superblock refrence, even if nr_to_scan=0.  Currently we
interpret this -1 as a ULONG_MAX size shrinker and evaluate `total_scan'
according to this.  So the next time around this shrinker can cause
really big pressure.  Let's skip such shrinkers instead.

Also make total_scan signed, otherwise the check (total_scan < 0) below
never works.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1378487..5326f98 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,12 +248,16 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
-		unsigned long total_scan;
-		unsigned long max_pass;
+		long total_scan;
+		long max_pass;
 		int shrink_ret = 0;
 		long nr;
 		long new_nr;
 
+		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+		if (max_pass <= 0)
+			continue;
+
 		/*
 		 * copy the current shrinker scan count into a local variable
 		 * and zero it so that other concurrent shrinker invocations
@@ -264,7 +268,6 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		} while (cmpxchg(&shrinker->nr, nr, 0) != nr);
 
 		total_scan = nr;
-		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
 		delta = (4 * nr_pages_scanned) / shrinker->seeks;
 		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
-- 
cgit v1.1


From cad33da5ceedac56481dd3168e42580e9bec6343 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Mon, 30 Jul 2012 14:39:04 -0700
Subject: mm: fix wrong argument of migrate_huge_pages() in
 soft_offline_huge_page()

commit dc32f63453f56d07a1073a697dcd843dd3098c09 upstream.

Commit a6bc32b89922 ("mm: compaction: introduce sync-light migration for
use by compaction") changed the declaration of migrate_pages() and
migrate_huge_pages().

But it missed changing the argument of migrate_huge_pages() in
soft_offline_huge_page().  In this case, we should call
migrate_huge_pages() with MIGRATE_SYNC.

Additionally, there is a mismatch between type the of argument and the
function declaration for migrate_pages().

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory-failure.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6496748..2f49dcf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	/* Keep page count to indicate a given hugepage is isolated. */
 
 	list_add(&hpage->lru, &pagelist);
-	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
-				true);
+	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
+				MIGRATE_SYNC);
 	if (ret) {
 		struct page *page1, *page2;
 		list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
 					    page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-							0, MIGRATE_SYNC);
+							false, MIGRATE_SYNC);
 		if (ret) {
 			putback_lru_pages(&pagelist);
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
-- 
cgit v1.1


From 8bda26e33846b53e2c70a2ccff13e3f5b69ab067 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 31 Jul 2012 16:45:52 -0700
Subject: mm: mmu_notifier: fix freed page still mapped in secondary MMU

commit 3ad3d901bbcfb15a5e4690e55350db0899095a68 upstream.

mmu_notifier_release() is called when the process is exiting.  It will
delete all the mmu notifiers.  But at this time the page belonging to the
process is still present in page tables and is present on the LRU list, so
this race will happen:

      CPU 0                 CPU 1
mmu_notifier_release:    try_to_unmap:
   hlist_del_init_rcu(&mn->hlist);
                            ptep_clear_flush_notify:
                                  mmu nofifler not found
                            free page  !!!!!!
                            /*
                             * At the point, the page has been
                             * freed, but it is still mapped in
                             * the secondary MMU.
                             */

  mn->ops->release(mn, mm);

Then the box is not stable and sometimes we can get this bug:

[  738.075923] BUG: Bad page state in process migrate-perf  pfn:03bec
[  738.075931] page:ffffea00000efb00 count:0 mapcount:0 mapping:          (null) index:0x8076
[  738.075936] page flags: 0x20000000000014(referenced|dirty)

The same issue is present in mmu_notifier_unregister().

We can call ->release before deleting the notifier to ensure the page has
been unmapped from the secondary MMU before it is freed.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mmu_notifier.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de..71c7811 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
+	struct hlist_node *n;
+
+	/*
+	 * RCU here will block mmu_notifier_unregister until
+	 * ->release returns.
+	 */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+		/*
+		 * if ->release runs before mmu_notifier_unregister it
+		 * must be handled as it's the only way for the driver
+		 * to flush all existing sptes and stop the driver
+		 * from establishing any more sptes before all the
+		 * pages in the mm are freed.
+		 */
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+	rcu_read_unlock();
 
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
 		 * mmu_notifier_unregister to return.
 		 */
 		hlist_del_init_rcu(&mn->hlist);
-		/*
-		 * RCU here will block mmu_notifier_unregister until
-		 * ->release returns.
-		 */
-		rcu_read_lock();
-		spin_unlock(&mm->mmu_notifier_mm->lock);
-		/*
-		 * if ->release runs before mmu_notifier_unregister it
-		 * must be handled as it's the only way for the driver
-		 * to flush all existing sptes and stop the driver
-		 * from establishing any more sptes before all the
-		 * pages in the mm are freed.
-		 */
-		if (mn->ops->release)
-			mn->ops->release(mn, mm);
-		rcu_read_unlock();
-		spin_lock(&mm->mmu_notifier_mm->lock);
 	}
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
-	spin_lock(&mm->mmu_notifier_mm->lock);
 	if (!hlist_unhashed(&mn->hlist)) {
-		hlist_del_rcu(&mn->hlist);
-
 		/*
 		 * RCU here will force exit_mmap to wait ->release to finish
 		 * before freeing the pages.
 		 */
 		rcu_read_lock();
-		spin_unlock(&mm->mmu_notifier_mm->lock);
+
 		/*
 		 * exit_mmap will block in mmu_notifier_release to
 		 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
 		rcu_read_unlock();
-	} else
+
+		spin_lock(&mm->mmu_notifier_mm->lock);
+		hlist_del_rcu(&mn->hlist);
 		spin_unlock(&mm->mmu_notifier_mm->lock);
+	}
 
 	/*
 	 * Wait any running method to finish, of course including
-- 
cgit v1.1


From 4c9682c5269e3c63ef9009ad20ea4e150370b7e0 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 31 Jul 2012 16:46:20 -0700
Subject: mm: hugetlbfs: close race during teardown of hugetlbfs shared page
 tables

commit d833352a4338dc31295ed832a30c9ccff5c7a183 upstream.

If a process creates a large hugetlbfs mapping that is eligible for page
table sharing and forks heavily with children some of whom fault and
others which destroy the mapping then it is possible for page tables to
get corrupted.  Some teardowns of the mapping encounter a "bad pmd" and
output a message to the kernel log.  The final teardown will trigger a
BUG_ON in mm/filemap.c.

This was reproduced in 3.4 but is known to have existed for a long time
and goes back at least as far as 2.6.37.  It was probably was introduced
in 2.6.20 by [39dde65c: shared page table for hugetlb page].  The messages
look like this;

[  ..........] Lots of bad pmd messages followed by this
[  127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7).
[  127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7).
[  127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7).
[  127.186778] ------------[ cut here ]------------
[  127.186781] kernel BUG at mm/filemap.c:134!
[  127.186782] invalid opcode: 0000 [#1] SMP
[  127.186783] CPU 7
[  127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod
[  127.186801]
[  127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR
[  127.186804] RIP: 0010:[<ffffffff810ed6ce>]  [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
[  127.186809] RSP: 0000:ffff8804144b5c08  EFLAGS: 00010002
[  127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0
[  127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00
[  127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003
[  127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8
[  127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8
[  127.186815] FS:  00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000
[  127.186816] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0
[  127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0)
[  127.186821] Stack:
[  127.186822]  ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b
[  127.186824]  ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98
[  127.186825]  ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000
[  127.186827] Call Trace:
[  127.186829]  [<ffffffff810ed83b>] delete_from_page_cache+0x3b/0x80
[  127.186832]  [<ffffffff811bc925>] truncate_hugepages+0x115/0x220
[  127.186834]  [<ffffffff811bca43>] hugetlbfs_evict_inode+0x13/0x30
[  127.186837]  [<ffffffff811655c7>] evict+0xa7/0x1b0
[  127.186839]  [<ffffffff811657a3>] iput_final+0xd3/0x1f0
[  127.186840]  [<ffffffff811658f9>] iput+0x39/0x50
[  127.186842]  [<ffffffff81162708>] d_kill+0xf8/0x130
[  127.186843]  [<ffffffff81162812>] dput+0xd2/0x1a0
[  127.186845]  [<ffffffff8114e2d0>] __fput+0x170/0x230
[  127.186848]  [<ffffffff81236e0e>] ? rb_erase+0xce/0x150
[  127.186849]  [<ffffffff8114e3ad>] fput+0x1d/0x30
[  127.186851]  [<ffffffff81117db7>] remove_vma+0x37/0x80
[  127.186853]  [<ffffffff81119182>] do_munmap+0x2d2/0x360
[  127.186855]  [<ffffffff811cc639>] sys_shmdt+0xc9/0x170
[  127.186857]  [<ffffffff81410a39>] system_call_fastpath+0x16/0x1b
[  127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0
[  127.186868] RIP  [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
[  127.186870]  RSP <ffff8804144b5c08>
[  127.186871] ---[ end trace 7cbac5d1db69f426 ]---

The bug is a race and not always easy to reproduce.  To reproduce it I was
doing the following on a single socket I7-based machine with 16G of RAM.

$ hugeadm --pool-pages-max DEFAULT:13G
$ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax
$ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall
$ for i in `seq 1 9000`; do ./hugetlbfs-test; done

On my particular machine, it usually triggers within 10 minutes but
enabling debug options can change the timing such that it never hits.
Once the bug is triggered, the machine is in trouble and needs to be
rebooted.  The machine will respond but processes accessing proc like "ps
aux" will hang due to the BUG_ON.  shutdown will also hang and needs a
hard reset or a sysrq-b.

The basic problem is a race between page table sharing and teardown.  For
the most part page table sharing depends on i_mmap_mutex.  In some cases,
it is also taking the mm->page_table_lock for the PTE updates but with
shared page tables, it is the i_mmap_mutex that is more important.

Unfortunately it appears to be also insufficient. Consider the following
situation

Process A					Process B
---------					---------
hugetlb_fault					shmdt
  						LockWrite(mmap_sem)
    						  do_munmap
						    unmap_region
						      unmap_vmas
						        unmap_single_vma
						          unmap_hugepage_range
      						            Lock(i_mmap_mutex)
							    Lock(mm->page_table_lock)
							    huge_pmd_unshare/unmap tables <--- (1)
							    Unlock(mm->page_table_lock)
      						            Unlock(i_mmap_mutex)
  huge_pte_alloc				      ...
    Lock(i_mmap_mutex)				      ...
    vma_prio_walk, find svma, spte		      ...
    Lock(mm->page_table_lock)			      ...
    share spte					      ...
    Unlock(mm->page_table_lock)			      ...
    Unlock(i_mmap_mutex)			      ...
  hugetlb_no_page									  <--- (2)
						      free_pgtables
						        unlink_file_vma
							hugetlb_free_pgd_range
						    remove_vma_list

In this scenario, it is possible for Process A to share page tables with
Process B that is trying to tear them down.  The i_mmap_mutex on its own
does not prevent Process A walking Process B's page tables.  At (1) above,
the page tables are not shared yet so it unmaps the PMDs.  Process A sets
up page table sharing and at (2) faults a new entry.  Process B then trips
up on it in free_pgtables.

This patch fixes the problem by adding a new function
__unmap_hugepage_range_final that is only called when the VMA is about to
be destroyed.  This function clears VM_MAYSHARE during
unmap_hugepage_range() under the i_mmap_mutex.  This makes the VMA
ineligible for sharing and avoids the race.  Superficially this looks like
it would then be vunerable to truncate and madvise issues but hugetlbfs
has its own truncate handlers so does not use unmap_mapping_range() and
does not support madvise(DONTNEED).

This should be treated as a -stable candidate if it is merged.

Test program is as follows. The test case was mostly written by Michal
Hocko with a few minor changes to reproduce this bug.

==== CUT HERE ====

static size_t huge_page_size = (2UL << 20);
static size_t nr_huge_page_A = 512;
static size_t nr_huge_page_B = 5632;

unsigned int get_random(unsigned int max)
{
	struct timeval tv;

	gettimeofday(&tv, NULL);
	srandom(tv.tv_usec);
	return random() % max;
}

static void play(void *addr, size_t size)
{
	unsigned char *start = addr,
		      *end = start + size,
		      *a;
	start += get_random(size/2);

	/* we could itterate on huge pages but let's give it more time. */
	for (a = start; a < end; a += 4096)
		*a = 0;
}

int main(int argc, char **argv)
{
	key_t key = IPC_PRIVATE;
	size_t sizeA = nr_huge_page_A * huge_page_size;
	size_t sizeB = nr_huge_page_B * huge_page_size;
	int shmidA, shmidB;
	void *addrA = NULL, *addrB = NULL;
	int nr_children = 300, n = 0;

	if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
		perror("shmget:");
		return 1;
	}

	if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) {
		perror("shmat");
		return 1;
	}
	if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
		perror("shmget:");
		return 1;
	}

	if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) {
		perror("shmat");
		return 1;
	}

fork_child:
	switch(fork()) {
		case 0:
			switch (n%3) {
			case 0:
				play(addrA, sizeA);
				break;
			case 1:
				play(addrB, sizeB);
				break;
			case 2:
				break;
			}
			break;
		case -1:
			perror("fork:");
			break;
		default:
			if (++n < nr_children)
				goto fork_child;
			play(addrA, sizeA);
			break;
	}
	shmdt(addrA);
	shmdt(addrB);
	do {
		wait(NULL);
	} while (--n > 0);
	shmctl(shmidA, IPC_RMID, NULL);
	shmctl(shmidB, IPC_RMID, NULL);
	return 0;
}

[akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build]
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/hugetlb.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ae60a53..037f077 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2301,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
 	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
 	__unmap_hugepage_range(vma, start, end, ref_page);
+	/*
+	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
+	 * test will fail on a vma being torn down, and not grab a page table
+	 * on its way out.  We're lucky that the flag has such an appropriate
+	 * name, and can in fact be safely cleared here. We could clear it
+	 * before the __unmap_hugepage_range above, but all that's necessary
+	 * is to clear it before releasing the i_mmap_mutex below.
+	 *
+	 * This works because in the contexts this is called, the VMA is
+	 * going to be destroyed. It is not vunerable to madvise(DONTNEED)
+	 * because madvise is not supported on hugetlbfs. The same applies
+	 * for direct IO. unmap_hugepage_range() is only being called just
+	 * before free_pgtables() so clearing VM_MAYSHARE will not cause
+	 * surprises later.
+	 */
+	vma->vm_flags &= ~VM_MAYSHARE;
 	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 
@@ -2853,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 		}
 	}
 	spin_unlock(&mm->page_table_lock);
-	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+	/*
+	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+	 * may have cleared our pud entry and done put_page on the page table:
+	 * once we release i_mmap_mutex, another task can do the final put_page
+	 * and that page table be reused and filled with junk.
+	 */
 	flush_tlb_range(vma, start, end);
+	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 
 int hugetlb_reserve_pages(struct inode *inode,
-- 
cgit v1.1


From 3ee8d648935cc63a611c454103419f2100d45c56 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 6 Sep 2012 12:01:00 -0400
Subject: Remove user-triggerable BUG from mpol_to_str

commit 80de7c3138ee9fd86a98696fd2cf7ad89b995d0a upstream.

Trivially triggerable, found by trinity:

  kernel BUG at mm/mempolicy.c:2546!
  Process trinity-child2 (pid: 23988, threadinfo ffff88010197e000, task ffff88007821a670)
  Call Trace:
    show_numa_map+0xd5/0x450
    show_pid_numa_map+0x13/0x20
    traverse+0xf2/0x230
    seq_read+0x34b/0x3e0
    vfs_read+0xac/0x180
    sys_pread64+0xa2/0xc0
    system_call_fastpath+0x1a/0x1f
  RIP: mpol_to_str+0x156/0x360

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cff919f..3f3cc56 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2500,7 +2500,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
 		break;
 
 	default:
-		BUG();
+		return -EINVAL;
 	}
 
 	l = strlen(policy_modes[mode]);
-- 
cgit v1.1


From 54fd21ca117a5dd36a1bf7e0b92bef2c89c2201e Mon Sep 17 00:00:00 2001
From: Li Haifeng <omycle@gmail.com>
Date: Mon, 17 Sep 2012 14:09:21 -0700
Subject: mm/page_alloc: fix the page address of higher page's buddy
 calculation

commit 0ba8f2d59304dfe69b59c034de723ad80f7ab9ac upstream.

The heuristic method for buddy has been introduced since commit
43506fad21ca ("mm/page_alloc.c: simplify calculation of combined index
of adjacent buddy lists").  But the page address of higher page's buddy
was wrongly calculated, which will lead page_is_buddy to fail for ever.
IOW, the heuristic method would be disabled with the wrong page address
of higher page's buddy.

Calculating the page address of higher page's buddy should be based
higher_page with the offset between index of higher page and index of
higher page's buddy.

Signed-off-by: Haifeng Li <omycle@gmail.com>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: KyongHo Cho <pullip.cho@samsung.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9177aa3..eb6b3fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -540,7 +540,7 @@ static inline void __free_one_page(struct page *page,
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
-		higher_buddy = page + (buddy_idx - combined_idx);
+		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
-- 
cgit v1.1


From 2aab4667576b39a1cba50b0322da6d21bbdfc9dd Mon Sep 17 00:00:00 2001
From: qiuxishi <qiuxishi@gmail.com>
Date: Mon, 17 Sep 2012 14:09:24 -0700
Subject: memory hotplug: fix section info double registration bug

commit f14851af0ebb32745c6c5a2e400aa0549f9d20df upstream.

There may be a bug when registering section info.  For example, on my
Itanium platform, the pfn range of node0 includes the other nodes, so
other nodes' section info will be double registered, and memmap's page
count will equal to 3.

  node0: start_pfn=0x100,    spanned_pfn=0x20fb00, present_pfn=0x7f8a3, => 0x000100-0x20fc00
  node1: start_pfn=0x80000,  spanned_pfn=0x80000,  present_pfn=0x80000, => 0x080000-0x100000
  node2: start_pfn=0x100000, spanned_pfn=0x80000,  present_pfn=0x80000, => 0x100000-0x180000
  node3: start_pfn=0x180000, spanned_pfn=0x80000,  present_pfn=0x80000, => 0x180000-0x200000

  free_all_bootmem_node()
	register_page_bootmem_info_node()
		register_page_bootmem_info_section()

When hot remove memory, we can't free the memmap's page because
page_count() is 2 after put_page_bootmem().

  sparse_remove_one_section()
	free_section_usemap()
		free_map_bootmem()
			put_page_bootmem()

[akpm@linux-foundation.org: add code comment]
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory_hotplug.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ae5a3f2..e0a3e51 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -116,9 +116,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 	struct mem_section *ms;
 	struct page *page, *memmap;
 
-	if (!pfn_valid(start_pfn))
-		return;
-
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
 
@@ -177,9 +174,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 	end_pfn = pfn + pgdat->node_spanned_pages;
 
 	/* register_section info */
-	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
-		register_page_bootmem_info_section(pfn);
-
+	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		/*
+		 * Some platforms can assign the same pfn to multiple nodes - on
+		 * node0 as well as nodeN.  To avoid registering a pfn against
+		 * multiple nodes we check that this pfn does not already
+		 * reside in some other node.
+		 */
+		if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+			register_page_bootmem_info_section(pfn);
+	}
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
-- 
cgit v1.1


From 49996738e9a7a8d0192c80d210b3a08853cd1f6c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 8 Oct 2012 16:33:14 -0700
Subject: mm: fix invalidate_complete_page2() lock ordering

commit ec4d9f626d5908b6052c2973f37992f1db52e967 upstream.

In fuzzing with trinity, lockdep protested "possible irq lock inversion
dependency detected" when isolate_lru_page() reenabled interrupts while
still holding the supposedly irq-safe tree_lock:

invalidate_inode_pages2
  invalidate_complete_page2
    spin_lock_irq(&mapping->tree_lock)
    clear_page_mlock
      isolate_lru_page
        spin_unlock_irq(&zone->lru_lock)

isolate_lru_page() is correct to enable interrupts unconditionally:
invalidate_complete_page2() is incorrect to call clear_page_mlock() while
holding tree_lock, which is supposed to nest inside lru_lock.

Both truncate_complete_page() and invalidate_complete_page() call
clear_page_mlock() before taking tree_lock to remove page from radix_tree.
 I guess invalidate_complete_page2() preferred to test PageDirty (again)
under tree_lock before committing to the munlock; but since the page has
already been unmapped, its state is already somewhat inconsistent, and no
worse if clear_page_mlock() moved up.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Deciphered-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/truncate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22e..3e9829f3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -398,11 +398,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
+	clear_page_mlock(page);
+
 	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;
 
-	clear_page_mlock(page);
 	BUG_ON(page_has_private(page));
 	__delete_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
-- 
cgit v1.1


From bdd779425e01c7247230b23051b1ab2144f9226d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Date: Mon, 8 Oct 2012 16:29:14 -0700
Subject: revert "mm: mempolicy: Let vma_merge and vma_split handle
 vma->vm_policy linkages"

commit 8d34694c1abf29df1f3c7317936b7e3e2e308d9b upstream.

Commit 05f144a0d5c2 ("mm: mempolicy: Let vma_merge and vma_split handle
vma->vm_policy linkages") removed vma->vm_policy updates code but it is
the purpose of mbind_range().  Now, mbind_range() is virtually a no-op
and while it does not allow memory corruption it is not the right fix.
This patch is a revert.

[mgorman@suse.de: Edited changelog]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3f3cc56..464b844 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -606,6 +606,27 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
+/* Apply policy to a single VMA */
+static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	int err = 0;
+	struct mempolicy *old = vma->vm_policy;
+
+	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+		 vma->vm_ops, vma->vm_file,
+		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy)
+		err = vma->vm_ops->set_policy(vma, new);
+	if (!err) {
+		mpol_get(new);
+		vma->vm_policy = new;
+		mpol_put(old);
+	}
+	return err;
+}
+
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
 		       unsigned long end, struct mempolicy *new_pol)
@@ -645,23 +666,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-
-		/*
-		 * Apply policy to a single VMA. The reference counting of
-		 * policy for vma_policy linkages has already been handled by
-		 * vma_merge and split_vma as necessary. If this is a shared
-		 * policy then ->set_policy will increment the reference count
-		 * for an sp node.
-		 */
-		pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-			vma->vm_start, vma->vm_end, vma->vm_pgoff,
-			vma->vm_ops, vma->vm_file,
-			vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-		if (vma->vm_ops && vma->vm_ops->set_policy) {
-			err = vma->vm_ops->set_policy(vma, new_pol);
-			if (err)
-				goto out;
-		}
+		err = policy_vma(vma, new_pol);
+		if (err)
+			goto out;
 	}
 
  out:
-- 
cgit v1.1


From e12681ffb14f5c3bcd25ace39b9fac3941ad6961 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:29:16 -0700
Subject: mempolicy: remove mempolicy sharing

commit 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 upstream.

Dave Jones' system call fuzz testing tool "trinity" triggered the
following bug error with slab debugging enabled

    =============================================================================
    BUG numa_policy (Not tainted): Poison overwritten
    -----------------------------------------------------------------------------

    INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b
    INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154
     __slab_alloc+0x3d3/0x445
     kmem_cache_alloc+0x29d/0x2b0
     mpol_new+0xa3/0x140
     sys_mbind+0x142/0x620
     system_call_fastpath+0x16/0x1b

    INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154
     __slab_free+0x2e/0x1de
     kmem_cache_free+0x25a/0x260
     __mpol_put+0x27/0x30
     remove_vma+0x68/0x90
     exit_mmap+0x118/0x140
     mmput+0x73/0x110
     exit_mm+0x108/0x130
     do_exit+0x162/0xb90
     do_group_exit+0x4f/0xc0
     sys_exit_group+0x17/0x20
     system_call_fastpath+0x16/0x1b

    INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x          (null) flags=0x20000000004080
    INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0

The problem is that the structure is being prematurely freed due to a
reference count imbalance. In the following case mbind(addr, len) should
replace the memory policies of both vma1 and vma2 and thus they will
become to share the same mempolicy and the new mempolicy will have the
MPOL_F_SHARED flag.

  +-------------------+-------------------+
  |     vma1          |     vma2(shmem)   |
  +-------------------+-------------------+
  |                                       |
 addr                                 addr+len

alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for
maintaining the mempolicy reference count.  The current rule is that
get_vma_policy() only increments refcount for shmem VMA and
mpol_conf_put() only decrements refcount if the policy has
MPOL_F_SHARED.

In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED!
The reference count will be decreased even though was not increased
whenever alloc_page_vma() is called.  This has been broken since commit
[52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008.

There is another serious bug with the sharing of memory policies.
Currently, mempolicy rebind logic (it is called from cpuset rebinding)
ignores a refcount of mempolicy and override it forcibly.  Thus, any
mempolicy sharing may cause mempolicy corruption.  The bug was
introduced by commit [68860ec1: cpusets: automatic numa mempolicy
rebinding].

Ideally, the shared policy handling would be rewritten to either
properly handle COW of the policy structures or at least reference count
MPOL_F_SHARED based exclusively on information within the policy.
However, this patch takes the easier approach of disabling any policy
sharing between VMAs.  Each new range allocated with sp_alloc will
allocate a new policy, set the reference count to 1 and drop the
reference count of the old policy.  This increases the memory footprint
but is not expected to be a major problem as mbind() is unlikely to be
used for fine-grained ranges.  It is also inefficient because it means
we allocate a new policy even in cases where mbind_range() could use the
new_policy passed to it.  However, it is more straight-forward and the
change should be invisible to the user.

[mgorman@suse.de: Edited changelog]
Reported-by: Dave Jones <davej@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 464b844..52df0b5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -606,24 +606,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	return first;
 }
 
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+						struct mempolicy *pol)
 {
-	int err = 0;
-	struct mempolicy *old = vma->vm_policy;
+	int err;
+	struct mempolicy *old;
+	struct mempolicy *new;
 
 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 		 vma->vm_ops, vma->vm_file,
 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 
-	if (vma->vm_ops && vma->vm_ops->set_policy)
+	new = mpol_dup(pol);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy) {
 		err = vma->vm_ops->set_policy(vma, new);
-	if (!err) {
-		mpol_get(new);
-		vma->vm_policy = new;
-		mpol_put(old);
+		if (err)
+			goto err_out;
 	}
+
+	old = vma->vm_policy;
+	vma->vm_policy = new; /* protected by mmap_sem */
+	mpol_put(old);
+
+	return 0;
+ err_out:
+	mpol_put(new);
 	return err;
 }
 
@@ -666,7 +681,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			if (err)
 				goto out;
 		}
-		err = policy_vma(vma, new_pol);
+		err = vma_replace_policy(vma, new_pol);
 		if (err)
 			goto out;
 	}
@@ -2091,15 +2106,24 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 				struct mempolicy *pol)
 {
-	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+	struct sp_node *n;
+	struct mempolicy *newpol;
 
+	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 	if (!n)
 		return NULL;
+
+	newpol = mpol_dup(pol);
+	if (IS_ERR(newpol)) {
+		kmem_cache_free(sn_cache, n);
+		return NULL;
+	}
+	newpol->flags |= MPOL_F_SHARED;
+
 	n->start = start;
 	n->end = end;
-	mpol_get(pol);
-	pol->flags |= MPOL_F_SHARED;	/* for unref */
-	n->policy = pol;
+	n->policy = newpol;
+
 	return n;
 }
 
-- 
cgit v1.1


From cedd186e31dacfb400ec74e0cdd59b02c3d55da8 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:17 -0700
Subject: mempolicy: fix a race in shared_policy_replace()

commit b22d127a39ddd10d93deee3d96e643657ad53a49 upstream.

shared_policy_replace() use of sp_alloc() is unsafe.  1) sp_node cannot
be dereferenced if sp->lock is not held and 2) another thread can modify
sp_node between spin_unlock for allocating a new sp node and next
spin_lock.  The bug was introduced before 2.6.12-rc2.

Kosaki's original patch for this problem was to allocate an sp node and
policy within shared_policy_replace and initialise it when the lock is
reacquired.  I was not keen on this approach because it partially
duplicates sp_alloc().  As the paths were sp->lock is taken are not that
performance critical this patch converts sp->lock to sp->mutex so it can
sleep when calling sp_alloc().

[kosaki.motohiro@jp.fujitsu.com: Original patch]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 52df0b5..a768692 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2021,7 +2021,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  */
 
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2085,13 +2085,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 
 	if (!sp->root.rb_node)
 		return NULL;
-	spin_lock(&sp->lock);
+	mutex_lock(&sp->mutex);
 	sn = sp_lookup(sp, idx, idx+1);
 	if (sn) {
 		mpol_get(sn->policy);
 		pol = sn->policy;
 	}
-	spin_unlock(&sp->lock);
+	mutex_unlock(&sp->mutex);
 	return pol;
 }
 
@@ -2131,10 +2131,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 				 unsigned long end, struct sp_node *new)
 {
-	struct sp_node *n, *new2 = NULL;
+	struct sp_node *n;
+	int ret = 0;
 
-restart:
-	spin_lock(&sp->lock);
+	mutex_lock(&sp->mutex);
 	n = sp_lookup(sp, start, end);
 	/* Take care of old policies in the same range. */
 	while (n && n->start < end) {
@@ -2147,16 +2147,14 @@ restart:
 		} else {
 			/* Old policy spanning whole new range. */
 			if (n->end > end) {
+				struct sp_node *new2;
+				new2 = sp_alloc(end, n->end, n->policy);
 				if (!new2) {
-					spin_unlock(&sp->lock);
-					new2 = sp_alloc(end, n->end, n->policy);
-					if (!new2)
-						return -ENOMEM;
-					goto restart;
+					ret = -ENOMEM;
+					goto out;
 				}
 				n->end = start;
 				sp_insert(sp, new2);
-				new2 = NULL;
 				break;
 			} else
 				n->end = start;
@@ -2167,12 +2165,9 @@ restart:
 	}
 	if (new)
 		sp_insert(sp, new);
-	spin_unlock(&sp->lock);
-	if (new2) {
-		mpol_put(new2->policy);
-		kmem_cache_free(sn_cache, new2);
-	}
-	return 0;
+out:
+	mutex_unlock(&sp->mutex);
+	return ret;
 }
 
 /**
@@ -2190,7 +2185,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
-	spin_lock_init(&sp->lock);
+	mutex_init(&sp->mutex);
 
 	if (mpol) {
 		struct vm_area_struct pvma;
@@ -2256,7 +2251,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 
 	if (!p->root.rb_node)
 		return;
-	spin_lock(&p->lock);
+	mutex_lock(&p->mutex);
 	next = rb_first(&p->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
@@ -2265,7 +2260,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 		mpol_put(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
-	spin_unlock(&p->lock);
+	mutex_unlock(&p->mutex);
 }
 
 /* assumes fs == KERNEL_DS */
-- 
cgit v1.1


From 29715fe22f6e7ea5d84c2872fd5dd2d407ed5083 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 8 Oct 2012 16:29:19 -0700
Subject: mempolicy: fix refcount leak in mpol_set_shared_policy()

commit 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 upstream.

When shared_policy_replace() fails to allocate new->policy is not freed
correctly by mpol_set_shared_policy().  The problem is that shared
mempolicy code directly call kmem_cache_free() in multiple places where
it is easy to make a mistake.

This patch creates an sp_free wrapper function and uses it. The bug was
introduced pre-git age (IOW, before 2.6.12-rc2).

[mgorman@suse.de: Editted changelog]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a768692..6a569cc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2095,12 +2095,17 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 	return pol;
 }
 
+static void sp_free(struct sp_node *n)
+{
+	mpol_put(n->policy);
+	kmem_cache_free(sn_cache, n);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
-	mpol_put(n->policy);
-	kmem_cache_free(sn_cache, n);
+	sp_free(n);
 }
 
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
@@ -2239,7 +2244,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 	}
 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
 	if (err && new)
-		kmem_cache_free(sn_cache, new);
+		sp_free(new);
 	return err;
 }
 
@@ -2256,9 +2261,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
-		rb_erase(&n->nd, &p->root);
-		mpol_put(n->policy);
-		kmem_cache_free(sn_cache, n);
+		sp_delete(p, n);
 	}
 	mutex_unlock(&p->mutex);
 }
-- 
cgit v1.1


From d08719c499bb9996ea6edd30e2342b3bbb3826b4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 8 Oct 2012 16:29:20 -0700
Subject: mempolicy: fix a memory corruption by refcount imbalance in
 alloc_pages_vma()

commit 00442ad04a5eac08a98255697c510e708f6082e2 upstream.

Commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier
related damage v3") introduced a potential memory corruption.
shmem_alloc_page() uses a pseudo vma and it has one significant unique
combination, vma->vm_ops=NULL and vma->policy->flags & MPOL_F_SHARED.

get_vma_policy() does NOT increase a policy ref when vma->vm_ops=NULL
and mpol_cond_put() DOES decrease a policy ref when a policy has
MPOL_F_SHARED.  Therefore, when a cpuset update race occurs,
alloc_pages_vma() falls in 'goto retry_cpuset' path, decrements the
reference count and frees the policy prematurely.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6a569cc..5dce7d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1511,8 +1511,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
 									addr);
 			if (vpol)
 				pol = vpol;
-		} else if (vma->vm_policy)
+		} else if (vma->vm_policy) {
 			pol = vma->vm_policy;
+
+			/*
+			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
+			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+			 * count on these policies which will be dropped by
+			 * mpol_cond_put() later
+			 */
+			if (mpol_needs_cond_ref(pol))
+				mpol_get(pol);
+		}
 	}
 	if (!pol)
 		pol = &default_policy;
-- 
cgit v1.1


From f38039a248831d279cca77ab1dab773684a96c1e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 7 Oct 2012 20:32:51 -0700
Subject: tmpfs,ceph,gfs2,isofs,reiserfs,xfs: fix fh_len checking

commit 35c2a7f4908d404c9124c2efc6ada4640ca4d5d5 upstream.

Fuzzing with trinity oopsed on the 1st instruction of shmem_fh_to_dentry(),
	u64 inum = fid->raw[2];
which is unhelpfully reported as at the end of shmem_alloc_inode():

BUG: unable to handle kernel paging request at ffff880061cd3000
IP: [<ffffffff812190d0>] shmem_alloc_inode+0x40/0x40
Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Call Trace:
 [<ffffffff81488649>] ? exportfs_decode_fh+0x79/0x2d0
 [<ffffffff812d77c3>] do_handle_open+0x163/0x2c0
 [<ffffffff812d792c>] sys_open_by_handle_at+0xc/0x10
 [<ffffffff83a5f3f8>] tracesys+0xe1/0xe6

Right, tmpfs is being stupid to access fid->raw[2] before validating that
fh_len includes it: the buffer kmalloc'ed by do_sys_name_to_handle() may
fall at the end of a page, and the next page not be present.

But some other filesystems (ceph, gfs2, isofs, reiserfs, xfs) are being
careless about fh_len too, in fh_to_dentry() and/or fh_to_parent(), and
could oops in the same way: add the missing fh_len checks to those.

Reported-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Sage Weil <sage@inktank.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/shmem.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf54..769941f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2348,12 +2348,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
 {
 	struct inode *inode;
 	struct dentry *dentry = NULL;
-	u64 inum = fid->raw[2];
-	inum = (inum << 32) | fid->raw[1];
+	u64 inum;
 
 	if (fh_len < 3)
 		return NULL;
 
+	inum = fid->raw[2];
+	inum = (inum << 32) | fid->raw[1];
+
 	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
 			shmem_match, fid->raw);
 	if (inode) {
-- 
cgit v1.1


From e418b3bbe9a34fc75a148ff890e0b3442628c5c7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 25 Oct 2012 13:37:31 -0700
Subject: mm: fix XFS oops due to dirty pages without buffers on s390

commit ef5d437f71afdf4afdbab99213add99f4b1318fd upstream.

On s390 any write to a page (even from kernel itself) sets architecture
specific page dirty bit.  Thus when a page is written to via buffered
write, HW dirty bit gets set and when we later map and unmap the page,
page_remove_rmap() finds the dirty bit and calls set_page_dirty().

Dirtying of a page which shouldn't be dirty can cause all sorts of
problems to filesystems.  The bug we observed in practice is that
buffers from the page get freed, so when the page gets later marked as
dirty and writeback writes it, XFS crashes due to an assertion
BUG_ON(!PagePrivate(page)) in page_buffers() called from
xfs_count_page_state().

Similar problem can also happen when zero_user_segment() call from
xfs_vm_writepage() (or block_write_full_page() for that matter) set the
hardware dirty bit during writeback, later buffers get freed, and then
page unmapped.

Fix the issue by ignoring s390 HW dirty bit for page cache pages of
mappings with mapping_cap_account_dirty().  This is safe because for
such mappings when a page gets marked as writeable in PTE it is also
marked dirty in do_wp_page() or do_page_fault().  When the dirty bit is
cleared by clear_page_dirty_for_io(), the page gets writeprotected in
page_mkclean().  So pagecache page is writeable if and only if it is
dirty.

Thanks to Hugh Dickins for pointing out mapping has to have
mapping_cap_account_dirty() for things to work and proposing a cleaned
up variant of the patch.

The patch has survived about two hours of running fsx-linux on tmpfs
while heavily swapping and several days of running on out build machines
where the original problem was triggered.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/rmap.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f6..30e44cb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -57,6 +57,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
 
 #include <asm/tlbflush.h>
 
@@ -936,11 +937,8 @@ int page_mkclean(struct page *page)
 
 	if (page_mapped(page)) {
 		struct address_space *mapping = page_mapping(page);
-		if (mapping) {
+		if (mapping)
 			ret = page_mkclean_file(mapping, page);
-			if (page_test_and_clear_dirty(page_to_pfn(page), 1))
-				ret = 1;
-		}
 	}
 
 	return ret;
@@ -1121,6 +1119,8 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
+	struct address_space *mapping = page_mapping(page);
+
 	/* page still mapped by someone else? */
 	if (!atomic_add_negative(-1, &page->_mapcount))
 		return;
@@ -1131,8 +1131,19 @@ void page_remove_rmap(struct page *page)
 	 * this if the page is anon, so about to be freed; but perhaps
 	 * not if it's in swapcache - there might be another pte slot
 	 * containing the swap entry, but page not yet written to swap.
+	 *
+	 * And we can skip it on file pages, so long as the filesystem
+	 * participates in dirty tracking; but need to catch shm and tmpfs
+	 * and ramfs pages which have been modified since creation by read
+	 * fault.
+	 *
+	 * Note that mapping must be decided above, before decrementing
+	 * mapcount (which luckily provides a barrier): once page is unmapped,
+	 * it could be truncated and page->mapping reset to NULL at any moment.
+	 * Note also that we are relying on page_mapping(page) to set mapping
+	 * to &swapper_space when PageSwapCache(page).
 	 */
-	if ((!PageAnon(page) || PageSwapCache(page)) &&
+	if (mapping && !mapping_cap_account_dirty(mapping) &&
 	    page_test_and_clear_dirty(page_to_pfn(page), 1))
 		set_page_dirty(page);
 	/*
-- 
cgit v1.1


From 218440d541517f1c8c6c5349e105dbf23b371b26 Mon Sep 17 00:00:00 2001
From: Takamori Yamaguchi <takamori.yamaguchi@jp.sony.com>
Date: Thu, 8 Nov 2012 15:53:39 -0800
Subject: mm: bugfix: set current->reclaim_state to NULL while returning from
 kswapd()

commit b0a8cc58e6b9aaae3045752059e5e6260c0b94bc upstream.

In kswapd(), set current->reclaim_state to NULL before returning, as
current->reclaim_state holds reference to variable on kswapd()'s stack.

In rare cases, while returning from kswapd() during memory offlining,
__free_slab() and freepages() can access the dangling pointer of
current->reclaim_state.

Signed-off-by: Takamori Yamaguchi <takamori.yamaguchi@jp.sony.com>
Signed-off-by: Aaditya Kumar <aaditya.kumar@ap.sony.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/vmscan.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5326f98..2174733 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2977,6 +2977,8 @@ static int kswapd(void *p)
 						&balanced_classzone_idx);
 		}
 	}
+
+	current->reclaim_state = NULL;
 	return 0;
 }
 
-- 
cgit v1.1


From 8bec6507e7073b4573e29d4990b4eebd3aa32abf Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@huawei.com>
Date: Thu, 29 Nov 2012 13:54:21 -0800
Subject: mm/vmemmap: fix wrong use of virt_to_page

commit ae64ffcac35de0db628ba9631edf8ff34c5cd7ac upstream.

I enable CONFIG_DEBUG_VIRTUAL and CONFIG_SPARSEMEM_VMEMMAP, when doing
memory hotremove, there is a kernel BUG at arch/x86/mm/physaddr.c:20.

It is caused by free_section_usemap()->virt_to_page(), virt_to_page() is
only used for kernel direct mapping address, but sparse-vmemmap uses
vmemmap address, so it is going wrong here.

  ------------[ cut here ]------------
  kernel BUG at arch/x86/mm/physaddr.c:20!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: acpihp_drv acpihp_slot edd cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf fuse vfat fat loop dm_mod coretemp kvm crc32c_intel ipv6 ixgbe igb iTCO_wdt i7core_edac edac_core pcspkr iTCO_vendor_support ioatdma microcode joydev sr_mod i2c_i801 dca lpc_ich mfd_core mdio tpm_tis i2c_core hid_generic tpm cdrom sg tpm_bios rtc_cmos button ext3 jbd mbcache usbhid hid uhci_hcd ehci_hcd usbcore usb_common sd_mod crc_t10dif processor thermal_sys hwmon scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh_emc scsi_dh ata_generic ata_piix libata megaraid_sas scsi_mod
  CPU 39
  Pid: 6454, comm: sh Not tainted 3.7.0-rc1-acpihp-final+ #45 QCI QSSC-S4R/QSSC-S4R
  RIP: 0010:[<ffffffff8103c908>]  [<ffffffff8103c908>] __phys_addr+0x88/0x90
  RSP: 0018:ffff8804440d7c08  EFLAGS: 00010006
  RAX: 0000000000000006 RBX: ffffea0012000000 RCX: 000000000000002c
  ...

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reviewd-by: Wen Congyang <wency@cn.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/sparse.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index 4cd05e5..9054f83 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -619,7 +619,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
 	return; /* XXX: Not implemented yet */
 }
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
 }
 #else
@@ -660,10 +660,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 			   get_order(sizeof(struct page) * nr_pages));
 }
 
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
 	unsigned long magic;
+	struct page *page = virt_to_page(memmap);
 
 	for (i = 0; i < nr_pages; i++, page++) {
 		magic = (unsigned long) page->lru.next;
@@ -712,13 +713,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 	 */
 
 	if (memmap) {
-		struct page *memmap_page;
-		memmap_page = virt_to_page(memmap);
-
 		nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
 			>> PAGE_SHIFT;
 
-		free_map_bootmem(memmap_page, nr_pages);
+		free_map_bootmem(memmap, nr_pages);
 	}
 }
 
-- 
cgit v1.1


From 11bcecc86d062fc22dcdf2c296b447bd21f25e8c Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 29 Nov 2012 13:54:34 -0800
Subject: mm: soft offline: split thp at the beginning of soft_offline_page()

commit 783657a7dc20e5c0efbc9a09a9dd38e238a723da upstream.

When we try to soft-offline a thp tail page, put_page() is called on the
tail page unthinkingly and VM_BUG_ON is triggered in put_compound_page().

This patch splits thp before going into the main body of soft-offlining.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory-failure.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2f49dcf..eace560 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1382,9 +1382,17 @@ int soft_offline_page(struct page *page, int flags)
 {
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
+	struct page *hpage = compound_trans_head(page);
 
 	if (PageHuge(page))
 		return soft_offline_huge_page(page, flags);
+	if (PageTransHuge(hpage)) {
+		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+			pr_info("soft offline: %#lx: failed to split THP\n",
+				pfn);
+			return -EBUSY;
+		}
+	}
 
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
-- 
cgit v1.1


From b9f21c252726e320c3878d9206571e2178cafd10 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Wed, 7 Nov 2012 15:37:07 +0100
Subject: mm: dmapool: use provided gfp flags for all dma_alloc_coherent()
 calls

commit 387870f2d6d679746020fa8e25ef786ff338dc98 upstream.

dmapool always calls dma_alloc_coherent() with GFP_ATOMIC flag,
regardless the flags provided by the caller. This causes excessive
pruning of emergency memory pools without any good reason. Additionaly,
on ARM architecture any driver which is using dmapools will sooner or
later  trigger the following error:
"ERROR: 256 KiB atomic DMA coherent pool is too small!
Please increase it with coherent_pool= kernel parameter!".
Increasing the coherent pool size usually doesn't help much and only
delays such error, because all GFP_ATOMIC DMA allocations are always
served from the special, very limited memory pool.

This patch changes the dmapool code to correctly use gfp flags provided
by the dmapool caller.

Reported-by: Soeren Moch <smoch@web.de>
Reported-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Soeren Moch <smoch@web.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/dmapool.c | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb..f8e675e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -49,7 +49,6 @@ struct dma_pool {		/* the pool */
 	size_t allocation;
 	size_t boundary;
 	char name[32];
-	wait_queue_head_t waitq;
 	struct list_head pools;
 };
 
@@ -61,8 +60,6 @@ struct dma_page {		/* cacheable header for 'allocation' bytes */
 	unsigned int offset;
 };
 
-#define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
-
 static DEFINE_MUTEX(pools_lock);
 
 static ssize_t
@@ -171,7 +168,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 	retval->size = size;
 	retval->boundary = boundary;
 	retval->allocation = allocation;
-	init_waitqueue_head(&retval->waitq);
 
 	if (dev) {
 		int ret;
@@ -226,7 +222,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 		memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
 		pool_initialise_page(pool, page);
-		list_add(&page->page_list, &pool->page_list);
 		page->in_use = 0;
 		page->offset = 0;
 	} else {
@@ -314,30 +309,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 	might_sleep_if(mem_flags & __GFP_WAIT);
 
 	spin_lock_irqsave(&pool->lock, flags);
- restart:
 	list_for_each_entry(page, &pool->page_list, page_list) {
 		if (page->offset < pool->allocation)
 			goto ready;
 	}
-	page = pool_alloc_page(pool, GFP_ATOMIC);
-	if (!page) {
-		if (mem_flags & __GFP_WAIT) {
-			DECLARE_WAITQUEUE(wait, current);
 
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			__add_wait_queue(&pool->waitq, &wait);
-			spin_unlock_irqrestore(&pool->lock, flags);
+	/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+	spin_unlock_irqrestore(&pool->lock, flags);
 
-			schedule_timeout(POOL_TIMEOUT_JIFFIES);
+	page = pool_alloc_page(pool, mem_flags);
+	if (!page)
+		return NULL;
 
-			spin_lock_irqsave(&pool->lock, flags);
-			__remove_wait_queue(&pool->waitq, &wait);
-			goto restart;
-		}
-		retval = NULL;
-		goto done;
-	}
+	spin_lock_irqsave(&pool->lock, flags);
 
+	list_add(&page->page_list, &pool->page_list);
  ready:
 	page->in_use++;
 	offset = page->offset;
@@ -347,7 +333,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 #ifdef	DMAPOOL_DEBUG
 	memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
- done:
 	spin_unlock_irqrestore(&pool->lock, flags);
 	return retval;
 }
@@ -434,8 +419,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
 	page->in_use--;
 	*(int *)vaddr = page->offset;
 	page->offset = offset;
-	if (waitqueue_active(&pool->waitq))
-		wake_up_locked(&pool->waitq);
 	/*
 	 * Resist a temptation to do
 	 *    if (!is_page_busy(page)) pool_free_page(pool, page);
-- 
cgit v1.1


From 15487eba1006de0db3c8d9f78ef16700a4d0a217 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 5 Dec 2012 14:01:41 -0800
Subject: tmpfs: fix shared mempolicy leak

commit 18a2f371f5edf41810f6469cb9be39931ef9deb9 upstream.

This fixes a regression in 3.7-rc, which has since gone into stable.

Commit 00442ad04a5e ("mempolicy: fix a memory corruption by refcount
imbalance in alloc_pages_vma()") changed get_vma_policy() to raise the
refcount on a shmem shared mempolicy; whereas shmem_alloc_page() went
on expecting alloc_page_vma() to drop the refcount it had acquired.
This deserves a rework: but for now fix the leak in shmem_alloc_page().

Hugh: shmem_swapin() did not need a fix, but surely it's clearer to use
the same refcounting there as in shmem_alloc_page(), delete its onstack
mempolicy, and the strange mpol_cond_copy() and __mpol_cond_copy() -
those were invented to let swapin_readahead() make an unknown number of
calls to alloc_pages_vma() with one mempolicy; but since 00442ad04a5e,
alloc_pages_vma() has kept refcount in balance, so now no problem.

Reported-and-tested-by: Tommi Rantala <tt.rantala@gmail.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 22 ----------------------
 mm/shmem.c     | 22 +++++++++++++---------
 2 files changed, 13 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5dce7d4..04282ba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1973,28 +1973,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 	return new;
 }
 
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
- * after return.  Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
-						struct mempolicy *frompol)
-{
-	if (!mpol_needs_cond_ref(frompol))
-		return frompol;
-
-	*tompol = *frompol;
-	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
-	__mpol_put(frompol);
-	return tompol;
-}
-
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 769941f..b952332 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1168,19 +1168,20 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
 			struct shmem_inode_info *info, unsigned long idx)
 {
-	struct mempolicy mpol, *spol;
 	struct vm_area_struct pvma;
 	struct page *page;
 
-	spol = mpol_cond_copy(&mpol,
-				mpol_shared_policy_lookup(&info->policy, idx));
-
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
 	pvma.vm_pgoff = idx;
 	pvma.vm_ops = NULL;
-	pvma.vm_policy = spol;
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+
 	page = swapin_readahead(entry, gfp, &pvma, 0);
+
+	/* Drop reference taken by mpol_shared_policy_lookup() */
+	mpol_cond_put(pvma.vm_policy);
+
 	return page;
 }
 
@@ -1188,6 +1189,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 			struct shmem_inode_info *info, unsigned long idx)
 {
 	struct vm_area_struct pvma;
+	struct page *page;
 
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
@@ -1195,10 +1197,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 
-	/*
-	 * alloc_page_vma() will drop the shared policy reference
-	 */
-	return alloc_page_vma(gfp, &pvma, 0);
+	page = alloc_page_vma(gfp, &pvma, 0);
+
+	/* Drop reference taken by mpol_shared_policy_lookup() */
+	mpol_cond_put(pvma.vm_policy);
+
+	return page;
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-- 
cgit v1.1


From 51afc625ccd2e38f76f690ccce93e11a21d9543b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 2 Jan 2013 02:01:33 -0800
Subject: tmpfs mempolicy: fix /proc/mounts corrupting memory

commit f2a07f40dbc603c15f8b06e6ec7f768af67b424f upstream.

Recently I suggested using "mount -o remount,mpol=local /tmp" in NUMA
mempolicy testing.  Very nasty.  Reading /proc/mounts, /proc/pid/mounts
or /proc/pid/mountinfo may then corrupt one bit of kernel memory, often
in a page table (causing "Bad swap" or "Bad page map" warning or "Bad
pagetable" oops), sometimes in a vm_area_struct or rbnode or somewhere
worse.  "mpol=prefer" and "mpol=prefer:Node" are equally toxic.

Recent NUMA enhancements are not to blame: this dates back to 2.6.35,
when commit e17f74af351c "mempolicy: don't call mpol_set_nodemask() when
no_context" skipped mpol_parse_str()'s call to mpol_set_nodemask(),
which used to initialize v.preferred_node, or set MPOL_F_LOCAL in flags.
With slab poisoning, you can then rely on mpol_to_str() to set the bit
for node 0x6b6b, probably in the next page above the caller's stack.

mpol_parse_str() is only called from shmem_parse_options(): no_context
is always true, so call it unused for now, and remove !no_context code.
Set v.nodes or v.preferred_node or MPOL_F_LOCAL as mpol_to_str() might
expect.  Then mpol_to_str() can ignore its no_context argument also,
the mpol being appropriately initialized whether contextualized or not.
Rename its no_context unused too, and let subsequent patch remove them
(that's not needed for stable backporting, which would involve rejects).

I don't understand why MPOL_LOCAL is described as a pseudo-policy:
it's a reasonable policy which suffers from a confusing implementation
in terms of MPOL_PREFERRED with MPOL_F_LOCAL.  I believe this would be
much more robust if MPOL_LOCAL were recognized in switch statements
throughout, MPOL_F_LOCAL deleted, and MPOL_PREFERRED use the (possibly
empty) nodes mask like everyone else, instead of its preferred_node
variant (I presume an optimization from the days before MPOL_LOCAL).
But that would take me too long to get right and fully tested.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/mempolicy.c | 64 ++++++++++++++++++++++++----------------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04282ba..0367beb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2308,8 +2308,7 @@ void numa_default_policy(void)
  */
 
 /*
- * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
- * Used only for mpol_parse_str() and mpol_to_str()
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
  */
 #define MPOL_LOCAL MPOL_MAX
 static const char * const policy_modes[] =
@@ -2324,28 +2323,21 @@ static const char * const policy_modes[] =
 
 #ifdef CONFIG_TMPFS
 /**
- * mpol_parse_str - parse string to mempolicy
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
  * @str:  string containing mempolicy to parse
  * @mpol:  pointer to struct mempolicy pointer, returned on success.
- * @no_context:  flag whether to "contextualize" the mempolicy
+ * @unused:  redundant argument, to be removed later.
  *
  * Format of input:
  *	<mode>[=<flags>][:<nodelist>]
  *
- * if @no_context is true, save the input nodemask in w.user_nodemask in
- * the returned mempolicy.  This will be used to "clone" the mempolicy in
- * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
- * mount option.  Note that if 'static' or 'relative' mode flags were
- * specified, the input nodemask will already have been saved.  Saving
- * it again is redundant, but safe.
- *
  * On success, returns 0, else 1
  */
-int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+int mpol_parse_str(char *str, struct mempolicy **mpol, int unused)
 {
 	struct mempolicy *new = NULL;
 	unsigned short mode;
-	unsigned short uninitialized_var(mode_flags);
+	unsigned short mode_flags;
 	nodemask_t nodes;
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
@@ -2433,24 +2425,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	if (IS_ERR(new))
 		goto out;
 
-	if (no_context) {
-		/* save for contextualization */
-		new->w.user_nodemask = nodes;
-	} else {
-		int ret;
-		NODEMASK_SCRATCH(scratch);
-		if (scratch) {
-			task_lock(current);
-			ret = mpol_set_nodemask(new, &nodes, scratch);
-			task_unlock(current);
-		} else
-			ret = -ENOMEM;
-		NODEMASK_SCRATCH_FREE(scratch);
-		if (ret) {
-			mpol_put(new);
-			goto out;
-		}
-	}
+	/*
+	 * Save nodes for mpol_to_str() to show the tmpfs mount options
+	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+	 */
+	if (mode != MPOL_PREFERRED)
+		new->v.nodes = nodes;
+	else if (nodelist)
+		new->v.preferred_node = first_node(nodes);
+	else
+		new->flags |= MPOL_F_LOCAL;
+
+	/*
+	 * Save nodes for contextualization: this will be used to "clone"
+	 * the mempolicy in a specific context [cpuset] at a later time.
+	 */
+	new->w.user_nodemask = nodes;
+
 	err = 0;
 
 out:
@@ -2470,13 +2461,13 @@ out:
  * @buffer:  to contain formatted mempolicy string
  * @maxlen:  length of @buffer
  * @pol:  pointer to mempolicy to be formatted
- * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
+ * @unused:  redundant argument, to be removed later.
  *
  * Convert a mempolicy into a string.
  * Returns the number of characters in buffer (if positive)
  * or an error (negative)
  */
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused)
 {
 	char *p = buffer;
 	int l;
@@ -2502,7 +2493,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
 	case MPOL_PREFERRED:
 		nodes_clear(nodes);
 		if (flags & MPOL_F_LOCAL)
-			mode = MPOL_LOCAL;	/* pseudo-policy */
+			mode = MPOL_LOCAL;
 		else
 			node_set(pol->v.preferred_node, nodes);
 		break;
@@ -2510,10 +2501,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
-		if (no_context)
-			nodes = pol->w.user_nodemask;
-		else
-			nodes = pol->v.nodes;
+		nodes = pol->v.nodes;
 		break;
 
 	default:
-- 
cgit v1.1


From c39096f1ae6551a0d43b84cb5dad309f0d392a38 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Fri, 4 Jan 2013 15:35:12 -0800
Subject: mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT

commit 53a59fc67f97374758e63a9c785891ec62324c81 upstream.

Since commit e303297e6c3a ("mm: extended batches for generic
mmu_gather") we are batching pages to be freed until either
tlb_next_batch cannot allocate a new batch or we are done.

This works just fine most of the time but we can get in troubles with
non-preemptible kernel (CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY)
on large machines where too aggressive batching might lead to soft
lockups during process exit path (exit_mmap) because there are no
scheduling points down the free_pages_and_swap_cache path and so the
freeing can take long enough to trigger the soft lockup.

The lockup is harmless except when the system is setup to panic on
softlockup which is not that unusual.

The simplest way to work around this issue is to limit the maximum
number of batches in a single mmu_gather.  10k of collected pages should
be safe to prevent from soft lockups (we would have 2ms for one) even if
they are all freed without an explicit scheduling point.

This patch doesn't add any new explicit scheduling points because it
relies on zap_pmd_range during page tables zapping which calls
cond_resched per PMD.

The following lockup has been reported for 3.0 kernel with a huge
process (in order of hundreds gigs but I do know any more details).

  BUG: soft lockup - CPU#56 stuck for 22s! [kernel:31053]
  Modules linked in: af_packet nfs lockd fscache auth_rpcgss nfs_acl sunrpc mptctl mptbase autofs4 binfmt_misc dm_round_robin dm_multipath bonding cpufreq_conservative cpufreq_userspace cpufreq_powersave pcc_cpufreq mperf microcode fuse loop osst sg sd_mod crc_t10dif st qla2xxx scsi_transport_fc scsi_tgt netxen_nic i7core_edac iTCO_wdt joydev e1000e serio_raw pcspkr edac_core iTCO_vendor_support acpi_power_meter rtc_cmos hpwdt hpilo button container usbhid hid dm_mirror dm_region_hash dm_log linear uhci_hcd ehci_hcd usbcore usb_common scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh dm_snapshot pcnet32 mii edd dm_mod raid1 ext3 mbcache jbd fan thermal processor thermal_sys hwmon cciss scsi_mod
  Supported: Yes
  CPU 56
  Pid: 31053, comm: kernel Not tainted 3.0.31-0.9-default #1 HP ProLiant DL580 G7
  RIP: 0010:  _raw_spin_unlock_irqrestore+0x8/0x10
  RSP: 0018:ffff883ec1037af0  EFLAGS: 00000206
  RAX: 0000000000000e00 RBX: ffffea01a0817e28 RCX: ffff88803ffd9e80
  RDX: 0000000000000200 RSI: 0000000000000206 RDI: 0000000000000206
  RBP: 0000000000000002 R08: 0000000000000001 R09: ffff887ec724a400
  R10: 0000000000000000 R11: dead000000200200 R12: ffffffff8144c26e
  R13: 0000000000000030 R14: 0000000000000297 R15: 000000000000000e
  FS:  00007ed834282700(0000) GS:ffff88c03f200000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: 000000000068b240 CR3: 0000003ec13c5000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Process kernel (pid: 31053, threadinfo ffff883ec1036000, task ffff883ebd5d4100)
  Call Trace:
    release_pages+0xc5/0x260
    free_pages_and_swap_cache+0x9d/0xc0
    tlb_flush_mmu+0x5c/0x80
    tlb_finish_mmu+0xe/0x50
    exit_mmap+0xbd/0x120
    mmput+0x49/0x120
    exit_mm+0x122/0x160
    do_exit+0x17a/0x430
    do_group_exit+0x3d/0xb0
    get_signal_to_deliver+0x247/0x480
    do_signal+0x71/0x1b0
    do_notify_resume+0x98/0xb0
    int_signal+0x12/0x17
  DWARF2 unwinder stuck at int_signal+0x12/0x17

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/memory.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index d49b58a..7292acb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -205,10 +205,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
 		return 1;
 	}
 
+	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+		return 0;
+
 	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
 	if (!batch)
 		return 0;
 
+	tlb->batch_count++;
 	batch->next = NULL;
 	batch->nr   = 0;
 	batch->max  = MAX_GATHER_BATCH;
@@ -235,6 +239,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
 	tlb->local.nr   = 0;
 	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 	tlb->active     = &tlb->local;
+	tlb->batch_count = 0;
 
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
 	tlb->batch = NULL;
-- 
cgit v1.1


From 63c54425c02a07bfa162189f2839a3f204a37b5b Mon Sep 17 00:00:00 2001
From: Jason Liu <r64343@freescale.com>
Date: Fri, 11 Jan 2013 14:31:47 -0800
Subject: mm: compaction: fix echo 1 > compact_memory return error issue

commit 7964c06d66c76507d8b6b662bffea770c29ef0ce upstream.

when run the folloing command under shell, it will return error

  sh/$ echo 1 > /proc/sys/vm/compact_memory
  sh/$ sh: write error: Bad address

After strace, I found the following log:

  ...
  write(1, "1\n", 2)               = 3
  write(1, "", 4294967295)         = -1 EFAULT (Bad address)
  write(2, "echo: write error: Bad address\n", 31echo: write error: Bad address
  ) = 31

This tells system return 3(COMPACT_COMPLETE) after write data to
compact_memory.

The fix is to make the system just return 0 instead 3(COMPACT_COMPLETE)
from sysctl_compaction_handler after compaction_nodes finished.

Signed-off-by: Jason Liu <r64343@freescale.com>
Suggested-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/compaction.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 8ea7308..b4689f81 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -714,14 +714,12 @@ static int compact_node(int nid)
 }
 
 /* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
 {
 	int nid;
 
 	for_each_online_node(nid)
 		compact_node(nid);
-
-	return COMPACT_COMPLETE;
 }
 
 /* The written value is actually unused, all memory is compacted */
@@ -732,7 +730,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos)
 {
 	if (write)
-		return compact_nodes();
+		compact_nodes();
 
 	return 0;
 }
-- 
cgit v1.1


From da0e14f01594bc013022bf057bcd714566a7b99e Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Fri, 11 Jan 2013 14:31:51 -0800
Subject: mm: use aligned zone start for pfn_to_bitidx calculation

commit c060f943d0929f3e429c5d9522290584f6281d6e upstream.

The current calculation in pfn_to_bitidx assumes that (pfn -
zone->zone_start_pfn) >> pageblock_order will return the same bit for
all pfn in a pageblock.  If zone_start_pfn is not aligned to
pageblock_nr_pages, this may not always be correct.

Consider the following with pageblock order = 10, zone start 2MB:

  pfn     | pfn - zone start | (pfn - zone start) >> page block order
  ----------------------------------------------------------------
  0x26000 | 0x25e00	   |  0x97
  0x26100 | 0x25f00	   |  0x97
  0x26200 | 0x26000	   |  0x98
  0x26300 | 0x26100	   |  0x98

This means that calling {get,set}_pageblock_migratetype on a single page
will not set the migratetype for the full block.  Fix this by rounding
down zone_start_pfn when doing the bitidx calculation.

For our use case, the effects of this bug were mostly tied to the fact
that CMA allocations would either take a long time or fail to happen.
Depending on the driver using CMA, this could result in anything from
visual glitches to application failures.

Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eb6b3fd..0ec869e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5492,7 +5492,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
-	pfn = pfn - zone->zone_start_pfn;
+	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
-- 
cgit v1.1


From 457a972f3038b8de2eba76dc4b7f9117f1084f7e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 29 May 2012 15:06:23 -0700
Subject: thp, memcg: split hugepage for memcg oom on cow

commit 1f1d06c34f7675026326cd9f39ff91e4555cf355 upstream.

On COW, a new hugepage is allocated and charged to the memcg.  If the
system is oom or the charge to the memcg fails, however, the fault
handler will return VM_FAULT_OOM which results in an oom kill.

Instead, it's possible to fallback to splitting the hugepage so that the
COW results only in an order-0 page being allocated and charged to the
memcg which has a higher liklihood to succeed.  This is expensive
because the hugepage must be split in the page fault handler, but it is
much better than unnecessarily oom killing a process.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 mm/huge_memory.c |  3 +++
 mm/memory.c      | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8cc11dd..a9ab45e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -920,6 +920,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
+		if (ret & VM_FAULT_OOM)
+			split_huge_page(page);
 		put_page(page);
 		goto out;
 	}
@@ -927,6 +929,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
+		split_huge_page(page);
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
diff --git a/mm/memory.c b/mm/memory.c
index 7292acb..4da0f8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3470,6 +3470,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
+retry:
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
@@ -3483,13 +3484,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
+		int ret;
+
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			if (flags & FAULT_FLAG_WRITE &&
 			    !pmd_write(orig_pmd) &&
-			    !pmd_trans_splitting(orig_pmd))
-				return do_huge_pmd_wp_page(mm, vma, address,
-							   pmd, orig_pmd);
+			    !pmd_trans_splitting(orig_pmd)) {
+				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+							  orig_pmd);
+				/*
+				 * If COW results in an oom, the huge pmd will
+				 * have been split, so retry the fault on the
+				 * pte for a smaller charge.
+				 */
+				if (unlikely(ret & VM_FAULT_OOM))
+					goto retry;
+				return ret;
+			}
 			return 0;
 		}
 	}
-- 
cgit v1.1