aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAndrew Dodd <atd7@cornell.edu>2013-02-16 18:41:04 -0500
committerAndrew Dodd <atd7@cornell.edu>2013-02-27 09:19:08 -0500
commitb08797f2afdfc604c3143f8725d058aeef8ddcb2 (patch)
treec59e963bd6931d4e9f9526034ab402cc551f18ae /mm
parentcbfae70f1dcaf3cc6e93061179dad80caa1597fe (diff)
parent54ea5b40f067cf098cac639973c6628c6944cfb2 (diff)
downloadkernel_samsung_smdk4412-b08797f2afdfc604c3143f8725d058aeef8ddcb2.zip
kernel_samsung_smdk4412-b08797f2afdfc604c3143f8725d058aeef8ddcb2.tar.gz
kernel_samsung_smdk4412-b08797f2afdfc604c3143f8725d058aeef8ddcb2.tar.bz2
Merge remote-tracking branch 'kernelorg/linux-3.0.y' into 3_0_64
Conflicts: arch/arm/Kconfig arch/arm/include/asm/hwcap.h arch/arm/kernel/smp.c arch/arm/plat-samsung/adc.c drivers/gpu/drm/i915/i915_reg.h drivers/gpu/drm/i915/intel_drv.h drivers/mmc/core/sd.c drivers/net/tun.c drivers/net/usb/usbnet.c drivers/regulator/max8997.c drivers/usb/core/hub.c drivers/usb/host/xhci.h drivers/usb/serial/qcserial.c fs/jbd2/transaction.c include/linux/migrate.h kernel/sys.c kernel/time/timekeeping.c lib/genalloc.c mm/memory-failure.c mm/memory_hotplug.c mm/mempolicy.c mm/page_alloc.c mm/vmalloc.c mm/vmscan.c mm/vmstat.c scripts/Kbuild.include Change-Id: I91e2d85c07320c7ccfc04cf98a448e89bed6ade6
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c37
-rw-r--r--mm/dmapool.c31
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/huge_memory.c3
-rw-r--r--mm/hugetlb.c68
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memcontrol.c9
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c23
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c236
-rw-r--r--mm/migrate.c240
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/nobootmem.c3
-rw-r--r--mm/page_alloc.c122
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/shmem.c28
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c40
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c10
-rw-r--r--mm/vmscan.c299
24 files changed, 868 insertions, 446 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 051a04e..f550ecf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
- /* Account for isolated anon and file pages */
- unsigned long nr_anon;
- unsigned long nr_file;
-
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
- unsigned int count[NR_LRU_LISTS] = { 0, };
+ unsigned int count[2] = { 0, };
- list_for_each_entry(page, &cc->migratepages, lru) {
- int lru = page_lru_base_type(page);
- count[lru]++;
- }
+ list_for_each_entry(page, &cc->migratepages, lru)
+ count[!!page_is_file_cache(page)]++;
- cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
+ isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
/* Do not scan outside zone boundaries */
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -378,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
}
+ if (!cc->sync)
+ mode |= ISOLATE_ASYNC_MIGRATE;
+
/* Try isolate the page */
- if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+ if (__isolate_lru_page(page, mode, 0) != 0)
continue;
VM_BUG_ON(PageTransCompound(page));
@@ -581,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -596,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
}
-
}
out:
@@ -719,14 +718,12 @@ static int compact_node(int nid)
}
/* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
{
int nid;
for_each_online_node(nid)
compact_node(nid);
-
- return COMPACT_COMPLETE;
}
/* The written value is actually unused, all memory is compacted */
@@ -737,7 +734,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
if (write)
- return compact_nodes();
+ compact_nodes();
return 0;
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb..f8e675e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -49,7 +49,6 @@ struct dma_pool { /* the pool */
size_t allocation;
size_t boundary;
char name[32];
- wait_queue_head_t waitq;
struct list_head pools;
};
@@ -61,8 +60,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
unsigned int offset;
};
-#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
-
static DEFINE_MUTEX(pools_lock);
static ssize_t
@@ -171,7 +168,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
- init_waitqueue_head(&retval->waitq);
if (dev) {
int ret;
@@ -226,7 +222,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
pool_initialise_page(pool, page);
- list_add(&page->page_list, &pool->page_list);
page->in_use = 0;
page->offset = 0;
} else {
@@ -314,30 +309,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
might_sleep_if(mem_flags & __GFP_WAIT);
spin_lock_irqsave(&pool->lock, flags);
- restart:
list_for_each_entry(page, &pool->page_list, page_list) {
if (page->offset < pool->allocation)
goto ready;
}
- page = pool_alloc_page(pool, GFP_ATOMIC);
- if (!page) {
- if (mem_flags & __GFP_WAIT) {
- DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_UNINTERRUPTIBLE);
- __add_wait_queue(&pool->waitq, &wait);
- spin_unlock_irqrestore(&pool->lock, flags);
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
- schedule_timeout(POOL_TIMEOUT_JIFFIES);
+ page = pool_alloc_page(pool, mem_flags);
+ if (!page)
+ return NULL;
- spin_lock_irqsave(&pool->lock, flags);
- __remove_wait_queue(&pool->waitq, &wait);
- goto restart;
- }
- retval = NULL;
- goto done;
- }
+ spin_lock_irqsave(&pool->lock, flags);
+ list_add(&page->page_list, &pool->page_list);
ready:
page->in_use++;
offset = page->offset;
@@ -347,7 +333,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
#ifdef DMAPOOL_DEBUG
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
- done:
spin_unlock_irqrestore(&pool->lock, flags);
return retval;
}
@@ -434,8 +419,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
- if (waitqueue_active(&pool->waitq))
- wake_up_locked(&pool->waitq);
/*
* Resist a temptation to do
* if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/filemap.c b/mm/filemap.c
index b7d8603..10481eb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
struct page *page;
if (cpuset_do_page_mem_spread()) {
- get_mems_allowed();
- n = cpuset_mem_spread_node();
- page = alloc_pages_exact_node(n, gfp, 0);
- put_mems_allowed();
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
return page;
}
return alloc_pages(gfp, 0);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8cc11dd..a9ab45e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -920,6 +920,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
put_page(page);
goto out;
}
@@ -927,6 +929,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
+ split_huge_page(page);
put_page(page);
ret |= VM_FAULT_OOM;
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f7001ac..037f077 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
}
}
}
-err:
+
mpol_cond_put(mpol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
+
+err:
+ mpol_cond_put(mpol);
+ return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
@@ -2060,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
kref_get(&reservations->refs);
}
+static void resv_map_put(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (!reservations)
+ return;
+ kref_put(&reservations->refs, resv_map_release);
+}
+
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
@@ -2075,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
reserve = (end - start) -
region_count(&reservations->regions, start, end);
- kref_put(&reservations->refs, resv_map_release);
+ resv_map_put(vma);
if (reserve) {
hugetlb_acct_memory(h, -reserve);
@@ -2285,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
{
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
__unmap_hugepage_range(vma, start, end, ref_page);
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex below.
+ *
+ * This works because in the contexts this is called, the VMA is
+ * going to be destroyed. It is not vunerable to madvise(DONTNEED)
+ * because madvise is not supported on hugetlbfs. The same applies
+ * for direct IO. unmap_hugepage_range() is only being called just
+ * before free_pgtables() so clearing VM_MAYSHARE will not cause
+ * surprises later.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
@@ -2398,7 +2430,6 @@ retry_avoidcopy:
if (outside_reserve) {
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
- BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
spin_lock(&mm->page_table_lock);
goto retry_avoidcopy;
@@ -2838,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
int hugetlb_reserve_pages(struct inode *inode,
@@ -2878,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode,
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0)
- return chg;
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
/* There must be enough filesystem quota for the mapping */
- if (hugetlb_get_quota(inode->i_mapping, chg))
- return -ENOSPC;
+ if (hugetlb_get_quota(inode->i_mapping, chg)) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
/*
* Check enough hugepages are available for the reservation.
@@ -2892,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode,
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
hugetlb_put_quota(inode->i_mapping, chg);
- return ret;
+ goto out_err;
}
/*
@@ -2909,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode,
if (!vma || vma->vm_flags & VM_MAYSHARE)
region_add(&inode->i_mapping->private_list, from, to);
return 0;
+out_err:
+ if (vma)
+ resv_map_put(vma);
+ return ret;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491..deabe5f6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
#include <linux/hugetlb.h>
#include <linux/sched.h>
#include <linux/ksm.h>
+#include <linux/file.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma,
struct address_space *mapping;
loff_t offset, endoff;
int error;
+ struct file *f;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
return -EINVAL;
- if (!vma->vm_file || !vma->vm_file->f_mapping
- || !vma->vm_file->f_mapping->host) {
+ f = vma->vm_file;
+
+ if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma,
endoff = (loff_t)(end - vma->vm_start - 1)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+ /*
+ * vmtruncate_range may need to take i_mutex and i_alloc_sem.
+ * We need to explicitly grab a reference because the vma (and
+ * hence the vma's reference to the file) can go away as soon as
+ * we drop mmap_sem.
+ */
+ get_file(f);
up_read(&current->mm->mmap_sem);
error = vmtruncate_range(mapping->host, offset, endoff);
+ fput(f);
down_read(&current->mm->mmap_sem);
return error;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 283068f..57cdf5a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
+ isolate_mode_t mode,
+ struct zone *z,
struct mem_cgroup *mem_cont,
int active, int file)
{
@@ -4605,6 +4606,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b4d627d..809823d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1338,8 +1338,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_add(&hpage->lru, &pagelist);
- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
- true);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
+ MIGRATE_SYNC);
if (ret) {
struct page *page1, *page2;
list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1386,9 +1386,17 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_trans_head(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
+ if (PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
ret = get_any_page(page, pfn, flags);
if (ret < 0)
@@ -1469,10 +1477,10 @@ int soft_offline_page(struct page *page, int flags)
list_add(&page->lru, &pagelist);
#ifndef CONFIG_DMA_CMA
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, true);
+ false, MIGRATE_SYNC);
#else
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, true, 0);
+ false, MIGRATE_SYNC, 0);
#endif
if (ret) {
putback_lru_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index 159985d..6204ce2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -208,10 +208,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
return 1;
}
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return 0;
+
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
return 0;
+ tlb->batch_count++;
batch->next = NULL;
batch->nr = 0;
batch->max = MAX_GATHER_BATCH;
@@ -238,6 +242,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
+ tlb->batch_count = 0;
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
@@ -3568,6 +3573,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
+retry:
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
@@ -3581,13 +3587,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd, flags);
} else {
pmd_t orig_pmd = *pmd;
+ int ret;
+
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd))
- return do_huge_pmd_wp_page(mm, vma, address,
- pmd, orig_pmd);
+ !pmd_trans_splitting(orig_pmd)) {
+ ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+ orig_pmd);
+ /*
+ * If COW results in an oom, the huge pmd will
+ * have been split, so retry the fault on the
+ * pte for a smaller charge.
+ */
+ if (unlikely(ret & VM_FAULT_OOM))
+ goto retry;
+ return ret;
+ }
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 348d7bf..9fcd1b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -116,9 +116,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
struct mem_section *ms;
struct page *page, *memmap;
- if (!pfn_valid(start_pfn))
- return;
-
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -177,9 +174,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
end_pfn = pfn + pgdat->node_spanned_pages;
/* register_section info */
- for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
- register_page_bootmem_info_section(pfn);
-
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other node.
+ */
+ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -748,10 +752,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
/* this function returns # of failed pages */
#ifndef CONFIG_DMA_CMA
ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
- true, true);
+ true, MIGRATE_SYNC);
#else
ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
- true, true, 0);
+ true, MIGRATE_SYNC, 0);
#endif
if (ret)
putback_lru_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 703b42a..7065fb3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -606,24 +606,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return first;
}
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
{
- int err = 0;
- struct mempolicy *old = vma->vm_policy;
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
- if (vma->vm_ops && vma->vm_ops->set_policy)
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
- if (!err) {
- mpol_get(new);
- vma->vm_policy = new;
- mpol_put(old);
+ if (err)
+ goto err_out;
}
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_sem */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
return err;
}
@@ -666,7 +681,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
- err = policy_vma(vma, new_pol);
+ err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
}
@@ -934,10 +949,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
#ifndef CONFIG_DMA_CMA
err = migrate_pages(&pagelist, new_node_page, dest,
- false, true);
+ false, MIGRATE_SYNC);
#else
err = migrate_pages(&pagelist, new_node_page, dest,
- false, true, 0);
+ false, MIGRATE_SYNC, 0);
#endif
if (err)
putback_lru_pages(&pagelist);
@@ -1507,8 +1522,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
addr);
if (vpol)
pol = vpol;
- } else if (vma->vm_policy)
+ } else if (vma->vm_policy) {
pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
}
if (!pol)
pol = &default_policy;
@@ -1828,18 +1853,24 @@ struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct mempolicy *pol;
struct zonelist *zl;
struct page *page;
+ unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+ cpuset_mems_cookie = get_mems_allowed();
- get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
zl = policy_zonelist(gfp, pol, node);
@@ -1850,7 +1881,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
/*
@@ -1858,7 +1890,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
*/
page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
@@ -1885,11 +1918,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
struct page *page;
+ unsigned int cpuset_mems_cookie;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
@@ -1900,7 +1936,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -1945,28 +1984,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
return new;
}
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
- * after return. Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
- struct mempolicy *frompol)
-{
- if (!mpol_needs_cond_ref(frompol))
- return frompol;
-
- *tompol = *frompol;
- tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
- __mpol_put(frompol);
- return tompol;
-}
-
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -2003,7 +2020,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2067,36 +2084,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ mutex_unlock(&sp->mutex);
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_free(n);
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ struct sp_node *n;
+ struct mempolicy *newpol;
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+
n->start = start;
n->end = end;
- mpol_get(pol);
- pol->flags |= MPOL_F_SHARED; /* for unref */
- n->policy = pol;
+ n->policy = newpol;
+
return n;
}
@@ -2104,10 +2135,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2 = NULL;
+ struct sp_node *n;
+ int ret = 0;
-restart:
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2120,16 +2151,14 @@ restart:
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
+ struct sp_node *new2;
+ new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
- spin_unlock(&sp->lock);
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2)
- return -ENOMEM;
- goto restart;
+ ret = -ENOMEM;
+ goto out;
}
n->end = start;
sp_insert(sp, new2);
- new2 = NULL;
break;
} else
n->end = start;
@@ -2140,12 +2169,9 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
- if (new2) {
- mpol_put(new2->policy);
- kmem_cache_free(sn_cache, new2);
- }
- return 0;
+out:
+ mutex_unlock(&sp->mutex);
+ return ret;
}
/**
@@ -2163,7 +2189,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ mutex_init(&sp->mutex);
if (mpol) {
struct vm_area_struct pvma;
@@ -2217,7 +2243,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
- kmem_cache_free(sn_cache, new);
+ sp_free(new);
return err;
}
@@ -2229,16 +2255,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ mutex_lock(&p->mutex);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ mutex_unlock(&p->mutex);
}
/* assumes fs == KERNEL_DS */
@@ -2295,8 +2319,7 @@ void numa_default_policy(void)
*/
/*
- * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
- * Used only for mpol_parse_str() and mpol_to_str()
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
@@ -2311,28 +2334,21 @@ static const char * const policy_modes[] =
#ifdef CONFIG_TMPFS
/**
- * mpol_parse_str - parse string to mempolicy
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
- * @no_context: flag whether to "contextualize" the mempolicy
+ * @unused: redundant argument, to be removed later.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * if @no_context is true, save the input nodemask in w.user_nodemask in
- * the returned mempolicy. This will be used to "clone" the mempolicy in
- * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
- * mount option. Note that if 'static' or 'relative' mode flags were
- * specified, the input nodemask will already have been saved. Saving
- * it again is redundant, but safe.
- *
* On success, returns 0, else 1
*/
-int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+int mpol_parse_str(char *str, struct mempolicy **mpol, int unused)
{
struct mempolicy *new = NULL;
unsigned short mode;
- unsigned short uninitialized_var(mode_flags);
+ unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
@@ -2420,24 +2436,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (IS_ERR(new))
goto out;
- if (no_context) {
- /* save for contextualization */
- new->w.user_nodemask = nodes;
- } else {
- int ret;
- NODEMASK_SCRATCH(scratch);
- if (scratch) {
- task_lock(current);
- ret = mpol_set_nodemask(new, &nodes, scratch);
- task_unlock(current);
- } else
- ret = -ENOMEM;
- NODEMASK_SCRATCH_FREE(scratch);
- if (ret) {
- mpol_put(new);
- goto out;
- }
- }
+ /*
+ * Save nodes for mpol_to_str() to show the tmpfs mount options
+ * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+ */
+ if (mode != MPOL_PREFERRED)
+ new->v.nodes = nodes;
+ else if (nodelist)
+ new->v.preferred_node = first_node(nodes);
+ else
+ new->flags |= MPOL_F_LOCAL;
+
+ /*
+ * Save nodes for contextualization: this will be used to "clone"
+ * the mempolicy in a specific context [cpuset] at a later time.
+ */
+ new->w.user_nodemask = nodes;
+
err = 0;
out:
@@ -2457,13 +2472,13 @@ out:
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
- * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
+ * @unused: redundant argument, to be removed later.
*
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused)
{
char *p = buffer;
int l;
@@ -2489,7 +2504,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_PREFERRED:
nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL; /* pseudo-policy */
+ mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
@@ -2497,14 +2512,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- if (no_context)
- nodes = pol->w.user_nodemask;
- else
- nodes = pol->v.nodes;
+ nodes = pol->v.nodes;
break;
default:
- BUG();
+ return -EINVAL;
}
l = strlen(policy_modes[mode]);
diff --git a/mm/migrate.c b/mm/migrate.c
index 14d0a6a..480714b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -220,6 +220,56 @@ out:
pte_unmap_unlock(ptep, ptl);
}
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ get_bh(bh);
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ put_bh(failed_bh);
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* Replace the page in the mapping.
*
@@ -229,7 +279,8 @@ out:
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
static int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ struct buffer_head *head, enum migrate_mode mode)
{
int expected_count;
void **pslot;
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
/*
+ * In the async migration case of moving a page with buffers, lock the
+ * buffers using trylock before the mapping is moved. If the mapping
+ * was moved, we later failed to lock the buffers and could not move
+ * the mapping back due to an elevated page count, we would have to
+ * block waiting on other references to be dropped.
+ */
+ if (mode == MIGRATE_ASYNC && head &&
+ !buffer_migrate_lock_buffers(head, mode)) {
+ page_unfreeze_refs(page, expected_count);
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
* Now we know that no one else is looking at the page.
*/
get_page(newpage); /* add cache reference */
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
* Pages are locked upon entry and exit.
*/
int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
if (rc)
return rc;
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
* exist.
*/
int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
struct buffer_head *bh, *head;
int rc;
if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
if (rc)
return rc;
- bh = head;
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
+ /*
+ * In the async case, migrate_page_move_mapping locked the buffers
+ * with an IRQ-safe spinlock held. In the sync case, the buffers
+ * need to be locked now
+ */
+ if (mode != MIGRATE_ASYNC)
+ BUG_ON(!buffer_migrate_lock_buffers(head, mode));
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
- if (PageDirty(page))
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+ if (mode != MIGRATE_SYNC)
+ return -EBUSY;
return writeout(mapping, page);
+ }
/*
* Buffers may be managed in a filesystem specific way.
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
/*
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache, bool sync)
+ int remap_swapcache, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
mapping = page_mapping(page);
if (!mapping)
- rc = migrate_page(mapping, newpage, page);
- else {
+ rc = migrate_page(mapping, newpage, page, mode);
+ else if (mapping->a_ops->migratepage)
/*
- * Do not writeback pages if !sync and migratepage is
- * not pointing to migrate_page() which is nonblocking
- * (swapcache/tmpfs uses migratepage = migrate_page).
+ * Most pages have a mapping and most filesystems provide a
+ * migratepage callback. Anonymous pages are part of swap
+ * space which also has its own migratepage callback. This
+ * is the most common path for page migration.
*/
- if (PageDirty(page) && !sync &&
- mapping->a_ops->migratepage != migrate_page)
- rc = -EBUSY;
- else if (mapping->a_ops->migratepage)
- /*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
- */
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
- else
- rc = fallback_migrate_page(mapping, newpage, page);
- }
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page, mode);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page, mode);
if (rc) {
newpage->mapping = NULL;
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
return rc;
}
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, bool offlining, enum migrate_mode mode)
{
- int rc = 0;
- int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ int rc = -EAGAIN;
int remap_swapcache = 1;
int charge = 0;
struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
- if (!newpage)
- return -ENOMEM;
-
- if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
- goto move_newpage;
- }
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
- goto move_newpage;
-
- /* prepare cgroup just returns 0 or -ENOMEM */
- rc = -EAGAIN;
-
if (!trylock_page(page)) {
- if (!force || !sync)
- goto move_newpage;
+ if (!force || mode == MIGRATE_ASYNC)
+ goto out;
/*
* It's not safe for direct compaction to call lock_page.
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* altogether.
*/
if (current->flags & PF_MEMALLOC)
- goto move_newpage;
+ goto out;
lock_page(page);
}
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (PageWriteback(page)) {
/*
- * For !sync, there is no point retrying as the retry loop
- * is expected to be too short for PageWriteback to be cleared
+ * Only in the case of a full syncronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
*/
- if (!sync) {
+ if (mode != MIGRATE_SYNC) {
rc = -EBUSY;
goto uncharge;
}
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+ rc = move_to_new_page(newpage, page, remap_swapcache, mode);
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
@@ -785,27 +826,53 @@ uncharge:
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+out:
+ return rc;
+}
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force, bool offlining,
+ enum migrate_mode mode)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+
+ if (!newpage)
+ return -ENOMEM;
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ goto out;
+ }
+
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto out;
+
+ rc = __unmap_and_move(page, newpage, force, offlining, mode);
+out:
if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
- */
- list_del(&page->lru);
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
*/
putback_lru_page(newpage);
-
if (result) {
if (rc)
*result = rc;
@@ -835,7 +902,8 @@ move_newpage:
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
- int force, bool offlining, bool sync)
+ int force, bool offlining,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = -EAGAIN;
if (!trylock_page(hpage)) {
- if (!force || !sync)
+ if (!force || mode != MIGRATE_SYNC)
goto out;
lock_page(hpage);
}
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1, sync);
+ rc = move_to_new_page(new_hpage, hpage, 1, mode);
if (rc)
remove_migration_ptes(hpage, hpage);
@@ -902,7 +970,7 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,
rc = unmap_and_move(get_new_page, private,
page, pass > 2, offlining,
- sync);
+ mode);
switch(rc) {
case -ENOMEM:
@@ -953,7 +1021,7 @@ out:
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining,
- sync);
+ mode);
switch(rc) {
case -ENOMEM:
@@ -1099,7 +1167,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, true);
+ (unsigned long)pm, 0, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de..71c7811 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ /*
+ * RCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+ /*
+ * if ->release runs before mmu_notifier_unregister it
+ * must be handled as it's the only way for the driver
+ * to flush all existing sptes and stop the driver
+ * from establishing any more sptes before all the
+ * pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
* mmu_notifier_unregister to return.
*/
hlist_del_init_rcu(&mn->hlist);
- /*
- * RCU here will block mmu_notifier_unregister until
- * ->release returns.
- */
- rcu_read_lock();
- spin_unlock(&mm->mmu_notifier_mm->lock);
- /*
- * if ->release runs before mmu_notifier_unregister it
- * must be handled as it's the only way for the driver
- * to flush all existing sptes and stop the driver
- * from establishing any more sptes before all the
- * pages in the mm are freed.
- */
- if (mn->ops->release)
- mn->ops->release(mn, mm);
- rcu_read_unlock();
- spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
- spin_lock(&mm->mmu_notifier_mm->lock);
if (!hlist_unhashed(&mn->hlist)) {
- hlist_del_rcu(&mn->hlist);
-
/*
* RCU here will force exit_mmap to wait ->release to finish
* before freeing the pages.
*/
rcu_read_lock();
- spin_unlock(&mm->mmu_notifier_mm->lock);
+
/*
* exit_mmap will block in mmu_notifier_release to
* guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
if (mn->ops->release)
mn->ops->release(mn, mm);
rcu_read_unlock();
- } else
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_del_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
+ }
/*
* Wait any running method to finish, of course including
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7..e39e3ef 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
- int i;
- unsigned long start_aligned, end_aligned;
+ unsigned long i, start_aligned, end_aligned;
int order = ilog2(BITS_PER_LONG);
start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4782..d1a7b6d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -565,7 +565,7 @@ static inline void __free_one_page(struct page *page,
combined_idx = buddy_idx & page_idx;
higher_page = page + (combined_idx - page_idx);
buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = page + (buddy_idx - combined_idx);
+ higher_buddy = higher_page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -1952,14 +1952,20 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;
- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
return NULL;
+ if (compaction_deferred(preferred_zone)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration);
@@ -1987,7 +1993,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone);
cond_resched();
}
@@ -1999,8 +2011,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
@@ -2153,6 +2166,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
#ifdef CONFIG_ANDROID_WIP
unsigned long start_tick = jiffies;
#endif
+ bool deferred_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2236,12 +2250,22 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -2323,8 +2347,9 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
@@ -2352,8 +2377,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
- struct page *page;
+ struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
@@ -2372,15 +2398,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
- if (!preferred_zone) {
- put_mems_allowed();
- return NULL;
- }
+ if (!preferred_zone)
+ goto out;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2390,9 +2416,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+ /*
+ * When updating a task's mems_allowed, it is possible to race with
+ * parallel threads in such a way that an allocation can fail while
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2616,13 +2652,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
+ unsigned int cpuset_mems_cookie;
if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;
- get_mems_allowed();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- put_mems_allowed();
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ } while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
@@ -3533,25 +3571,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
if (page_to_nid(page) != zone_to_nid(zone))
continue;
- /* Blocks with reserved pages will never free, skip them. */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
block_migratetype = get_pageblock_migratetype(page);
- /* If this block is reserved, account for it */
- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;
- /* Suitable for reserving if this block is movable */
- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page, MIGRATE_RESERVE);
- move_freepages_block(zone, page, MIGRATE_RESERVE);
- reserve--;
- continue;
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
}
/*
@@ -5562,7 +5608,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
- pfn = pfn - zone->zone_start_pfn;
+ pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 0ae7a09..af0cc7a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1630,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
areas[group] = ptr;
base = min(ptr, base);
+ }
+
+ /*
+ * Copy data and free unused parts. This should happen after all
+ * allocations are complete; otherwise, we may end up with
+ * overlapping groups.
+ */
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ void *ptr = areas[group];
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f6..30e44cb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -57,6 +57,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
#include <asm/tlbflush.h>
@@ -936,11 +937,8 @@ int page_mkclean(struct page *page)
if (page_mapped(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ if (mapping)
ret = page_mkclean_file(mapping, page);
- if (page_test_and_clear_dirty(page_to_pfn(page), 1))
- ret = 1;
- }
}
return ret;
@@ -1121,6 +1119,8 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
@@ -1131,8 +1131,19 @@ void page_remove_rmap(struct page *page)
* this if the page is anon, so about to be freed; but perhaps
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
+ *
+ * And we can skip it on file pages, so long as the filesystem
+ * participates in dirty tracking; but need to catch shm and tmpfs
+ * and ramfs pages which have been modified since creation by read
+ * fault.
+ *
+ * Note that mapping must be decided above, before decrementing
+ * mapcount (which luckily provides a barrier): once page is unmapped,
+ * it could be truncated and page->mapping reset to NULL at any moment.
+ * Note also that we are relying on page_mapping(page) to set mapping
+ * to &swapper_space when PageSwapCache(page).
*/
- if ((!PageAnon(page) || PageSwapCache(page)) &&
+ if (mapping && !mapping_cap_account_dirty(mapping) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index e76f74e..818947b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1177,19 +1177,20 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
struct shmem_inode_info *info, unsigned long idx)
{
- struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
struct page *page;
- spol = mpol_cond_copy(&mpol,
- mpol_shared_policy_lookup(&info->policy, idx));
-
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
pvma.vm_pgoff = idx;
pvma.vm_ops = NULL;
- pvma.vm_policy = spol;
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+
page = swapin_readahead(entry, gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
return page;
}
@@ -1197,6 +1198,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, unsigned long idx)
{
struct vm_area_struct pvma;
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
@@ -1204,10 +1206,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
- /*
- * alloc_page_vma() will drop the shared policy reference
- */
- return alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_page_vma(gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
@@ -2357,12 +2361,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
{
struct inode *inode;
struct dentry *dentry = NULL;
- u64 inum = fid->raw[2];
- inum = (inum << 32) | fid->raw[1];
+ u64 inum;
if (fh_len < 3)
return NULL;
+ inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
shmem_match, fid->raw);
if (inode) {
diff --git a/mm/slab.c b/mm/slab.c
index d96e223..a67f812 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_mem_id();
- get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
- put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
+ unsigned int cpuset_mems_cookie;
if (flags & __GFP_THISNODE)
return NULL;
- get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
retry:
/*
* Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
}
}
}
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ goto retry_cpuset;
return obj;
}
diff --git a/mm/slub.c b/mm/slub.c
index fa7c546..6b13a9c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1467,6 +1467,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
struct page *page;
+ unsigned int cpuset_mems_cookie;
/*
* The defrag ratio allows a configuration of the tradeoffs between
@@ -1490,23 +1491,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
- get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- struct kmem_cache_node *n;
-
- n = get_node(s, zone_to_nid(zone));
-
- if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
- n->nr_partial > s->min_partial) {
- page = get_partial_node(n);
- if (page) {
- put_mems_allowed();
- return page;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ n->nr_partial > s->min_partial) {
+ page = get_partial_node(n);
+ if (page) {
+ /*
+ * Return the object even if
+ * put_mems_allowed indicated that
+ * the cpuset mems_allowed was
+ * updated in parallel. It's a
+ * harmless race between the alloc
+ * and the cpuset update.
+ */
+ put_mems_allowed(cpuset_mems_cookie);
+ return page;
+ }
}
}
- }
- put_mems_allowed();
+ } while (!put_mems_allowed(cpuset_mems_cookie));
#endif
return NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 4cd05e5..9054f83 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -619,7 +619,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
return; /* XXX: Not implemented yet */
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
}
#else
@@ -660,10 +660,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
get_order(sizeof(struct page) * nr_pages));
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
unsigned long magic;
+ struct page *page = virt_to_page(memmap);
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -712,13 +713,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
*/
if (memmap) {
- struct page *memmap_page;
- memmap_page = virt_to_page(memmap);
-
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
- free_map_bootmem(memmap_page, nr_pages);
+ free_map_bootmem(memmap, nr_pages);
}
}
diff --git a/mm/truncate.c b/mm/truncate.c
index ca54477..6a11acb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -404,11 +404,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
+ clear_page_mlock(page);
+
spin_lock_irq(&mapping->tree_lock);
if (PageDirty(page))
goto failed;
- clear_page_mlock(page);
BUG_ON(page_has_private(page));
__delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d213fa1..a819c35 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct list_head purge_list; /* "lazy purge" list */
- void *private;
+ struct vm_struct *vm;
struct rcu_head rcu_head;
};
@@ -1174,9 +1174,10 @@ void __init vmalloc_init(void)
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
- va->flags = tmp->flags | VM_VM_AREA;
+ va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
__insert_vmap_area(va);
}
@@ -1279,6 +1280,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
vm->task_name = current->comm;
#endif
va->private = vm;
+ va->vm = vm;
va->flags |= VM_VM_AREA;
}
@@ -1401,7 +1403,7 @@ static struct vm_struct *find_vm_area(const void *addr)
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA)
- return va->private;
+ return va->vm;
return NULL;
}
@@ -1420,7 +1422,7 @@ struct vm_struct *remove_vm_area(const void *addr)
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
- struct vm_struct *vm = va->private;
+ struct vm_struct *vm = va->vm;
if (!(vm->flags & VM_UNLIST)) {
struct vm_struct *tmp, **p;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6979dc..99082fa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
- unsigned long total_scan;
- unsigned long max_pass;
+ long total_scan;
+ long max_pass;
+ int shrink_ret = 0;
+ long nr;
+ long new_nr;
max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ if (max_pass <= 0)
+ continue;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ do {
+ nr = shrinker->nr;
+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+ total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0) {
+ total_scan += delta;
+ if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
- shrinker->shrink, shrinker->nr);
- shrinker->nr = max_pass;
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
}
/*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (shrinker->nr > max_pass * 2)
- shrinker->nr = max_pass * 2;
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
- int shrink_ret;
int nr_before;
nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
cond_resched();
}
- shrinker->nr += total_scan;
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ do {
+ nr = shrinker->nr;
+ new_nr = total_scan + nr;
+ if (total_scan <= 0)
+ break;
+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
@@ -665,7 +708,7 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
if (referenced_ptes) {
- if (PageAnon(page))
+ if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
@@ -985,23 +1028,27 @@ keep_lumpy:
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
{
+ bool all_lru_mode;
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
+ all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+ (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
- if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
return ret;
- if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+ if (!all_lru_mode && !!page_is_file_cache(page) != file)
return ret;
/*
@@ -1018,6 +1065,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
#endif
ret = -EBUSY;
+ /*
+ * To minimise LRU disruption, the caller can indicate that it only
+ * wants to isolate pages it will be able to operate on without
+ * blocking - clean pages for the most part.
+ *
+ * ISOLATE_CLEAN means that only clean pages should be isolated. This
+ * is used by reclaim when it is cannot write to backing storage
+ *
+ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+ * that it is possible to migrate without blocking
+ */
+ if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+ /* All the caller can do on PageWriteback is block */
+ if (PageWriteback(page))
+ return ret;
+
+ if (PageDirty(page)) {
+ struct address_space *mapping;
+
+ /* ISOLATE_CLEAN means only clean pages */
+ if (mode & ISOLATE_CLEAN)
+ return ret;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking
+ */
+ mapping = page_mapping(page);
+ if (mapping && !mapping->a_ops->migratepage)
+ return ret;
+ }
+ }
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
@@ -1053,7 +1137,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned, int order, int mode, int file)
+ unsigned long *scanned, int order, isolate_mode_t mode,
+ int file)
{
unsigned long nr_taken = 0;
unsigned long nr_lumpy_taken = 0;
@@ -1128,7 +1213,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* anon page which don't already have a swap slot is
* pointless.
*/
- if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+ if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
!PageSwapCache(cursor_page))
break;
@@ -1178,8 +1263,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
- int active, int file)
+ isolate_mode_t mode,
+ struct zone *z, int active, int file)
{
int lru = LRU_BASE;
if (active)
@@ -1462,6 +1547,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;
+ isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1472,15 +1558,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
}
set_reclaim_mode(priority, sc, false);
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+ reclaim_mode |= ISOLATE_ACTIVE;
+
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, 0, file);
+ nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1489,12 +1581,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
} else {
- nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, sc->mem_cgroup,
- 0, file);
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone,
+ sc->mem_cgroup, 0, file);
/*
* mem_cgroup_isolate_pages() keeps track of
* scanned pages on its own.
@@ -1634,19 +1723,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
1, file);
zone->pages_scanned += pgscanned;
} else {
nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
sc->mem_cgroup, 1, file);
/*
* mem_cgroup_isolate_pages() keeps track of
@@ -1839,23 +1935,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
- int force_scan = 0;
+ bool force_scan = false;
unsigned long nr_force_scan[2];
-
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
- if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
- /* kswapd does zone balancing and need to scan this zone */
- if (scanning_global_lru(sc) && current_is_kswapd())
- force_scan = 1;
- /* memcg may have small limit and need to avoid priority drop */
- if (!scanning_global_lru(sc))
- force_scan = 1;
- }
+ /* kswapd does zone balancing and needs to scan this zone */
+ if (scanning_global_lru(sc) && current_is_kswapd() &&
+ zone->all_unreclaimable)
+ force_scan = true;
+ /* memcg may have small limit and need to avoid priority drop */
+ if (!scanning_global_lru(sc))
+ force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1868,6 +1957,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
goto out;
}
+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
@@ -2004,8 +2098,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (nr_swap_pages > 0)
+ inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
@@ -2080,6 +2175,42 @@ restart:
throttle_vm_writeout(sc->gfp_mask);
}
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long balance_gap, watermark;
+ bool watermark_ok;
+
+ /* Do not consider compaction for orders reclaim is meant to satisfy */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * Compaction takes time to run and there are potentially other
+ * callers using the pages just freed. Continue reclaiming until
+ * there is a buffer of free pages available to give compaction
+ * a reasonable chance of completing and allocating the page
+ */
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+ /*
+ * If compaction is deferred, reclaim up to a point where
+ * compaction will have a chance of success when re-enabled
+ */
+ if (compaction_deferred(zone))
+ return watermark_ok;
+
+ /* If compaction is not ready to start, keep reclaiming */
+ if (!compaction_suitable(zone, sc->order))
+ return false;
+
+ return watermark_ok;
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2095,14 +2226,20 @@ restart:
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ bool aborted_reclaim = false;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2117,6 +2254,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ if (COMPACTION_BUILD) {
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticable problem, like transparent huge page
+ * allocations.
+ */
+ if (compaction_ready(zone, sc)) {
+ aborted_reclaim = true;
+ continue;
+ }
+ }
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2134,6 +2286,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
shrink_zone(priority, zone, sc);
}
+
+ return aborted_reclaim;
}
static bool zone_reclaimable(struct zone *zone)
@@ -2187,8 +2341,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
unsigned long writeback_threshold;
+ bool aborted_reclaim;
- get_mems_allowed();
delayacct_freepages_start();
if (scanning_global_lru(sc))
@@ -2198,7 +2352,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token(sc->mem_cgroup);
- shrink_zones(priority, zonelist, sc);
+ aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -2250,7 +2405,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
out:
delayacct_freepages_end();
- put_mems_allowed();
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
@@ -2263,6 +2417,10 @@ out:
if (oom_killer_disabled)
return 0;
+ /* Aborted reclaim to try compaction? don't OOM, then */
+ if (aborted_reclaim)
+ return 1;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
@@ -2554,6 +2712,9 @@ loop_again:
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
+ } else {
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
}
}
if (i < 0)
@@ -2790,7 +2951,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
@@ -2817,7 +2981,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
+ unsigned balanced_order;
int classzone_idx, new_classzone_idx;
+ int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2848,7 +3014,9 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
+ balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+ balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
int ret;
@@ -2857,7 +3025,8 @@ static int kswapd(void *p)
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
- if (classzone_idx >= new_classzone_idx && order == new_order) {
+ if (balanced_classzone_idx >= new_classzone_idx &&
+ balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
@@ -2872,9 +3041,12 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order, classzone_idx);
+ kswapd_try_to_sleep(pgdat, balanced_order,
+ balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ new_order = order;
+ new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
@@ -2889,9 +3061,13 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- order = balance_pgdat(pgdat, order, &classzone_idx);
+ balanced_classzone_idx = classzone_idx;
+ balanced_order = balance_pgdat(pgdat, order,
+ &balanced_classzone_idx);
}
}
+
+ current->reclaim_state = NULL;
return 0;
}
@@ -3051,14 +3227,17 @@ int kswapd_run(int nid)
}
/*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
- if (kswapd)
+ if (kswapd) {
kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
}
static int __init kswapd_init(void)