aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/bootmem.c37
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/filemap.c26
-rw-r--r--mm/filemap_xip.c65
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/hugetlb.c62
-rw-r--r--mm/memcontrol.c20
-rw-r--r--mm/mempolicy.c1
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page_alloc.c24
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/quicklist.c9
-rw-r--r--mm/rmap.c39
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slob.c9
-rw-r--r--mm/slub.c32
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/tiny-shmem.c26
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/util.c15
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmstat.c19
28 files changed, 340 insertions, 123 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 446c658..91ee392 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP
def_bool y
depends on !SPARSEMEM
-config HAVE_GET_USER_PAGES_FAST
- bool
-
#
# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
# to represent different areas of memory. This variable allows
@@ -190,6 +187,9 @@ config RESOURCES_64BIT
help
This option allows memory and IO resources to be 64 bit.
+config PHYS_ADDR_T_64BIT
+ def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+
config ZONE_DMA_FLAG
int
default "0" if !ZONE_DMA
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 4af15d0..ad8eec6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
}
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+ unsigned long step)
+{
+ unsigned long base = bdata->node_min_pfn;
+
+ /*
+ * Align the index with respect to the node start so that the
+ * combination of both satisfies the requested alignment.
+ */
+
+ return ALIGN(base + idx, step) - base;
+}
+
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+ unsigned long align)
+{
+ unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+
+ /* Same as align_idx for byte offsets */
+
+ return ALIGN(base + off, align) - base;
+}
+
static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
@@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
else
start = ALIGN(min, step);
- sidx = start - bdata->node_min_pfn;;
+ sidx = start - bdata->node_min_pfn;
midx = max - bdata->node_min_pfn;
if (bdata->hint_idx > sidx) {
@@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
* catch the fallback below.
*/
fallback = sidx + 1;
- sidx = ALIGN(bdata->hint_idx, step);
+ sidx = align_idx(bdata, bdata->hint_idx, step);
}
while (1) {
@@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long eidx, i, start_off, end_off;
find_block:
sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
- sidx = ALIGN(sidx, step);
+ sidx = align_idx(bdata, sidx, step);
eidx = sidx + PFN_UP(size);
if (sidx >= midx || eidx > midx)
@@ -467,15 +490,15 @@ find_block:
for (i = sidx; i < eidx; i++)
if (test_bit(i, bdata->node_bootmem_map)) {
- sidx = ALIGN(i, step);
+ sidx = align_idx(bdata, i, step);
if (sidx == i)
sidx += step;
goto find_block;
}
- if (bdata->last_end_off &&
+ if (bdata->last_end_off & (PAGE_SIZE - 1) &&
PFN_DOWN(bdata->last_end_off) + 1 == sidx)
- start_off = ALIGN(bdata->last_end_off, align);
+ start_off = align_off(bdata, bdata->last_end_off, align);
else
start_off = PFN_PHYS(sidx);
@@ -499,7 +522,7 @@ find_block:
}
if (fallback) {
- sidx = ALIGN(fallback - 1, step);
+ sidx = align_idx(bdata, fallback - 1, step);
fallback = 0;
goto find_block;
}
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f..06722c4 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
/*
* Data-less bio, nothing to bounce
*/
- if (bio_empty_barrier(*bio_orig))
+ if (!bio_has_data(*bio_orig))
return;
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 54e9686..494ff20 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1100,8 +1100,9 @@ page_ok:
page_not_up_to_date:
/* Get exclusive access to the page ... */
- if (lock_page_killable(page))
- goto readpage_eio;
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
@@ -1130,8 +1131,9 @@ readpage:
}
if (!PageUptodate(page)) {
- if (lock_page_killable(page))
- goto readpage_eio;
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
@@ -1143,15 +1145,14 @@ readpage:
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
- goto readpage_eio;
+ error = -EIO;
+ goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
-readpage_eio:
- error = -EIO;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
@@ -2129,13 +2130,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
* After a write we want buffered reads to be sure to go to disk to get
* the new data. We invalidate clean cached page from the region we're
* about to write. We do this *before* the write so that we can return
- * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
*/
if (mapping->nrpages) {
written = invalidate_inode_pages2_range(mapping,
pos >> PAGE_CACHE_SHIFT, end);
- if (written)
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
goto out;
+ }
}
written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 380ab40..b5167df 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,8 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
@@ -22,22 +24,18 @@
* We do use our own empty page to avoid interference with other users
* of ZERO_PAGE(), such as /dev/zero
*/
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
static struct page *__xip_sparse_page;
+/* called under xip_sparse_mutex */
static struct page *xip_sparse_page(void)
{
if (!__xip_sparse_page) {
struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
- if (page) {
- static DEFINE_SPINLOCK(xip_alloc_lock);
- spin_lock(&xip_alloc_lock);
- if (!__xip_sparse_page)
- __xip_sparse_page = page;
- else
- __free_page(page);
- spin_unlock(&xip_alloc_lock);
- }
+ if (page)
+ __xip_sparse_page = page;
}
return __xip_sparse_page;
}
@@ -174,18 +172,23 @@ __xip_unmap (struct address_space * mapping,
pte_t pteval;
spinlock_t *ptl;
struct page *page;
+ unsigned count;
+ int locked = 0;
+
+ count = read_seqcount_begin(&xip_sparse_seq);
page = __xip_sparse_page;
if (!page)
return;
+retry:
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 1);
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
@@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
}
}
spin_unlock(&mapping->i_mmap_lock);
+
+ if (locked) {
+ mutex_unlock(&xip_sparse_mutex);
+ } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+ mutex_lock(&xip_sparse_mutex);
+ locked = 1;
+ goto retry;
+ }
}
/*
@@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error;
/* XXX: are VM_FAULT_ codes OK? */
-
+again:
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;
@@ -237,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int err;
/* maybe shared writable, allocate new block */
+ mutex_lock(&xip_sparse_mutex);
error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
&xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
if (error)
return VM_FAULT_SIGBUS;
/* unmap sparse mappings at pgoff from all other vmas */
@@ -252,14 +265,34 @@ found:
BUG_ON(err);
return VM_FAULT_NOPAGE;
} else {
+ int err, ret = VM_FAULT_OOM;
+
+ mutex_lock(&xip_sparse_mutex);
+ write_seqcount_begin(&xip_sparse_seq);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(!error)) {
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+ goto again;
+ }
+ if (error != -ENODATA)
+ goto out;
/* not shared and writable, use xip_sparse_page() */
page = xip_sparse_page();
if (!page)
- return VM_FAULT_OOM;
+ goto out;
+ err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+ page);
+ if (err == -ENOMEM)
+ goto out;
- page_cache_get(page);
- vmf->page = page;
- return 0;
+ ret = VM_FAULT_NOPAGE;
+out:
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+
+ return ret;
}
}
@@ -308,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
&xip_mem, &xip_pfn);
if (status == -ENODATA) {
/* we allocate a new page unmap it */
+ mutex_lock(&xip_sparse_mutex);
status = a_ops->get_xip_mem(mapping, index, 1,
&xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
if (!status)
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, index);
diff --git a/mm/highmem.c b/mm/highmem.c
index e16e152..b36b83b 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
static void flush_all_zero_pkmaps(void)
{
int i;
+ int need_flush = 0;
flush_cache_kmaps();
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
&pkmap_page_table[i]);
set_page_address(page, NULL);
+ need_flush = 1;
}
- flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+ if (need_flush)
+ flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
/**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 757ca98..67a7119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
huge_page_order(h));
if (page) {
if (arch_prepare_hugepage(page)) {
- __free_pages(page, HUGETLB_PAGE_ORDER);
+ __free_pages(page, huge_page_order(h));
return NULL;
}
prep_new_huge_page(h, page, nid);
@@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
+ if (page && arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
+
spin_lock(&hugetlb_lock);
if (page) {
/*
@@ -1937,6 +1942,18 @@ retry:
lock_page(page);
}
+ /*
+ * If we are going to COW a private mapping later, we examine the
+ * pending reservations for this page now. This will ensure that
+ * any allocations necessary to record that reservation occur outside
+ * the spinlock.
+ */
+ if (write_access && !(vma->vm_flags & VM_SHARED))
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+
spin_lock(&mm->page_table_lock);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
@@ -1962,6 +1979,7 @@ out:
backout:
spin_unlock(&mm->page_table_lock);
+backout_unlocked:
unlock_page(page);
put_page(page);
goto out;
@@ -1973,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
@@ -1989,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
- mutex_unlock(&hugetlb_instantiation_mutex);
- return ret;
+ goto out_unlock;
}
ret = 0;
+ /*
+ * If we are going to COW the mapping later, we examine the pending
+ * reservations for this page now. This will ensure that any
+ * allocations necessary to record that reservation occur outside the
+ * spinlock. For private mappings, we also lookup the pagecache
+ * page now as it is used to determine if a reservation has been
+ * consumed.
+ */
+ if (write_access && !pte_write(entry)) {
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto out_unlock;
+ }
+
+ if (!(vma->vm_flags & VM_SHARED))
+ pagecache_page = hugetlbfs_pagecache_page(h,
+ vma, address);
+ }
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (likely(pte_same(entry, huge_ptep_get(ptep))))
- if (write_access && !pte_write(entry)) {
- struct page *page;
- page = hugetlbfs_pagecache_page(h, vma, address);
- ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
- if (page) {
- unlock_page(page);
- put_page(page);
- }
- }
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry,
+ pagecache_page);
spin_unlock(&mm->page_table_lock);
+
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+
+out_unlock:
mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7056c3b..36896f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
+ /*
+ * mm_update_next_owner() may clear mm->owner to NULL
+ * if it races with swapoff, page migration, etc.
+ * So this can be called with p == NULL.
+ */
+ if (unlikely(!p))
+ return NULL;
+
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
@@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
if (likely(!memcg)) {
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ kmem_cache_free(page_cgroup_cache, pc);
+ return 0;
+ }
/*
* For every charge from the cgroup, increment reference count
*/
@@ -796,14 +809,21 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
if (mem_cgroup_subsys.disabled)
return 0;
+ if (!mm)
+ return 0;
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ return 0;
+ }
css_get(&mem->css);
rcu_read_unlock();
do {
progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+ progress += res_counter_check_under_limit(&mem->res);
} while (!progress && --retry);
css_put(&mem->css);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e550bec..8336905 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
{
- LIST_HEAD(pagelist);
int busy = 0;
int err = 0;
nodemask_t tmp;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 936ef2e..4e0e265 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -12,7 +12,7 @@
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
-int __meminitdata mminit_loglevel;
+int mminit_loglevel;
#ifndef SECTIONS_SHIFT
#define SECTIONS_SHIFT 0
diff --git a/mm/mmap.c b/mm/mmap.c
index 971d0ed..e7a5a68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
@@ -2273,14 +2277,14 @@ int install_special_mapping(struct mm_struct *mm,
static DEFINE_MUTEX(mm_all_locks_mutex);
-static void vm_lock_anon_vma(struct anon_vma *anon_vma)
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock(&anon_vma->lock);
+ spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
* anon_vma->lock. If some other vma in this mm shares
@@ -2296,7 +2300,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma)
}
}
-static void vm_lock_mapping(struct address_space *mapping)
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
/*
@@ -2310,7 +2314,7 @@ static void vm_lock_mapping(struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- spin_lock(&mapping->i_mmap_lock);
+ spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
}
}
@@ -2358,11 +2362,17 @@ int mm_take_all_locks(struct mm_struct *mm)
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (signal_pending(current))
goto out_unlock;
- if (vma->anon_vma)
- vm_lock_anon_vma(vma->anon_vma);
if (vma->vm_file && vma->vm_file->f_mapping)
- vm_lock_mapping(vma->vm_file->f_mapping);
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ vm_lock_anon_vma(mm, vma->anon_vma);
}
+
ret = 0;
out_unlock:
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed59..16ce8b9 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
(z->zone && !zref_in_nodemask(z, nodes)))
z++;
- *zone = zonelist_zone(z++);
+ *zone = zonelist_zone(z);
return z;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467e..64e5b4b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/security.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* Superuser processes are usually more important, so we make it
* less likely that we kill those.
*/
- if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
+ if (has_capability(p, CAP_SYS_ADMIN) ||
+ has_capability(p, CAP_SYS_RESOURCE))
points /= 4;
/*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* tend to only have this flag set on applications they think
* of as important.
*/
- if (__capable(p, CAP_SYS_RAWIO))
+ if (has_capability(p, CAP_SYS_RAWIO))
points /= 4;
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 401d104..27b8681 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+ struct page *p = page + 1;
set_compound_page_dtor(page, free_compound_page);
set_compound_order(page, order);
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
-
+ for (i = 1; i < nr_pages; i++, p++) {
+ if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+ p = pfn_to_page(page_to_pfn(page) + i);
__SetPageTail(p);
p->first_page = page;
}
@@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+ struct page *p = page + 1;
if (unlikely(compound_order(page) != order))
bad_page(page);
@@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (unlikely(!PageHead(page)))
bad_page(page);
__ClearPageHead(page);
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
+ for (i = 1; i < nr_pages; i++, p++) {
+ if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+ p = pfn_to_page(page_to_pfn(page) + i);
if (unlikely(!PageTail(p) |
(p->first_page != page)))
@@ -694,6 +697,9 @@ static int move_freepages(struct zone *zone,
#endif
for (page = start_page; page <= end_page;) {
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
@@ -2516,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
continue;
page = pfn_to_page(pfn);
+ /* Watch out for overlapping nodes */
+ if (page_to_nid(page) != zone_to_nid(zone))
+ continue;
+
/* Blocks with reserved pages will never free, skip them. */
if (PageReserved(page))
continue;
@@ -4064,7 +4074,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
EXPORT_SYMBOL(contig_page_data);
#endif
@@ -4437,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem(size);
+ table = alloc_bootmem_nopanic(size);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58..b70a7fe 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
* linux/mm/page_isolation.c
*/
-#include <stddef.h>
#include <linux/mm.h>
#include <linux/page-isolation.h>
#include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long pfn;
+ unsigned long pfn, flags;
struct page *page;
+ struct zone *zone;
+ int ret;
pfn = start_pfn;
/*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
if (pfn < end_pfn)
return -EBUSY;
/* Check all pages are free or Marked as ISOLATED */
- if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
- return 0;
- return -EBUSY;
+ zone = page_zone(pfn_to_page(pfn));
+ spin_lock_irqsave(&zone->lock, flags);
+ ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret ? 0 : -EBUSY;
}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7..8dbb680 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
static unsigned long max_pages(unsigned long min_pages)
{
unsigned long node_free_pages, max;
- struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+ int node = numa_node_id();
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int num_cpus_on_node;
+ node_to_cpumask_ptr(cpumask_on_node, node);
node_free_pages =
#ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
max = node_free_pages / FRACTION_OF_NODE_MEM;
+
+ num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+ max /= num_cpus_on_node;
+
return max(max, min_pages);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ea4e6f..0383acf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -224,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
/*
* Check that @page is mapped at @address into @mm.
*
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
* On success returns with pte mapped and locked.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address, spinlock_t **ptlp)
+ unsigned long address, spinlock_t **ptlp, int sync)
{
pgd_t *pgd;
pud_t *pud;
@@ -249,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
- if (!pte_present(*pte)) {
+ if (!sync && !pte_present(*pte)) {
pte_unmap(pte);
return NULL;
}
@@ -281,7 +285,7 @@ static int page_referenced_one(struct page *page,
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -450,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
goto out;
@@ -659,23 +663,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
}
/*
- * It would be tidy to reset the PageAnon mapping here,
- * but that might overwrite a racing page_add_anon_rmap
- * which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_hot_cold_page,
- * and remember that it's only reliable while mapped.
- * Leaving it set also helps swapoff to reinstate ptes
- * faster for those pages still in swapcache.
+ * Now that the last pte has gone, s390 must transfer dirty
+ * flag from storage key to struct page. We can usually skip
+ * this if the page is anon, so about to be freed; but perhaps
+ * not if it's in swapcache - there might be another pte slot
+ * containing the swap entry, but page not yet written to swap.
*/
if ((!PageAnon(page) || PageSwapCache(page)) &&
page_test_dirty(page)) {
page_clear_dirty(page);
set_page_dirty(page);
}
- mem_cgroup_uncharge_page(page);
+ mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page,
- PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+ PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+ /*
+ * It would be tidy to reset the PageAnon mapping here,
+ * but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping
+ * before us: so leave the reset to free_hot_cold_page,
+ * and remember that it's only reliable while mapped.
+ * Leaving it set also helps swapoff to reinstate ptes
+ * faster for those pages still in swapcache.
+ */
}
}
@@ -697,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
diff --git a/mm/shmem.c b/mm/shmem.c
index 04fb4f1..bf66d01 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -50,14 +50,12 @@
#include <linux/migrate.h>
#include <linux/highmem.h>
#include <linux/seq_file.h>
+#include <linux/magic.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>
-/* This magic number is used in glibc for posix shared memory */
-#define TMPFS_MAGIC 0x01021994
-
#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
diff --git a/mm/slab.c b/mm/slab.c
index 918f04f..e76eee4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4472,4 +4472,3 @@ size_t ksize(const void *objp)
return obj_size(virt_to_cache(objp));
}
-EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index d8fbd4d..cb675d1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -514,12 +514,13 @@ size_t ksize(const void *block)
return 0;
sp = (struct slob_page *)virt_to_page(block);
- if (slob_page(sp))
- return ((slob_t *)block - 1)->units + SLOB_UNIT;
- else
+ if (slob_page(sp)) {
+ int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
+ } else
return sp->page.private;
}
-EXPORT_SYMBOL(ksize);
struct kmem_cache {
unsigned int size, align;
diff --git a/mm/slub.c b/mm/slub.c
index b7e2cd5..0c83e6a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
n = get_node(s, zone_to_nid(zone));
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
- n->nr_partial > MIN_PARTIAL) {
+ n->nr_partial > n->min_partial) {
page = get_partial_node(n);
if (page)
return page;
@@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
slab_unlock(page);
} else {
stat(c, DEACTIVATE_EMPTY);
- if (n->nr_partial < MIN_PARTIAL) {
+ if (n->nr_partial < n->min_partial) {
/*
* Adding an empty slab to the partial slabs in order
* to avoid page allocator overhead. This slab needs
@@ -1913,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
#endif
}
-static void init_kmem_cache_node(struct kmem_cache_node *n)
+static void
+init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
{
n->nr_partial = 0;
+
+ /*
+ * The larger the object size is, the more pages we want on the partial
+ * list to avoid pounding the page allocator excessively.
+ */
+ n->min_partial = ilog2(s->size);
+ if (n->min_partial < MIN_PARTIAL)
+ n->min_partial = MIN_PARTIAL;
+ else if (n->min_partial > MAX_PARTIAL)
+ n->min_partial = MAX_PARTIAL;
+
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
+ atomic_long_set(&n->total_objects, 0);
INIT_LIST_HEAD(&n->full);
#endif
}
@@ -2087,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
init_object(kmalloc_caches, n, 1);
init_tracking(kmalloc_caches, n);
#endif
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, kmalloc_caches);
inc_slabs_node(kmalloc_caches, node, page->objects);
/*
@@ -2144,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
}
s->node[node] = n;
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, s);
}
return 1;
}
@@ -2155,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
{
- init_kmem_cache_node(&s->local_node);
+ init_kmem_cache_node(&s->local_node, s);
return 1;
}
#endif
@@ -2300,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
s->refcount = 1;
#ifdef CONFIG_NUMA
- s->remote_node_defrag_ratio = 100;
+ s->remote_node_defrag_ratio = 1000;
#endif
if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
goto error;
@@ -2715,7 +2728,6 @@ size_t ksize(const void *object)
*/
return s->size;
}
-EXPORT_SYMBOL(ksize);
void kfree(const void *x)
{
@@ -2890,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
ret = -ENOMEM;
goto out;
}
- init_kmem_cache_node(n);
+ init_kmem_cache_node(n, s);
s->node[nid] = n;
}
out:
@@ -4047,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
if (err)
return err;
- if (ratio < 100)
+ if (ratio <= 100)
s->remote_node_defrag_ratio = ratio * 10;
return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 5d9dbbb..39db301 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,7 +12,6 @@
#include <asm/dma.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
-#include "internal.h"
/*
* Permanent SPARSEMEM data:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 167cf2d..797c383 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -60,7 +60,7 @@ void show_swap_cache_info(void)
printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total);
- printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+ printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f5..8d7a27a 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (!dentry)
goto put_memory;
+ error = -ENFILE;
+ file = get_empty_filp();
+ if (!file)
+ goto put_dentry;
+
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto put_dentry;
+ goto close_file;
d_instantiate(dentry, inode);
- error = -ENFILE;
- file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
- &ramfs_file_operations);
- if (!file)
- goto put_dentry;
-
+ inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &ramfs_file_operations);
- /* notify everyone as to the change of file size */
- error = do_truncate(dentry, size, 0, file);
- if (error < 0)
+#ifndef CONFIG_MMU
+ error = ramfs_nommu_expand_for_mapping(inode, size);
+ if (error)
goto close_file;
-
+#endif
return file;
close_file:
put_filp(file);
- return ERR_PTR(error);
-
put_dentry:
dput(dentry);
put_memory:
diff --git a/mm/truncate.c b/mm/truncate.c
index 2505050..6650c1d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -380,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -440,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
- ret2 = -EIO;
+ ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 9341ca7..cb00b74 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -171,3 +171,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->unmap_area = arch_unmap_area;
}
#endif
+
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start, nr_pages,
+ write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 85b9a0d..bba06c4 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -180,6 +180,13 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
pmd_t *pmd;
pte_t *ptep, pte;
+ /*
+ * XXX we might need to change this if we add VIRTUAL_BUG_ON for
+ * architectures that do not vmalloc module space
+ */
+ VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) &&
+ !is_module_address(addr));
+
if (!pgd_none(*pgd)) {
pud = pud_offset(pgd, addr);
if (!pud_none(*pud)) {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0d08e6..d7826af 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
continue;
page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+ /*
+ * Ordinarily, memory holes in flatmem still have a valid
+ * memmap for the PFN range. However, an architecture for
+ * embedded systems (e.g. ARM) can free up the memmap backing
+ * holes to save memory on the assumption the memmap is
+ * never used. The page_zone linkages are then broken even
+ * though pfn_valid() returns true. Skip the page if the
+ * linkages are broken. Even if this test passed, the impact
+ * is that the counters for the movable type are off but
+ * fragmentation monitoring is likely meaningless on small
+ * systems.
+ */
+ if (page_zone(page) != zone)
+ continue;
+#endif
mtype = get_pageblock_migratetype(page);
- count[mtype]++;
+ if (mtype < MIGRATE_TYPES)
+ count[mtype]++;
}
/* Print counts */