aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/allocpercpu.c2
-rw-r--r--mm/backing-dev.c12
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/filemap.c7
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/memory.c111
-rw-r--r--mm/memory_hotplug.c86
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/migrate.c12
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c11
-rw-r--r--mm/nommu.c21
-rw-r--r--mm/page_alloc.c46
-rw-r--r--mm/pagewalk.c42
-rw-r--r--mm/pdflush.c4
-rw-r--r--mm/slab.c5
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c69
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/swap.c4
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c2
22 files changed, 301 insertions, 168 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index f4026ba..05f2b40 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -1,7 +1,7 @@
/*
* linux/mm/allocpercpu.c
*
- * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
+ * Separated from slab.c August 11, 2006 Christoph Lameter
*/
#include <linux/mm.h>
#include <linux/module.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7c4f9e0..f2e574d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -172,30 +172,22 @@ postcore_initcall(bdi_class_init);
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
- char *name;
va_list args;
int ret = 0;
struct device *dev;
va_start(args, fmt);
- name = kvasprintf(GFP_KERNEL, fmt, args);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
-
- if (!name)
- return -ENOMEM;
-
- dev = device_create(bdi_class, parent, MKDEV(0, 0), name);
if (IS_ERR(dev)) {
ret = PTR_ERR(dev);
goto exit;
}
bdi->dev = dev;
- dev_set_drvdata(bdi->dev, bdi);
- bdi_debug_register(bdi, name);
+ bdi_debug_register(bdi, dev_name(dev));
exit:
- kfree(name);
return ret;
}
EXPORT_SYMBOL(bdi_register);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8fb927..8d9f60e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -442,15 +442,17 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
}
-void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
int ret;
ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
if (ret < 0)
- return;
+ return -ENOMEM;
reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+
+ return 0;
}
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
diff --git a/mm/filemap.c b/mm/filemap.c
index 239d361..1e6a7d3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1461,6 +1461,11 @@ page_not_uptodate:
*/
ClearPageError(page);
error = mapping->a_ops->readpage(file, page);
+ if (!error) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ error = -EIO;
+ }
page_cache_release(page);
if (!error || error == AOP_TRUNCATED_PAGE)
@@ -1655,7 +1660,7 @@ int should_remove_suid(struct dentry *dentry)
}
EXPORT_SYMBOL(should_remove_suid);
-int __remove_suid(struct dentry *dentry, int kill)
+static int __remove_suid(struct dentry *dentry, int kill)
{
struct iattr newattrs;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bbf953e..ab17127 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -785,7 +785,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
continue;
spin_lock(&dst->page_table_lock);
- spin_lock(&src->page_table_lock);
+ spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
if (!huge_pte_none(huge_ptep_get(src_pte))) {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
diff --git a/mm/memory.c b/mm/memory.c
index bbab1e3..2302d22 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -311,6 +311,21 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
if (!new)
return -ENOMEM;
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_read_barrier_depends() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+
spin_lock(&mm->page_table_lock);
if (!pmd_present(*pmd)) { /* Has another populated it ? */
mm->nr_ptes++;
@@ -329,6 +344,8 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
if (!new)
return -ENOMEM;
+ smp_wmb(); /* See comment in __pte_alloc */
+
spin_lock(&init_mm.page_table_lock);
if (!pmd_present(*pmd)) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
@@ -969,7 +986,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
goto no_page_table;
pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ if (pmd_none(*pmd))
goto no_page_table;
if (pmd_huge(*pmd)) {
@@ -978,18 +995,19 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
goto out;
}
+ if (unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!ptep)
- goto out;
pte = *ptep;
if (!pte_present(pte))
- goto unlock;
+ goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
page = vm_normal_page(vma, address, pte);
if (unlikely(!page))
- goto unlock;
+ goto bad_page;
if (flags & FOLL_GET)
get_page(page);
@@ -1004,6 +1022,15 @@ unlock:
out:
return page;
+bad_page:
+ pte_unmap_unlock(ptep, ptl);
+ return ERR_PTR(-EFAULT);
+
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return page;
+ /* Fall through to ZERO_PAGE handling */
no_page_table:
/*
* When core dumping an enormous anonymous area that nobody
@@ -1018,6 +1045,26 @@ no_page_table:
return page;
}
+/* Can we do the FOLL_ANON optimization? */
+static inline int use_zero_page(struct vm_area_struct *vma)
+{
+ /*
+ * We don't want to optimize FOLL_ANON for make_pages_present()
+ * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
+ * we want to get the page from the page tables to make sure
+ * that we serialize and update with any other user of that
+ * mapping.
+ */
+ if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
+ return 0;
+ /*
+ * And if we have a fault or a nopfn routine, it's not an
+ * anonymous region.
+ */
+ return !vma->vm_ops ||
+ (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
+}
+
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -1092,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
foll_flags = FOLL_TOUCH;
if (pages)
foll_flags |= FOLL_GET;
- if (!write && !(vma->vm_flags & VM_LOCKED) &&
- (!vma->vm_ops || !vma->vm_ops->fault))
+ if (!write && use_zero_page(vma))
foll_flags |= FOLL_ANON;
do {
@@ -1105,7 +1151,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* be processed until returning to user space.
*/
if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
- return -ENOMEM;
+ return i ? i : -ENOMEM;
if (write)
foll_flags |= FOLL_WRITE;
@@ -1139,6 +1185,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
}
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
if (pages) {
pages[i] = page;
@@ -1649,8 +1697,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page)
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable as we can't do any dirty
+ * accounting on raw pfn maps.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ goto reuse;
goto gotten;
+ }
/*
* Take out anonymous pages first, anonymous shared vmas are
@@ -1703,6 +1762,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
if (reuse) {
+reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1737,7 +1797,6 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
- page_remove_rmap(old_page, vma);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
@@ -1759,6 +1818,32 @@ gotten:
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
+ if (old_page) {
+ /*
+ * Only after switching the pte to the new page may
+ * we remove the mapcount here. Otherwise another
+ * process may come and find the rmap count decremented
+ * before the pte is switched to the new page, and
+ * "reuse" the old page writing into it while our pte
+ * here still points into it and can be read by other
+ * threads.
+ *
+ * The critical issue is to order this
+ * page_remove_rmap with the ptp_clear_flush above.
+ * Those stores are ordered by (if nothing else,)
+ * the barrier present in the atomic_add_negative
+ * in page_remove_rmap.
+ *
+ * Then the TLB flush in ptep_clear_flush ensures that
+ * no process can access the old page before the
+ * decremented mapcount is visible. And the old page
+ * cannot be reused until after the decremented
+ * mapcount is visible. So transitively, TLBs to
+ * old page will be flushed before it can be reused.
+ */
+ page_remove_rmap(old_page, vma);
+ }
+
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
@@ -2275,8 +2360,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.flags = flags;
vmf.page = NULL;
- BUG_ON(vma->vm_flags & VM_PFNMAP);
-
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
@@ -2616,6 +2699,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
if (!new)
return -ENOMEM;
+ smp_wmb(); /* See comment in __pte_alloc */
+
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
pud_free(mm, new);
@@ -2637,6 +2722,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
if (!new)
return -ENOMEM;
+ smp_wmb(); /* See comment in __pte_alloc */
+
spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (pud_present(*pud)) /* Another has populated it */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17dca7..833f854 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -159,21 +159,58 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long old_zone_end_pfn;
+
+ zone_span_writelock(zone);
+
+ old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ if (start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
+ zone->zone_start_pfn;
+
+ zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
+ pgdat->node_start_pfn;
+}
+
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
int nid = pgdat->node_id;
int zone_type;
+ unsigned long flags;
zone_type = zone - pgdat->node_zones;
if (!zone->wait_table) {
- int ret = 0;
+ int ret;
+
ret = init_currently_empty_zone(zone, phys_start_pfn,
nr_pages, MEMMAP_HOTPLUG);
- if (ret < 0)
+ if (ret)
return ret;
}
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
+ grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
+ phys_start_pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
memmap_init_zone(nr_pages, nid, zone_type,
phys_start_pfn, MEMMAP_HOTPLUG);
return 0;
@@ -299,36 +336,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__remove_pages);
-static void grow_zone_span(struct zone *zone,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long old_zone_end_pfn;
-
- zone_span_writelock(zone);
-
- old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
- if (start_pfn < zone->zone_start_pfn)
- zone->zone_start_pfn = start_pfn;
-
- zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
- zone->zone_start_pfn;
-
- zone_span_writeunlock(zone);
-}
-
-static void grow_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long old_pgdat_end_pfn =
- pgdat->node_start_pfn + pgdat->node_spanned_pages;
-
- if (start_pfn < pgdat->node_start_pfn)
- pgdat->node_start_pfn = start_pfn;
-
- pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
- pgdat->node_start_pfn;
-}
-
void online_page(struct page *page)
{
totalram_pages++;
@@ -367,7 +374,6 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
int online_pages(unsigned long pfn, unsigned long nr_pages)
{
- unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
@@ -395,11 +401,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
* memory_block->state_mutex.
*/
zone = page_zone(pfn_to_page(pfn));
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- grow_zone_span(zone, pfn, pfn + nr_pages);
- grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
-
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
@@ -408,8 +409,15 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
if (!populated_zone(zone))
need_zonelists_rebuild = 1;
- walk_memory_resource(pfn, nr_pages, &onlined_pages,
+ ret = walk_memory_resource(pfn, nr_pages, &onlined_pages,
online_pages_range);
+ if (ret) {
+ printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
+ nr_pages, pfn);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
+ }
+
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a37a503..c94e58b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
- *policy |= pol->flags;
+ /*
+ * Internal mempolicy flags must be masked off before exposing
+ * the policy to userspace.
+ */
+ *policy |= (pol->flags & MPOL_MODE_FLAGS);
}
if (vma) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 449d77d..55bd355 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,7 +9,7 @@
* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
* Hirokazu Takahashi <taka@valinux.co.jp>
* Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter
*/
#include <linux/migrate.h>
@@ -865,6 +865,11 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
goto set_status;
page = follow_page(vma, pp->addr, FOLL_GET);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
err = -ENOENT;
if (!page)
goto set_status;
@@ -928,6 +933,11 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
goto set_status;
page = follow_page(vma, pm->addr, 0);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto set_status;
+
err = -ENOENT;
/* Use PageReserved to check for zero page */
if (!page || PageReserved(page))
diff --git a/mm/mmap.c b/mm/mmap.c
index fac6633..3354fdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
/*
* Check that a process has enough memory to allocate a new virtual
@@ -177,7 +177,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* cast `allowed' as a signed long because vm_committed_space
* sometimes has a negative value
*/
- if (atomic_read(&vm_committed_space) < (long)allowed)
+ if (atomic_long_read(&vm_committed_space) < (long)allowed)
return 0;
error:
vm_unacct_memory(pages);
@@ -245,10 +245,16 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ unsigned long min_brk;
down_write(&mm->mmap_sem);
- if (brk < mm->start_brk)
+#ifdef CONFIG_COMPAT_BRK
+ min_brk = mm->end_code;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
goto out;
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4de5468..a5bf31c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -26,6 +26,13 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#ifndef pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ return newprot;
+}
+#endif
+
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
@@ -192,7 +199,9 @@ success:
* held in write mode.
*/
vma->vm_flags = newflags;
- vma->vm_page_prot = vm_get_page_prot(newflags);
+ vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
+ vm_get_page_prot(newflags));
+
if (vma_wants_writenotify(vma)) {
vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
dirty_accountable = 1;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef8c62c..4462b6a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -39,7 +39,7 @@ struct page *mem_map;
unsigned long max_mapnr;
unsigned long num_physpages;
unsigned long askedalloc, realalloc;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -109,16 +109,23 @@ unsigned int kobjsize(const void *objp)
* If the object we have should not have ksize performed on it,
* return size of 0
*/
- if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
+ if (!objp || !virt_addr_valid(objp))
return 0;
+ page = virt_to_head_page(objp);
+
+ /*
+ * If the allocator sets PageSlab, we know the pointer came from
+ * kmalloc().
+ */
if (PageSlab(page))
return ksize(objp);
- BUG_ON(page->index < 0);
- BUG_ON(page->index >= MAX_ORDER);
-
- return (PAGE_SIZE << page->index);
+ /*
+ * The ksize() function is only guaranteed to work for pointers
+ * returned by kmalloc(). So handle arbitrary pointers here.
+ */
+ return PAGE_SIZE << compound_order(page);
}
/*
@@ -1410,7 +1417,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* cast `allowed' as a signed long because vm_committed_space
* sometimes has a negative value
*/
- if (atomic_read(&vm_committed_space) < (long)allowed)
+ if (atomic_long_read(&vm_committed_space) < (long)allowed)
return 0;
error:
vm_unacct_memory(pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdd5c43..f32fae3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -237,16 +237,7 @@ static void bad_page(struct page *page)
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
KERN_EMERG "Backtrace:\n");
dump_stack();
- page->flags &= ~(1 << PG_lru |
- 1 << PG_private |
- 1 << PG_locked |
- 1 << PG_active |
- 1 << PG_dirty |
- 1 << PG_reclaim |
- 1 << PG_slab |
- 1 << PG_swapcache |
- 1 << PG_writeback |
- 1 << PG_buddy );
+ page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
set_page_count(page, 0);
reset_page_mapcount(page);
page->mapping = NULL;
@@ -463,16 +454,7 @@ static inline int free_pages_check(struct page *page)
(page->mapping != NULL) |
(page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
- (page->flags & (
- 1 << PG_lru |
- 1 << PG_private |
- 1 << PG_locked |
- 1 << PG_active |
- 1 << PG_slab |
- 1 << PG_swapcache |
- 1 << PG_writeback |
- 1 << PG_reserved |
- 1 << PG_buddy ))))
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
bad_page(page);
if (PageDirty(page))
__ClearPageDirty(page);
@@ -616,17 +598,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
(page->mapping != NULL) |
(page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
- (page->flags & (
- 1 << PG_lru |
- 1 << PG_private |
- 1 << PG_locked |
- 1 << PG_active |
- 1 << PG_dirty |
- 1 << PG_slab |
- 1 << PG_swapcache |
- 1 << PG_writeback |
- 1 << PG_reserved |
- 1 << PG_buddy ))))
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
bad_page(page);
/*
@@ -1396,6 +1368,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
&preferred_zone);
+ if (!preferred_zone)
+ return NULL;
+
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
@@ -2353,7 +2328,6 @@ static void build_zonelists(pg_data_t *pgdat)
static void build_zonelist_cache(pg_data_t *pgdat)
{
pgdat->node_zonelists[0].zlcache_ptr = NULL;
- pgdat->node_zonelists[1].zlcache_ptr = NULL;
}
#endif /* CONFIG_NUMA */
@@ -2804,7 +2778,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
alloc_size = zone->wait_table_hash_nr_entries
* sizeof(wait_queue_head_t);
- if (system_state == SYSTEM_BOOTING) {
+ if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
alloc_bootmem_node(pgdat, alloc_size);
} else {
@@ -2862,8 +2836,6 @@ __meminit int init_currently_empty_zone(struct zone *zone,
zone->zone_start_pfn = zone_start_pfn;
- memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-
zone_init_free_lists(zone);
return 0;
@@ -3380,7 +3352,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
+ memmap_pages =
+ PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
printk(KERN_DEBUG
@@ -3433,6 +3406,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
+ memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 0afd238..d5878be 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,14 +3,14 @@
#include <linux/sched.h>
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- const struct mm_walk *walk, void *private)
+ struct mm_walk *walk)
{
pte_t *pte;
int err = 0;
pte = pte_offset_map(pmd, addr);
for (;;) {
- err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+ err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
@@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
- const struct mm_walk *walk, void *private)
+ struct mm_walk *walk)
{
pmd_t *pmd;
unsigned long next;
@@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd)) {
if (walk->pte_hole)
- err = walk->pte_hole(addr, next, private);
+ err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pmd_entry)
- err = walk->pmd_entry(pmd, addr, next, private);
+ err = walk->pmd_entry(pmd, addr, next, walk);
if (!err && walk->pte_entry)
- err = walk_pte_range(pmd, addr, next, walk, private);
+ err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
}
static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
- const struct mm_walk *walk, void *private)
+ struct mm_walk *walk)
{
pud_t *pud;
unsigned long next;
@@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud)) {
if (walk->pte_hole)
- err = walk->pte_hole(addr, next, private);
+ err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pud_entry)
- err = walk->pud_entry(pud, addr, next, private);
+ err = walk->pud_entry(pud, addr, next, walk);
if (!err && (walk->pmd_entry || walk->pte_entry))
- err = walk_pmd_range(pud, addr, next, walk, private);
+ err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
* @addr: starting address
* @end: ending address
* @walk: set of callbacks to invoke for each level of the tree
- * @private: private data passed to the callback function
*
* Recursively walk the page table for the memory area in a VMA,
* calling supplied callbacks. Callbacks are called in-order (first
* PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
* etc.). If lower-level callbacks are omitted, walking depth is reduced.
*
- * Each callback receives an entry pointer, the start and end of the
- * associated range, and a caller-supplied private data pointer.
+ * Each callback receives an entry pointer and the start and end of the
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
*
* No locks are taken, but the bottom level iterator will map PTE
* directories from highmem if necessary.
@@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
* If any callback returns a non-zero value, the walk is aborted and
* the return value is propagated back to the caller. Otherwise 0 is returned.
*/
-int walk_page_range(const struct mm_struct *mm,
- unsigned long addr, unsigned long end,
- const struct mm_walk *walk, void *private)
+int walk_page_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
pgd_t *pgd;
unsigned long next;
@@ -112,21 +111,24 @@ int walk_page_range(const struct mm_struct *mm,
if (addr >= end)
return err;
- pgd = pgd_offset(mm, addr);
+ if (!walk->mm)
+ return -EINVAL;
+
+ pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
- err = walk->pte_hole(addr, next, private);
+ err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
if (walk->pgd_entry)
- err = walk->pgd_entry(pgd, addr, next, private);
+ err = walk->pgd_entry(pgd, addr, next, walk);
if (!err &&
(walk->pud_entry || walk->pmd_entry || walk->pte_entry))
- err = walk_pud_range(pgd, addr, next, walk, private);
+ err = walk_pud_range(pgd, addr, next, walk);
if (err)
break;
} while (pgd++, addr = next, addr != end);
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 1c96cfc..9d834aa 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -207,7 +207,6 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
spin_lock_irqsave(&pdflush_lock, flags);
if (list_empty(&pdflush_list)) {
- spin_unlock_irqrestore(&pdflush_lock, flags);
ret = -1;
} else {
struct pdflush_work *pdf;
@@ -219,8 +218,9 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
pdf->fn = fn;
pdf->arg0 = arg0;
wake_up_process(pdf->who);
- spin_unlock_irqrestore(&pdflush_lock, flags);
}
+ spin_unlock_irqrestore(&pdflush_lock, flags);
+
return ret;
}
diff --git a/mm/slab.c b/mm/slab.c
index 06236e4..046607f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3263,9 +3263,12 @@ retry:
if (cpuset_zone_allowed_hardwall(zone, flags) &&
cache->nodelists[nid] &&
- cache->nodelists[nid]->free_objects)
+ cache->nodelists[nid]->free_objects) {
obj = ____cache_alloc_node(cache,
flags | GFP_THISNODE, nid);
+ if (obj)
+ break;
+ }
}
if (!obj) {
diff --git a/mm/slob.c b/mm/slob.c
index 6038cba..a3ad667 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -469,8 +469,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
return ZERO_SIZE_PTR;
m = slob_alloc(size + align, gfp, align, node);
- if (m)
- *m = size;
+ if (!m)
+ return NULL;
+ *m = size;
return (void *)m + align;
} else {
void *ret;
diff --git a/mm/slub.c b/mm/slub.c
index 32b6262..315c392 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5,7 +5,7 @@
* The allocator synchronizes using per slab locks and only
* uses a centralized lock to manage a pool of partial slabs.
*
- * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
+ * (C) 2007 SGI, Christoph Lameter
*/
#include <linux/mm.h>
@@ -217,7 +217,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
+#ifdef CONFIG_SLUB_DEBUG
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
@@ -814,7 +814,8 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
return search == NULL;
}
-static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
+static void trace(struct kmem_cache *s, struct page *page, void *object,
+ int alloc)
{
if (s->flags & SLAB_TRACE) {
printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
@@ -1267,8 +1268,7 @@ static void add_partial(struct kmem_cache_node *n,
spin_unlock(&n->list_lock);
}
-static void remove_partial(struct kmem_cache *s,
- struct page *page)
+static void remove_partial(struct kmem_cache *s, struct page *page)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1283,7 +1283,8 @@ static void remove_partial(struct kmem_cache *s,
*
* Must hold list_lock.
*/
-static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page)
+static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
+ struct page *page)
{
if (slab_trylock(page)) {
list_del(&page->lru);
@@ -1420,8 +1421,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
* so that the others get filled first. That way the
* size of the partial list stays small.
*
- * kmem_cache_shrink can reclaim any empty slabs from the
- * partial list.
+ * kmem_cache_shrink can reclaim any empty slabs from
+ * the partial list.
*/
add_partial(n, page, 1);
slab_unlock(page);
@@ -1627,9 +1628,11 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void **object;
struct kmem_cache_cpu *c;
unsigned long flags;
+ unsigned int objsize;
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
+ objsize = c->objsize;
if (unlikely(!c->freelist || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
@@ -1642,7 +1645,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
local_irq_restore(flags);
if (unlikely((gfpflags & __GFP_ZERO) && object))
- memset(object, 0, c->objsize);
+ memset(object, 0, objsize);
return object;
}
@@ -2725,9 +2728,10 @@ size_t ksize(const void *object)
page = virt_to_head_page(object);
- if (unlikely(!PageSlab(page)))
+ if (unlikely(!PageSlab(page))) {
+ WARN_ON(!PageCompound(page));
return PAGE_SIZE << compound_order(page);
-
+ }
s = page->slab;
#ifdef CONFIG_SLUB_DEBUG
@@ -2909,7 +2913,7 @@ static int slab_mem_going_online_callback(void *arg)
return 0;
/*
- * We are bringing a node online. No memory is availabe yet. We must
+ * We are bringing a node online. No memory is available yet. We must
* allocate a kmem_cache_node structure in order to bring the node
* online.
*/
@@ -2993,8 +2997,6 @@ void __init kmem_cache_init(void)
create_kmalloc_cache(&kmalloc_caches[1],
"kmalloc-96", 96, GFP_KERNEL);
caches++;
- }
- if (KMALLOC_MIN_SIZE <= 128) {
create_kmalloc_cache(&kmalloc_caches[2],
"kmalloc-192", 192, GFP_KERNEL);
caches++;
@@ -3024,6 +3026,16 @@ void __init kmem_cache_init(void)
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+ if (KMALLOC_MIN_SIZE == 128) {
+ /*
+ * The 192 byte sized cache is not used if the alignment
+ * is 128 byte. Redirect kmalloc to use the 256 byte cache
+ * instead.
+ */
+ for (i = 128 + 8; i <= 192; i += 8)
+ size_index[(i - 1) / 8] = 8;
+ }
+
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
@@ -3246,7 +3258,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return slab_alloc(s, gfpflags, node, caller);
}
-#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO)
+#ifdef CONFIG_SLUB_DEBUG
static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
{
@@ -3275,9 +3287,7 @@ static int count_free(struct page *page)
{
return page->objects - page->inuse;
}
-#endif
-#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
@@ -3763,7 +3773,7 @@ static int any_slab_objects(struct kmem_cache *s)
if (!n)
continue;
- if (atomic_read(&n->total_objects))
+ if (atomic_long_read(&n->total_objects))
return 1;
}
return 0;
@@ -3812,7 +3822,12 @@ SLAB_ATTR_RO(objs_per_slab);
static ssize_t order_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- int order = simple_strtoul(buf, NULL, 10);
+ unsigned long order;
+ int err;
+
+ err = strict_strtoul(buf, 10, &order);
+ if (err)
+ return err;
if (order > slub_max_order || order < slub_min_order)
return -EINVAL;
@@ -4065,10 +4080,16 @@ static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- int n = simple_strtoul(buf, NULL, 10);
+ unsigned long ratio;
+ int err;
+
+ err = strict_strtoul(buf, 10, &ratio);
+ if (err)
+ return err;
+
+ if (ratio < 100)
+ s->remote_node_defrag_ratio = ratio * 10;
- if (n < 100)
- s->remote_node_defrag_ratio = n * 10;
return length;
}
SLAB_ATTR(remote_node_defrag_ratio);
@@ -4425,8 +4446,8 @@ __initcall(slab_sysfs_init);
*/
#ifdef CONFIG_SLABINFO
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
- size_t count, loff_t *ppos)
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
{
return -EINVAL;
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 99c4f36..a91b5f8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -1,7 +1,7 @@
/*
* Virtual Memory Map support
*
- * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ * (C) 2007 sgi. Christoph Lameter.
*
* Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
* virt_to_page, page_address() to be implemented as a base offset
diff --git a/mm/swap.c b/mm/swap.c
index 91e1944..45c9f25 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -503,7 +503,7 @@ void vm_acct_memory(long pages)
local = &__get_cpu_var(committed_space);
*local += pages;
if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
- atomic_add(*local, &vm_committed_space);
+ atomic_long_add(*local, &vm_committed_space);
*local = 0;
}
preempt_enable();
@@ -520,7 +520,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
committed = &per_cpu(committed_space, (long)hcpu);
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- atomic_add(*committed, &vm_committed_space);
+ atomic_long_add(*committed, &vm_committed_space);
*committed = 0;
drain_cpu_pagevecs((long)hcpu);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a29901..967d30c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1307,7 +1307,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- int ret = 0;
+ unsigned long ret = 0;
unsigned long total_scanned = 0;
unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1a32130..db9eabb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -41,7 +41,9 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
*/
void all_vm_events(unsigned long *ret)
{
+ get_online_cpus();
sum_vm_events(ret, &cpu_online_map);
+ put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);