aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c500
1 files changed, 330 insertions, 170 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab98dc6..99082fa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,6 +95,8 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
int may_swap;
+ int swappiness;
+
int order;
/*
@@ -171,8 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
struct scan_control *sc, enum lru_list lru)
{
if (!scanning_global_lru(sc))
- return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
- zone_to_nid(zone), zone_idx(zone), BIT(lru));
+ return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
return zone_page_state(zone, NR_LRU_BASE + lru);
}
@@ -183,7 +184,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
*/
void register_shrinker(struct shrinker *shrinker)
{
- atomic_long_set(&shrinker->nr_in_batch, 0);
+ shrinker->nr = 0;
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
@@ -252,8 +253,6 @@ unsigned long shrink_slab(struct shrink_control *shrink,
int shrink_ret = 0;
long nr;
long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
max_pass = do_shrinker_shrink(shrinker, shrink, 0);
if (max_pass <= 0)
@@ -264,7 +263,9 @@ unsigned long shrink_slab(struct shrink_control *shrink,
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
- nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
+ do {
+ nr = shrinker->nr;
+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
@@ -305,18 +306,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
- while (total_scan >= batch_size) {
+ while (total_scan >= SHRINK_BATCH) {
+ long this_scan = SHRINK_BATCH;
int nr_before;
nr_before = do_shrinker_shrink(shrinker, shrink, 0);
shrink_ret = do_shrinker_shrink(shrinker, shrink,
- batch_size);
+ this_scan);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, batch_size);
- total_scan -= batch_size;
+ count_vm_events(SLABS_SCANNED, this_scan);
+ total_scan -= this_scan;
cond_resched();
}
@@ -326,11 +328,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
- &shrinker->nr_in_batch);
- else
- new_nr = atomic_long_read(&shrinker->nr_in_batch);
+ do {
+ nr = shrinker->nr;
+ new_nr = total_scan + nr;
+ if (total_scan <= 0)
+ break;
+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
@@ -495,6 +498,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
return PAGE_ACTIVATE;
}
+ /*
+ * Wait on writeback if requested to. This happens when
+ * direct reclaiming a large contiguous area and the
+ * first attempt to free a range of pages fails.
+ */
+ if (PageWriteback(page) &&
+ (sc->reclaim_mode & RECLAIM_MODE_SYNC))
+ wait_on_page_writeback(page);
+
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
@@ -633,14 +645,13 @@ redo:
lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
/*
- * When racing with an mlock or AS_UNEVICTABLE clearing
- * (page is unlocked) make sure that if the other thread
- * does not observe our setting of PG_lru and fails
- * isolation/check_move_unevictable_pages,
- * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
+ * When racing with an mlock clearing (page is
+ * unlocked), make sure that if the other thread does
+ * not observe our setting of PG_lru and fails
+ * isolation, we see PG_mlocked cleared below and move
* the page back to the evictable list.
*
- * The other side is TestClearPageMlocked() or shmem_lock().
+ * The other side is TestClearPageMlocked().
*/
smp_mb();
}
@@ -715,15 +726,19 @@ static enum page_references page_check_references(struct page *page,
*/
SetPageReferenced(page);
+#ifndef CONFIG_DMA_CMA
+ if (referenced_page)
+ return PAGEREF_ACTIVATE;
+#else
if (referenced_page || referenced_ptes > 1)
return PAGEREF_ACTIVATE;
/*
* Activate file-backed executable pages after first usage.
- */
+ */
if (vm_flags & VM_EXEC)
return PAGEREF_ACTIVATE;
-
+#endif
return PAGEREF_KEEP;
}
@@ -755,12 +770,12 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
/*
* shrink_page_list() returns the number of reclaimed pages
*/
-static unsigned long shrink_page_list(struct list_head *page_list,
+#ifndef CONFIG_ZRAM_FOR_ANDROID
+static
+#endif /* CONFIG_ZRAM_FOR_ANDROID */
+unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,
- struct scan_control *sc,
- int priority,
- unsigned long *ret_nr_dirty,
- unsigned long *ret_nr_writeback)
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -768,7 +783,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
- unsigned long nr_writeback = 0;
cond_resched();
@@ -805,12 +819,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
- nr_writeback++;
/*
- * Synchronous reclaim cannot queue pages for
- * writeback due to the possibility of stack overflow
- * but if it encounters a page under writeback, wait
- * for the IO to complete.
+ * Synchronous reclaim is performed in two passes,
+ * first an asynchronous pass over the list to
+ * start parallel writeback, and a second synchronous
+ * pass to wait for the IO to complete. Wait here
+ * for any page for which writeback has already
+ * started.
*/
if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
may_enter_fs)
@@ -866,25 +881,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page)) {
nr_dirty++;
- /*
- * Only kswapd can writeback filesystem pages to
- * avoid risk of stack overflow but do not writeback
- * unless under significant pressure.
- */
- if (page_is_file_cache(page) &&
- (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
- /*
- * Immediately reclaim when written back.
- * Similar in principal to deactivate_page()
- * except we already have the page isolated
- * and know it's dirty
- */
- inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
- SetPageReclaim(page);
-
- goto keep_locked;
- }
-
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -1019,8 +1015,6 @@ keep_lumpy:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- *ret_nr_dirty += nr_dirty;
- *ret_nr_writeback += nr_writeback;
return nr_reclaimed;
}
@@ -1063,8 +1057,12 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
* unevictable; only give shrink_page_list evictable pages.
*/
if (PageUnevictable(page))
+#ifndef CONFIG_DMA_CMA
return ret;
-
+#else
+ printk(KERN_ERR "%s[%d] Unevictable page %p\n",
+ __func__, __LINE__, page);
+#endif
ret = -EBUSY;
/*
@@ -1281,7 +1279,10 @@ static unsigned long isolate_pages_global(unsigned long nr,
* clear_active_flags() is a helper for shrink_active_list(), clearing
* any active bits from the pages in the list.
*/
-static unsigned long clear_active_flags(struct list_head *page_list,
+#ifndef CONFIG_ZRAM_FOR_ANDROID
+static
+#endif /* CONFIG_ZRAM_FOR_ANDROID */
+unsigned long clear_active_flags(struct list_head *page_list,
unsigned int *count)
{
int nr_active = 0;
@@ -1351,6 +1352,40 @@ int isolate_lru_page(struct page *page)
return ret;
}
+#ifdef CONFIG_ZRAM_FOR_ANDROID
+/**
+ * isolate_lru_page_compcache - tries to isolate a page for compcache
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU,but
+ * does not adjusts the vmstat statistic
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ */
+int isolate_lru_page_compcache(struct page *page)
+{
+ int ret = -EBUSY;
+
+ VM_BUG_ON(!page_count(page));
+
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page)) {
+ int lru = page_lru(page);
+ ret = 0;
+ get_page(page);
+ ClearPageLRU(page);
+ list_del(&page->lru);
+ mem_cgroup_del_lru_list(page, lru);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+ return ret;
+}
+#endif
+
/*
* Are there way too many processes in the direct reclaim path already?
*/
@@ -1458,7 +1493,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
}
/*
- * Returns true if a direct reclaim should wait on pages under writeback.
+ * Returns true if the caller should wait to clean dirty/writeback pages.
*
* If we are direct reclaiming for contiguous pages and we do not reclaim
* everything in the list, try again and wait for writeback IO to complete.
@@ -1480,7 +1515,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
- /* If we have reclaimed everything on the isolated list, no stall */
+ /* If we have relaimed everything on the isolated list, no stall */
if (nr_freed == nr_taken)
return false;
@@ -1512,8 +1547,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;
- unsigned long nr_dirty = 0;
- unsigned long nr_writeback = 0;
isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
while (unlikely(too_many_isolated(zone, file, sc))) {
@@ -1566,14 +1599,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
- &nr_dirty, &nr_writeback);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
set_reclaim_mode(priority, sc, true);
- nr_reclaimed += shrink_page_list(&page_list, zone, sc,
- priority, &nr_dirty, &nr_writeback);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
local_irq_disable();
@@ -1583,32 +1614,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
- /*
- * If reclaim is isolating dirty pages under writeback, it implies
- * that the long-lived page allocation rate is exceeding the page
- * laundering rate. Either the global limits are not being effective
- * at throttling processes due to the page distribution throughout
- * zones or there is heavy usage of a slow backing device. The
- * only option is to throttle from reclaim context which is not ideal
- * as there is no guarantee the dirtying process is throttled in the
- * same way balance_dirty_pages() manages.
- *
- * This scales the number of dirty pages that must be under writeback
- * before throttling depending on priority. It is a simple backoff
- * function that has the most effect in the range DEF_PRIORITY to
- * DEF_PRIORITY-2 which is the priority reclaim is considered to be
- * in trouble and reclaim is considered to be in trouble.
- *
- * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
- * DEF_PRIORITY-1 50% must be PageWriteback
- * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
- * ...
- * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
- * isolated page is PageWriteback
- */
- if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
- wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
-
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
@@ -1617,6 +1622,44 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
return nr_reclaimed;
}
+#ifdef CONFIG_ZRAM_FOR_ANDROID
+unsigned long
+zone_id_shrink_pagelist(struct zone *zone, struct list_head *page_list)
+{
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+
+ struct scan_control sc = {
+ .gfp_mask = GFP_USER,
+ .may_writepage = 1,
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .swappiness = vm_swappiness,
+ .order = 0,
+ .mem_cgroup = NULL,
+ .nodemask = NULL,
+ };
+
+ spin_lock_irq(&zone->lru_lock);
+
+ update_isolated_counts(zone, &sc, &nr_anon, &nr_file, page_list);
+
+ spin_unlock_irq(&zone->lru_lock);
+
+ nr_reclaimed = shrink_page_list(page_list, zone, &sc);
+
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+
+ putback_lru_pages(zone, &sc, nr_anon, nr_file, page_list);
+
+ return nr_reclaimed;
+}
+
+EXPORT_SYMBOL(zone_id_shrink_pagelist);
+#endif /* CONFIG_ZRAM_FOR_ANDROID */
+
/*
* This moves pages from the active list to the inactive list.
*
@@ -1804,7 +1847,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
- low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
+ low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
return low;
}
#else
@@ -1847,7 +1890,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
if (scanning_global_lru(sc))
low = inactive_file_is_low_global(zone);
else
- low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
+ low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
return low;
}
@@ -1874,13 +1917,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
}
-static int vmscan_swappiness(struct scan_control *sc)
-{
- if (scanning_global_lru(sc))
- return vm_swappiness;
- return mem_cgroup_swappiness(sc->mem_cgroup);
-}
-
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
@@ -1900,20 +1936,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
enum lru_list l;
int noswap = 0;
bool force_scan = false;
+ unsigned long nr_force_scan[2];
- /*
- * If the zone or memcg is small, nr[l] can be 0. This
- * results in no scanning on this priority and a potential
- * priority drop. Global direct reclaim can go to the next
- * zone and tends to have no problems. Global kswapd is for
- * zone balancing and it needs to scan a minimum amount. When
- * reclaiming for a memcg, a priority drop can cause high
- * latencies, so it's better to scan a minimum amount there as
- * well.
- */
+ /* kswapd does zone balancing and needs to scan this zone */
if (scanning_global_lru(sc) && current_is_kswapd() &&
zone->all_unreclaimable)
force_scan = true;
+ /* memcg may have small limit and need to avoid priority drop */
if (!scanning_global_lru(sc))
force_scan = true;
@@ -1923,6 +1952,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
+ nr_force_scan[0] = 0;
+ nr_force_scan[1] = SWAP_CLUSTER_MAX;
goto out;
}
@@ -1939,6 +1970,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
+ nr_force_scan[0] = SWAP_CLUSTER_MAX;
+ nr_force_scan[1] = 0;
goto out;
}
}
@@ -1947,8 +1980,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = vmscan_swappiness(sc);
- file_prio = 200 - vmscan_swappiness(sc);
+ anon_prio = sc->swappiness;
+ file_prio = 200 - sc->swappiness;
/*
* OK, so we have swap space and a fair amount of page cache
@@ -1977,28 +2010,43 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
*/
- ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
+ ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
- fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
+ fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
+ if (force_scan) {
+ unsigned long scan = SWAP_CLUSTER_MAX;
+ nr_force_scan[0] = div64_u64(scan * ap, denominator);
+ nr_force_scan[1] = div64_u64(scan * fp, denominator);
+ }
out:
for_each_evictable_lru(l) {
int file = is_file_lru(l);
unsigned long scan;
scan = zone_nr_lru_pages(zone, sc, l);
- if (priority || noswap || !vmscan_swappiness(sc)) {
+ if (priority || noswap) {
scan >>= priority;
- if (!scan && force_scan)
- scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
}
+
+ /*
+ * If zone is small or memcg is small, nr[l] can be 0.
+ * This results no-scan on this priority and priority drop down.
+ * For global direct reclaim, it can visit next zone and tend
+ * not to have problems. For global kswapd, it's for zone
+ * balancing and it need to scan a small amounts. When using
+ * memcg, priority drop can cause big latency. So, it's better
+ * to scan small amount. See may_noscan above.
+ */
+ if (!scan && force_scan)
+ scan = nr_force_scan[file];
nr[l] = scan;
}
}
@@ -2339,8 +2387,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
- wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
- WB_REASON_TRY_TO_FREE_PAGES);
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}
@@ -2391,6 +2438,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
.may_swap = 1,
+ .swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
.nodemask = nodemask,
@@ -2414,6 +2462,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
+ unsigned int swappiness,
struct zone *zone,
unsigned long *nr_scanned)
{
@@ -2423,6 +2472,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
+ .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
};
@@ -2451,7 +2501,8 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
- bool noswap)
+ bool noswap,
+ unsigned int swappiness)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
@@ -2461,6 +2512,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.may_unmap = 1,
.may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
@@ -2611,6 +2663,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* we want to put equal scanning pressure on each zone.
*/
.nr_to_reclaim = ULONG_MAX,
+ .swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
};
@@ -2852,8 +2905,6 @@ out:
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= *classzone_idx)
- balanced += zone->present_pages;
}
}
@@ -3016,10 +3067,7 @@ static int kswapd(void *p)
}
}
- tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
-
return 0;
}
@@ -3098,11 +3146,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
struct reclaim_state reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_HIGHUSER_MOVABLE,
+#if defined(CONFIG_SLP) && defined(CONFIG_FULL_PAGE_RECLAIM)
+ .may_swap = 0,
+#else
.may_swap = 1,
+#endif
.may_unmap = 1,
.may_writepage = 1,
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
+ .swappiness = vm_swappiness,
.order = 0,
};
struct shrink_control shrink = {
@@ -3292,6 +3345,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.nr_to_reclaim = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
+ .swappiness = vm_swappiness,
.order = order,
};
struct shrink_control shrink = {
@@ -3440,66 +3494,158 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
return 1;
}
-#ifdef CONFIG_SHMEM
/**
- * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
- * @pages: array of pages to check
- * @nr_pages: number of pages to check
+ * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
+ * @page: page to check evictability and move to appropriate lru list
+ * @zone: zone page is in
*
- * Checks pages for evictability and moves them to the appropriate lru list.
+ * Checks a page for evictability and moves the page to the appropriate
+ * zone lru list.
*
- * This function is only used for SysV IPC SHM_UNLOCK.
+ * Restrictions: zone->lru_lock must be held, page must be on LRU and must
+ * have PageUnevictable set.
*/
-void check_move_unevictable_pages(struct page **pages, int nr_pages)
+static void check_move_unevictable_page(struct page *page, struct zone *zone)
{
- struct zone *zone = NULL;
- int pgscanned = 0;
- int pgrescued = 0;
- int i;
+ VM_BUG_ON(PageActive(page));
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pages[i];
- struct zone *pagezone;
+retry:
+ ClearPageUnevictable(page);
+ if (page_evictable(page, NULL)) {
+ enum lru_list l = page_lru_base_type(page);
- pgscanned++;
- pagezone = page_zone(page);
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
+ __dec_zone_state(zone, NR_UNEVICTABLE);
+ list_move(&page->lru, &zone->lru[l].list);
+ mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
+ __inc_zone_state(zone, NR_INACTIVE_ANON + l);
+ __count_vm_event(UNEVICTABLE_PGRESCUED);
+ } else {
+ /*
+ * rotate unevictable list
+ */
+ SetPageUnevictable(page);
+ list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+ mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
+ if (page_evictable(page, NULL))
+ goto retry;
+ }
+}
- if (!PageLRU(page) || !PageUnevictable(page))
- continue;
+/**
+ * scan_mapping_unevictable_pages - scan an address space for evictable pages
+ * @mapping: struct address_space to scan for evictable pages
+ *
+ * Scan all pages in mapping. Check unevictable pages for
+ * evictability and move them to the appropriate zone lru list.
+ */
+void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+ pgoff_t next = 0;
+ pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ struct zone *zone;
+ struct pagevec pvec;
- if (page_evictable(page, NULL)) {
- enum lru_list lru = page_lru_base_type(page);
+ if (mapping->nrpages == 0)
+ return;
- VM_BUG_ON(PageActive(page));
- ClearPageUnevictable(page);
- __dec_zone_state(zone, NR_UNEVICTABLE);
- list_move(&page->lru, &zone->lru[lru].list);
- mem_cgroup_move_lists(page, LRU_UNEVICTABLE, lru);
- __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
- pgrescued++;
+ pagevec_init(&pvec, 0);
+ while (next < end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ int i;
+ int pg_scanned = 0;
+
+ zone = NULL;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ pgoff_t page_index = page->index;
+ struct zone *pagezone = page_zone(page);
+
+ pg_scanned++;
+ if (page_index > next)
+ next = page_index;
+ next++;
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
+
+ if (PageLRU(page) && PageUnevictable(page))
+ check_move_unevictable_page(page, zone);
}
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+
+ count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
}
- if (zone) {
- __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
- __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
+}
+
+/**
+ * scan_zone_unevictable_pages - check unevictable list for evictable pages
+ * @zone - zone of which to scan the unevictable list
+ *
+ * Scan @zone's unevictable LRU lists to check for pages that have become
+ * evictable. Move those that have to @zone's inactive list where they
+ * become candidates for reclaim, unless shrink_inactive_zone() decides
+ * to reactivate them. Pages that are still unevictable are rotated
+ * back onto @zone's unevictable list.
+ */
+#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
+static void scan_zone_unevictable_pages(struct zone *zone)
+{
+ struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+ unsigned long scan;
+ unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+
+ while (nr_to_scan > 0) {
+ unsigned long batch_size = min(nr_to_scan,
+ SCAN_UNEVICTABLE_BATCH_SIZE);
+
+ spin_lock_irq(&zone->lru_lock);
+ for (scan = 0; scan < batch_size; scan++) {
+ struct page *page = lru_to_page(l_unevictable);
+
+ if (!trylock_page(page))
+ continue;
+
+ prefetchw_prev_lru_page(page, l_unevictable, flags);
+
+ if (likely(PageLRU(page) && PageUnevictable(page)))
+ check_move_unevictable_page(page, zone);
+
+ unlock_page(page);
+ }
spin_unlock_irq(&zone->lru_lock);
+
+ nr_to_scan -= batch_size;
}
}
-#endif /* CONFIG_SHMEM */
-static void warn_scan_unevictable_pages(void)
+
+/**
+ * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
+ *
+ * A really big hammer: scan all zones' unevictable LRU lists to check for
+ * pages that have become evictable. Move those back to the zones'
+ * inactive list where they become candidates for reclaim.
+ * This occurs when, e.g., we have unswappable pages on the unevictable lists,
+ * and we add swap to the system. As such, it runs in the context of a task
+ * that has possibly/probably made some previously unevictable pages
+ * evictable.
+ */
+static void scan_all_zones_unevictable_pages(void)
{
- printk_once(KERN_WARNING
- "The scan_unevictable_pages sysctl/node-interface has been "
- "disabled for lack of a legitimate use case. If you have "
- "one, please send an email to linux-mm@kvack.org.\n");
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ scan_zone_unevictable_pages(zone);
+ }
}
/*
@@ -3512,8 +3658,11 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
{
- warn_scan_unevictable_pages();
proc_doulongvec_minmax(table, write, buffer, length, ppos);
+
+ if (write && *(unsigned long *)table->data)
+ scan_all_zones_unevictable_pages();
+
scan_unevictable_pages = 0;
return 0;
}
@@ -3528,7 +3677,6 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
struct sysdev_attribute *attr,
char *buf)
{
- warn_scan_unevictable_pages();
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
@@ -3536,7 +3684,19 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
- warn_scan_unevictable_pages();
+ struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+ struct zone *zone;
+ unsigned long res;
+ unsigned long req = strict_strtoul(buf, 10, &res);
+
+ if (!req)
+ return 1; /* zero is no-op */
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+ scan_zone_unevictable_pages(zone);
+ }
return 1;
}